Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Copyright IBM Corporation, 2021 |
| 4 | * |
| 5 | * Author: Mike Rapoport <rppt@linux.ibm.com> |
| 6 | */ |
| 7 | |
| 8 | #include <linux/mm.h> |
| 9 | #include <linux/fs.h> |
| 10 | #include <linux/swap.h> |
| 11 | #include <linux/mount.h> |
| 12 | #include <linux/memfd.h> |
| 13 | #include <linux/bitops.h> |
| 14 | #include <linux/printk.h> |
| 15 | #include <linux/pagemap.h> |
| 16 | #include <linux/syscalls.h> |
| 17 | #include <linux/pseudo_fs.h> |
| 18 | #include <linux/secretmem.h> |
| 19 | #include <linux/set_memory.h> |
| 20 | #include <linux/sched/signal.h> |
| 21 | |
| 22 | #include <uapi/linux/magic.h> |
| 23 | |
| 24 | #include <asm/tlbflush.h> |
| 25 | |
| 26 | #include "internal.h" |
| 27 | |
| 28 | #undef pr_fmt |
| 29 | #define pr_fmt(fmt) "secretmem: " fmt |
| 30 | |
| 31 | /* |
| 32 | * Define mode and flag masks to allow validation of the system call |
| 33 | * parameters. |
| 34 | */ |
| 35 | #define SECRETMEM_MODE_MASK (0x0) |
| 36 | #define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK |
| 37 | |
Mike Rapoport (IBM) | b758fe6 | 2023-05-15 11:34:00 +0300 | [diff] [blame] | 38 | static bool secretmem_enable __ro_after_init = 1; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 39 | module_param_named(enable, secretmem_enable, bool, 0400); |
| 40 | MODULE_PARM_DESC(secretmem_enable, |
| 41 | "Enable secretmem and memfd_secret(2) system call"); |
| 42 | |
Linus Torvalds | 87066fd | 2021-10-24 09:48:33 -1000 | [diff] [blame] | 43 | static atomic_t secretmem_users; |
Mike Rapoport | 9a436f8 | 2021-07-07 18:08:07 -0700 | [diff] [blame] | 44 | |
| 45 | bool secretmem_active(void) |
| 46 | { |
Linus Torvalds | 87066fd | 2021-10-24 09:48:33 -1000 | [diff] [blame] | 47 | return !!atomic_read(&secretmem_users); |
Mike Rapoport | 9a436f8 | 2021-07-07 18:08:07 -0700 | [diff] [blame] | 48 | } |
| 49 | |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 50 | static vm_fault_t secretmem_fault(struct vm_fault *vmf) |
| 51 | { |
| 52 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
| 53 | struct inode *inode = file_inode(vmf->vma->vm_file); |
| 54 | pgoff_t offset = vmf->pgoff; |
| 55 | gfp_t gfp = vmf->gfp_mask; |
| 56 | unsigned long addr; |
| 57 | struct page *page; |
ZhangPeng | 7e2fca5 | 2023-08-12 14:26:12 +0800 | [diff] [blame] | 58 | struct folio *folio; |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 59 | vm_fault_t ret; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 60 | int err; |
| 61 | |
| 62 | if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) |
| 63 | return vmf_error(-EINVAL); |
| 64 | |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 65 | filemap_invalidate_lock_shared(mapping); |
| 66 | |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 67 | retry: |
| 68 | page = find_lock_page(mapping, offset); |
| 69 | if (!page) { |
ZhangPeng | 7e2fca5 | 2023-08-12 14:26:12 +0800 | [diff] [blame] | 70 | folio = folio_alloc(gfp | __GFP_ZERO, 0); |
| 71 | if (!folio) { |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 72 | ret = VM_FAULT_OOM; |
| 73 | goto out; |
| 74 | } |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 75 | |
ZhangPeng | 7e2fca5 | 2023-08-12 14:26:12 +0800 | [diff] [blame] | 76 | page = &folio->page; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 77 | err = set_direct_map_invalid_noflush(page); |
| 78 | if (err) { |
ZhangPeng | 7e2fca5 | 2023-08-12 14:26:12 +0800 | [diff] [blame] | 79 | folio_put(folio); |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 80 | ret = vmf_error(err); |
| 81 | goto out; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 82 | } |
| 83 | |
ZhangPeng | 7e2fca5 | 2023-08-12 14:26:12 +0800 | [diff] [blame] | 84 | __folio_mark_uptodate(folio); |
| 85 | err = filemap_add_folio(mapping, folio, offset, gfp); |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 86 | if (unlikely(err)) { |
ZhangPeng | 7e2fca5 | 2023-08-12 14:26:12 +0800 | [diff] [blame] | 87 | folio_put(folio); |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 88 | /* |
| 89 | * If a split of large page was required, it |
| 90 | * already happened when we marked the page invalid |
| 91 | * which guarantees that this call won't fail |
| 92 | */ |
| 93 | set_direct_map_default_noflush(page); |
| 94 | if (err == -EEXIST) |
| 95 | goto retry; |
| 96 | |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 97 | ret = vmf_error(err); |
| 98 | goto out; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 99 | } |
| 100 | |
| 101 | addr = (unsigned long)page_address(page); |
| 102 | flush_tlb_kernel_range(addr, addr + PAGE_SIZE); |
| 103 | } |
| 104 | |
| 105 | vmf->page = page; |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 106 | ret = VM_FAULT_LOCKED; |
| 107 | |
| 108 | out: |
| 109 | filemap_invalidate_unlock_shared(mapping); |
| 110 | return ret; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 111 | } |
| 112 | |
| 113 | static const struct vm_operations_struct secretmem_vm_ops = { |
| 114 | .fault = secretmem_fault, |
| 115 | }; |
| 116 | |
Mike Rapoport | 9a436f8 | 2021-07-07 18:08:07 -0700 | [diff] [blame] | 117 | static int secretmem_release(struct inode *inode, struct file *file) |
| 118 | { |
Linus Torvalds | 87066fd | 2021-10-24 09:48:33 -1000 | [diff] [blame] | 119 | atomic_dec(&secretmem_users); |
Mike Rapoport | 9a436f8 | 2021-07-07 18:08:07 -0700 | [diff] [blame] | 120 | return 0; |
| 121 | } |
| 122 | |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 123 | static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) |
| 124 | { |
| 125 | unsigned long len = vma->vm_end - vma->vm_start; |
| 126 | |
| 127 | if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) |
| 128 | return -EINVAL; |
| 129 | |
Andrew Morton | b0cc5e8 | 2023-05-22 13:52:10 -0700 | [diff] [blame] | 130 | if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 131 | return -EAGAIN; |
| 132 | |
Suren Baghdasaryan | 1c71222 | 2023-01-26 11:37:49 -0800 | [diff] [blame] | 133 | vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP); |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 134 | vma->vm_ops = &secretmem_vm_ops; |
| 135 | |
| 136 | return 0; |
| 137 | } |
| 138 | |
| 139 | bool vma_is_secretmem(struct vm_area_struct *vma) |
| 140 | { |
| 141 | return vma->vm_ops == &secretmem_vm_ops; |
| 142 | } |
| 143 | |
| 144 | static const struct file_operations secretmem_fops = { |
Mike Rapoport | 9a436f8 | 2021-07-07 18:08:07 -0700 | [diff] [blame] | 145 | .release = secretmem_release, |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 146 | .mmap = secretmem_mmap, |
| 147 | }; |
| 148 | |
Matthew Wilcox (Oracle) | 5409548 | 2022-06-06 11:30:43 -0400 | [diff] [blame] | 149 | static int secretmem_migrate_folio(struct address_space *mapping, |
| 150 | struct folio *dst, struct folio *src, enum migrate_mode mode) |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 151 | { |
| 152 | return -EBUSY; |
| 153 | } |
| 154 | |
Matthew Wilcox (Oracle) | 6612ed2 | 2022-05-02 01:47:42 -0400 | [diff] [blame] | 155 | static void secretmem_free_folio(struct folio *folio) |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 156 | { |
Matthew Wilcox (Oracle) | 6612ed2 | 2022-05-02 01:47:42 -0400 | [diff] [blame] | 157 | set_direct_map_default_noflush(&folio->page); |
| 158 | folio_zero_segment(folio, 0, folio_size(folio)); |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 159 | } |
| 160 | |
| 161 | const struct address_space_operations secretmem_aops = { |
Matthew Wilcox (Oracle) | 46de8b97 | 2022-02-09 20:22:13 +0000 | [diff] [blame] | 162 | .dirty_folio = noop_dirty_folio, |
Matthew Wilcox (Oracle) | 6612ed2 | 2022-05-02 01:47:42 -0400 | [diff] [blame] | 163 | .free_folio = secretmem_free_folio, |
Matthew Wilcox (Oracle) | 5409548 | 2022-06-06 11:30:43 -0400 | [diff] [blame] | 164 | .migrate_folio = secretmem_migrate_folio, |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 165 | }; |
| 166 | |
Christian Brauner | c1632a0 | 2023-01-13 12:49:11 +0100 | [diff] [blame] | 167 | static int secretmem_setattr(struct mnt_idmap *idmap, |
Axel Rasmussen | f9b141f | 2022-04-14 19:13:31 -0700 | [diff] [blame] | 168 | struct dentry *dentry, struct iattr *iattr) |
| 169 | { |
| 170 | struct inode *inode = d_inode(dentry); |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 171 | struct address_space *mapping = inode->i_mapping; |
Axel Rasmussen | f9b141f | 2022-04-14 19:13:31 -0700 | [diff] [blame] | 172 | unsigned int ia_valid = iattr->ia_valid; |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 173 | int ret; |
| 174 | |
| 175 | filemap_invalidate_lock(mapping); |
Axel Rasmussen | f9b141f | 2022-04-14 19:13:31 -0700 | [diff] [blame] | 176 | |
| 177 | if ((ia_valid & ATTR_SIZE) && inode->i_size) |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 178 | ret = -EINVAL; |
| 179 | else |
Christian Brauner | c1632a0 | 2023-01-13 12:49:11 +0100 | [diff] [blame] | 180 | ret = simple_setattr(idmap, dentry, iattr); |
Axel Rasmussen | f9b141f | 2022-04-14 19:13:31 -0700 | [diff] [blame] | 181 | |
Mike Rapoport | 84ac0130 | 2022-07-07 19:56:50 +0300 | [diff] [blame] | 182 | filemap_invalidate_unlock(mapping); |
| 183 | |
| 184 | return ret; |
Axel Rasmussen | f9b141f | 2022-04-14 19:13:31 -0700 | [diff] [blame] | 185 | } |
| 186 | |
| 187 | static const struct inode_operations secretmem_iops = { |
| 188 | .setattr = secretmem_setattr, |
| 189 | }; |
| 190 | |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 191 | static struct vfsmount *secretmem_mnt; |
| 192 | |
| 193 | static struct file *secretmem_file_create(unsigned long flags) |
| 194 | { |
Colin Ian King | 98001fd | 2023-01-16 16:43:32 +0000 | [diff] [blame] | 195 | struct file *file; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 196 | struct inode *inode; |
Christian Göttsche | 2bfe15c | 2022-01-25 15:33:04 +0100 | [diff] [blame] | 197 | const char *anon_name = "[secretmem]"; |
| 198 | const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name)); |
| 199 | int err; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 200 | |
| 201 | inode = alloc_anon_inode(secretmem_mnt->mnt_sb); |
| 202 | if (IS_ERR(inode)) |
| 203 | return ERR_CAST(inode); |
| 204 | |
Christian Göttsche | 2bfe15c | 2022-01-25 15:33:04 +0100 | [diff] [blame] | 205 | err = security_inode_init_security_anon(inode, &qname, NULL); |
| 206 | if (err) { |
| 207 | file = ERR_PTR(err); |
| 208 | goto err_free_inode; |
| 209 | } |
| 210 | |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 211 | file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", |
| 212 | O_RDWR, &secretmem_fops); |
| 213 | if (IS_ERR(file)) |
| 214 | goto err_free_inode; |
| 215 | |
| 216 | mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); |
| 217 | mapping_set_unevictable(inode->i_mapping); |
| 218 | |
Axel Rasmussen | f9b141f | 2022-04-14 19:13:31 -0700 | [diff] [blame] | 219 | inode->i_op = &secretmem_iops; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 220 | inode->i_mapping->a_ops = &secretmem_aops; |
| 221 | |
| 222 | /* pretend we are a normal file with zero size */ |
| 223 | inode->i_mode |= S_IFREG; |
| 224 | inode->i_size = 0; |
| 225 | |
| 226 | return file; |
| 227 | |
| 228 | err_free_inode: |
| 229 | iput(inode); |
| 230 | return file; |
| 231 | } |
| 232 | |
| 233 | SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) |
| 234 | { |
| 235 | struct file *file; |
| 236 | int fd, err; |
| 237 | |
| 238 | /* make sure local flags do not confict with global fcntl.h */ |
| 239 | BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); |
| 240 | |
| 241 | if (!secretmem_enable) |
| 242 | return -ENOSYS; |
| 243 | |
| 244 | if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) |
| 245 | return -EINVAL; |
Matthew Wilcox (Oracle) | cb68543 | 2021-10-25 19:16:34 +0100 | [diff] [blame] | 246 | if (atomic_read(&secretmem_users) < 0) |
| 247 | return -ENFILE; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 248 | |
| 249 | fd = get_unused_fd_flags(flags & O_CLOEXEC); |
| 250 | if (fd < 0) |
| 251 | return fd; |
| 252 | |
| 253 | file = secretmem_file_create(flags); |
| 254 | if (IS_ERR(file)) { |
| 255 | err = PTR_ERR(file); |
| 256 | goto err_put_fd; |
| 257 | } |
| 258 | |
| 259 | file->f_flags |= O_LARGEFILE; |
| 260 | |
Linus Torvalds | 87066fd | 2021-10-24 09:48:33 -1000 | [diff] [blame] | 261 | atomic_inc(&secretmem_users); |
Kees Cook | 855d444 | 2021-10-28 14:36:21 -0700 | [diff] [blame] | 262 | fd_install(fd, file); |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 263 | return fd; |
| 264 | |
| 265 | err_put_fd: |
| 266 | put_unused_fd(fd); |
| 267 | return err; |
| 268 | } |
| 269 | |
| 270 | static int secretmem_init_fs_context(struct fs_context *fc) |
| 271 | { |
| 272 | return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; |
| 273 | } |
| 274 | |
| 275 | static struct file_system_type secretmem_fs = { |
| 276 | .name = "secretmem", |
| 277 | .init_fs_context = secretmem_init_fs_context, |
| 278 | .kill_sb = kill_anon_super, |
| 279 | }; |
| 280 | |
Xiu Jianfeng | 1ea4159 | 2022-09-15 09:16:02 +0800 | [diff] [blame] | 281 | static int __init secretmem_init(void) |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 282 | { |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 283 | if (!secretmem_enable) |
Xiu Jianfeng | f7c5b1a | 2022-09-20 09:22:05 +0800 | [diff] [blame] | 284 | return 0; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 285 | |
| 286 | secretmem_mnt = kern_mount(&secretmem_fs); |
| 287 | if (IS_ERR(secretmem_mnt)) |
Binyi Han | 4eb5bbd | 2022-09-04 00:46:47 -0700 | [diff] [blame] | 288 | return PTR_ERR(secretmem_mnt); |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 289 | |
| 290 | /* prevent secretmem mappings from ever getting PROT_EXEC */ |
| 291 | secretmem_mnt->mnt_flags |= MNT_NOEXEC; |
| 292 | |
Xiu Jianfeng | f7c5b1a | 2022-09-20 09:22:05 +0800 | [diff] [blame] | 293 | return 0; |
Mike Rapoport | 1507f51 | 2021-07-07 18:08:03 -0700 | [diff] [blame] | 294 | } |
| 295 | fs_initcall(secretmem_init); |