| // SPDX-License-Identifier: GPL-2.0 |
| #include <linux/backing-dev.h> |
| #include <linux/falloc.h> |
| #include <linux/kvm_host.h> |
| #include <linux/pagemap.h> |
| #include <linux/anon_inodes.h> |
| |
| #include "kvm_mm.h" |
| |
| struct kvm_gmem { |
| struct kvm *kvm; |
| struct xarray bindings; |
| struct list_head entry; |
| }; |
| |
| /** |
| * folio_file_pfn - like folio_file_page, but return a pfn. |
| * @folio: The folio which contains this index. |
| * @index: The index we want to look up. |
| * |
| * Return: The pfn for this index. |
| */ |
| static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) |
| { |
| return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); |
| } |
| |
| static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, |
| pgoff_t index, struct folio *folio) |
| { |
| #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE |
| kvm_pfn_t pfn = folio_file_pfn(folio, index); |
| gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; |
| int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); |
| if (rc) { |
| pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", |
| index, gfn, pfn, rc); |
| return rc; |
| } |
| #endif |
| |
| return 0; |
| } |
| |
| static inline void kvm_gmem_mark_prepared(struct folio *folio) |
| { |
| folio_mark_uptodate(folio); |
| } |
| |
| /* |
| * Process @folio, which contains @gfn, so that the guest can use it. |
| * The folio must be locked and the gfn must be contained in @slot. |
| * On successful return the guest sees a zero page so as to avoid |
| * leaking host data and the up-to-date flag is set. |
| */ |
| static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, |
| gfn_t gfn, struct folio *folio) |
| { |
| unsigned long nr_pages, i; |
| pgoff_t index; |
| int r; |
| |
| nr_pages = folio_nr_pages(folio); |
| for (i = 0; i < nr_pages; i++) |
| clear_highpage(folio_page(folio, i)); |
| |
| /* |
| * Preparing huge folios should always be safe, since it should |
| * be possible to split them later if needed. |
| * |
| * Right now the folio order is always going to be zero, but the |
| * code is ready for huge folios. The only assumption is that |
| * the base pgoff of memslots is naturally aligned with the |
| * requested page order, ensuring that huge folios can also use |
| * huge page table entries for GPA->HPA mapping. |
| * |
| * The order will be passed when creating the guest_memfd, and |
| * checked when creating memslots. |
| */ |
| WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); |
| index = gfn - slot->base_gfn + slot->gmem.pgoff; |
| index = ALIGN_DOWN(index, 1 << folio_order(folio)); |
| r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); |
| if (!r) |
| kvm_gmem_mark_prepared(folio); |
| |
| return r; |
| } |
| |
| /* |
| * Returns a locked folio on success. The caller is responsible for |
| * setting the up-to-date flag before the memory is mapped into the guest. |
| * There is no backing storage for the memory, so the folio will remain |
| * up-to-date until it's removed. |
| * |
| * Ignore accessed, referenced, and dirty flags. The memory is |
| * unevictable and there is no storage to write back to. |
| */ |
| static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) |
| { |
| /* TODO: Support huge pages. */ |
| return filemap_grab_folio(inode->i_mapping, index); |
| } |
| |
| static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, |
| pgoff_t end) |
| { |
| bool flush = false, found_memslot = false; |
| struct kvm_memory_slot *slot; |
| struct kvm *kvm = gmem->kvm; |
| unsigned long index; |
| |
| xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { |
| pgoff_t pgoff = slot->gmem.pgoff; |
| |
| struct kvm_gfn_range gfn_range = { |
| .start = slot->base_gfn + max(pgoff, start) - pgoff, |
| .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, |
| .slot = slot, |
| .may_block = true, |
| }; |
| |
| if (!found_memslot) { |
| found_memslot = true; |
| |
| KVM_MMU_LOCK(kvm); |
| kvm_mmu_invalidate_begin(kvm); |
| } |
| |
| flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); |
| } |
| |
| if (flush) |
| kvm_flush_remote_tlbs(kvm); |
| |
| if (found_memslot) |
| KVM_MMU_UNLOCK(kvm); |
| } |
| |
| static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, |
| pgoff_t end) |
| { |
| struct kvm *kvm = gmem->kvm; |
| |
| if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { |
| KVM_MMU_LOCK(kvm); |
| kvm_mmu_invalidate_end(kvm); |
| KVM_MMU_UNLOCK(kvm); |
| } |
| } |
| |
| static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) |
| { |
| struct list_head *gmem_list = &inode->i_mapping->i_private_list; |
| pgoff_t start = offset >> PAGE_SHIFT; |
| pgoff_t end = (offset + len) >> PAGE_SHIFT; |
| struct kvm_gmem *gmem; |
| |
| /* |
| * Bindings must be stable across invalidation to ensure the start+end |
| * are balanced. |
| */ |
| filemap_invalidate_lock(inode->i_mapping); |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_begin(gmem, start, end); |
| |
| truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_end(gmem, start, end); |
| |
| filemap_invalidate_unlock(inode->i_mapping); |
| |
| return 0; |
| } |
| |
| static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) |
| { |
| struct address_space *mapping = inode->i_mapping; |
| pgoff_t start, index, end; |
| int r; |
| |
| /* Dedicated guest is immutable by default. */ |
| if (offset + len > i_size_read(inode)) |
| return -EINVAL; |
| |
| filemap_invalidate_lock_shared(mapping); |
| |
| start = offset >> PAGE_SHIFT; |
| end = (offset + len) >> PAGE_SHIFT; |
| |
| r = 0; |
| for (index = start; index < end; ) { |
| struct folio *folio; |
| |
| if (signal_pending(current)) { |
| r = -EINTR; |
| break; |
| } |
| |
| folio = kvm_gmem_get_folio(inode, index); |
| if (IS_ERR(folio)) { |
| r = PTR_ERR(folio); |
| break; |
| } |
| |
| index = folio_next_index(folio); |
| |
| folio_unlock(folio); |
| folio_put(folio); |
| |
| /* 64-bit only, wrapping the index should be impossible. */ |
| if (WARN_ON_ONCE(!index)) |
| break; |
| |
| cond_resched(); |
| } |
| |
| filemap_invalidate_unlock_shared(mapping); |
| |
| return r; |
| } |
| |
| static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, |
| loff_t len) |
| { |
| int ret; |
| |
| if (!(mode & FALLOC_FL_KEEP_SIZE)) |
| return -EOPNOTSUPP; |
| |
| if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) |
| return -EOPNOTSUPP; |
| |
| if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) |
| return -EINVAL; |
| |
| if (mode & FALLOC_FL_PUNCH_HOLE) |
| ret = kvm_gmem_punch_hole(file_inode(file), offset, len); |
| else |
| ret = kvm_gmem_allocate(file_inode(file), offset, len); |
| |
| if (!ret) |
| file_modified(file); |
| return ret; |
| } |
| |
| static int kvm_gmem_release(struct inode *inode, struct file *file) |
| { |
| struct kvm_gmem *gmem = file->private_data; |
| struct kvm_memory_slot *slot; |
| struct kvm *kvm = gmem->kvm; |
| unsigned long index; |
| |
| /* |
| * Prevent concurrent attempts to *unbind* a memslot. This is the last |
| * reference to the file and thus no new bindings can be created, but |
| * dereferencing the slot for existing bindings needs to be protected |
| * against memslot updates, specifically so that unbind doesn't race |
| * and free the memslot (kvm_gmem_get_file() will return NULL). |
| */ |
| mutex_lock(&kvm->slots_lock); |
| |
| filemap_invalidate_lock(inode->i_mapping); |
| |
| xa_for_each(&gmem->bindings, index, slot) |
| rcu_assign_pointer(slot->gmem.file, NULL); |
| |
| synchronize_rcu(); |
| |
| /* |
| * All in-flight operations are gone and new bindings can be created. |
| * Zap all SPTEs pointed at by this file. Do not free the backing |
| * memory, as its lifetime is associated with the inode, not the file. |
| */ |
| kvm_gmem_invalidate_begin(gmem, 0, -1ul); |
| kvm_gmem_invalidate_end(gmem, 0, -1ul); |
| |
| list_del(&gmem->entry); |
| |
| filemap_invalidate_unlock(inode->i_mapping); |
| |
| mutex_unlock(&kvm->slots_lock); |
| |
| xa_destroy(&gmem->bindings); |
| kfree(gmem); |
| |
| kvm_put_kvm(kvm); |
| |
| return 0; |
| } |
| |
| static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) |
| { |
| /* |
| * Do not return slot->gmem.file if it has already been closed; |
| * there might be some time between the last fput() and when |
| * kvm_gmem_release() clears slot->gmem.file, and you do not |
| * want to spin in the meanwhile. |
| */ |
| return get_file_active(&slot->gmem.file); |
| } |
| |
| static struct file_operations kvm_gmem_fops = { |
| .open = generic_file_open, |
| .release = kvm_gmem_release, |
| .fallocate = kvm_gmem_fallocate, |
| }; |
| |
| void kvm_gmem_init(struct module *module) |
| { |
| kvm_gmem_fops.owner = module; |
| } |
| |
| static int kvm_gmem_migrate_folio(struct address_space *mapping, |
| struct folio *dst, struct folio *src, |
| enum migrate_mode mode) |
| { |
| WARN_ON_ONCE(1); |
| return -EINVAL; |
| } |
| |
| static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) |
| { |
| struct list_head *gmem_list = &mapping->i_private_list; |
| struct kvm_gmem *gmem; |
| pgoff_t start, end; |
| |
| filemap_invalidate_lock_shared(mapping); |
| |
| start = folio->index; |
| end = start + folio_nr_pages(folio); |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_begin(gmem, start, end); |
| |
| /* |
| * Do not truncate the range, what action is taken in response to the |
| * error is userspace's decision (assuming the architecture supports |
| * gracefully handling memory errors). If/when the guest attempts to |
| * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, |
| * at which point KVM can either terminate the VM or propagate the |
| * error to userspace. |
| */ |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_end(gmem, start, end); |
| |
| filemap_invalidate_unlock_shared(mapping); |
| |
| return MF_DELAYED; |
| } |
| |
| #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE |
| static void kvm_gmem_free_folio(struct folio *folio) |
| { |
| struct page *page = folio_page(folio, 0); |
| kvm_pfn_t pfn = page_to_pfn(page); |
| int order = folio_order(folio); |
| |
| kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); |
| } |
| #endif |
| |
| static const struct address_space_operations kvm_gmem_aops = { |
| .dirty_folio = noop_dirty_folio, |
| .migrate_folio = kvm_gmem_migrate_folio, |
| .error_remove_folio = kvm_gmem_error_folio, |
| #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE |
| .free_folio = kvm_gmem_free_folio, |
| #endif |
| }; |
| |
| static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, |
| struct kstat *stat, u32 request_mask, |
| unsigned int query_flags) |
| { |
| struct inode *inode = path->dentry->d_inode; |
| |
| generic_fillattr(idmap, request_mask, inode, stat); |
| return 0; |
| } |
| |
| static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, |
| struct iattr *attr) |
| { |
| return -EINVAL; |
| } |
| static const struct inode_operations kvm_gmem_iops = { |
| .getattr = kvm_gmem_getattr, |
| .setattr = kvm_gmem_setattr, |
| }; |
| |
| static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) |
| { |
| const char *anon_name = "[kvm-gmem]"; |
| struct kvm_gmem *gmem; |
| struct inode *inode; |
| struct file *file; |
| int fd, err; |
| |
| fd = get_unused_fd_flags(0); |
| if (fd < 0) |
| return fd; |
| |
| gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); |
| if (!gmem) { |
| err = -ENOMEM; |
| goto err_fd; |
| } |
| |
| file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, |
| O_RDWR, NULL); |
| if (IS_ERR(file)) { |
| err = PTR_ERR(file); |
| goto err_gmem; |
| } |
| |
| file->f_flags |= O_LARGEFILE; |
| |
| inode = file->f_inode; |
| WARN_ON(file->f_mapping != inode->i_mapping); |
| |
| inode->i_private = (void *)(unsigned long)flags; |
| inode->i_op = &kvm_gmem_iops; |
| inode->i_mapping->a_ops = &kvm_gmem_aops; |
| inode->i_mode |= S_IFREG; |
| inode->i_size = size; |
| mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); |
| mapping_set_inaccessible(inode->i_mapping); |
| /* Unmovable mappings are supposed to be marked unevictable as well. */ |
| WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); |
| |
| kvm_get_kvm(kvm); |
| gmem->kvm = kvm; |
| xa_init(&gmem->bindings); |
| list_add(&gmem->entry, &inode->i_mapping->i_private_list); |
| |
| fd_install(fd, file); |
| return fd; |
| |
| err_gmem: |
| kfree(gmem); |
| err_fd: |
| put_unused_fd(fd); |
| return err; |
| } |
| |
| int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) |
| { |
| loff_t size = args->size; |
| u64 flags = args->flags; |
| u64 valid_flags = 0; |
| |
| if (flags & ~valid_flags) |
| return -EINVAL; |
| |
| if (size <= 0 || !PAGE_ALIGNED(size)) |
| return -EINVAL; |
| |
| return __kvm_gmem_create(kvm, size, flags); |
| } |
| |
| int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, |
| unsigned int fd, loff_t offset) |
| { |
| loff_t size = slot->npages << PAGE_SHIFT; |
| unsigned long start, end; |
| struct kvm_gmem *gmem; |
| struct inode *inode; |
| struct file *file; |
| int r = -EINVAL; |
| |
| BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); |
| |
| file = fget(fd); |
| if (!file) |
| return -EBADF; |
| |
| if (file->f_op != &kvm_gmem_fops) |
| goto err; |
| |
| gmem = file->private_data; |
| if (gmem->kvm != kvm) |
| goto err; |
| |
| inode = file_inode(file); |
| |
| if (offset < 0 || !PAGE_ALIGNED(offset) || |
| offset + size > i_size_read(inode)) |
| goto err; |
| |
| filemap_invalidate_lock(inode->i_mapping); |
| |
| start = offset >> PAGE_SHIFT; |
| end = start + slot->npages; |
| |
| if (!xa_empty(&gmem->bindings) && |
| xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { |
| filemap_invalidate_unlock(inode->i_mapping); |
| goto err; |
| } |
| |
| /* |
| * No synchronize_rcu() needed, any in-flight readers are guaranteed to |
| * be see either a NULL file or this new file, no need for them to go |
| * away. |
| */ |
| rcu_assign_pointer(slot->gmem.file, file); |
| slot->gmem.pgoff = start; |
| |
| xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); |
| filemap_invalidate_unlock(inode->i_mapping); |
| |
| /* |
| * Drop the reference to the file, even on success. The file pins KVM, |
| * not the other way 'round. Active bindings are invalidated if the |
| * file is closed before memslots are destroyed. |
| */ |
| r = 0; |
| err: |
| fput(file); |
| return r; |
| } |
| |
| void kvm_gmem_unbind(struct kvm_memory_slot *slot) |
| { |
| unsigned long start = slot->gmem.pgoff; |
| unsigned long end = start + slot->npages; |
| struct kvm_gmem *gmem; |
| struct file *file; |
| |
| /* |
| * Nothing to do if the underlying file was already closed (or is being |
| * closed right now), kvm_gmem_release() invalidates all bindings. |
| */ |
| file = kvm_gmem_get_file(slot); |
| if (!file) |
| return; |
| |
| gmem = file->private_data; |
| |
| filemap_invalidate_lock(file->f_mapping); |
| xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); |
| rcu_assign_pointer(slot->gmem.file, NULL); |
| synchronize_rcu(); |
| filemap_invalidate_unlock(file->f_mapping); |
| |
| fput(file); |
| } |
| |
| /* Returns a locked folio on success. */ |
| static struct folio * |
| __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, |
| gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, |
| int *max_order) |
| { |
| pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; |
| struct kvm_gmem *gmem = file->private_data; |
| struct folio *folio; |
| |
| if (file != slot->gmem.file) { |
| WARN_ON_ONCE(slot->gmem.file); |
| return ERR_PTR(-EFAULT); |
| } |
| |
| gmem = file->private_data; |
| if (xa_load(&gmem->bindings, index) != slot) { |
| WARN_ON_ONCE(xa_load(&gmem->bindings, index)); |
| return ERR_PTR(-EIO); |
| } |
| |
| folio = kvm_gmem_get_folio(file_inode(file), index); |
| if (IS_ERR(folio)) |
| return folio; |
| |
| if (folio_test_hwpoison(folio)) { |
| folio_unlock(folio); |
| folio_put(folio); |
| return ERR_PTR(-EHWPOISON); |
| } |
| |
| *pfn = folio_file_pfn(folio, index); |
| if (max_order) |
| *max_order = 0; |
| |
| *is_prepared = folio_test_uptodate(folio); |
| return folio; |
| } |
| |
| int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, |
| gfn_t gfn, kvm_pfn_t *pfn, int *max_order) |
| { |
| struct file *file = kvm_gmem_get_file(slot); |
| struct folio *folio; |
| bool is_prepared = false; |
| int r = 0; |
| |
| if (!file) |
| return -EFAULT; |
| |
| folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order); |
| if (IS_ERR(folio)) { |
| r = PTR_ERR(folio); |
| goto out; |
| } |
| |
| if (!is_prepared) |
| r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); |
| |
| folio_unlock(folio); |
| if (r < 0) |
| folio_put(folio); |
| |
| out: |
| fput(file); |
| return r; |
| } |
| EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); |
| |
| #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM |
| long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, |
| kvm_gmem_populate_cb post_populate, void *opaque) |
| { |
| struct file *file; |
| struct kvm_memory_slot *slot; |
| void __user *p; |
| |
| int ret = 0, max_order; |
| long i; |
| |
| lockdep_assert_held(&kvm->slots_lock); |
| if (npages < 0) |
| return -EINVAL; |
| |
| slot = gfn_to_memslot(kvm, start_gfn); |
| if (!kvm_slot_can_be_private(slot)) |
| return -EINVAL; |
| |
| file = kvm_gmem_get_file(slot); |
| if (!file) |
| return -EFAULT; |
| |
| filemap_invalidate_lock(file->f_mapping); |
| |
| npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); |
| for (i = 0; i < npages; i += (1 << max_order)) { |
| struct folio *folio; |
| gfn_t gfn = start_gfn + i; |
| bool is_prepared = false; |
| kvm_pfn_t pfn; |
| |
| if (signal_pending(current)) { |
| ret = -EINTR; |
| break; |
| } |
| |
| folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order); |
| if (IS_ERR(folio)) { |
| ret = PTR_ERR(folio); |
| break; |
| } |
| |
| if (is_prepared) { |
| folio_unlock(folio); |
| folio_put(folio); |
| ret = -EEXIST; |
| break; |
| } |
| |
| folio_unlock(folio); |
| WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || |
| (npages - i) < (1 << max_order)); |
| |
| ret = -EINVAL; |
| while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), |
| KVM_MEMORY_ATTRIBUTE_PRIVATE, |
| KVM_MEMORY_ATTRIBUTE_PRIVATE)) { |
| if (!max_order) |
| goto put_folio_and_exit; |
| max_order--; |
| } |
| |
| p = src ? src + i * PAGE_SIZE : NULL; |
| ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); |
| if (!ret) |
| kvm_gmem_mark_prepared(folio); |
| |
| put_folio_and_exit: |
| folio_put(folio); |
| if (ret) |
| break; |
| } |
| |
| filemap_invalidate_unlock(file->f_mapping); |
| |
| fput(file); |
| return ret && !i ? ret : i; |
| } |
| EXPORT_SYMBOL_GPL(kvm_gmem_populate); |
| #endif |