| // SPDX-License-Identifier: GPL-2.0 |
| #include <linux/fs.h> |
| #include <linux/backing-dev.h> |
| #include <linux/falloc.h> |
| #include <linux/kvm_host.h> |
| #include <linux/pseudo_fs.h> |
| #include <linux/pagemap.h> |
| #include <linux/anon_inodes.h> |
| |
| #include "kvm_mm.h" |
| |
| static struct vfsmount *kvm_gmem_mnt; |
| |
| struct kvm_gmem { |
| struct kvm *kvm; |
| struct xarray bindings; |
| struct list_head entry; |
| }; |
| |
| struct kvm_gmem_inode_private { |
| #ifdef CONFIG_KVM_GMEM_SHARED_MEM |
| struct xarray shared_offsets; |
| rwlock_t offsets_lock; |
| #endif |
| }; |
| |
| static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode) |
| { |
| return inode->i_mapping->i_private_data; |
| } |
| |
| /** |
| * folio_file_pfn - like folio_file_page, but return a pfn. |
| * @folio: The folio which contains this index. |
| * @index: The index we want to look up. |
| * |
| * Return: The pfn for this index. |
| */ |
| static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) |
| { |
| return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); |
| } |
| |
| static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, |
| pgoff_t index, struct folio *folio) |
| { |
| #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE |
| kvm_pfn_t pfn = folio_file_pfn(folio, index); |
| gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff; |
| int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio)); |
| if (rc) { |
| pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n", |
| index, gfn, pfn, rc); |
| return rc; |
| } |
| #endif |
| |
| return 0; |
| } |
| |
| static inline void kvm_gmem_mark_prepared(struct folio *folio) |
| { |
| folio_mark_uptodate(folio); |
| } |
| |
| /* |
| * Process @folio, which contains @gfn, so that the guest can use it. |
| * The folio must be locked and the gfn must be contained in @slot. |
| * On successful return the guest sees a zero page so as to avoid |
| * leaking host data and the up-to-date flag is set. |
| */ |
| static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, |
| gfn_t gfn, struct folio *folio) |
| { |
| unsigned long nr_pages, i; |
| pgoff_t index; |
| int r; |
| |
| nr_pages = folio_nr_pages(folio); |
| for (i = 0; i < nr_pages; i++) |
| clear_highpage(folio_page(folio, i)); |
| |
| /* |
| * Preparing huge folios should always be safe, since it should |
| * be possible to split them later if needed. |
| * |
| * Right now the folio order is always going to be zero, but the |
| * code is ready for huge folios. The only assumption is that |
| * the base pgoff of memslots is naturally aligned with the |
| * requested page order, ensuring that huge folios can also use |
| * huge page table entries for GPA->HPA mapping. |
| * |
| * The order will be passed when creating the guest_memfd, and |
| * checked when creating memslots. |
| */ |
| WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); |
| index = gfn - slot->base_gfn + slot->gmem.pgoff; |
| index = ALIGN_DOWN(index, 1 << folio_order(folio)); |
| r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); |
| if (!r) |
| kvm_gmem_mark_prepared(folio); |
| |
| return r; |
| } |
| |
| /* |
| * Returns a locked folio on success. The caller is responsible for |
| * setting the up-to-date flag before the memory is mapped into the guest. |
| * There is no backing storage for the memory, so the folio will remain |
| * up-to-date until it's removed. |
| * |
| * Ignore accessed, referenced, and dirty flags. The memory is |
| * unevictable and there is no storage to write back to. |
| */ |
| static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) |
| { |
| /* TODO: Support huge pages. */ |
| return filemap_grab_folio(inode->i_mapping, index); |
| } |
| |
| #ifdef CONFIG_KVM_GMEM_SHARED_MEM |
| static void kvm_gmem_offset_range_invalidate_shared(struct inode *inode, |
| pgoff_t start, pgoff_t end); |
| #else |
| static inline void kvm_gmem_offset_range_invalidate_shared(struct inode *inode, |
| pgoff_t start, pgoff_t end) |
| { |
| } |
| #endif |
| |
| static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, |
| pgoff_t end) |
| { |
| bool flush = false, found_memslot = false; |
| struct kvm_memory_slot *slot; |
| struct kvm *kvm = gmem->kvm; |
| unsigned long index; |
| |
| xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { |
| struct file *file = READ_ONCE(slot->gmem.file); |
| pgoff_t pgoff = slot->gmem.pgoff; |
| |
| struct kvm_gfn_range gfn_range = { |
| .start = slot->base_gfn + max(pgoff, start) - pgoff, |
| .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, |
| .slot = slot, |
| .may_block = true, |
| }; |
| |
| if (!found_memslot) { |
| found_memslot = true; |
| |
| KVM_MMU_LOCK(kvm); |
| kvm_mmu_invalidate_begin(kvm); |
| } |
| |
| flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); |
| |
| /* |
| * If this gets called after kvm_gmem_unbind() it means that all |
| * in-flight operations are gone, and the file has been closed. |
| */ |
| if (file) { |
| kvm_gmem_offset_range_invalidate_shared(file_inode(file), |
| gfn_range.start, |
| gfn_range.end); |
| } |
| } |
| |
| if (flush) |
| kvm_flush_remote_tlbs(kvm); |
| |
| if (found_memslot) |
| KVM_MMU_UNLOCK(kvm); |
| } |
| |
| static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, |
| pgoff_t end) |
| { |
| struct kvm *kvm = gmem->kvm; |
| |
| if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { |
| KVM_MMU_LOCK(kvm); |
| kvm_mmu_invalidate_end(kvm); |
| KVM_MMU_UNLOCK(kvm); |
| } |
| } |
| |
| static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) |
| { |
| struct list_head *gmem_list = &inode->i_mapping->i_private_list; |
| pgoff_t start = offset >> PAGE_SHIFT; |
| pgoff_t end = (offset + len) >> PAGE_SHIFT; |
| struct kvm_gmem *gmem; |
| |
| /* |
| * Bindings must be stable across invalidation to ensure the start+end |
| * are balanced. |
| */ |
| filemap_invalidate_lock(inode->i_mapping); |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_begin(gmem, start, end); |
| |
| truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_end(gmem, start, end); |
| |
| filemap_invalidate_unlock(inode->i_mapping); |
| |
| return 0; |
| } |
| |
| static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) |
| { |
| struct address_space *mapping = inode->i_mapping; |
| pgoff_t start, index, end; |
| int r; |
| |
| /* Dedicated guest is immutable by default. */ |
| if (offset + len > i_size_read(inode)) |
| return -EINVAL; |
| |
| filemap_invalidate_lock_shared(mapping); |
| |
| start = offset >> PAGE_SHIFT; |
| end = (offset + len) >> PAGE_SHIFT; |
| |
| r = 0; |
| for (index = start; index < end; ) { |
| struct folio *folio; |
| |
| if (signal_pending(current)) { |
| r = -EINTR; |
| break; |
| } |
| |
| folio = kvm_gmem_get_folio(inode, index); |
| if (IS_ERR(folio)) { |
| r = PTR_ERR(folio); |
| break; |
| } |
| |
| index = folio_next_index(folio); |
| |
| folio_unlock(folio); |
| folio_put(folio); |
| |
| /* 64-bit only, wrapping the index should be impossible. */ |
| if (WARN_ON_ONCE(!index)) |
| break; |
| |
| cond_resched(); |
| } |
| |
| filemap_invalidate_unlock_shared(mapping); |
| |
| return r; |
| } |
| |
| static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, |
| loff_t len) |
| { |
| int ret; |
| |
| if (!(mode & FALLOC_FL_KEEP_SIZE)) |
| return -EOPNOTSUPP; |
| |
| if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) |
| return -EOPNOTSUPP; |
| |
| if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) |
| return -EINVAL; |
| |
| if (mode & FALLOC_FL_PUNCH_HOLE) |
| ret = kvm_gmem_punch_hole(file_inode(file), offset, len); |
| else |
| ret = kvm_gmem_allocate(file_inode(file), offset, len); |
| |
| if (!ret) |
| file_modified(file); |
| return ret; |
| } |
| |
| static int kvm_gmem_release(struct inode *inode, struct file *file) |
| { |
| struct kvm_gmem *gmem = file->private_data; |
| struct kvm_memory_slot *slot; |
| struct kvm *kvm = gmem->kvm; |
| unsigned long index; |
| |
| /* |
| * Prevent concurrent attempts to *unbind* a memslot. This is the last |
| * reference to the file and thus no new bindings can be created, but |
| * dereferencing the slot for existing bindings needs to be protected |
| * against memslot updates, specifically so that unbind doesn't race |
| * and free the memslot (kvm_gmem_get_file() will return NULL). |
| */ |
| mutex_lock(&kvm->slots_lock); |
| |
| filemap_invalidate_lock(inode->i_mapping); |
| |
| xa_for_each(&gmem->bindings, index, slot) |
| rcu_assign_pointer(slot->gmem.file, NULL); |
| |
| synchronize_rcu(); |
| |
| /* |
| * All in-flight operations are gone and new bindings can be created. |
| * Zap all SPTEs pointed at by this file. Do not free the backing |
| * memory, as its lifetime is associated with the inode, not the file. |
| */ |
| kvm_gmem_invalidate_begin(gmem, 0, -1ul); |
| kvm_gmem_invalidate_end(gmem, 0, -1ul); |
| |
| list_del(&gmem->entry); |
| |
| filemap_invalidate_unlock(inode->i_mapping); |
| |
| mutex_unlock(&kvm->slots_lock); |
| |
| xa_destroy(&gmem->bindings); |
| kfree(gmem); |
| |
| kvm_put_kvm(kvm); |
| |
| return 0; |
| } |
| |
| static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) |
| { |
| /* |
| * Do not return slot->gmem.file if it has already been closed; |
| * there might be some time between the last fput() and when |
| * kvm_gmem_release() clears slot->gmem.file, and you do not |
| * want to spin in the meanwhile. |
| */ |
| return get_file_active(&slot->gmem.file); |
| } |
| |
| static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) |
| { |
| return gfn - slot->base_gfn + slot->gmem.pgoff; |
| } |
| |
| static void kvm_gmem_evict_inode(struct inode *inode) |
| { |
| struct kvm_gmem_inode_private *private = kvm_gmem_private(inode); |
| |
| #ifdef CONFIG_KVM_GMEM_SHARED_MEM |
| /* |
| * .evict_inode can be called before private data is set up if there are |
| * issues during inode creation. |
| */ |
| if (private) |
| xa_destroy(&private->shared_offsets); |
| #endif |
| |
| truncate_inode_pages_final(inode->i_mapping); |
| |
| kfree(private); |
| clear_inode(inode); |
| } |
| |
| static const struct super_operations kvm_gmem_super_operations = { |
| .statfs = simple_statfs, |
| .evict_inode = kvm_gmem_evict_inode, |
| }; |
| |
| static int kvm_gmem_init_fs_context(struct fs_context *fc) |
| { |
| struct pseudo_fs_context *ctx; |
| |
| if (!init_pseudo(fc, GUEST_MEMORY_MAGIC)) |
| return -ENOMEM; |
| |
| ctx = fc->fs_private; |
| ctx->ops = &kvm_gmem_super_operations; |
| |
| return 0; |
| } |
| |
| static struct file_system_type kvm_gmem_fs = { |
| .name = "kvm_guest_memory", |
| .init_fs_context = kvm_gmem_init_fs_context, |
| .kill_sb = kill_anon_super, |
| }; |
| |
| static void kvm_gmem_init_mount(void) |
| { |
| kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); |
| BUG_ON(IS_ERR(kvm_gmem_mnt)); |
| |
| /* For giggles. Userspace can never map this anyways. */ |
| kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; |
| } |
| |
| #ifdef CONFIG_KVM_GMEM_SHARED_MEM |
| /* |
| * An enum of the valid folio sharing states: |
| * Bit 0: set if not shared with the guest (guest cannot fault it in) |
| * Bit 1: set if not shared with the host (host cannot fault it in) |
| */ |
| enum folio_shareability { |
| KVM_GMEM_ALL_SHARED = 0b00, /* Shared with host and guest. */ |
| KVM_GMEM_GUEST_SHARED = 0b10, /* Shared only with guest. */ |
| KVM_GMEM_NONE_SHARED = 0b11, /* Not shared, transient state. */ |
| }; |
| |
| /* |
| * Unregisters the __folio_put() callback from the folio. |
| * |
| * Restores a folio's refcount after all pending references have been released, |
| * and removes the folio type, thereby removing the callback. Now the folio can |
| * be freed normaly once all actual references have been dropped. |
| * |
| * Must be called with the folio locked and the offsets_lock write lock held. |
| */ |
| static void kvm_gmem_restore_pending_folio(struct folio *folio, struct inode *inode) |
| { |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| |
| lockdep_assert_held_write(offsets_lock); |
| WARN_ON_ONCE(!folio_test_locked(folio)); |
| |
| if (WARN_ON_ONCE(!folio_test_guestmem(folio))) |
| return; |
| |
| __folio_clear_guestmem(folio); |
| folio_ref_add(folio, folio_nr_pages(folio)); |
| |
| pr_info("%s: done\n", __func__); |
| } |
| |
| static int kvm_gmem_offset_set_shared(struct inode *inode, pgoff_t index) |
| { |
| struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| void *xval = xa_mk_value(KVM_GMEM_ALL_SHARED); |
| |
| lockdep_assert_held_write(offsets_lock); |
| |
| /* |
| * If the folio is NONE_SHARED, it indicates that it is transitioning to |
| * private (GUEST_SHARED). Transition it to shared (ALL_SHARED) |
| * immediately, and remove the callback. |
| */ |
| if (xa_to_value(xa_load(shared_offsets, index)) == KVM_GMEM_NONE_SHARED) { |
| struct folio *folio = filemap_lock_folio(inode->i_mapping, index); |
| |
| if (WARN_ON_ONCE(IS_ERR(folio))) |
| return PTR_ERR(folio); |
| |
| if (folio_test_guestmem(folio)) |
| kvm_gmem_restore_pending_folio(folio, inode); |
| |
| folio_unlock(folio); |
| folio_put(folio); |
| } |
| |
| return xa_err(xa_store(shared_offsets, index, xval, GFP_KERNEL)); |
| } |
| |
| /* |
| * Marks the range [start, end) as shared with both the host and the guest. |
| * Called when guest shares memory with the host. |
| */ |
| static int kvm_gmem_offset_range_set_shared(struct inode *inode, |
| pgoff_t start, pgoff_t end) |
| { |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| pgoff_t i; |
| int r = 0; |
| |
| write_lock(offsets_lock); |
| for (i = start; i < end; i++) { |
| r = kvm_gmem_offset_set_shared(inode, i); |
| if (WARN_ON_ONCE(r)) |
| break; |
| } |
| write_unlock(offsets_lock); |
| |
| return r; |
| } |
| |
| static int kvm_gmem_offset_clear_shared(struct inode *inode, pgoff_t index) |
| { |
| struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_SHARED); |
| void *xval_none = xa_mk_value(KVM_GMEM_NONE_SHARED); |
| struct folio *folio; |
| int refcount; |
| int r; |
| |
| lockdep_assert_held_write(offsets_lock); |
| |
| folio = filemap_lock_folio(inode->i_mapping, index); |
| if (!IS_ERR(folio)) { |
| /* +1 references are expected because of filemap_lock_folio(). */ |
| refcount = folio_nr_pages(folio) + 1; |
| } else { |
| r = PTR_ERR(folio); |
| if (WARN_ON_ONCE(r != -ENOENT)) |
| return r; |
| |
| folio = NULL; |
| } |
| |
| if (!folio || folio_ref_freeze(folio, refcount)) { |
| /* |
| * No outstanding references: transition to guest shared. |
| */ |
| r = xa_err(xa_store(shared_offsets, index, xval_guest, GFP_KERNEL)); |
| |
| if (folio) |
| folio_ref_unfreeze(folio, refcount); |
| } else { |
| /* |
| * Outstanding references: the folio cannot be faulted in by |
| * anyone until they're dropped. |
| */ |
| r = xa_err(xa_store(shared_offsets, index, xval_none, GFP_KERNEL)); |
| } |
| |
| if (folio) { |
| folio_unlock(folio); |
| folio_put(folio); |
| } |
| |
| return r; |
| } |
| |
| /* |
| * Callback when invalidating memory that is potentially shared. |
| * |
| * Must be called with the offsets_lock write lock held. |
| */ |
| static void kvm_gmem_offset_range_invalidate_shared(struct inode *inode, |
| pgoff_t start, pgoff_t end) |
| { |
| struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| pgoff_t i; |
| |
| lockdep_assert_held_write(offsets_lock); |
| |
| for (i = start; i < end; i++) { |
| /* |
| * If the folio is NONE_SHARED, it indicates that it's |
| * transitioning to private (GUEST_SHARED). Transition it to |
| * shared (ALL_SHARED) and remove the callback. |
| */ |
| if (xa_to_value(xa_load(shared_offsets, i)) == KVM_GMEM_NONE_SHARED) { |
| struct folio *folio = filemap_lock_folio(inode->i_mapping, i); |
| |
| if (!WARN_ON_ONCE(IS_ERR(folio))) { |
| if (folio_test_guestmem(folio)) |
| kvm_gmem_restore_pending_folio(folio, inode); |
| |
| folio_unlock(folio); |
| folio_put(folio); |
| } |
| } |
| |
| xa_erase(shared_offsets, i); |
| } |
| } |
| |
| /* |
| * Marks the range [start, end) as not shared with the host. If the host doesn't |
| * have any references to a particular folio, then that folio is marked as |
| * shared with the guest. |
| * |
| * However, if the host still has references to the folio, then the folio is |
| * marked and not shared with anyone. Marking it as not shared allows draining |
| * all references from the host, and ensures that the hypervisor does not |
| * transition the folio to private, since the host still might access it. |
| * |
| * Called when guest unshares memory with the host. |
| */ |
| static int kvm_gmem_offset_range_clear_shared(struct inode *inode, |
| pgoff_t start, pgoff_t end) |
| { |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| pgoff_t i; |
| int r = 0; |
| |
| write_lock(offsets_lock); |
| for (i = start; i < end; i++) { |
| r = kvm_gmem_offset_clear_shared(inode, i); |
| if (WARN_ON_ONCE(r)) |
| break; |
| } |
| write_unlock(offsets_lock); |
| |
| return r; |
| } |
| |
| /* |
| * Registers a callback to __folio_put(), so that gmem knows that the host does |
| * not have any references to the folio. The callback itself is registered by |
| * setting the folio type to guestmem. |
| * |
| * Returns 0 if a callback was registered or already has been registered, or |
| * -EAGAIN if the host has references, indicating a callback wasn't registered. |
| * |
| * Must be called with the folio locked and the offsets_lock write lock held. |
| */ |
| static int kvm_gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t index) |
| { |
| struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_SHARED); |
| int refcount; |
| int r = 0; |
| |
| lockdep_assert_held_write(offsets_lock); |
| WARN_ON_ONCE(!folio_test_locked(folio)); |
| |
| if (folio_test_guestmem(folio)) { |
| pr_info("%s:%d 0 already set\n", __func__, __LINE__); |
| return 0; |
| } |
| |
| if (folio_mapped(folio)) { |
| pr_info("%s:%d -EAGAIN\n", __func__, __LINE__); |
| return -EAGAIN; |
| } |
| |
| refcount = folio_ref_count(folio); |
| if (!folio_ref_freeze(folio, refcount)) { |
| pr_info("%s:%d -EAGAIN\n", __func__, __LINE__); |
| return -EAGAIN; |
| } |
| |
| /* |
| * Register callback by setting the folio type and subtracting gmem's |
| * references for it to trigger once outstanding references are dropped. |
| */ |
| if (refcount > 1) { |
| __folio_set_guestmem(folio); |
| refcount -= folio_nr_pages(folio); |
| } else { |
| /* No outstanding references, transition it to guest shared. */ |
| r = WARN_ON_ONCE(xa_err(xa_store(shared_offsets, index, xval_guest, GFP_KERNEL))); |
| } |
| |
| folio_ref_unfreeze(folio, refcount); |
| |
| pr_info("%s:%d done with ret: %d\n", __func__, __LINE__, r); |
| |
| return r; |
| } |
| |
| int kvm_gmem_slot_register_callback(struct kvm_memory_slot *slot, gfn_t gfn) |
| { |
| unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn; |
| struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| struct folio *folio; |
| int r; |
| |
| write_lock(offsets_lock); |
| |
| folio = filemap_lock_folio(inode->i_mapping, pgoff); |
| if (WARN_ON_ONCE(IS_ERR(folio))) { |
| write_unlock(offsets_lock); |
| return PTR_ERR(folio); |
| } |
| |
| r = kvm_gmem_register_callback(folio, inode, pgoff); |
| |
| folio_unlock(folio); |
| folio_put(folio); |
| write_unlock(offsets_lock); |
| |
| return r; |
| } |
| EXPORT_SYMBOL_GPL(kvm_gmem_slot_register_callback); |
| |
| /* |
| * Callback function for __folio_put(), i.e., called once all references by the |
| * host to the folio have been dropped. This allows gmem to transition the state |
| * of the folio to shared with the guest, and allows the hypervisor to continue |
| * transitioning its state to private, since the host cannot attempt to access |
| * it anymore. |
| */ |
| void kvm_gmem_handle_folio_put(struct folio *folio) |
| { |
| struct address_space *mapping; |
| struct xarray *shared_offsets; |
| rwlock_t *offsets_lock; |
| struct inode *inode; |
| pgoff_t index; |
| void *xval; |
| |
| mapping = folio->mapping; |
| if (WARN_ON_ONCE(!mapping)) |
| return; |
| |
| inode = mapping->host; |
| index = folio->index; |
| shared_offsets = &kvm_gmem_private(inode)->shared_offsets; |
| offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| xval = xa_mk_value(KVM_GMEM_GUEST_SHARED); |
| |
| write_lock(offsets_lock); |
| folio_lock(folio); |
| kvm_gmem_restore_pending_folio(folio, inode); |
| folio_unlock(folio); |
| WARN_ON_ONCE(xa_err(xa_store(shared_offsets, index, xval, GFP_KERNEL))); |
| write_unlock(offsets_lock); |
| |
| pr_info("%s: done\n", __func__); |
| } |
| EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put); |
| |
| /* |
| * Returns true if the folio is shared with the host and the guest. |
| * |
| * Must be called with the offsets_lock lock held. |
| */ |
| static bool kvm_gmem_offset_is_shared(struct inode *inode, pgoff_t index) |
| { |
| struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| unsigned long r; |
| |
| lockdep_assert_held(offsets_lock); |
| |
| r = xa_to_value(xa_load(shared_offsets, index)); |
| |
| return r == KVM_GMEM_ALL_SHARED; |
| } |
| |
| /* |
| * Returns true if the folio is shared with the guest (not transitioning). |
| * |
| * Must be called with the offsets_lock lock held. |
| */ |
| static bool kvm_gmem_offset_is_guest_shared(struct inode *inode, pgoff_t index) |
| { |
| struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| unsigned long r; |
| |
| lockdep_assert_held(offsets_lock); |
| |
| r = xa_to_value(xa_load(shared_offsets, index)); |
| |
| return (r == KVM_GMEM_ALL_SHARED || r == KVM_GMEM_GUEST_SHARED); |
| } |
| |
| int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) |
| { |
| struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); |
| pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; |
| pgoff_t end_off = start_off + end - start; |
| |
| return kvm_gmem_offset_range_set_shared(inode, start_off, end_off); |
| } |
| |
| int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) |
| { |
| struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); |
| pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; |
| pgoff_t end_off = start_off + end - start; |
| |
| return kvm_gmem_offset_range_clear_shared(inode, start_off, end_off); |
| } |
| |
| bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn) |
| { |
| struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn; |
| bool r; |
| |
| read_lock(offsets_lock); |
| r = kvm_gmem_offset_is_guest_shared(inode, pgoff); |
| read_unlock(offsets_lock); |
| |
| return r; |
| } |
| |
| static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) |
| { |
| struct inode *inode = file_inode(vmf->vma->vm_file); |
| rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; |
| struct folio *folio; |
| vm_fault_t ret = VM_FAULT_LOCKED; |
| |
| filemap_invalidate_lock_shared(inode->i_mapping); |
| read_lock(offsets_lock); |
| |
| folio = kvm_gmem_get_folio(inode, vmf->pgoff); |
| if (IS_ERR(folio)) { |
| int err = PTR_ERR(folio); |
| |
| if (err == -EAGAIN) |
| ret = VM_FAULT_RETRY; |
| else |
| ret = vmf_error(err); |
| |
| goto out_filemap; |
| } |
| |
| if (folio_test_hwpoison(folio)) { |
| ret = VM_FAULT_HWPOISON; |
| goto out_folio; |
| } |
| |
| if (!kvm_gmem_offset_is_shared(inode, vmf->pgoff)) { |
| ret = VM_FAULT_SIGBUS; |
| goto out_folio; |
| } |
| |
| /* |
| * Shared folios would not be marked as "guestmem" so far, and we only |
| * expect shared folios at this point. |
| */ |
| if (WARN_ON_ONCE(folio_test_guestmem(folio))) { |
| ret = VM_FAULT_SIGBUS; |
| goto out_folio; |
| } |
| |
| /* No support for huge pages. */ |
| if (WARN_ON_ONCE(folio_test_large(folio))) { |
| ret = VM_FAULT_SIGBUS; |
| goto out_folio; |
| } |
| |
| if (!folio_test_uptodate(folio)) { |
| clear_highpage(folio_page(folio, 0)); |
| kvm_gmem_mark_prepared(folio); |
| } |
| |
| vmf->page = folio_file_page(folio, vmf->pgoff); |
| |
| out_folio: |
| if (ret != VM_FAULT_LOCKED) { |
| folio_unlock(folio); |
| folio_put(folio); |
| } |
| |
| out_filemap: |
| read_unlock(offsets_lock); |
| filemap_invalidate_unlock_shared(inode->i_mapping); |
| |
| return ret; |
| } |
| |
| static const struct vm_operations_struct kvm_gmem_vm_ops = { |
| .fault = kvm_gmem_fault, |
| }; |
| |
| static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) |
| { |
| struct kvm_gmem *gmem = file->private_data; |
| |
| if (!kvm_arch_gmem_supports_shared_mem(gmem->kvm)) |
| return -ENODEV; |
| |
| if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) != |
| (VM_SHARED | VM_MAYSHARE)) { |
| return -EINVAL; |
| } |
| |
| file_accessed(file); |
| vm_flags_set(vma, VM_DONTDUMP); |
| vma->vm_ops = &kvm_gmem_vm_ops; |
| |
| return 0; |
| } |
| #else |
| #define kvm_gmem_mmap NULL |
| #endif /* CONFIG_KVM_GMEM_SHARED_MEM */ |
| |
| static struct file_operations kvm_gmem_fops = { |
| .mmap = kvm_gmem_mmap, |
| .open = generic_file_open, |
| .release = kvm_gmem_release, |
| .fallocate = kvm_gmem_fallocate, |
| }; |
| |
| void kvm_gmem_init(struct module *module) |
| { |
| kvm_gmem_fops.owner = module; |
| |
| kvm_gmem_init_mount(); |
| } |
| |
| static int kvm_gmem_migrate_folio(struct address_space *mapping, |
| struct folio *dst, struct folio *src, |
| enum migrate_mode mode) |
| { |
| WARN_ON_ONCE(1); |
| return -EINVAL; |
| } |
| |
| static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio) |
| { |
| struct list_head *gmem_list = &mapping->i_private_list; |
| struct kvm_gmem *gmem; |
| pgoff_t start, end; |
| |
| filemap_invalidate_lock_shared(mapping); |
| |
| start = folio->index; |
| end = start + folio_nr_pages(folio); |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_begin(gmem, start, end); |
| |
| /* |
| * Do not truncate the range, what action is taken in response to the |
| * error is userspace's decision (assuming the architecture supports |
| * gracefully handling memory errors). If/when the guest attempts to |
| * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, |
| * at which point KVM can either terminate the VM or propagate the |
| * error to userspace. |
| */ |
| |
| list_for_each_entry(gmem, gmem_list, entry) |
| kvm_gmem_invalidate_end(gmem, start, end); |
| |
| filemap_invalidate_unlock_shared(mapping); |
| |
| return MF_DELAYED; |
| } |
| |
| #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE |
| static void kvm_gmem_free_folio(struct folio *folio) |
| { |
| struct page *page = folio_page(folio, 0); |
| kvm_pfn_t pfn = page_to_pfn(page); |
| int order = folio_order(folio); |
| |
| kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order)); |
| } |
| #endif |
| |
| static const struct address_space_operations kvm_gmem_aops = { |
| .dirty_folio = noop_dirty_folio, |
| .migrate_folio = kvm_gmem_migrate_folio, |
| .error_remove_folio = kvm_gmem_error_folio, |
| #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE |
| .free_folio = kvm_gmem_free_folio, |
| #endif |
| }; |
| |
| static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, |
| struct kstat *stat, u32 request_mask, |
| unsigned int query_flags) |
| { |
| struct inode *inode = path->dentry->d_inode; |
| |
| generic_fillattr(idmap, request_mask, inode, stat); |
| return 0; |
| } |
| |
| static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, |
| struct iattr *attr) |
| { |
| return -EINVAL; |
| } |
| static const struct inode_operations kvm_gmem_iops = { |
| .getattr = kvm_gmem_getattr, |
| .setattr = kvm_gmem_setattr, |
| }; |
| |
| static struct inode *kvm_gmem_inode_make_secure_inode(const char *name, |
| loff_t size, u64 flags) |
| { |
| const struct qstr qname = QSTR_INIT(name, strlen(name)); |
| struct kvm_gmem_inode_private *private; |
| struct inode *inode; |
| int err; |
| |
| inode = alloc_anon_inode(kvm_gmem_mnt->mnt_sb); |
| if (IS_ERR(inode)) |
| return inode; |
| |
| err = security_inode_init_security_anon(inode, &qname, NULL); |
| if (err) |
| goto out; |
| |
| err = -ENOMEM; |
| private = kzalloc(sizeof(*private), GFP_KERNEL); |
| if (!private) |
| goto out; |
| |
| #ifdef CONFIG_KVM_GMEM_SHARED_MEM |
| xa_init(&private->shared_offsets); |
| rwlock_init(&private->offsets_lock); |
| #endif |
| |
| inode->i_mapping->i_private_data = private; |
| |
| inode->i_private = (void *)(unsigned long)flags; |
| inode->i_op = &kvm_gmem_iops; |
| inode->i_mapping->a_ops = &kvm_gmem_aops; |
| inode->i_mode |= S_IFREG; |
| inode->i_size = size; |
| mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); |
| mapping_set_inaccessible(inode->i_mapping); |
| /* Unmovable mappings are supposed to be marked unevictable as well. */ |
| WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); |
| |
| return inode; |
| |
| out: |
| iput(inode); |
| |
| return ERR_PTR(err); |
| } |
| |
| static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size, |
| u64 flags) |
| { |
| static const char *name = "[kvm-gmem]"; |
| struct inode *inode; |
| struct file *file; |
| int err; |
| |
| err = -ENOENT; |
| if (!try_module_get(kvm_gmem_fops.owner)) |
| goto err; |
| |
| inode = kvm_gmem_inode_make_secure_inode(name, size, flags); |
| if (IS_ERR(inode)) { |
| err = PTR_ERR(inode); |
| goto err_put_module; |
| } |
| |
| file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, |
| &kvm_gmem_fops); |
| if (IS_ERR(file)) { |
| err = PTR_ERR(file); |
| goto err_put_inode; |
| } |
| |
| file->f_flags |= O_LARGEFILE; |
| file->private_data = priv; |
| |
| out: |
| return file; |
| |
| err_put_inode: |
| iput(inode); |
| err_put_module: |
| module_put(kvm_gmem_fops.owner); |
| err: |
| file = ERR_PTR(err); |
| goto out; |
| } |
| |
| static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) |
| { |
| struct kvm_gmem *gmem; |
| struct file *file; |
| int fd, err; |
| |
| fd = get_unused_fd_flags(0); |
| if (fd < 0) |
| return fd; |
| |
| gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); |
| if (!gmem) { |
| err = -ENOMEM; |
| goto err_fd; |
| } |
| |
| file = kvm_gmem_inode_create_getfile(gmem, size, flags); |
| if (IS_ERR(file)) { |
| err = PTR_ERR(file); |
| goto err_gmem; |
| } |
| |
| if (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) && |
| (flags & GUEST_MEMFD_FLAG_INIT_SHARED)) { |
| err = kvm_gmem_offset_range_set_shared(file_inode(file), 0, size >> PAGE_SHIFT); |
| if (err) { |
| fput(file); |
| goto err_gmem; |
| } |
| } |
| |
| kvm_get_kvm(kvm); |
| gmem->kvm = kvm; |
| xa_init(&gmem->bindings); |
| list_add(&gmem->entry, &file_inode(file)->i_mapping->i_private_list); |
| |
| fd_install(fd, file); |
| return fd; |
| |
| err_gmem: |
| kfree(gmem); |
| err_fd: |
| put_unused_fd(fd); |
| return err; |
| } |
| |
| int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) |
| { |
| loff_t size = args->size; |
| u64 flags = args->flags; |
| u64 valid_flags = 0; |
| |
| if (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM)) |
| valid_flags |= GUEST_MEMFD_FLAG_INIT_SHARED; |
| |
| if (flags & ~valid_flags) |
| return -EINVAL; |
| |
| if (size <= 0 || !PAGE_ALIGNED(size)) |
| return -EINVAL; |
| |
| return __kvm_gmem_create(kvm, size, flags); |
| } |
| |
| int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, |
| unsigned int fd, loff_t offset) |
| { |
| loff_t size = slot->npages << PAGE_SHIFT; |
| unsigned long start, end; |
| struct kvm_gmem *gmem; |
| struct inode *inode; |
| struct file *file; |
| int r = -EINVAL; |
| |
| BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); |
| |
| file = fget(fd); |
| if (!file) |
| return -EBADF; |
| |
| if (file->f_op != &kvm_gmem_fops) |
| goto err; |
| |
| gmem = file->private_data; |
| if (gmem->kvm != kvm) |
| goto err; |
| |
| inode = file_inode(file); |
| |
| if (offset < 0 || !PAGE_ALIGNED(offset) || |
| offset + size > i_size_read(inode)) |
| goto err; |
| |
| filemap_invalidate_lock(inode->i_mapping); |
| |
| start = offset >> PAGE_SHIFT; |
| end = start + slot->npages; |
| |
| if (!xa_empty(&gmem->bindings) && |
| xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { |
| filemap_invalidate_unlock(inode->i_mapping); |
| goto err; |
| } |
| |
| /* |
| * No synchronize_rcu() needed, any in-flight readers are guaranteed to |
| * be see either a NULL file or this new file, no need for them to go |
| * away. |
| */ |
| rcu_assign_pointer(slot->gmem.file, file); |
| slot->gmem.pgoff = start; |
| |
| xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); |
| filemap_invalidate_unlock(inode->i_mapping); |
| |
| /* |
| * Drop the reference to the file, even on success. The file pins KVM, |
| * not the other way 'round. Active bindings are invalidated if the |
| * file is closed before memslots are destroyed. |
| */ |
| r = 0; |
| err: |
| fput(file); |
| return r; |
| } |
| |
| void kvm_gmem_unbind(struct kvm_memory_slot *slot) |
| { |
| unsigned long start = slot->gmem.pgoff; |
| unsigned long end = start + slot->npages; |
| struct kvm_gmem *gmem; |
| struct file *file; |
| |
| /* |
| * Nothing to do if the underlying file was already closed (or is being |
| * closed right now), kvm_gmem_release() invalidates all bindings. |
| */ |
| file = kvm_gmem_get_file(slot); |
| if (!file) |
| return; |
| |
| gmem = file->private_data; |
| |
| filemap_invalidate_lock(file->f_mapping); |
| xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); |
| rcu_assign_pointer(slot->gmem.file, NULL); |
| synchronize_rcu(); |
| filemap_invalidate_unlock(file->f_mapping); |
| |
| fput(file); |
| } |
| |
| /* Returns a locked folio on success. */ |
| static struct folio *__kvm_gmem_get_pfn(struct file *file, |
| struct kvm_memory_slot *slot, |
| pgoff_t index, kvm_pfn_t *pfn, |
| bool *is_prepared, int *max_order) |
| { |
| struct kvm_gmem *gmem = file->private_data; |
| struct folio *folio; |
| |
| if (file != slot->gmem.file) { |
| WARN_ON_ONCE(slot->gmem.file); |
| return ERR_PTR(-EFAULT); |
| } |
| |
| gmem = file->private_data; |
| if (xa_load(&gmem->bindings, index) != slot) { |
| WARN_ON_ONCE(xa_load(&gmem->bindings, index)); |
| return ERR_PTR(-EIO); |
| } |
| |
| folio = kvm_gmem_get_folio(file_inode(file), index); |
| if (IS_ERR(folio)) |
| return folio; |
| |
| if (folio_test_hwpoison(folio)) { |
| folio_unlock(folio); |
| folio_put(folio); |
| return ERR_PTR(-EHWPOISON); |
| } |
| |
| *pfn = folio_file_pfn(folio, index); |
| if (max_order) |
| *max_order = 0; |
| |
| *is_prepared = folio_test_uptodate(folio); |
| return folio; |
| } |
| |
| int kvm_gmem_get_pfn_locked(struct kvm *kvm, struct kvm_memory_slot *slot, |
| gfn_t gfn, kvm_pfn_t *pfn, struct page **page, |
| int *max_order) |
| { |
| pgoff_t index = kvm_gmem_get_index(slot, gfn); |
| struct file *file = kvm_gmem_get_file(slot); |
| struct folio *folio; |
| bool is_prepared = false; |
| int r = 0; |
| |
| if (!file) |
| return -EFAULT; |
| |
| folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); |
| if (IS_ERR(folio)) { |
| r = PTR_ERR(folio); |
| goto out; |
| } |
| |
| if (!is_prepared) |
| r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); |
| |
| if (!r) { |
| *page = folio_file_page(folio, index); |
| } else { |
| folio_unlock(folio); |
| folio_put(folio); |
| } |
| |
| out: |
| fput(file); |
| return r; |
| } |
| EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn_locked); |
| |
| int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, |
| gfn_t gfn, kvm_pfn_t *pfn, struct page **page, |
| int *max_order) |
| { |
| int r = kvm_gmem_get_pfn_locked(kvm, slot, gfn, pfn, page, max_order); |
| |
| if (!r) |
| unlock_page(*page); |
| |
| return r; |
| } |
| EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); |
| |
| #ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM |
| long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, |
| kvm_gmem_populate_cb post_populate, void *opaque) |
| { |
| struct file *file; |
| struct kvm_memory_slot *slot; |
| void __user *p; |
| |
| int ret = 0, max_order; |
| long i; |
| |
| lockdep_assert_held(&kvm->slots_lock); |
| if (npages < 0) |
| return -EINVAL; |
| |
| slot = gfn_to_memslot(kvm, start_gfn); |
| if (!kvm_slot_can_be_private(slot)) |
| return -EINVAL; |
| |
| file = kvm_gmem_get_file(slot); |
| if (!file) |
| return -EFAULT; |
| |
| filemap_invalidate_lock(file->f_mapping); |
| |
| npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages); |
| for (i = 0; i < npages; i += (1 << max_order)) { |
| struct folio *folio; |
| gfn_t gfn = start_gfn + i; |
| pgoff_t index = kvm_gmem_get_index(slot, gfn); |
| bool is_prepared = false; |
| kvm_pfn_t pfn; |
| |
| if (signal_pending(current)) { |
| ret = -EINTR; |
| break; |
| } |
| |
| folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); |
| if (IS_ERR(folio)) { |
| ret = PTR_ERR(folio); |
| break; |
| } |
| |
| if (is_prepared) { |
| folio_unlock(folio); |
| folio_put(folio); |
| ret = -EEXIST; |
| break; |
| } |
| |
| folio_unlock(folio); |
| WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) || |
| (npages - i) < (1 << max_order)); |
| |
| ret = -EINVAL; |
| while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order), |
| KVM_MEMORY_ATTRIBUTE_PRIVATE, |
| KVM_MEMORY_ATTRIBUTE_PRIVATE)) { |
| if (!max_order) |
| goto put_folio_and_exit; |
| max_order--; |
| } |
| |
| p = src ? src + i * PAGE_SIZE : NULL; |
| ret = post_populate(kvm, gfn, pfn, p, max_order, opaque); |
| if (!ret) |
| kvm_gmem_mark_prepared(folio); |
| |
| put_folio_and_exit: |
| folio_put(folio); |
| if (ret) |
| break; |
| } |
| |
| filemap_invalidate_unlock(file->f_mapping); |
| |
| fput(file); |
| return ret && !i ? ret : i; |
| } |
| EXPORT_SYMBOL_GPL(kvm_gmem_populate); |
| #endif |