blob: ae49be766c948bb9a1aa3593e88586706a4a4692 [file] [log] [blame] [edit]
// SPDX-License-Identifier: GPL-2.0
#include <linux/fs.h>
#include <linux/backing-dev.h>
#include <linux/falloc.h>
#include <linux/kvm_host.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>
#include "kvm_mm.h"
static struct vfsmount *kvm_gmem_mnt;
struct kvm_gmem {
struct kvm *kvm;
struct xarray bindings;
struct list_head entry;
};
struct kvm_gmem_inode_private {
#ifdef CONFIG_KVM_GMEM_SHARED_MEM
struct xarray shared_offsets;
rwlock_t offsets_lock;
#endif
};
static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
{
return inode->i_mapping->i_private_data;
}
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
* @index: The index we want to look up.
*
* Return: The pfn for this index.
*/
static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
{
return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
}
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
kvm_pfn_t pfn = folio_file_pfn(folio, index);
gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
if (rc) {
pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
index, gfn, pfn, rc);
return rc;
}
#endif
return 0;
}
static inline void kvm_gmem_mark_prepared(struct folio *folio)
{
folio_mark_uptodate(folio);
}
/*
* Process @folio, which contains @gfn, so that the guest can use it.
* The folio must be locked and the gfn must be contained in @slot.
* On successful return the guest sees a zero page so as to avoid
* leaking host data and the up-to-date flag is set.
*/
static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, struct folio *folio)
{
unsigned long nr_pages, i;
pgoff_t index;
int r;
nr_pages = folio_nr_pages(folio);
for (i = 0; i < nr_pages; i++)
clear_highpage(folio_page(folio, i));
/*
* Preparing huge folios should always be safe, since it should
* be possible to split them later if needed.
*
* Right now the folio order is always going to be zero, but the
* code is ready for huge folios. The only assumption is that
* the base pgoff of memslots is naturally aligned with the
* requested page order, ensuring that huge folios can also use
* huge page table entries for GPA->HPA mapping.
*
* The order will be passed when creating the guest_memfd, and
* checked when creating memslots.
*/
WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
index = gfn - slot->base_gfn + slot->gmem.pgoff;
index = ALIGN_DOWN(index, 1 << folio_order(folio));
r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
if (!r)
kvm_gmem_mark_prepared(folio);
return r;
}
/*
* Returns a locked folio on success. The caller is responsible for
* setting the up-to-date flag before the memory is mapped into the guest.
* There is no backing storage for the memory, so the folio will remain
* up-to-date until it's removed.
*
* Ignore accessed, referenced, and dirty flags. The memory is
* unevictable and there is no storage to write back to.
*/
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
return filemap_grab_folio(inode->i_mapping, index);
}
#ifdef CONFIG_KVM_GMEM_SHARED_MEM
static void kvm_gmem_offset_range_invalidate_shared(struct inode *inode,
pgoff_t start, pgoff_t end);
#else
static inline void kvm_gmem_offset_range_invalidate_shared(struct inode *inode,
pgoff_t start, pgoff_t end)
{
}
#endif
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
pgoff_t end)
{
bool flush = false, found_memslot = false;
struct kvm_memory_slot *slot;
struct kvm *kvm = gmem->kvm;
unsigned long index;
xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
struct file *file = READ_ONCE(slot->gmem.file);
pgoff_t pgoff = slot->gmem.pgoff;
struct kvm_gfn_range gfn_range = {
.start = slot->base_gfn + max(pgoff, start) - pgoff,
.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
.slot = slot,
.may_block = true,
};
if (!found_memslot) {
found_memslot = true;
KVM_MMU_LOCK(kvm);
kvm_mmu_invalidate_begin(kvm);
}
flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
/*
* If this gets called after kvm_gmem_unbind() it means that all
* in-flight operations are gone, and the file has been closed.
*/
if (file) {
kvm_gmem_offset_range_invalidate_shared(file_inode(file),
gfn_range.start,
gfn_range.end);
}
}
if (flush)
kvm_flush_remote_tlbs(kvm);
if (found_memslot)
KVM_MMU_UNLOCK(kvm);
}
static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
pgoff_t end)
{
struct kvm *kvm = gmem->kvm;
if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
KVM_MMU_LOCK(kvm);
kvm_mmu_invalidate_end(kvm);
KVM_MMU_UNLOCK(kvm);
}
}
static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
struct list_head *gmem_list = &inode->i_mapping->i_private_list;
pgoff_t start = offset >> PAGE_SHIFT;
pgoff_t end = (offset + len) >> PAGE_SHIFT;
struct kvm_gmem *gmem;
/*
* Bindings must be stable across invalidation to ensure the start+end
* are balanced.
*/
filemap_invalidate_lock(inode->i_mapping);
list_for_each_entry(gmem, gmem_list, entry)
kvm_gmem_invalidate_begin(gmem, start, end);
truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
list_for_each_entry(gmem, gmem_list, entry)
kvm_gmem_invalidate_end(gmem, start, end);
filemap_invalidate_unlock(inode->i_mapping);
return 0;
}
static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start, index, end;
int r;
/* Dedicated guest is immutable by default. */
if (offset + len > i_size_read(inode))
return -EINVAL;
filemap_invalidate_lock_shared(mapping);
start = offset >> PAGE_SHIFT;
end = (offset + len) >> PAGE_SHIFT;
r = 0;
for (index = start; index < end; ) {
struct folio *folio;
if (signal_pending(current)) {
r = -EINTR;
break;
}
folio = kvm_gmem_get_folio(inode, index);
if (IS_ERR(folio)) {
r = PTR_ERR(folio);
break;
}
index = folio_next_index(folio);
folio_unlock(folio);
folio_put(folio);
/* 64-bit only, wrapping the index should be impossible. */
if (WARN_ON_ONCE(!index))
break;
cond_resched();
}
filemap_invalidate_unlock_shared(mapping);
return r;
}
static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
int ret;
if (!(mode & FALLOC_FL_KEEP_SIZE))
return -EOPNOTSUPP;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
return -EINVAL;
if (mode & FALLOC_FL_PUNCH_HOLE)
ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
else
ret = kvm_gmem_allocate(file_inode(file), offset, len);
if (!ret)
file_modified(file);
return ret;
}
static int kvm_gmem_release(struct inode *inode, struct file *file)
{
struct kvm_gmem *gmem = file->private_data;
struct kvm_memory_slot *slot;
struct kvm *kvm = gmem->kvm;
unsigned long index;
/*
* Prevent concurrent attempts to *unbind* a memslot. This is the last
* reference to the file and thus no new bindings can be created, but
* dereferencing the slot for existing bindings needs to be protected
* against memslot updates, specifically so that unbind doesn't race
* and free the memslot (kvm_gmem_get_file() will return NULL).
*/
mutex_lock(&kvm->slots_lock);
filemap_invalidate_lock(inode->i_mapping);
xa_for_each(&gmem->bindings, index, slot)
rcu_assign_pointer(slot->gmem.file, NULL);
synchronize_rcu();
/*
* All in-flight operations are gone and new bindings can be created.
* Zap all SPTEs pointed at by this file. Do not free the backing
* memory, as its lifetime is associated with the inode, not the file.
*/
kvm_gmem_invalidate_begin(gmem, 0, -1ul);
kvm_gmem_invalidate_end(gmem, 0, -1ul);
list_del(&gmem->entry);
filemap_invalidate_unlock(inode->i_mapping);
mutex_unlock(&kvm->slots_lock);
xa_destroy(&gmem->bindings);
kfree(gmem);
kvm_put_kvm(kvm);
return 0;
}
static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
{
/*
* Do not return slot->gmem.file if it has already been closed;
* there might be some time between the last fput() and when
* kvm_gmem_release() clears slot->gmem.file, and you do not
* want to spin in the meanwhile.
*/
return get_file_active(&slot->gmem.file);
}
static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
{
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
static void kvm_gmem_evict_inode(struct inode *inode)
{
struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
#ifdef CONFIG_KVM_GMEM_SHARED_MEM
/*
* .evict_inode can be called before private data is set up if there are
* issues during inode creation.
*/
if (private)
xa_destroy(&private->shared_offsets);
#endif
truncate_inode_pages_final(inode->i_mapping);
kfree(private);
clear_inode(inode);
}
static const struct super_operations kvm_gmem_super_operations = {
.statfs = simple_statfs,
.evict_inode = kvm_gmem_evict_inode,
};
static int kvm_gmem_init_fs_context(struct fs_context *fc)
{
struct pseudo_fs_context *ctx;
if (!init_pseudo(fc, GUEST_MEMORY_MAGIC))
return -ENOMEM;
ctx = fc->fs_private;
ctx->ops = &kvm_gmem_super_operations;
return 0;
}
static struct file_system_type kvm_gmem_fs = {
.name = "kvm_guest_memory",
.init_fs_context = kvm_gmem_init_fs_context,
.kill_sb = kill_anon_super,
};
static void kvm_gmem_init_mount(void)
{
kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
BUG_ON(IS_ERR(kvm_gmem_mnt));
/* For giggles. Userspace can never map this anyways. */
kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
}
#ifdef CONFIG_KVM_GMEM_SHARED_MEM
/*
* An enum of the valid folio sharing states:
* Bit 0: set if not shared with the guest (guest cannot fault it in)
* Bit 1: set if not shared with the host (host cannot fault it in)
*/
enum folio_shareability {
KVM_GMEM_ALL_SHARED = 0b00, /* Shared with host and guest. */
KVM_GMEM_GUEST_SHARED = 0b10, /* Shared only with guest. */
KVM_GMEM_NONE_SHARED = 0b11, /* Not shared, transient state. */
};
/*
* Unregisters the __folio_put() callback from the folio.
*
* Restores a folio's refcount after all pending references have been released,
* and removes the folio type, thereby removing the callback. Now the folio can
* be freed normaly once all actual references have been dropped.
*
* Must be called with the folio locked and the offsets_lock write lock held.
*/
static void kvm_gmem_restore_pending_folio(struct folio *folio, struct inode *inode)
{
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
lockdep_assert_held_write(offsets_lock);
WARN_ON_ONCE(!folio_test_locked(folio));
if (WARN_ON_ONCE(!folio_test_guestmem(folio)))
return;
__folio_clear_guestmem(folio);
folio_ref_add(folio, folio_nr_pages(folio));
pr_info("%s: done\n", __func__);
}
static int kvm_gmem_offset_set_shared(struct inode *inode, pgoff_t index)
{
struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
void *xval = xa_mk_value(KVM_GMEM_ALL_SHARED);
lockdep_assert_held_write(offsets_lock);
/*
* If the folio is NONE_SHARED, it indicates that it is transitioning to
* private (GUEST_SHARED). Transition it to shared (ALL_SHARED)
* immediately, and remove the callback.
*/
if (xa_to_value(xa_load(shared_offsets, index)) == KVM_GMEM_NONE_SHARED) {
struct folio *folio = filemap_lock_folio(inode->i_mapping, index);
if (WARN_ON_ONCE(IS_ERR(folio)))
return PTR_ERR(folio);
if (folio_test_guestmem(folio))
kvm_gmem_restore_pending_folio(folio, inode);
folio_unlock(folio);
folio_put(folio);
}
return xa_err(xa_store(shared_offsets, index, xval, GFP_KERNEL));
}
/*
* Marks the range [start, end) as shared with both the host and the guest.
* Called when guest shares memory with the host.
*/
static int kvm_gmem_offset_range_set_shared(struct inode *inode,
pgoff_t start, pgoff_t end)
{
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
pgoff_t i;
int r = 0;
write_lock(offsets_lock);
for (i = start; i < end; i++) {
r = kvm_gmem_offset_set_shared(inode, i);
if (WARN_ON_ONCE(r))
break;
}
write_unlock(offsets_lock);
return r;
}
static int kvm_gmem_offset_clear_shared(struct inode *inode, pgoff_t index)
{
struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_SHARED);
void *xval_none = xa_mk_value(KVM_GMEM_NONE_SHARED);
struct folio *folio;
int refcount;
int r;
lockdep_assert_held_write(offsets_lock);
folio = filemap_lock_folio(inode->i_mapping, index);
if (!IS_ERR(folio)) {
/* +1 references are expected because of filemap_lock_folio(). */
refcount = folio_nr_pages(folio) + 1;
} else {
r = PTR_ERR(folio);
if (WARN_ON_ONCE(r != -ENOENT))
return r;
folio = NULL;
}
if (!folio || folio_ref_freeze(folio, refcount)) {
/*
* No outstanding references: transition to guest shared.
*/
r = xa_err(xa_store(shared_offsets, index, xval_guest, GFP_KERNEL));
if (folio)
folio_ref_unfreeze(folio, refcount);
} else {
/*
* Outstanding references: the folio cannot be faulted in by
* anyone until they're dropped.
*/
r = xa_err(xa_store(shared_offsets, index, xval_none, GFP_KERNEL));
}
if (folio) {
folio_unlock(folio);
folio_put(folio);
}
return r;
}
/*
* Callback when invalidating memory that is potentially shared.
*
* Must be called with the offsets_lock write lock held.
*/
static void kvm_gmem_offset_range_invalidate_shared(struct inode *inode,
pgoff_t start, pgoff_t end)
{
struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
pgoff_t i;
lockdep_assert_held_write(offsets_lock);
for (i = start; i < end; i++) {
/*
* If the folio is NONE_SHARED, it indicates that it's
* transitioning to private (GUEST_SHARED). Transition it to
* shared (ALL_SHARED) and remove the callback.
*/
if (xa_to_value(xa_load(shared_offsets, i)) == KVM_GMEM_NONE_SHARED) {
struct folio *folio = filemap_lock_folio(inode->i_mapping, i);
if (!WARN_ON_ONCE(IS_ERR(folio))) {
if (folio_test_guestmem(folio))
kvm_gmem_restore_pending_folio(folio, inode);
folio_unlock(folio);
folio_put(folio);
}
}
xa_erase(shared_offsets, i);
}
}
/*
* Marks the range [start, end) as not shared with the host. If the host doesn't
* have any references to a particular folio, then that folio is marked as
* shared with the guest.
*
* However, if the host still has references to the folio, then the folio is
* marked and not shared with anyone. Marking it as not shared allows draining
* all references from the host, and ensures that the hypervisor does not
* transition the folio to private, since the host still might access it.
*
* Called when guest unshares memory with the host.
*/
static int kvm_gmem_offset_range_clear_shared(struct inode *inode,
pgoff_t start, pgoff_t end)
{
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
pgoff_t i;
int r = 0;
write_lock(offsets_lock);
for (i = start; i < end; i++) {
r = kvm_gmem_offset_clear_shared(inode, i);
if (WARN_ON_ONCE(r))
break;
}
write_unlock(offsets_lock);
return r;
}
/*
* Registers a callback to __folio_put(), so that gmem knows that the host does
* not have any references to the folio. The callback itself is registered by
* setting the folio type to guestmem.
*
* Returns 0 if a callback was registered or already has been registered, or
* -EAGAIN if the host has references, indicating a callback wasn't registered.
*
* Must be called with the folio locked and the offsets_lock write lock held.
*/
static int kvm_gmem_register_callback(struct folio *folio, struct inode *inode, pgoff_t index)
{
struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_SHARED);
int refcount;
int r = 0;
lockdep_assert_held_write(offsets_lock);
WARN_ON_ONCE(!folio_test_locked(folio));
if (folio_test_guestmem(folio)) {
pr_info("%s:%d 0 already set\n", __func__, __LINE__);
return 0;
}
if (folio_mapped(folio)) {
pr_info("%s:%d -EAGAIN\n", __func__, __LINE__);
return -EAGAIN;
}
refcount = folio_ref_count(folio);
if (!folio_ref_freeze(folio, refcount)) {
pr_info("%s:%d -EAGAIN\n", __func__, __LINE__);
return -EAGAIN;
}
/*
* Register callback by setting the folio type and subtracting gmem's
* references for it to trigger once outstanding references are dropped.
*/
if (refcount > 1) {
__folio_set_guestmem(folio);
refcount -= folio_nr_pages(folio);
} else {
/* No outstanding references, transition it to guest shared. */
r = WARN_ON_ONCE(xa_err(xa_store(shared_offsets, index, xval_guest, GFP_KERNEL)));
}
folio_ref_unfreeze(folio, refcount);
pr_info("%s:%d done with ret: %d\n", __func__, __LINE__, r);
return r;
}
int kvm_gmem_slot_register_callback(struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
struct inode *inode = file_inode(READ_ONCE(slot->gmem.file));
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
struct folio *folio;
int r;
write_lock(offsets_lock);
folio = filemap_lock_folio(inode->i_mapping, pgoff);
if (WARN_ON_ONCE(IS_ERR(folio))) {
write_unlock(offsets_lock);
return PTR_ERR(folio);
}
r = kvm_gmem_register_callback(folio, inode, pgoff);
folio_unlock(folio);
folio_put(folio);
write_unlock(offsets_lock);
return r;
}
EXPORT_SYMBOL_GPL(kvm_gmem_slot_register_callback);
/*
* Callback function for __folio_put(), i.e., called once all references by the
* host to the folio have been dropped. This allows gmem to transition the state
* of the folio to shared with the guest, and allows the hypervisor to continue
* transitioning its state to private, since the host cannot attempt to access
* it anymore.
*/
void kvm_gmem_handle_folio_put(struct folio *folio)
{
struct address_space *mapping;
struct xarray *shared_offsets;
rwlock_t *offsets_lock;
struct inode *inode;
pgoff_t index;
void *xval;
mapping = folio->mapping;
if (WARN_ON_ONCE(!mapping))
return;
inode = mapping->host;
index = folio->index;
shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
xval = xa_mk_value(KVM_GMEM_GUEST_SHARED);
write_lock(offsets_lock);
folio_lock(folio);
kvm_gmem_restore_pending_folio(folio, inode);
folio_unlock(folio);
WARN_ON_ONCE(xa_err(xa_store(shared_offsets, index, xval, GFP_KERNEL)));
write_unlock(offsets_lock);
pr_info("%s: done\n", __func__);
}
EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put);
/*
* Returns true if the folio is shared with the host and the guest.
*
* Must be called with the offsets_lock lock held.
*/
static bool kvm_gmem_offset_is_shared(struct inode *inode, pgoff_t index)
{
struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
unsigned long r;
lockdep_assert_held(offsets_lock);
r = xa_to_value(xa_load(shared_offsets, index));
return r == KVM_GMEM_ALL_SHARED;
}
/*
* Returns true if the folio is shared with the guest (not transitioning).
*
* Must be called with the offsets_lock lock held.
*/
static bool kvm_gmem_offset_is_guest_shared(struct inode *inode, pgoff_t index)
{
struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
unsigned long r;
lockdep_assert_held(offsets_lock);
r = xa_to_value(xa_load(shared_offsets, index));
return (r == KVM_GMEM_ALL_SHARED || r == KVM_GMEM_GUEST_SHARED);
}
int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
{
struct inode *inode = file_inode(READ_ONCE(slot->gmem.file));
pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
pgoff_t end_off = start_off + end - start;
return kvm_gmem_offset_range_set_shared(inode, start_off, end_off);
}
int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
{
struct inode *inode = file_inode(READ_ONCE(slot->gmem.file));
pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
pgoff_t end_off = start_off + end - start;
return kvm_gmem_offset_range_clear_shared(inode, start_off, end_off);
}
bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn)
{
struct inode *inode = file_inode(READ_ONCE(slot->gmem.file));
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
bool r;
read_lock(offsets_lock);
r = kvm_gmem_offset_is_guest_shared(inode, pgoff);
read_unlock(offsets_lock);
return r;
}
static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
struct folio *folio;
vm_fault_t ret = VM_FAULT_LOCKED;
filemap_invalidate_lock_shared(inode->i_mapping);
read_lock(offsets_lock);
folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
int err = PTR_ERR(folio);
if (err == -EAGAIN)
ret = VM_FAULT_RETRY;
else
ret = vmf_error(err);
goto out_filemap;
}
if (folio_test_hwpoison(folio)) {
ret = VM_FAULT_HWPOISON;
goto out_folio;
}
if (!kvm_gmem_offset_is_shared(inode, vmf->pgoff)) {
ret = VM_FAULT_SIGBUS;
goto out_folio;
}
/*
* Shared folios would not be marked as "guestmem" so far, and we only
* expect shared folios at this point.
*/
if (WARN_ON_ONCE(folio_test_guestmem(folio))) {
ret = VM_FAULT_SIGBUS;
goto out_folio;
}
/* No support for huge pages. */
if (WARN_ON_ONCE(folio_test_large(folio))) {
ret = VM_FAULT_SIGBUS;
goto out_folio;
}
if (!folio_test_uptodate(folio)) {
clear_highpage(folio_page(folio, 0));
kvm_gmem_mark_prepared(folio);
}
vmf->page = folio_file_page(folio, vmf->pgoff);
out_folio:
if (ret != VM_FAULT_LOCKED) {
folio_unlock(folio);
folio_put(folio);
}
out_filemap:
read_unlock(offsets_lock);
filemap_invalidate_unlock_shared(inode->i_mapping);
return ret;
}
static const struct vm_operations_struct kvm_gmem_vm_ops = {
.fault = kvm_gmem_fault,
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct kvm_gmem *gmem = file->private_data;
if (!kvm_arch_gmem_supports_shared_mem(gmem->kvm))
return -ENODEV;
if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
(VM_SHARED | VM_MAYSHARE)) {
return -EINVAL;
}
file_accessed(file);
vm_flags_set(vma, VM_DONTDUMP);
vma->vm_ops = &kvm_gmem_vm_ops;
return 0;
}
#else
#define kvm_gmem_mmap NULL
#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
static struct file_operations kvm_gmem_fops = {
.mmap = kvm_gmem_mmap,
.open = generic_file_open,
.release = kvm_gmem_release,
.fallocate = kvm_gmem_fallocate,
};
void kvm_gmem_init(struct module *module)
{
kvm_gmem_fops.owner = module;
kvm_gmem_init_mount();
}
static int kvm_gmem_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
WARN_ON_ONCE(1);
return -EINVAL;
}
static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
{
struct list_head *gmem_list = &mapping->i_private_list;
struct kvm_gmem *gmem;
pgoff_t start, end;
filemap_invalidate_lock_shared(mapping);
start = folio->index;
end = start + folio_nr_pages(folio);
list_for_each_entry(gmem, gmem_list, entry)
kvm_gmem_invalidate_begin(gmem, start, end);
/*
* Do not truncate the range, what action is taken in response to the
* error is userspace's decision (assuming the architecture supports
* gracefully handling memory errors). If/when the guest attempts to
* access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
* at which point KVM can either terminate the VM or propagate the
* error to userspace.
*/
list_for_each_entry(gmem, gmem_list, entry)
kvm_gmem_invalidate_end(gmem, start, end);
filemap_invalidate_unlock_shared(mapping);
return MF_DELAYED;
}
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
static void kvm_gmem_free_folio(struct folio *folio)
{
struct page *page = folio_page(folio, 0);
kvm_pfn_t pfn = page_to_pfn(page);
int order = folio_order(folio);
kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
}
#endif
static const struct address_space_operations kvm_gmem_aops = {
.dirty_folio = noop_dirty_folio,
.migrate_folio = kvm_gmem_migrate_folio,
.error_remove_folio = kvm_gmem_error_folio,
#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
.free_folio = kvm_gmem_free_folio,
#endif
};
static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
struct inode *inode = path->dentry->d_inode;
generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
return -EINVAL;
}
static const struct inode_operations kvm_gmem_iops = {
.getattr = kvm_gmem_getattr,
.setattr = kvm_gmem_setattr,
};
static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
loff_t size, u64 flags)
{
const struct qstr qname = QSTR_INIT(name, strlen(name));
struct kvm_gmem_inode_private *private;
struct inode *inode;
int err;
inode = alloc_anon_inode(kvm_gmem_mnt->mnt_sb);
if (IS_ERR(inode))
return inode;
err = security_inode_init_security_anon(inode, &qname, NULL);
if (err)
goto out;
err = -ENOMEM;
private = kzalloc(sizeof(*private), GFP_KERNEL);
if (!private)
goto out;
#ifdef CONFIG_KVM_GMEM_SHARED_MEM
xa_init(&private->shared_offsets);
rwlock_init(&private->offsets_lock);
#endif
inode->i_mapping->i_private_data = private;
inode->i_private = (void *)(unsigned long)flags;
inode->i_op = &kvm_gmem_iops;
inode->i_mapping->a_ops = &kvm_gmem_aops;
inode->i_mode |= S_IFREG;
inode->i_size = size;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_inaccessible(inode->i_mapping);
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
return inode;
out:
iput(inode);
return ERR_PTR(err);
}
static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
u64 flags)
{
static const char *name = "[kvm-gmem]";
struct inode *inode;
struct file *file;
int err;
err = -ENOENT;
if (!try_module_get(kvm_gmem_fops.owner))
goto err;
inode = kvm_gmem_inode_make_secure_inode(name, size, flags);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto err_put_module;
}
file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR,
&kvm_gmem_fops);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_put_inode;
}
file->f_flags |= O_LARGEFILE;
file->private_data = priv;
out:
return file;
err_put_inode:
iput(inode);
err_put_module:
module_put(kvm_gmem_fops.owner);
err:
file = ERR_PTR(err);
goto out;
}
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
struct kvm_gmem *gmem;
struct file *file;
int fd, err;
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;
gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
if (!gmem) {
err = -ENOMEM;
goto err_fd;
}
file = kvm_gmem_inode_create_getfile(gmem, size, flags);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_gmem;
}
if (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) &&
(flags & GUEST_MEMFD_FLAG_INIT_SHARED)) {
err = kvm_gmem_offset_range_set_shared(file_inode(file), 0, size >> PAGE_SHIFT);
if (err) {
fput(file);
goto err_gmem;
}
}
kvm_get_kvm(kvm);
gmem->kvm = kvm;
xa_init(&gmem->bindings);
list_add(&gmem->entry, &file_inode(file)->i_mapping->i_private_list);
fd_install(fd, file);
return fd;
err_gmem:
kfree(gmem);
err_fd:
put_unused_fd(fd);
return err;
}
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
{
loff_t size = args->size;
u64 flags = args->flags;
u64 valid_flags = 0;
if (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM))
valid_flags |= GUEST_MEMFD_FLAG_INIT_SHARED;
if (flags & ~valid_flags)
return -EINVAL;
if (size <= 0 || !PAGE_ALIGNED(size))
return -EINVAL;
return __kvm_gmem_create(kvm, size, flags);
}
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned int fd, loff_t offset)
{
loff_t size = slot->npages << PAGE_SHIFT;
unsigned long start, end;
struct kvm_gmem *gmem;
struct inode *inode;
struct file *file;
int r = -EINVAL;
BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
file = fget(fd);
if (!file)
return -EBADF;
if (file->f_op != &kvm_gmem_fops)
goto err;
gmem = file->private_data;
if (gmem->kvm != kvm)
goto err;
inode = file_inode(file);
if (offset < 0 || !PAGE_ALIGNED(offset) ||
offset + size > i_size_read(inode))
goto err;
filemap_invalidate_lock(inode->i_mapping);
start = offset >> PAGE_SHIFT;
end = start + slot->npages;
if (!xa_empty(&gmem->bindings) &&
xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
filemap_invalidate_unlock(inode->i_mapping);
goto err;
}
/*
* No synchronize_rcu() needed, any in-flight readers are guaranteed to
* be see either a NULL file or this new file, no need for them to go
* away.
*/
rcu_assign_pointer(slot->gmem.file, file);
slot->gmem.pgoff = start;
xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
filemap_invalidate_unlock(inode->i_mapping);
/*
* Drop the reference to the file, even on success. The file pins KVM,
* not the other way 'round. Active bindings are invalidated if the
* file is closed before memslots are destroyed.
*/
r = 0;
err:
fput(file);
return r;
}
void kvm_gmem_unbind(struct kvm_memory_slot *slot)
{
unsigned long start = slot->gmem.pgoff;
unsigned long end = start + slot->npages;
struct kvm_gmem *gmem;
struct file *file;
/*
* Nothing to do if the underlying file was already closed (or is being
* closed right now), kvm_gmem_release() invalidates all bindings.
*/
file = kvm_gmem_get_file(slot);
if (!file)
return;
gmem = file->private_data;
filemap_invalidate_lock(file->f_mapping);
xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
rcu_assign_pointer(slot->gmem.file, NULL);
synchronize_rcu();
filemap_invalidate_unlock(file->f_mapping);
fput(file);
}
/* Returns a locked folio on success. */
static struct folio *__kvm_gmem_get_pfn(struct file *file,
struct kvm_memory_slot *slot,
pgoff_t index, kvm_pfn_t *pfn,
bool *is_prepared, int *max_order)
{
struct kvm_gmem *gmem = file->private_data;
struct folio *folio;
if (file != slot->gmem.file) {
WARN_ON_ONCE(slot->gmem.file);
return ERR_PTR(-EFAULT);
}
gmem = file->private_data;
if (xa_load(&gmem->bindings, index) != slot) {
WARN_ON_ONCE(xa_load(&gmem->bindings, index));
return ERR_PTR(-EIO);
}
folio = kvm_gmem_get_folio(file_inode(file), index);
if (IS_ERR(folio))
return folio;
if (folio_test_hwpoison(folio)) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-EHWPOISON);
}
*pfn = folio_file_pfn(folio, index);
if (max_order)
*max_order = 0;
*is_prepared = folio_test_uptodate(folio);
return folio;
}
int kvm_gmem_get_pfn_locked(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
int *max_order)
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
struct file *file = kvm_gmem_get_file(slot);
struct folio *folio;
bool is_prepared = false;
int r = 0;
if (!file)
return -EFAULT;
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
if (IS_ERR(folio)) {
r = PTR_ERR(folio);
goto out;
}
if (!is_prepared)
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
if (!r) {
*page = folio_file_page(folio, index);
} else {
folio_unlock(folio);
folio_put(folio);
}
out:
fput(file);
return r;
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn_locked);
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
int *max_order)
{
int r = kvm_gmem_get_pfn_locked(kvm, slot, gfn, pfn, page, max_order);
if (!r)
unlock_page(*page);
return r;
}
EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
struct file *file;
struct kvm_memory_slot *slot;
void __user *p;
int ret = 0, max_order;
long i;
lockdep_assert_held(&kvm->slots_lock);
if (npages < 0)
return -EINVAL;
slot = gfn_to_memslot(kvm, start_gfn);
if (!kvm_slot_can_be_private(slot))
return -EINVAL;
file = kvm_gmem_get_file(slot);
if (!file)
return -EFAULT;
filemap_invalidate_lock(file->f_mapping);
npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
for (i = 0; i < npages; i += (1 << max_order)) {
struct folio *folio;
gfn_t gfn = start_gfn + i;
pgoff_t index = kvm_gmem_get_index(slot, gfn);
bool is_prepared = false;
kvm_pfn_t pfn;
if (signal_pending(current)) {
ret = -EINTR;
break;
}
folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
break;
}
if (is_prepared) {
folio_unlock(folio);
folio_put(folio);
ret = -EEXIST;
break;
}
folio_unlock(folio);
WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
(npages - i) < (1 << max_order));
ret = -EINVAL;
while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
KVM_MEMORY_ATTRIBUTE_PRIVATE,
KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
if (!max_order)
goto put_folio_and_exit;
max_order--;
}
p = src ? src + i * PAGE_SIZE : NULL;
ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
if (!ret)
kvm_gmem_mark_prepared(folio);
put_folio_and_exit:
folio_put(folio);
if (ret)
break;
}
filemap_invalidate_unlock(file->f_mapping);
fput(file);
return ret && !i ? ret : i;
}
EXPORT_SYMBOL_GPL(kvm_gmem_populate);
#endif