arch/arm64/kvm/pkvm.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2020 - Google LLC
  * Author: Quentin Perret <qperret@google.com>
  */

 #include "linux/delay.h"
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/kmemleak.h>
 #include <linux/kvm_host.h>
 #include <linux/memblock.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/of_fdt.h>
 #include <linux/of_reserved_mem.h>
 #include <linux/sort.h>
 #include <linux/debugfs.h>

 #include <asm/kvm_host.h>
 #include <asm/kvm_mmu.h>
 #include <asm/kvm_pkvm.h>

 #include "hyp_constants.h"

 DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);

 static struct reserved_mem *pkvm_firmware_mem;
 static phys_addr_t *pvmfw_base = &kvm_nvhe_sym(pvmfw_base);
 static phys_addr_t *pvmfw_size = &kvm_nvhe_sym(pvmfw_size);

 static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
 static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);

 phys_addr_t hyp_mem_base;
 phys_addr_t hyp_mem_size;

 static int cmp_hyp_memblock(const void *p1, const void *p2)
 {
 	const struct memblock_region *r1 = p1;
 	const struct memblock_region *r2 = p2;

 	return r1->base < r2->base ? -1 : (r1->base > r2->base);
 }

 static void __init sort_memblock_regions(void)
 {
 	sort(hyp_memory,
 	     *hyp_memblock_nr_ptr,
 	     sizeof(struct memblock_region),
 	     cmp_hyp_memblock,
 	     NULL);
 }

 static int __init register_memblock_regions(void)
 {
 	struct memblock_region *reg;

 	for_each_mem_region(reg) {
 		if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
 			return -ENOMEM;

 		hyp_memory[*hyp_memblock_nr_ptr] = *reg;
 		(*hyp_memblock_nr_ptr)++;
 	}
 	sort_memblock_regions();

 	return 0;
 }

 void __init kvm_hyp_reserve(void)
 {
 	u64 hyp_mem_pages = 0;
 	int ret;

 	if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
 		return;

 	if (kvm_get_mode() != KVM_MODE_PROTECTED)
 		return;

 	ret = register_memblock_regions();
 	if (ret) {
 		*hyp_memblock_nr_ptr = 0;
 		kvm_err("Failed to register hyp memblocks: %d\n", ret);
 		return;
 	}

 	hyp_mem_pages += hyp_s1_pgtable_pages();
 	hyp_mem_pages += host_s2_pgtable_pages();
 	hyp_mem_pages += hyp_vm_table_pages();
 	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
 	hyp_mem_pages += hyp_ffa_proxy_pages();
 	hyp_mem_pages += hyp_host_fp_pages(num_possible_cpus());

 	/*
 	 * Try to allocate a PMD-aligned region to reduce TLB pressure once
 	 * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
 	 */
 	hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
 	hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
 					   PMD_SIZE);
 	if (!hyp_mem_base)
 		hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
 	else
 		hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);

 	if (!hyp_mem_base) {
 		kvm_err("Failed to reserve hyp memory\n");
 		return;
 	}

 	kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
 		 hyp_mem_base);
 }

 static int __pkvm_topup_hyp_alloc(int nr_pages)
 {
 	struct kvm_hyp_memcache mc = {
 		.head		= 0,
 		.nr_pages	= 0,
 	};
 	int ret;

 	ret = topup_hyp_memcache(&mc, 1);
 	if (ret)
 		return ret;

 	return kvm_call_hyp_nvhe(__pkvm_hyp_alloc_refill, mc.head, mc.nr_pages);
 }

 #define refill_hyp_alloc(__func, __nr_pages)			\
 ({								\
 	int __ret;						\
 	do {							\
 		__ret = __func;					\
 		if (__ret != -ENOMEM)				\
 			break;					\
 		__ret = __pkvm_topup_hyp_alloc(__nr_pages);	\
 	} while (!__ret);					\
 	__ret;							\
 })

 static int __pkvm_create_hyp_vcpu(struct kvm *host_kvm, struct kvm_vcpu *host_vcpu, unsigned long idx)
 {
 	pkvm_handle_t handle = host_kvm->arch.pkvm.handle;
 	void *hyp_reqs;
 	int ret;

 	/* Indexing of the vcpus to be sequential starting at 0. */
 	if (WARN_ON(host_vcpu->vcpu_idx != idx))
 		return -EINVAL;

 	hyp_reqs = alloc_pages_exact(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 	if (!hyp_reqs)
 		return -ENOMEM;

 	ret = kvm_share_hyp(hyp_reqs, hyp_reqs + 1);
 	if (ret) {
 		free_pages_exact(hyp_reqs, PAGE_SIZE);
 		return ret;
 	}

 	host_vcpu->arch.hyp_reqs = hyp_reqs;
 	ret = refill_hyp_alloc(kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu),
 				1);
 	if (ret) {
 		free_pages_exact(hyp_reqs, PAGE_SIZE);
 	}

 	return ret;
 }

 static void __pkvm_vcpu_hyp_created(struct kvm_vcpu *vcpu)
 {
 	if (kvm_vm_is_protected(vcpu->kvm))
 		vcpu->arch.sve_state = NULL;
 }

 /*
  * Allocates and donates memory for hypervisor VM structs at EL2.
  *
  * Allocates space for the VM state, which includes the hyp vm as well as
  * the hyp vcpus.
  *
  * Stores an opaque handler in the kvm struct for future reference.
  *
  * Return 0 on success, negative error code on failure.
  */
 static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
 {
 	struct kvm_vcpu *host_vcpu;
 	pkvm_handle_t handle;
 	unsigned long idx;
 	size_t pgd_sz;
 	void *pgd;
 	int ret;

 	if (host_kvm->created_vcpus < 1)
 		return -EINVAL;

 	pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);

 	/*
 	 * The PGD pages will be reclaimed using a hyp_memcache which implies
 	 * page granularity. So, use alloc_pages_exact() to get individual
 	 * refcounts.
 	 */
 	pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
 	if (!pgd)
 		return -ENOMEM;

 	/* Donate the VM memory to hyp and let hyp initialize it. */
 	ret = refill_hyp_alloc(kvm_call_hyp_nvhe(__pkvm_init_vm,
 						 host_kvm, pgd), 4);
 	if (ret < 0)
 		goto free_pgd;

 	handle = ret;

 	host_kvm->arch.pkvm.handle = handle;

 	/* Donate memory for the vcpus at hyp and initialize it. */
 	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
 		ret = __pkvm_create_hyp_vcpu(host_kvm, host_vcpu, idx);
 		if (ret)
 			goto destroy_vm;
 		__pkvm_vcpu_hyp_created(host_vcpu);
 	}

 	return 0;

 destroy_vm:
 	pkvm_destroy_hyp_vm(host_kvm);
 	return ret;
 free_pgd:
 	free_pages_exact(pgd, pgd_sz);
 	return ret;
 }

 int pkvm_create_hyp_vm(struct kvm *host_kvm)
 {
 	int ret = 0;

 	mutex_lock(&host_kvm->lock);
 	if (!host_kvm->arch.pkvm.handle)
 		ret = __pkvm_create_hyp_vm(host_kvm);
 	mutex_unlock(&host_kvm->lock);

 	return ret;
 }

 void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
 {
 	struct kvm_pinned_page *ppage;
 	struct mm_struct *mm = current->mm;
 	struct rb_node *node;
 	struct kvm_hyp_req *hyp_reqs;
 	struct kvm_vcpu *host_vcpu;
 	unsigned long idx;
 	int ret;

 	if (!host_kvm->arch.pkvm.handle)
 		goto out_free;

 	while (true) {
 		ret = kvm_call_hyp_nvhe(__pkvm_start_teardown_vm,
 					host_kvm->arch.pkvm.handle);
 		if (ret != 0) {
 			// WARN_ON(ret);
 			pr_warn("start teardown returned: %d\n", ret);
 			cond_resched();
 			msleep(100);
 		} else {
 			break;
 		}
 	}

 	node = rb_first(&host_kvm->arch.pkvm.pinned_pages);
 	while (node) {
 		ppage = rb_entry(node, struct kvm_pinned_page, node);
 		WARN_ON(kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page,
 					  host_kvm->arch.pkvm.handle,
 					  page_to_pfn(ppage->page),
 					  ppage->ipa));
 		cond_resched();

 		account_locked_vm(mm, 1, false);
 		unpin_user_pages_dirty_lock(&ppage->page, 1, true);
 		node = rb_next(node);
 		rb_erase(&ppage->node, &host_kvm->arch.pkvm.pinned_pages);
 		kfree(ppage);
 	}

 	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
 		hyp_reqs = host_vcpu->arch.hyp_reqs;

 		if (!hyp_reqs)
 			continue;

 		kvm_unshare_hyp(hyp_reqs, hyp_reqs + 1);
 		free_pages_exact(hyp_reqs, PAGE_SIZE);
 	}

 	WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));

 out_free:
 	host_kvm->arch.pkvm.handle = 0;
 	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
 }

 int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type)
 {
 	mutex_init(&host_kvm->lock);

 	if (!(type & KVM_VM_TYPE_ARM_PROTECTED)) {
 		kvm_info("Initialized non-protected VM.");
 		return 0;
 	}

 	if (!is_protected_kvm_enabled())
 		return -EINVAL;

 	host_kvm->arch.pkvm.pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR;
 	kvm_info("Initialized protected VM.");

 	host_kvm->arch.pkvm.enabled = true;
 	return 0;
 }

 static void __init _kvm_host_prot_finalize(void *arg)
 {
 	int *err = arg;

 	if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
 		WRITE_ONCE(*err, -EINVAL);
 }

 static int __init pkvm_drop_host_privileges(void)
 {
 	int ret = 0;

 	/*
 	 * Flip the static key upfront as that may no longer be possible
 	 * once the host stage 2 is installed.
 	 */
 	static_branch_enable(&kvm_protected_mode_initialized);
 	on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
 	return ret;
 }

 static int __init finalize_pkvm(void)
 {
 	int ret;

 	if (!is_protected_kvm_enabled())
 		return 0;

 	/*
 	 * Exclude HYP sections from kmemleak so that they don't get peeked
 	 * at, which would end badly once inaccessible.
 	 */
 	kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
 	kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);

 	ret = pkvm_drop_host_privileges();
 	if (ret)
 		pr_err("Failed to finalize Hyp protection: %d\n", ret);

 	return ret;
 }
 device_initcall_sync(finalize_pkvm);

 static int rb_ppage_cmp(const void *key, const struct rb_node *node)
 {
        struct kvm_pinned_page *p = container_of(node, struct kvm_pinned_page, node);
        phys_addr_t ipa = (phys_addr_t)key;

        return (ipa < p->ipa) ? -1 : (ipa > p->ipa);
 }

 void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
 {
 	struct kvm_pinned_page *ppage;
 	struct mm_struct *mm = current->mm;
 	struct rb_node *node;

 	write_lock(&host_kvm->mmu_lock);
 	node = rb_find((void *)ipa, &host_kvm->arch.pkvm.pinned_pages,
 		       rb_ppage_cmp);
 	if (node)
 		rb_erase(node, &host_kvm->arch.pkvm.pinned_pages);
 	write_unlock(&host_kvm->mmu_lock);

 	WARN_ON(!node);
 	if (!node)
 		return;

 	ppage = container_of(node, struct kvm_pinned_page, node);
 	account_locked_vm(mm, 1, false);
 	unpin_user_pages_dirty_lock(&ppage->page, 1, true);
 	kfree(ppage);
 }

 static int __init pkvm_firmware_rmem_err(struct reserved_mem *rmem,
 					 const char *reason)
 {
 	phys_addr_t end = rmem->base + rmem->size;

 	kvm_err("Ignoring pkvm guest firmware memory reservation [%pa - %pa]: %s\n",
 		&rmem->base, &end, reason);
 	return -EINVAL;
 }

 static int __init pkvm_firmware_rmem_init(struct reserved_mem *rmem)
 {
 	unsigned long node = rmem->fdt_node;

 	if (pkvm_firmware_mem)
 		return pkvm_firmware_rmem_err(rmem, "duplicate reservation");

 	if (!of_get_flat_dt_prop(node, "no-map", NULL))
 		return pkvm_firmware_rmem_err(rmem, "missing \"no-map\" property");

 	if (of_get_flat_dt_prop(node, "reusable", NULL))
 		return pkvm_firmware_rmem_err(rmem, "\"reusable\" property unsupported");

 	if (!PAGE_ALIGNED(rmem->base))
 		return pkvm_firmware_rmem_err(rmem, "base is not page-aligned");

 	if (!PAGE_ALIGNED(rmem->size))
 		return pkvm_firmware_rmem_err(rmem, "size is not page-aligned");

 	*pvmfw_size = rmem->size;
 	*pvmfw_base = rmem->base;
 	pkvm_firmware_mem = rmem;
 	return 0;
 }
 RESERVEDMEM_OF_DECLARE(pkvm_firmware, "linux,pkvm-guest-firmware-memory",
 		       pkvm_firmware_rmem_init);

 static int __init pkvm_firmware_rmem_clear(void)
 {
 	void *addr;
 	phys_addr_t size;

 	if (likely(!pkvm_firmware_mem) || is_protected_kvm_enabled())
 		return 0;

 	kvm_info("Clearing unused pKVM firmware memory\n");
 	size = pkvm_firmware_mem->size;
 	addr = memremap(pkvm_firmware_mem->base, size, MEMREMAP_WB);
 	if (!addr)
 		return -EINVAL;

 	memset(addr, 0, size);
 	dcache_clean_poc((unsigned long)addr, (unsigned long)addr + size);
 	memunmap(addr);
 	return 0;
 }
 device_initcall_sync(pkvm_firmware_rmem_clear);

 static int pkvm_vm_ioctl_set_fw_ipa(struct kvm *kvm, u64 ipa)
 {
 	int ret = 0;

 	if (!pkvm_firmware_mem)
 		return -EINVAL;

 	mutex_lock(&kvm->lock);
 	if (kvm->arch.pkvm.handle) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}

 	kvm->arch.pkvm.pvmfw_load_addr = ipa;
 out_unlock:
 	mutex_unlock(&kvm->lock);
 	return ret;
 }

 static int pkvm_vm_ioctl_info(struct kvm *kvm,
 			      struct kvm_protected_vm_info __user *info)
 {
 	struct kvm_protected_vm_info kinfo = {
 		.firmware_size = pkvm_firmware_mem ?
 				 pkvm_firmware_mem->size :
 				 0,
 	};

 	return copy_to_user(info, &kinfo, sizeof(kinfo)) ? -EFAULT : 0;
 }

 int pkvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 {
 	if (!kvm_vm_is_protected(kvm))
 		return -EINVAL;

 	if (cap->args[1] || cap->args[2] || cap->args[3])
 		return -EINVAL;

 	switch (cap->flags) {
 	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_SET_FW_IPA:
 		return pkvm_vm_ioctl_set_fw_ipa(kvm, cap->args[0]);
 	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO:
 		return pkvm_vm_ioctl_info(kvm, (void __force __user *)cap->args[0]);
 	default:
 		return -EINVAL;
 	}

 	return 0;
 }

 static ssize_t event_write(struct file *f, const char __user *buf,
 			   size_t size, loff_t *pos)
 {
 	kvm_call_hyp_nvhe(__pkvm_selftest_event);

 	return size;
 }

 static const struct file_operations event_fops = {
 	.read = NULL,
 	.write = event_write,
 	.llseek = default_llseek,
 };

 static int __init pkvm_selftest_init(void)
 {
 	debugfs_create_file("pkvm_selftest_event", 0200, NULL, NULL,
 			    &event_fops);
 	return 0;
 }
 device_initcall(pkvm_selftest_init);
	// SPDX-License-Identifier: GPL-2.0
	/*
	* Copyright (C) 2020 - Google LLC
	* Author: Quentin Perret <qperret@google.com>
	*/

	#include "linux/delay.h"
	#include <linux/init.h>
	#include <linux/io.h>
	#include <linux/kmemleak.h>
	#include <linux/kvm_host.h>
	#include <linux/memblock.h>
	#include <linux/mm.h>
	#include <linux/mutex.h>
	#include <linux/of_fdt.h>
	#include <linux/of_reserved_mem.h>
	#include <linux/sort.h>
	#include <linux/debugfs.h>

	#include <asm/kvm_host.h>
	#include <asm/kvm_mmu.h>
	#include <asm/kvm_pkvm.h>

	#include "hyp_constants.h"

	DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);

	static struct reserved_mem *pkvm_firmware_mem;
	static phys_addr_t *pvmfw_base = &kvm_nvhe_sym(pvmfw_base);
	static phys_addr_t *pvmfw_size = &kvm_nvhe_sym(pvmfw_size);

	static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
	static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);

	phys_addr_t hyp_mem_base;
	phys_addr_t hyp_mem_size;

	static int cmp_hyp_memblock(const void p1, const void p2)
	{
	const struct memblock_region *r1 = p1;
	const struct memblock_region *r2 = p2;

	return r1->base < r2->base ? -1 : (r1->base > r2->base);
	}

	static void __init sort_memblock_regions(void)
	{
	sort(hyp_memory,
	*hyp_memblock_nr_ptr,
	sizeof(struct memblock_region),
	cmp_hyp_memblock,
	NULL);
	}

	static int __init register_memblock_regions(void)
	{
	struct memblock_region *reg;

	for_each_mem_region(reg) {
	if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
	return -ENOMEM;

	hyp_memory[hyp_memblock_nr_ptr] = reg;
	(*hyp_memblock_nr_ptr)++;
	}
	sort_memblock_regions();

	return 0;
	}

	void __init kvm_hyp_reserve(void)
	{
	u64 hyp_mem_pages = 0;
	int ret;

	if (!is_hyp_mode_available() \|\| is_kernel_in_hyp_mode())
	return;

	if (kvm_get_mode() != KVM_MODE_PROTECTED)
	return;

	ret = register_memblock_regions();
	if (ret) {
	*hyp_memblock_nr_ptr = 0;
	kvm_err("Failed to register hyp memblocks: %d\n", ret);
	return;
	}

	hyp_mem_pages += hyp_s1_pgtable_pages();
	hyp_mem_pages += host_s2_pgtable_pages();
	hyp_mem_pages += hyp_vm_table_pages();
	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
	hyp_mem_pages += hyp_ffa_proxy_pages();
	hyp_mem_pages += hyp_host_fp_pages(num_possible_cpus());

	/*
	* Try to allocate a PMD-aligned region to reduce TLB pressure once
	* this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
	*/
	hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
	hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
	PMD_SIZE);
	if (!hyp_mem_base)
	hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
	else
	hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);

	if (!hyp_mem_base) {
	kvm_err("Failed to reserve hyp memory\n");
	return;
	}

	kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
	hyp_mem_base);
	}

	static int __pkvm_topup_hyp_alloc(int nr_pages)
	{
	struct kvm_hyp_memcache mc = {
	.head = 0,
	.nr_pages = 0,
	};
	int ret;

	ret = topup_hyp_memcache(&mc, 1);
	if (ret)
	return ret;

	return kvm_call_hyp_nvhe(__pkvm_hyp_alloc_refill, mc.head, mc.nr_pages);
	}

	#define refill_hyp_alloc(__func, __nr_pages) \
	({ \
	int __ret; \
	do { \
	__ret = __func; \
	if (__ret != -ENOMEM) \
	break; \
	__ret = __pkvm_topup_hyp_alloc(__nr_pages); \
	} while (!__ret); \
	__ret; \
	})

	static int __pkvm_create_hyp_vcpu(struct kvm host_kvm, struct kvm_vcpu host_vcpu, unsigned long idx)
	{
	pkvm_handle_t handle = host_kvm->arch.pkvm.handle;
	void *hyp_reqs;
	int ret;

	/* Indexing of the vcpus to be sequential starting at 0. */
	if (WARN_ON(host_vcpu->vcpu_idx != idx))
	return -EINVAL;

	hyp_reqs = alloc_pages_exact(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
	if (!hyp_reqs)
	return -ENOMEM;

	ret = kvm_share_hyp(hyp_reqs, hyp_reqs + 1);
	if (ret) {
	free_pages_exact(hyp_reqs, PAGE_SIZE);
	return ret;
	}

	host_vcpu->arch.hyp_reqs = hyp_reqs;
	ret = refill_hyp_alloc(kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, host_vcpu),
	1);
	if (ret) {
	free_pages_exact(hyp_reqs, PAGE_SIZE);
	}

	return ret;
	}

	static void __pkvm_vcpu_hyp_created(struct kvm_vcpu *vcpu)
	{
	if (kvm_vm_is_protected(vcpu->kvm))
	vcpu->arch.sve_state = NULL;
	}

	/*
	* Allocates and donates memory for hypervisor VM structs at EL2.
	*
	* Allocates space for the VM state, which includes the hyp vm as well as
	* the hyp vcpus.
	*
	* Stores an opaque handler in the kvm struct for future reference.
	*
	* Return 0 on success, negative error code on failure.
	*/
	static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
	{
	struct kvm_vcpu *host_vcpu;
	pkvm_handle_t handle;
	unsigned long idx;
	size_t pgd_sz;
	void *pgd;
	int ret;

	if (host_kvm->created_vcpus < 1)
	return -EINVAL;

	pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.vtcr);

	/*
	* The PGD pages will be reclaimed using a hyp_memcache which implies
	* page granularity. So, use alloc_pages_exact() to get individual
	* refcounts.
	*/
	pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
	if (!pgd)
	return -ENOMEM;

	/* Donate the VM memory to hyp and let hyp initialize it. */
	ret = refill_hyp_alloc(kvm_call_hyp_nvhe(__pkvm_init_vm,
	host_kvm, pgd), 4);
	if (ret < 0)
	goto free_pgd;

	handle = ret;

	host_kvm->arch.pkvm.handle = handle;

	/* Donate memory for the vcpus at hyp and initialize it. */
	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
	ret = __pkvm_create_hyp_vcpu(host_kvm, host_vcpu, idx);
	if (ret)
	goto destroy_vm;
	__pkvm_vcpu_hyp_created(host_vcpu);
	}

	return 0;

	destroy_vm:
	pkvm_destroy_hyp_vm(host_kvm);
	return ret;
	free_pgd:
	free_pages_exact(pgd, pgd_sz);
	return ret;
	}

	int pkvm_create_hyp_vm(struct kvm *host_kvm)
	{
	int ret = 0;

	mutex_lock(&host_kvm->lock);
	if (!host_kvm->arch.pkvm.handle)
	ret = __pkvm_create_hyp_vm(host_kvm);
	mutex_unlock(&host_kvm->lock);

	return ret;
	}

	void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
	{
	struct kvm_pinned_page *ppage;
	struct mm_struct *mm = current->mm;
	struct rb_node *node;
	struct kvm_hyp_req *hyp_reqs;
	struct kvm_vcpu *host_vcpu;
	unsigned long idx;
	int ret;

	if (!host_kvm->arch.pkvm.handle)
	goto out_free;

	while (true) {
	ret = kvm_call_hyp_nvhe(__pkvm_start_teardown_vm,
	host_kvm->arch.pkvm.handle);
	if (ret != 0) {
	// WARN_ON(ret);
	pr_warn("start teardown returned: %d\n", ret);
	cond_resched();
	msleep(100);
	} else {
	break;
	}
	}

	node = rb_first(&host_kvm->arch.pkvm.pinned_pages);
	while (node) {
	ppage = rb_entry(node, struct kvm_pinned_page, node);
	WARN_ON(kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page,
	host_kvm->arch.pkvm.handle,
	page_to_pfn(ppage->page),
	ppage->ipa));
	cond_resched();

	account_locked_vm(mm, 1, false);
	unpin_user_pages_dirty_lock(&ppage->page, 1, true);
	node = rb_next(node);
	rb_erase(&ppage->node, &host_kvm->arch.pkvm.pinned_pages);
	kfree(ppage);
	}

	kvm_for_each_vcpu(idx, host_vcpu, host_kvm) {
	hyp_reqs = host_vcpu->arch.hyp_reqs;

	if (!hyp_reqs)
	continue;

	kvm_unshare_hyp(hyp_reqs, hyp_reqs + 1);
	free_pages_exact(hyp_reqs, PAGE_SIZE);
	}

	WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle));

	out_free:
	host_kvm->arch.pkvm.handle = 0;
	free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
	}

	int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type)
	{
	mutex_init(&host_kvm->lock);

	if (!(type & KVM_VM_TYPE_ARM_PROTECTED)) {
	kvm_info("Initialized non-protected VM.");
	return 0;
	}

	if (!is_protected_kvm_enabled())
	return -EINVAL;

	host_kvm->arch.pkvm.pvmfw_load_addr = PVMFW_INVALID_LOAD_ADDR;
	kvm_info("Initialized protected VM.");

	host_kvm->arch.pkvm.enabled = true;
	return 0;
	}

	static void __init _kvm_host_prot_finalize(void *arg)
	{
	int *err = arg;

	if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
	WRITE_ONCE(*err, -EINVAL);
	}

	static int __init pkvm_drop_host_privileges(void)
	{
	int ret = 0;

	/*
	* Flip the static key upfront as that may no longer be possible
	* once the host stage 2 is installed.
	*/
	static_branch_enable(&kvm_protected_mode_initialized);
	on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
	return ret;
	}

	static int __init finalize_pkvm(void)
	{
	int ret;

	if (!is_protected_kvm_enabled())
	return 0;

	/*
	* Exclude HYP sections from kmemleak so that they don't get peeked
	* at, which would end badly once inaccessible.
	*/
	kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
	kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);

	ret = pkvm_drop_host_privileges();
	if (ret)
	pr_err("Failed to finalize Hyp protection: %d\n", ret);

	return ret;
	}
	device_initcall_sync(finalize_pkvm);

	static int rb_ppage_cmp(const void key, const struct rb_node node)
	{
	struct kvm_pinned_page *p = container_of(node, struct kvm_pinned_page, node);
	phys_addr_t ipa = (phys_addr_t)key;

	return (ipa < p->ipa) ? -1 : (ipa > p->ipa);
	}

	void pkvm_host_reclaim_page(struct kvm *host_kvm, phys_addr_t ipa)
	{
	struct kvm_pinned_page *ppage;
	struct mm_struct *mm = current->mm;
	struct rb_node *node;

	write_lock(&host_kvm->mmu_lock);
	node = rb_find((void *)ipa, &host_kvm->arch.pkvm.pinned_pages,
	rb_ppage_cmp);
	if (node)
	rb_erase(node, &host_kvm->arch.pkvm.pinned_pages);
	write_unlock(&host_kvm->mmu_lock);

	WARN_ON(!node);
	if (!node)
	return;

	ppage = container_of(node, struct kvm_pinned_page, node);
	account_locked_vm(mm, 1, false);
	unpin_user_pages_dirty_lock(&ppage->page, 1, true);
	kfree(ppage);
	}

	static int __init pkvm_firmware_rmem_err(struct reserved_mem *rmem,
	const char *reason)
	{
	phys_addr_t end = rmem->base + rmem->size;

	kvm_err("Ignoring pkvm guest firmware memory reservation [%pa - %pa]: %s\n",
	&rmem->base, &end, reason);
	return -EINVAL;
	}

	static int __init pkvm_firmware_rmem_init(struct reserved_mem *rmem)
	{
	unsigned long node = rmem->fdt_node;

	if (pkvm_firmware_mem)
	return pkvm_firmware_rmem_err(rmem, "duplicate reservation");

	if (!of_get_flat_dt_prop(node, "no-map", NULL))
	return pkvm_firmware_rmem_err(rmem, "missing \"no-map\" property");

	if (of_get_flat_dt_prop(node, "reusable", NULL))
	return pkvm_firmware_rmem_err(rmem, "\"reusable\" property unsupported");

	if (!PAGE_ALIGNED(rmem->base))
	return pkvm_firmware_rmem_err(rmem, "base is not page-aligned");

	if (!PAGE_ALIGNED(rmem->size))
	return pkvm_firmware_rmem_err(rmem, "size is not page-aligned");

	*pvmfw_size = rmem->size;
	*pvmfw_base = rmem->base;
	pkvm_firmware_mem = rmem;
	return 0;
	}
	RESERVEDMEM_OF_DECLARE(pkvm_firmware, "linux,pkvm-guest-firmware-memory",
	pkvm_firmware_rmem_init);

	static int __init pkvm_firmware_rmem_clear(void)
	{
	void *addr;
	phys_addr_t size;

	if (likely(!pkvm_firmware_mem) \|\| is_protected_kvm_enabled())
	return 0;

	kvm_info("Clearing unused pKVM firmware memory\n");
	size = pkvm_firmware_mem->size;
	addr = memremap(pkvm_firmware_mem->base, size, MEMREMAP_WB);
	if (!addr)
	return -EINVAL;

	memset(addr, 0, size);
	dcache_clean_poc((unsigned long)addr, (unsigned long)addr + size);
	memunmap(addr);
	return 0;
	}
	device_initcall_sync(pkvm_firmware_rmem_clear);

	static int pkvm_vm_ioctl_set_fw_ipa(struct kvm *kvm, u64 ipa)
	{
	int ret = 0;

	if (!pkvm_firmware_mem)
	return -EINVAL;

	mutex_lock(&kvm->lock);
	if (kvm->arch.pkvm.handle) {
	ret = -EBUSY;
	goto out_unlock;
	}

	kvm->arch.pkvm.pvmfw_load_addr = ipa;
	out_unlock:
	mutex_unlock(&kvm->lock);
	return ret;
	}

	static int pkvm_vm_ioctl_info(struct kvm *kvm,
	struct kvm_protected_vm_info __user *info)
	{
	struct kvm_protected_vm_info kinfo = {
	.firmware_size = pkvm_firmware_mem ?
	pkvm_firmware_mem->size :
	0,
	};

	return copy_to_user(info, &kinfo, sizeof(kinfo)) ? -EFAULT : 0;
	}

	int pkvm_vm_ioctl_enable_cap(struct kvm kvm, struct kvm_enable_cap cap)
	{
	if (!kvm_vm_is_protected(kvm))
	return -EINVAL;

	if (cap->args[1] \|\| cap->args[2] \|\| cap->args[3])
	return -EINVAL;

	switch (cap->flags) {
	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_SET_FW_IPA:
	return pkvm_vm_ioctl_set_fw_ipa(kvm, cap->args[0]);
	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO:
	return pkvm_vm_ioctl_info(kvm, (void __force __user *)cap->args[0]);
	default:
	return -EINVAL;
	}

	return 0;
	}

	static ssize_t event_write(struct file f, const char __user buf,
	size_t size, loff_t *pos)
	{
	kvm_call_hyp_nvhe(__pkvm_selftest_event);

	return size;
	}

	static const struct file_operations event_fops = {
	.read = NULL,
	.write = event_write,
	.llseek = default_llseek,
	};

	static int __init pkvm_selftest_init(void)
	{
	debugfs_create_file("pkvm_selftest_event", 0200, NULL, NULL,
	&event_fops);
	return 0;
	}
	device_initcall(pkvm_selftest_init);