arch/arm64/kvm/hyp/nvhe/iommu/iommu.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * IOMMU operations for pKVM
  *
  * Copyright (C) 2022 Linaro Ltd.
  */

 #include <asm/kvm_hyp.h>
 #include <kvm/iommu.h>
 #include <kvm/pl011.h>
 #include <nvhe/iommu.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>

 /* Hypervisor is non-preemptable, so cur_context can be per cpu. */
 DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, __cur_context);
 #define cur_context (*this_cpu_ptr(&__cur_context))

 struct kvm_hyp_iommu_memcache __ro_after_init *kvm_hyp_iommu_memcaches;

 #define domain_to_iopt(_iommu, _domain, _domain_id)		\
 	(struct io_pgtable) {					\
 		.ops = &(_domain)->pgtable->ops,		\
 		.pgd = (_domain)->pgd,				\
 		.cookie = &(struct kvm_iommu_tlb_cookie) {	\
 			.iommu		= (_iommu),		\
 			.domain_id	= (_domain_id),		\
 			.domain		= (_domain),		\
 		},						\
 	}

 void *kvm_iommu_donate_page(void)
 {
 	void *p;
 	int cpu = hyp_smp_processor_id();
 	struct kvm_hyp_memcache tmp = kvm_hyp_iommu_memcaches[cpu].pages;

 	/* For vcpus only use it's allocator as it is accounted. */
 	if (cur_context) {
 		p = guest_alloc_contig_pages(cur_context, 1);
 		return p;
 	}
 	if (!tmp.nr_pages) {
 		kvm_hyp_iommu_memcaches[cpu].needs_page = true;
 		return NULL;
 	}

 	p = pkvm_admit_host_page(&tmp);
 	if (!p)
 		return NULL;

 	kvm_hyp_iommu_memcaches[cpu].pages = tmp;
 	memset(p, 0, PAGE_SIZE);
 	return p;
 }

 void kvm_iommu_reclaim_page(void *p)
 {
 	int cpu = hyp_smp_processor_id();

 	pkvm_teardown_donated_memory(&kvm_hyp_iommu_memcaches[cpu].pages, p,
 				     PAGE_SIZE);
 }

 static int access_allowed(struct pkvm_hyp_vcpu *ctxt,
 			  struct kvm_hyp_iommu_domain *domain)
 {
 	if (!ctxt && domain->vm)
 		return -EPERM;
 	if (ctxt && (domain->vm != pkvm_hyp_vcpu_to_hyp_vm(ctxt)))
 		return -EPERM;
 	return 0;
 }

 static struct kvm_hyp_iommu_domain *
 handle_to_domain(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 		 struct kvm_hyp_iommu **out_iommu, struct pkvm_hyp_vcpu *ctxt)
 {
 	int idx;
 	struct kvm_hyp_iommu *iommu;
 	struct kvm_hyp_iommu_domain *domains;

 	iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	if (!iommu)
 		return NULL;

 	/* Guests only have access to top half of the domain IDs. */
 	if (ctxt && (domain_id < (iommu->nr_domains >> 1)))
 		return NULL;

 	/* Host only has access to bottom half of the domain IDs. */
 	else if (!ctxt && (domain_id >= (iommu->nr_domains >> 1)))
 		return NULL;

 	domain_id = array_index_nospec(domain_id, iommu->nr_domains);

 	idx = domain_id >> KVM_IOMMU_DOMAIN_ID_SPLIT;
 	domains = iommu->domains[idx];
 	if (!domains) {
 		/*
 		 * Although we have guest context we don't allocate new domain
 		 * pages from guest mc, as one domain page can include many VMs
 		 * domains, so we allocate this from host mc to make sure data is
 		 * available after one guest teardown.
 		 */
 		domains = kvm_iommu_donate_page();
 		if (!domains)
 			return NULL;
 		iommu->domains[idx] = domains;
 	}

 	*out_iommu = iommu;
 	return &domains[domain_id & KVM_IOMMU_DOMAIN_ID_LEAF_MASK];
 }

 int kvm_iommu_alloc_domain_nolock(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 				  unsigned long pgd_hva, int type, struct pkvm_hyp_vcpu *ctxt)
 {
 	int ret = -EINVAL;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu *iommu;
 	struct kvm_hyp_iommu_domain *domain;
 	size_t pgd_size;

 	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
 	if (!domain)
 		return ret;

 	if (domain->refs)
 		return ret;

 	ret = kvm_iommu_ops->alloc_domain(iommu_id, &domain->pgtable, type);
 	if (ret)
 		return ret;

 	iopt = domain_to_iopt(iommu, domain, domain_id);

 	pgd_size = kvm_iommu_ops->iopt_size(&iopt);

 	if (ctxt) {
 		pgd_hva = (unsigned long)guest_alloc_contig_pages(ctxt, pgd_size >> PAGE_SHIFT);
 		if (!pgd_hva)
 			return -ENOMEM;
 		domain->vm = pkvm_hyp_vcpu_to_hyp_vm(ctxt);
 	}
 	else {
 		pgd_hva = (unsigned long)pkvm_map_donated_memory(pgd_hva, pgd_size);
 	}

 	ret = kvm_iommu_ops->alloc_iopt(&iopt, pgd_hva);
 	if (ret)
 		return ret;

 	domain->refs = 1;
 	domain->pgd = iopt.pgd;

 	return 0;
 }

 int kvm_iommu_alloc_domain(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			   unsigned long pgd_hva, int type, struct pkvm_hyp_vcpu *ctxt)
 {
 	int ret;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);

 	hyp_spin_lock(&iommu->iommu_lock);
 	ret = kvm_iommu_alloc_domain_nolock(iommu_id, domain_id, pgd_hva, type, ctxt);
 	hyp_spin_unlock(&iommu->iommu_lock);

 	return ret;
 }

 int kvm_iommu_free_domain(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			  struct pkvm_hyp_vcpu *ctxt)
 {
 	int ret = -EINVAL;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	struct kvm_hyp_iommu_domain *domain;

 	hyp_spin_lock(&iommu->iommu_lock);
 	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
 	if (!domain)
 		goto out_unlock;

 	ret = access_allowed(ctxt, domain);
 	if (ret)
 		return ret;

 	if (domain->refs != 1)
 		goto out_unlock;

 	iopt = domain_to_iopt(iommu, domain, domain_id);
 	ret = kvm_iommu_ops->free_iopt(&iopt);

 	memset(domain, 0, sizeof(*domain));

 out_unlock:
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 int kvm_iommu_attach_dev(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			 u32 endpoint_id, u32 pasid,
 			 u32 pasid_bits, struct pkvm_hyp_vcpu *ctxt)
 {
 	int ret = -EINVAL;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	struct kvm_hyp_iommu_domain *domain;

 	hyp_spin_lock(&iommu->iommu_lock);
 	cur_context = ctxt;
 	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
 	if (!domain || !domain->refs || domain->refs == UINT_MAX)
 		goto out_unlock;

 	ret = access_allowed(ctxt, domain);
 	if (ret)
 		return ret;

 	ret = kvm_iommu_ops->attach_dev(iommu, domain_id, domain, endpoint_id, pasid,
 					pasid_bits);
 	if (ret)
 		goto out_unlock;

 	domain->refs++;
 out_unlock:
 	cur_context = NULL;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 int kvm_iommu_detach_dev(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			 u32 endpoint_id, u32 pasid, struct pkvm_hyp_vcpu *ctxt)
 {
 	int ret = -EINVAL;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	struct kvm_hyp_iommu_domain *domain;

 	hyp_spin_lock(&iommu->iommu_lock);
 	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
 	if (!domain || domain->refs <= 1)
 		goto out_unlock;

 	ret = access_allowed(ctxt, domain);
 	if (ret)
 		return ret;

 	ret = kvm_iommu_ops->detach_dev(iommu, domain_id, domain, endpoint_id, pasid);
 	if (ret)
 		goto out_unlock;

 	domain->refs--;
 out_unlock:
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 int kvm_iommu_force_detach_dev(pkvm_handle_t iommu_id, u32 endpoint_id, u32 pasid)
 {
 	int ret;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);

 	hyp_spin_lock(&iommu->iommu_lock);
 	/*
 	 * Driver shouldn't use domain in detach operation.
 	 */
 	ret = kvm_iommu_ops->detach_dev(iommu, 0, NULL, endpoint_id, pasid);
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 static int __kvm_iommu_unmap_pages(struct io_pgtable *iopt, unsigned long iova,
 				   size_t pgsize, size_t pgcount, bool unshare,
 				   struct pkvm_hyp_vcpu *ctxt)
 {
 	int ret;
 	size_t unmapped;
 	phys_addr_t paddr;
 	size_t total_unmapped = 0;
 	size_t size = pgsize * pgcount;

 	while (total_unmapped < size) {
 		paddr = iopt_iova_to_phys(iopt, iova);
 		if (paddr == 0)
 			return -EINVAL;

 		/*
 		 * One page/block at a time, because the range provided may not
 		 * be physically contiguous, and we need to unshare all physical
 		 * pages.
 		 */
 		unmapped = iopt_unmap_pages(iopt, iova, pgsize, 1, NULL);
 		if (!unmapped)
 			return -EINVAL;

 		if (unshare) {
 			ret = __pkvm_unshare_dma(paddr, pgsize, ctxt);
 			if (ret)
 				return ret;
 		}

 		iova += unmapped;
 		pgcount -= unmapped / pgsize;
 		total_unmapped += unmapped;
 	}

 	return 0;
 }

 #define IOMMU_PROT_MASK (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE |\
 			 IOMMU_NOEXEC | IOMMU_MMIO)

 int kvm_iommu_map_pages(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			unsigned long iova, phys_addr_t paddr, size_t pgsize,
 			size_t pgcount, int prot, struct pkvm_hyp_vcpu *ctxt)
 {
 	size_t size;
 	size_t granule;
 	int ret = -EINVAL;
 	size_t mapped = 0;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	size_t pgcount_orig = pgcount;
 	unsigned long iova_orig = iova;
 	phys_addr_t orig_paddr = paddr;
 	struct kvm_hyp_iommu_domain *domain;

 	cur_context = ctxt;
 	if (prot & ~IOMMU_PROT_MASK)
 		return -EINVAL;

 	if (__builtin_mul_overflow(pgsize, pgcount, &size) ||
 	    iova + size < iova || paddr + size < paddr)
 		return -EOVERFLOW;

 	hyp_spin_lock(&iommu->iommu_lock);

 	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
 	if (!domain)
 		goto err_unlock;

 	ret = access_allowed(ctxt, domain);
 	if (ret)
 		return ret;

 	granule = 1 << __ffs(domain->pgtable->cfg.pgsize_bitmap);
 	if (!IS_ALIGNED(iova | paddr | pgsize, granule))
 		goto err_unlock;

 	ret = __pkvm_share_dma(paddr, size, ctxt);
 	if (ret)
 		goto err_unlock;

 	iopt = domain_to_iopt(iommu, domain, domain_id);
 	while (pgcount) {
 		ret = iopt_map_pages(&iopt, iova, paddr, pgsize, pgcount, prot,
 				     0, &mapped);
 		WARN_ON(!IS_ALIGNED(mapped, pgsize));
 		pgcount -= mapped / pgsize;
 		if (ret)
 			goto err_unmap;
 		iova += mapped;
 		paddr += mapped;
 	}
 	cur_context = NULL;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return 0;

 err_unmap:
 	__pkvm_unshare_dma(orig_paddr, size, ctxt);
 	__kvm_iommu_unmap_pages(&iopt, iova_orig, pgsize, pgcount_orig - pgcount, false, ctxt);
 err_unlock:
 	cur_context = NULL;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 int kvm_iommu_unmap_pages(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			  unsigned long iova, size_t pgsize, size_t pgcount,
 			  struct pkvm_hyp_vcpu *ctxt)
 {
 	size_t size;
 	size_t granule;
 	int ret = -EINVAL;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	struct kvm_hyp_iommu_domain *domain;

 	if (__builtin_mul_overflow(pgsize, pgcount, &size) ||
 	    iova + size < iova)
 		return -EOVERFLOW;

 	hyp_spin_lock(&iommu->iommu_lock);
 	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
 	if (!domain)
 		goto out_unlock;

 	ret = access_allowed(ctxt, domain);
 	if (ret)
 		return ret;

 	granule = 1 << __ffs(domain->pgtable->cfg.pgsize_bitmap);
 	if (!IS_ALIGNED(iova | pgsize, granule))
 		goto out_unlock;

 	iopt = domain_to_iopt(iommu, domain, domain_id);
 	ret = __kvm_iommu_unmap_pages(&iopt, iova, pgsize, pgcount, true, ctxt);
 out_unlock:
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 phys_addr_t kvm_iommu_iova_to_phys(pkvm_handle_t iommu_id,
 				   pkvm_handle_t domain_id, unsigned long iova,
 				   struct pkvm_hyp_vcpu *ctxt)
 {
 	phys_addr_t phys = 0;
 	int ret;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	struct kvm_hyp_iommu_domain *domain;

 	hyp_spin_lock(&iommu->iommu_lock);
 	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);

 	ret = access_allowed(ctxt, domain);
 	if (ret)
 		return ret;

 	if (domain) {
 		iopt = domain_to_iopt(iommu, domain, domain_id);

 		phys = iopt_iova_to_phys(&iopt, iova);
 	}
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return phys;
 }

 static int iommu_power_on(struct kvm_power_domain *pd)
 {
 	struct kvm_hyp_iommu *iommu = container_of(pd, struct kvm_hyp_iommu,
 						   power_domain);

 	pkvm_debug("%s\n", __func__);

 	/*
 	 * We currently assume that the device retains its architectural state
 	 * across power off, hence no save/restore.
 	 */
 	hyp_spin_lock(&iommu->iommu_lock);
 	iommu->power_is_off = false;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return 0;
 }

 static int iommu_power_off(struct kvm_power_domain *pd)
 {
 	struct kvm_hyp_iommu *iommu = container_of(pd, struct kvm_hyp_iommu,
 						   power_domain);

 	pkvm_debug("%s\n", __func__);

 	hyp_spin_lock(&iommu->iommu_lock);
 	iommu->power_is_off = true;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return 0;
 }

 static const struct kvm_power_domain_ops iommu_power_ops = {
 	.power_on	= iommu_power_on,
 	.power_off	= iommu_power_off,
 };

 /*
  * Guests need pass domain ID when dealing with IOMMUs, and we must ensure that
  * guest IDs doesn't collide with each other, we can specify fixed ranges for each
  * VM that we keep track off, but it is more flexible to have a function that
  * allocated one domain ID at a time so we don't have max number of domains
  * for a VM. And we would need to keep track for each guest ranges for each IOMMU.
  */
 int kvm_iommu_alloc_guest_domain(pkvm_handle_t iommu_id, struct pkvm_hyp_vcpu *ctxt,
 				 pkvm_handle_t *ret_domain)
 {
 	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	pkvm_handle_t domain_id = (iommu->nr_domains >> 1);
 	struct kvm_hyp_iommu_domain *domain;
 	int ret;
 	unsigned long pgd_hva = 0;

 	if (!ret_domain)
 		return -EINVAL;

 	hyp_spin_lock(&iommu->iommu_lock);
 	cur_context = ctxt;
 	/*
 	 * Not optimal but works for guests as this operation is rare and
 	 * guests doesn't allocate many domains.
 	 */
 	for ( ; domain_id  < iommu->nr_domains; ++domain_id) {
 		domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
 		if (!domain) {
 			ret = -ENOMEM;
 			goto out_unlock;
 		}
 		/* A free domain we can use. */
 		if (domain->refs == 0)
 			break;
 	}

 	/* Out of domains to allocate. */
 	if (!domain) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}

 	ret = kvm_iommu_alloc_domain_nolock(iommu_id, domain_id, pgd_hva, 2, ctxt);
 	*ret_domain = domain_id;
 out_unlock:
 	cur_context = ctxt;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 int kvm_iommu_init_device(struct kvm_hyp_iommu *iommu)
 {
 	int ret;
 	void *domains;

 	hyp_spin_lock_init(&iommu->iommu_lock);

 	ret = pkvm_init_power_domain(&iommu->power_domain, &iommu_power_ops);
 	if (ret)
 		return ret;

 	domains = iommu->domains;
 	iommu->domains = kern_hyp_va(domains);
 	return pkvm_create_mappings(iommu->domains, iommu->domains +
 				    KVM_IOMMU_DOMAINS_ROOT_ENTRIES, PAGE_HYP);
 }

 int kvm_iommu_init(void)
 {
 	enum kvm_pgtable_prot prot;

 	if (WARN_ON(!kvm_iommu_ops->get_iommu_by_id ||
 		    !kvm_iommu_ops->alloc_iopt ||
 		    !kvm_iommu_ops->free_iopt ||
 		    !kvm_iommu_ops->iopt_size ||
 		    !kvm_iommu_ops->attach_dev ||
 		    !kvm_iommu_ops->alloc_domain ||
 		    !kvm_iommu_ops->detach_dev))
 		return -ENODEV;

 	/* The memcache is shared with the host */
 	prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_OWNED);
 	return pkvm_create_mappings(kvm_hyp_iommu_memcaches,
 				    kvm_hyp_iommu_memcaches + NR_CPUS, prot);
 }
	// SPDX-License-Identifier: GPL-2.0
	/*
	* IOMMU operations for pKVM
	*
	* Copyright (C) 2022 Linaro Ltd.
	*/

	#include <asm/kvm_hyp.h>
	#include <kvm/iommu.h>
	#include <kvm/pl011.h>
	#include <nvhe/iommu.h>
	#include <nvhe/mem_protect.h>
	#include <nvhe/mm.h>

	/* Hypervisor is non-preemptable, so cur_context can be per cpu. */
	DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, __cur_context);
	#define cur_context (*this_cpu_ptr(&__cur_context))

	struct kvm_hyp_iommu_memcache __ro_after_init *kvm_hyp_iommu_memcaches;

	#define domain_to_iopt(_iommu, _domain, _domain_id) \
	(struct io_pgtable) { \
	.ops = &(_domain)->pgtable->ops, \
	.pgd = (_domain)->pgd, \
	.cookie = &(struct kvm_iommu_tlb_cookie) { \
	.iommu = (_iommu), \
	.domain_id = (_domain_id), \
	.domain = (_domain), \
	}, \
	}

	void *kvm_iommu_donate_page(void)
	{
	void *p;
	int cpu = hyp_smp_processor_id();
	struct kvm_hyp_memcache tmp = kvm_hyp_iommu_memcaches[cpu].pages;

	/* For vcpus only use it's allocator as it is accounted. */
	if (cur_context) {
	p = guest_alloc_contig_pages(cur_context, 1);
	return p;
	}
	if (!tmp.nr_pages) {
	kvm_hyp_iommu_memcaches[cpu].needs_page = true;
	return NULL;
	}

	p = pkvm_admit_host_page(&tmp);
	if (!p)
	return NULL;

	kvm_hyp_iommu_memcaches[cpu].pages = tmp;
	memset(p, 0, PAGE_SIZE);
	return p;
	}

	void kvm_iommu_reclaim_page(void *p)
	{
	int cpu = hyp_smp_processor_id();

	pkvm_teardown_donated_memory(&kvm_hyp_iommu_memcaches[cpu].pages, p,
	PAGE_SIZE);
	}

	static int access_allowed(struct pkvm_hyp_vcpu *ctxt,
	struct kvm_hyp_iommu_domain *domain)
	{
	if (!ctxt && domain->vm)
	return -EPERM;
	if (ctxt && (domain->vm != pkvm_hyp_vcpu_to_hyp_vm(ctxt)))
	return -EPERM;
	return 0;
	}

	static struct kvm_hyp_iommu_domain *
	handle_to_domain(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	struct kvm_hyp_iommu *out_iommu, struct pkvm_hyp_vcpu ctxt)
	{
	int idx;
	struct kvm_hyp_iommu *iommu;
	struct kvm_hyp_iommu_domain *domains;

	iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	if (!iommu)
	return NULL;

	/* Guests only have access to top half of the domain IDs. */
	if (ctxt && (domain_id < (iommu->nr_domains >> 1)))
	return NULL;

	/* Host only has access to bottom half of the domain IDs. */
	else if (!ctxt && (domain_id >= (iommu->nr_domains >> 1)))
	return NULL;

	domain_id = array_index_nospec(domain_id, iommu->nr_domains);

	idx = domain_id >> KVM_IOMMU_DOMAIN_ID_SPLIT;
	domains = iommu->domains[idx];
	if (!domains) {
	/*
	* Although we have guest context we don't allocate new domain
	* pages from guest mc, as one domain page can include many VMs
	* domains, so we allocate this from host mc to make sure data is
	* available after one guest teardown.
	*/
	domains = kvm_iommu_donate_page();
	if (!domains)
	return NULL;
	iommu->domains[idx] = domains;
	}

	*out_iommu = iommu;
	return &domains[domain_id & KVM_IOMMU_DOMAIN_ID_LEAF_MASK];
	}

	int kvm_iommu_alloc_domain_nolock(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	unsigned long pgd_hva, int type, struct pkvm_hyp_vcpu *ctxt)
	{
	int ret = -EINVAL;
	struct io_pgtable iopt;
	struct kvm_hyp_iommu *iommu;
	struct kvm_hyp_iommu_domain *domain;
	size_t pgd_size;

	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
	if (!domain)
	return ret;

	if (domain->refs)
	return ret;

	ret = kvm_iommu_ops->alloc_domain(iommu_id, &domain->pgtable, type);
	if (ret)
	return ret;

	iopt = domain_to_iopt(iommu, domain, domain_id);

	pgd_size = kvm_iommu_ops->iopt_size(&iopt);

	if (ctxt) {
	pgd_hva = (unsigned long)guest_alloc_contig_pages(ctxt, pgd_size >> PAGE_SHIFT);
	if (!pgd_hva)
	return -ENOMEM;
	domain->vm = pkvm_hyp_vcpu_to_hyp_vm(ctxt);
	}
	else {
	pgd_hva = (unsigned long)pkvm_map_donated_memory(pgd_hva, pgd_size);
	}

	ret = kvm_iommu_ops->alloc_iopt(&iopt, pgd_hva);
	if (ret)
	return ret;

	domain->refs = 1;
	domain->pgd = iopt.pgd;

	return 0;
	}

	int kvm_iommu_alloc_domain(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	unsigned long pgd_hva, int type, struct pkvm_hyp_vcpu *ctxt)
	{
	int ret;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);

	hyp_spin_lock(&iommu->iommu_lock);
	ret = kvm_iommu_alloc_domain_nolock(iommu_id, domain_id, pgd_hva, type, ctxt);
	hyp_spin_unlock(&iommu->iommu_lock);

	return ret;
	}

	int kvm_iommu_free_domain(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	struct pkvm_hyp_vcpu *ctxt)
	{
	int ret = -EINVAL;
	struct io_pgtable iopt;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	struct kvm_hyp_iommu_domain *domain;

	hyp_spin_lock(&iommu->iommu_lock);
	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
	if (!domain)
	goto out_unlock;

	ret = access_allowed(ctxt, domain);
	if (ret)
	return ret;

	if (domain->refs != 1)
	goto out_unlock;

	iopt = domain_to_iopt(iommu, domain, domain_id);
	ret = kvm_iommu_ops->free_iopt(&iopt);

	memset(domain, 0, sizeof(*domain));

	out_unlock:
	hyp_spin_unlock(&iommu->iommu_lock);
	return ret;
	}

	int kvm_iommu_attach_dev(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	u32 endpoint_id, u32 pasid,
	u32 pasid_bits, struct pkvm_hyp_vcpu *ctxt)
	{
	int ret = -EINVAL;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	struct kvm_hyp_iommu_domain *domain;

	hyp_spin_lock(&iommu->iommu_lock);
	cur_context = ctxt;
	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
	if (!domain \|\| !domain->refs \|\| domain->refs == UINT_MAX)
	goto out_unlock;

	ret = access_allowed(ctxt, domain);
	if (ret)
	return ret;

	ret = kvm_iommu_ops->attach_dev(iommu, domain_id, domain, endpoint_id, pasid,
	pasid_bits);
	if (ret)
	goto out_unlock;

	domain->refs++;
	out_unlock:
	cur_context = NULL;
	hyp_spin_unlock(&iommu->iommu_lock);
	return ret;
	}

	int kvm_iommu_detach_dev(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	u32 endpoint_id, u32 pasid, struct pkvm_hyp_vcpu *ctxt)
	{
	int ret = -EINVAL;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	struct kvm_hyp_iommu_domain *domain;

	hyp_spin_lock(&iommu->iommu_lock);
	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
	if (!domain \|\| domain->refs <= 1)
	goto out_unlock;

	ret = access_allowed(ctxt, domain);
	if (ret)
	return ret;

	ret = kvm_iommu_ops->detach_dev(iommu, domain_id, domain, endpoint_id, pasid);
	if (ret)
	goto out_unlock;

	domain->refs--;
	out_unlock:
	hyp_spin_unlock(&iommu->iommu_lock);
	return ret;
	}

	int kvm_iommu_force_detach_dev(pkvm_handle_t iommu_id, u32 endpoint_id, u32 pasid)
	{
	int ret;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);

	hyp_spin_lock(&iommu->iommu_lock);
	/*
	* Driver shouldn't use domain in detach operation.
	*/
	ret = kvm_iommu_ops->detach_dev(iommu, 0, NULL, endpoint_id, pasid);
	hyp_spin_unlock(&iommu->iommu_lock);
	return ret;
	}

	static int __kvm_iommu_unmap_pages(struct io_pgtable *iopt, unsigned long iova,
	size_t pgsize, size_t pgcount, bool unshare,
	struct pkvm_hyp_vcpu *ctxt)
	{
	int ret;
	size_t unmapped;
	phys_addr_t paddr;
	size_t total_unmapped = 0;
	size_t size = pgsize * pgcount;

	while (total_unmapped < size) {
	paddr = iopt_iova_to_phys(iopt, iova);
	if (paddr == 0)
	return -EINVAL;

	/*
	* One page/block at a time, because the range provided may not
	* be physically contiguous, and we need to unshare all physical
	* pages.
	*/
	unmapped = iopt_unmap_pages(iopt, iova, pgsize, 1, NULL);
	if (!unmapped)
	return -EINVAL;

	if (unshare) {
	ret = __pkvm_unshare_dma(paddr, pgsize, ctxt);
	if (ret)
	return ret;
	}

	iova += unmapped;
	pgcount -= unmapped / pgsize;
	total_unmapped += unmapped;
	}

	return 0;
	}

	#define IOMMU_PROT_MASK (IOMMU_READ \| IOMMU_WRITE \| IOMMU_CACHE \|\
	IOMMU_NOEXEC \| IOMMU_MMIO)

	int kvm_iommu_map_pages(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	unsigned long iova, phys_addr_t paddr, size_t pgsize,
	size_t pgcount, int prot, struct pkvm_hyp_vcpu *ctxt)
	{
	size_t size;
	size_t granule;
	int ret = -EINVAL;
	size_t mapped = 0;
	struct io_pgtable iopt;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	size_t pgcount_orig = pgcount;
	unsigned long iova_orig = iova;
	phys_addr_t orig_paddr = paddr;
	struct kvm_hyp_iommu_domain *domain;

	cur_context = ctxt;
	if (prot & ~IOMMU_PROT_MASK)
	return -EINVAL;

	if (__builtin_mul_overflow(pgsize, pgcount, &size) \|\|
	iova + size < iova \|\| paddr + size < paddr)
	return -EOVERFLOW;

	hyp_spin_lock(&iommu->iommu_lock);

	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
	if (!domain)
	goto err_unlock;

	ret = access_allowed(ctxt, domain);
	if (ret)
	return ret;

	granule = 1 << __ffs(domain->pgtable->cfg.pgsize_bitmap);
	if (!IS_ALIGNED(iova \| paddr \| pgsize, granule))
	goto err_unlock;

	ret = __pkvm_share_dma(paddr, size, ctxt);
	if (ret)
	goto err_unlock;

	iopt = domain_to_iopt(iommu, domain, domain_id);
	while (pgcount) {
	ret = iopt_map_pages(&iopt, iova, paddr, pgsize, pgcount, prot,
	0, &mapped);
	WARN_ON(!IS_ALIGNED(mapped, pgsize));
	pgcount -= mapped / pgsize;
	if (ret)
	goto err_unmap;
	iova += mapped;
	paddr += mapped;
	}
	cur_context = NULL;
	hyp_spin_unlock(&iommu->iommu_lock);
	return 0;

	err_unmap:
	__pkvm_unshare_dma(orig_paddr, size, ctxt);
	__kvm_iommu_unmap_pages(&iopt, iova_orig, pgsize, pgcount_orig - pgcount, false, ctxt);
	err_unlock:
	cur_context = NULL;
	hyp_spin_unlock(&iommu->iommu_lock);
	return ret;
	}

	int kvm_iommu_unmap_pages(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
	unsigned long iova, size_t pgsize, size_t pgcount,
	struct pkvm_hyp_vcpu *ctxt)
	{
	size_t size;
	size_t granule;
	int ret = -EINVAL;
	struct io_pgtable iopt;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	struct kvm_hyp_iommu_domain *domain;

	if (__builtin_mul_overflow(pgsize, pgcount, &size) \|\|
	iova + size < iova)
	return -EOVERFLOW;

	hyp_spin_lock(&iommu->iommu_lock);
	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
	if (!domain)
	goto out_unlock;

	ret = access_allowed(ctxt, domain);
	if (ret)
	return ret;

	granule = 1 << __ffs(domain->pgtable->cfg.pgsize_bitmap);
	if (!IS_ALIGNED(iova \| pgsize, granule))
	goto out_unlock;

	iopt = domain_to_iopt(iommu, domain, domain_id);
	ret = __kvm_iommu_unmap_pages(&iopt, iova, pgsize, pgcount, true, ctxt);
	out_unlock:
	hyp_spin_unlock(&iommu->iommu_lock);
	return ret;
	}

	phys_addr_t kvm_iommu_iova_to_phys(pkvm_handle_t iommu_id,
	pkvm_handle_t domain_id, unsigned long iova,
	struct pkvm_hyp_vcpu *ctxt)
	{
	phys_addr_t phys = 0;
	int ret;
	struct io_pgtable iopt;
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	struct kvm_hyp_iommu_domain *domain;

	hyp_spin_lock(&iommu->iommu_lock);
	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);

	ret = access_allowed(ctxt, domain);
	if (ret)
	return ret;

	if (domain) {
	iopt = domain_to_iopt(iommu, domain, domain_id);

	phys = iopt_iova_to_phys(&iopt, iova);
	}
	hyp_spin_unlock(&iommu->iommu_lock);
	return phys;
	}

	static int iommu_power_on(struct kvm_power_domain *pd)
	{
	struct kvm_hyp_iommu *iommu = container_of(pd, struct kvm_hyp_iommu,
	power_domain);

	pkvm_debug("%s\n", __func__);

	/*
	* We currently assume that the device retains its architectural state
	* across power off, hence no save/restore.
	*/
	hyp_spin_lock(&iommu->iommu_lock);
	iommu->power_is_off = false;
	hyp_spin_unlock(&iommu->iommu_lock);
	return 0;
	}

	static int iommu_power_off(struct kvm_power_domain *pd)
	{
	struct kvm_hyp_iommu *iommu = container_of(pd, struct kvm_hyp_iommu,
	power_domain);

	pkvm_debug("%s\n", __func__);

	hyp_spin_lock(&iommu->iommu_lock);
	iommu->power_is_off = true;
	hyp_spin_unlock(&iommu->iommu_lock);
	return 0;
	}

	static const struct kvm_power_domain_ops iommu_power_ops = {
	.power_on = iommu_power_on,
	.power_off = iommu_power_off,
	};

	/*
	* Guests need pass domain ID when dealing with IOMMUs, and we must ensure that
	* guest IDs doesn't collide with each other, we can specify fixed ranges for each
	* VM that we keep track off, but it is more flexible to have a function that
	* allocated one domain ID at a time so we don't have max number of domains
	* for a VM. And we would need to keep track for each guest ranges for each IOMMU.
	*/
	int kvm_iommu_alloc_guest_domain(pkvm_handle_t iommu_id, struct pkvm_hyp_vcpu *ctxt,
	pkvm_handle_t *ret_domain)
	{
	struct kvm_hyp_iommu *iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
	pkvm_handle_t domain_id = (iommu->nr_domains >> 1);
	struct kvm_hyp_iommu_domain *domain;
	int ret;
	unsigned long pgd_hva = 0;

	if (!ret_domain)
	return -EINVAL;

	hyp_spin_lock(&iommu->iommu_lock);
	cur_context = ctxt;
	/*
	* Not optimal but works for guests as this operation is rare and
	* guests doesn't allocate many domains.
	*/
	for ( ; domain_id < iommu->nr_domains; ++domain_id) {
	domain = handle_to_domain(iommu_id, domain_id, &iommu, ctxt);
	if (!domain) {
	ret = -ENOMEM;
	goto out_unlock;
	}
	/* A free domain we can use. */
	if (domain->refs == 0)
	break;
	}

	/* Out of domains to allocate. */
	if (!domain) {
	ret = -EBUSY;
	goto out_unlock;
	}

	ret = kvm_iommu_alloc_domain_nolock(iommu_id, domain_id, pgd_hva, 2, ctxt);
	*ret_domain = domain_id;
	out_unlock:
	cur_context = ctxt;
	hyp_spin_unlock(&iommu->iommu_lock);
	return ret;
	}

	int kvm_iommu_init_device(struct kvm_hyp_iommu *iommu)
	{
	int ret;
	void *domains;

	hyp_spin_lock_init(&iommu->iommu_lock);

	ret = pkvm_init_power_domain(&iommu->power_domain, &iommu_power_ops);
	if (ret)
	return ret;

	domains = iommu->domains;
	iommu->domains = kern_hyp_va(domains);
	return pkvm_create_mappings(iommu->domains, iommu->domains +
	KVM_IOMMU_DOMAINS_ROOT_ENTRIES, PAGE_HYP);
	}

	int kvm_iommu_init(void)
	{
	enum kvm_pgtable_prot prot;

	if (WARN_ON(!kvm_iommu_ops->get_iommu_by_id \|\|
	!kvm_iommu_ops->alloc_iopt \|\|
	!kvm_iommu_ops->free_iopt \|\|
	!kvm_iommu_ops->iopt_size \|\|
	!kvm_iommu_ops->attach_dev \|\|
	!kvm_iommu_ops->alloc_domain \|\|
	!kvm_iommu_ops->detach_dev))
	return -ENODEV;

	/* The memcache is shared with the host */
	prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_OWNED);
	return pkvm_create_mappings(kvm_hyp_iommu_memcaches,
	kvm_hyp_iommu_memcaches + NR_CPUS, prot);
	}