arch/arm64/kvm/hyp/nvhe/iommu/iommu.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0
 /*
  * IOMMU operations for pKVM
  *
  * Copyright (C) 2022 Linaro Ltd.
  */

 #include <asm/kvm_hyp.h>

 #include <hyp/adjust_pc.h>

 #include <kvm/iommu.h>

 #include <nvhe/alloc_mgt.h>
 #include <nvhe/iommu.h>
 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
 #include <nvhe/pviommu-host.h>

 enum {
 	IOMMU_DRIVER_NOT_READY = 0,
 	IOMMU_DRIVER_INITIALIZING,
 	IOMMU_DRIVER_READY,
 };
 static atomic_t kvm_iommu_initialized;

 void **kvm_hyp_iommu_domains;
 static struct hyp_pool iommu_idmap_pool;
 static struct hyp_pool iommu_host_pool;
 static int snapshot_host_stage2(void);

 /*
  * This lock protect domain operations, that can't be done using the atomic refcount
  * It is used for alloc/free domains, so it shouldn't have a lot of overhead as
  * these are rare operations, while map/unmap are left lockless.
  */
 static DEFINE_HYP_SPINLOCK(iommu_domains_lock);

 /* Hypervisor is non-preemptable, so cur_context can be per cpu. */
 DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, __cur_context);
 #define cur_context (*this_cpu_ptr(&__cur_context))

 DECLARE_PER_CPU(struct kvm_hyp_req, host_hyp_reqs);

 static void host_lock_component(void)
 {
 	hyp_spin_lock(&host_mmu.lock);
 }

 static void host_unlock_component(void)
 {
 	hyp_spin_unlock(&host_mmu.lock);
 }

 struct pkvm_hyp_vcpu * __get_ctxt(void)
 {
 	return this_cpu_ptr(&kvm_host_data)->host_ctxt.__hyp_running_vcpu;
 }

 void hyp_iommu_lock(struct kvm_hyp_iommu *iommu)
 {
 	hyp_spin_lock(&iommu->iommu_lock);
 }

 void hyp_iommu_unlock(struct kvm_hyp_iommu *iommu)
 {
 	hyp_spin_unlock(&iommu->iommu_lock);
 }

 void hyp_assert_iommu_lock_held(struct kvm_hyp_iommu *iommu)
 {
 	hyp_assert_lock_held(&iommu->iommu_lock);
 }

 void hyp_domains_lock(void)
 {
 	hyp_spin_lock(&iommu_domains_lock);
 }

 void hyp_domains_unlock(void)
 {
 	hyp_spin_unlock(&iommu_domains_lock);
 }


 static inline bool kvm_iommu_acquire_init(void)
 {
 	return atomic_cmpxchg_acquire(&kvm_iommu_initialized, IOMMU_DRIVER_NOT_READY,
 				      IOMMU_DRIVER_INITIALIZING) == IOMMU_DRIVER_NOT_READY;
 }

 static inline void kvm_iommu_release_init(void)
 {
 	atomic_set_release(&kvm_iommu_initialized, IOMMU_DRIVER_READY);
 }

 static inline bool kvm_iommu_is_ready(void)
 {
 	return atomic_read(&kvm_iommu_initialized) == IOMMU_DRIVER_READY;
 }

 void *kvm_iommu_donate_pages(u8 order, bool fill_req)
 {
 	void *p;
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();
 	struct kvm_hyp_req *req = this_cpu_ptr(&host_hyp_reqs);

 	/* For vcpus only use it's allocator. */
 	if (ctxt) {
 		p = guest_alloc_contig_pages(ctxt, order);
 		if (!p && fill_req) {
 			req = pkvm_hyp_req_reserve(ctxt, KVM_HYP_REQ_MEM);
 			goto ret_fill_req;
 		}
 		return p;
 	}
 	p = hyp_alloc_pages(&iommu_host_pool, order);
 	if (p)
 		return p;

 ret_fill_req:
 	if (fill_req) {
 		req->type = KVM_HYP_REQ_MEM;
 		req->mem.dest = REQ_MEM_IOMMU;
 		req->mem.sz_alloc = (1 << order) * PAGE_SIZE;
 		req->mem.nr_pages = 1;
 	}
 	return NULL;
 }

 void *kvm_iommu_donate_pgtable_pages(struct io_pgtable *iop, u8 order, bool fill_req)
 {
 	if (iop && ((struct kvm_iommu_tlb_cookie *)iop->cookie)->domain_id ==
 	    KVM_IOMMU_IDMAPPED_DOMAIN) {
 		return hyp_alloc_pages(&iommu_idmap_pool, order);
 	} else {
 		return kvm_iommu_donate_pages(order, fill_req);
 	}
 }

 void kvm_iommu_reclaim_pages(void *p, u8 order)
 {
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();

 	/* Maybe guest is not loaded but we are in teardown context. */
 	if (!ctxt)
 		ctxt = cur_context;

 	if (ctxt)
 		guest_free_contig_pages(ctxt, p, order);
 	else
 		hyp_put_page(&iommu_host_pool, p);
 }

 int kvm_iommu_refill(struct kvm_hyp_memcache *host_mc)
 {
 	void *p;
 	unsigned long order;

 	while (host_mc->nr_pages) {
 		order = host_mc->head & (PAGE_SIZE - 1);
 		p = pkvm_admit_host_page(host_mc, order);
 		hyp_virt_to_page(p)->order = order;
 		hyp_set_page_refcounted(hyp_virt_to_page(p));
 		hyp_put_page(&iommu_host_pool, p);
 	}

 	return 0;
 }

 void kvm_iommu_reclaim(struct kvm_hyp_memcache *host_mc, int target)
 {
 	void *p;

 	while (target--) {
 		p = hyp_alloc_pages(&iommu_idmap_pool, 0);
 		if (!p)
 			return;
 		push_hyp_memcache(host_mc, p, hyp_virt_to_phys, 0);
 		WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(p), 1));
 	}
 }

 int kvm_iommu_reclaimable(void)
 {
 	return hyp_pool_free_pages(&iommu_host_pool);
 }

 struct hyp_mgt_allocator_ops kvm_iommu_allocator_ops = {
 	.refill = kvm_iommu_refill,
 	.reclaim = kvm_iommu_reclaim,
 	.reclaimable = kvm_iommu_reclaimable,
 };

 static struct kvm_hyp_iommu_domain *
 handle_to_domain(pkvm_handle_t domain_id)
 {
 	int idx;
 	struct kvm_hyp_iommu_domain *domains;

 	if (domain_id >= KVM_IOMMU_MAX_DOMAINS)
 		return NULL;
 	domain_id = array_index_nospec(domain_id, KVM_IOMMU_MAX_DOMAINS);

 	idx = domain_id >> KVM_IOMMU_DOMAIN_ID_SPLIT;
 	domains = (struct kvm_hyp_iommu_domain *)READ_ONCE(kvm_hyp_iommu_domains[idx]);
 	if (!domains) {
 		domains = kvm_iommu_donate_pages(0, true);
 		if (!domains)
 			return NULL;

 		/*
 		 * handle_to_domain() does not have to be called under a lock,
 		 * but even though we allocate a leaf in all cases, it's only
 		 * really a valid thing to do under alloc_domain(), which uses a
 		 * lock. Races are therefore a host bug and we don't need to be
 		 * delicate about it.
 		 */
 		if (WARN_ON(cmpxchg64_relaxed(&kvm_hyp_iommu_domains[idx], 0,
 					      (void *)domains) != 0))
 			return NULL;
 	}

 	return &domains[domain_id & KVM_IOMMU_DOMAIN_ID_LEAF_MASK];
 }

 static int domain_get(struct kvm_hyp_iommu_domain *domain)
 {
 	int old = atomic_fetch_inc_acquire(&domain->refs);

 	if (WARN_ON(!old))
 		return -EINVAL;
 	else if (old < 0 || old + 1 < 0)
 		return -EOVERFLOW;
 	return 0;
 }

 static void domain_put(struct kvm_hyp_iommu_domain *domain)
 {
 	BUG_ON(!atomic_dec_return_release(&domain->refs));
 }

 static int access_allowed(struct kvm_hyp_iommu_domain *domain)
 {
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();

 	/* Maybe guest is not loaded but we are in teardown context. */
 	if (!ctxt)
 		ctxt = cur_context;

 	if (!ctxt && domain->vm)
 		return -EPERM;
 	if (ctxt && (domain->vm != pkvm_hyp_vcpu_to_hyp_vm(ctxt)))
 		return -EPERM;
 	return 0;
 }

 static int kvm_iommu_alloc_domain_nolock(pkvm_handle_t domain_id, unsigned long pgd_hva,
 					 unsigned long pgd_size, u32 type)
 {
 	struct kvm_hyp_iommu_domain *domain;
 	int ret = -EINVAL;
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();
 	struct pkvm_hyp_vm *vm;

 	if (!kvm_iommu_ops)
 		return -ENODEV;

 	domain = handle_to_domain(domain_id);
 	if (!domain)
 		return -EINVAL;

 	if (atomic_read(&domain->refs))
 		return -EINVAL;

 	ret = kvm_iommu_ops->alloc_domain(domain, domain_id, pgd_hva, pgd_size, type);
 	if (ret)
 		return ret;
 	atomic_set_release(&domain->refs, 1);
 	if (ctxt) {
 		vm = pkvm_hyp_vcpu_to_hyp_vm(ctxt);
 		domain->vm = vm;
 	}

 	return ret;
 }

 int kvm_iommu_alloc_domain(pkvm_handle_t domain_id, u32 type)
 {
 	int ret;
 	unsigned long pgd_hva, pgd_size;

 	/* Host only has access to the lower half of the domain IDs. */
 	if (domain_id >= (KVM_IOMMU_MAX_DOMAINS >> 1))
 		return -EINVAL;

 	pgd_size = kvm_iommu_ops->pgd_size(type);
 	/*
 	 * Guest memory are already donated as they come from memcache
 	 * while host memory passed from HVC needs to be donated.
 	 */
 	pgd_hva = (unsigned long)kvm_iommu_donate_pages(get_order(pgd_size), true);
 	if (!pgd_hva)
 		return -ENOMEM;

 	hyp_spin_lock(&iommu_domains_lock);
 	ret = kvm_iommu_alloc_domain_nolock(domain_id, pgd_hva, pgd_size, type);
 	if (ret)
 		pkvm_unmap_donated_memory((void *)pgd_hva, pgd_size);

 	hyp_spin_unlock(&iommu_domains_lock);
 	return ret;
 }

 /*
  * The domain ID space is shared between guests (second half), so this is a
  * (dummy) allocator for guest domain IDs.
  */
 int kvm_iommu_alloc_guest_domain(pkvm_handle_t *ret_domain)
 {
 	pkvm_handle_t domain_id = KVM_IOMMU_MAX_DOMAINS >> 1;
 	struct kvm_hyp_iommu_domain *domain;
 	int ret = -EINVAL;
 	unsigned long pgd_hva, pgd_size;

 	pgd_size = kvm_iommu_ops->pgd_size(DOMAIN_ANY_TYPE);

 	if (!ret_domain)
 		return -EINVAL;
 	hyp_spin_lock(&iommu_domains_lock);
 	/*
 	 * Not optimal but works for guests as this operation is rare as
 	 * guests doesn't allocate many domains.
 	 */
 	for ( ; domain_id < KVM_IOMMU_MAX_DOMAINS; ++domain_id) {
 		domain = handle_to_domain(domain_id);
 		if (!domain) {
 			ret = -ENOMEM;
 			goto out_unlock;
 		}
 		/* A free domain we can use... hopefully */
 		if (atomic_read(&domain->refs) == 0)
 			break;
 	}

 	if (domain_id == KVM_IOMMU_MAX_DOMAINS) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}

 	pgd_hva = (u64)kvm_iommu_donate_pages(get_order(pgd_size), true);
 	if (!pgd_hva) {
 		ret = -ENOMEM;
 		goto out_unlock;
 	}

 	ret = kvm_iommu_alloc_domain_nolock(domain_id, pgd_hva, pgd_size, DOMAIN_ANY_TYPE);
 	if (ret)
 		kvm_iommu_reclaim_pages((void *)pgd_hva, get_order(pgd_size));

 	*ret_domain = domain_id;
 out_unlock:
 	hyp_spin_unlock(&iommu_domains_lock);
 	return ret;
 }

 int kvm_iommu_free_domain(pkvm_handle_t domain_id)
 {
 	struct kvm_hyp_iommu_domain *domain;
 	int ret = -EINVAL;

 	if (!kvm_iommu_ops)
 		return -ENODEV;

 	if (domain_id == KVM_IOMMU_IDMAPPED_DOMAIN)
 		return -EINVAL;

 	domain = handle_to_domain(domain_id);
 	if (!domain)
 		return -EINVAL;

 	hyp_spin_lock(&iommu_domains_lock);
 	ret = access_allowed(domain);
 	if (ret)
 		goto out_unlock;

 	if (WARN_ON(atomic_cmpxchg_release(&domain->refs, 1, 0) != 1))
 		goto out_unlock;

 	ret = kvm_iommu_ops->free_domain(domain, domain_id);
 	memset(domain, 0, sizeof(*domain));

 out_unlock:
 	hyp_spin_unlock(&iommu_domains_lock);
 	return ret;
 }

 /*
  * A guest is dying before freeing its domains, free them for it.
  */
 int kvm_iommu_free_guest_domains(struct pkvm_hyp_vm *hyp_vm)
 {
 	pkvm_handle_t domain_id = (KVM_IOMMU_MAX_DOMAINS >> 1);
 	struct kvm_hyp_iommu_domain *domain;

 	/* Doesn't matter which vcpu. */
 	cur_context = hyp_vm->vcpus[0];

 	for ( ; (domain_id < KVM_IOMMU_MAX_DOMAINS) ; ++domain_id) {
 		domain = handle_to_domain(domain_id);
 		if (domain && domain->vm == hyp_vm) {
 			/*
 			 * Guest is dying, it can't do any operations on this domains, so it is safe
 			 * modify the domain without look.
 			 * And a guest can die while attaching devices to a domain, so we don't care
 			 * about the refcount as the pvIOMMU will block the device anyway.
 			 */
 			atomic_set_release(&domain->refs, 1);
 			kvm_iommu_free_domain(domain_id);
 		}
 	}

 	cur_context = NULL;
 	return 0;
 }

 int kvm_iommu_attach_dev(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			 u32 endpoint_id, u32 pasid, u32 pasid_bits, u64 flags)
 {
 	int ret = -EINVAL;
 	struct kvm_hyp_iommu *iommu;
 	struct kvm_hyp_iommu_domain *domain;
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();

 	/*
 	 * This is attached to a running guest!.
 	 * Guest is prevented from attaching to host domain from pviommu handler
 	 * as it won't find a translation for the requested device.
 	 */
 	if (!ctxt) {
 		ret = pkvm_pviommu_host_allowed(iommu_id, endpoint_id);
 		if (ret)
 			return ret;
 	}

 	if (!kvm_iommu_ops)
 		return -ENODEV;

 	iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	if (!iommu)
 		return -ENOENT;
 	hyp_spin_lock(&iommu->iommu_lock);
 	domain = handle_to_domain(domain_id);
 	if (!domain || domain_get(domain))
 		goto out_unlock;

 	ret = access_allowed(domain);
 	if (ret)
 		goto out_unlock;

 	ret = kvm_iommu_ops->attach_dev(iommu, domain_id, domain, endpoint_id,
 					pasid, pasid_bits, flags);

 	if (ret)
 		goto err_put_domain;

 	if ((domain_id == KVM_IOMMU_IDMAPPED_DOMAIN)  && kvm_iommu_acquire_init()) {
 		host_lock_component();
 		snapshot_host_stage2();
 		host_unlock_component();
 		kvm_iommu_release_init();
 	}

 out_unlock:
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;

 err_put_domain:
 	domain_put(domain);
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 int kvm_iommu_detach_dev(pkvm_handle_t iommu_id, pkvm_handle_t domain_id,
 			 u32 endpoint_id, u32 pasid)
 {
 	int ret = -EINVAL;
 	struct kvm_hyp_iommu *iommu;
 	struct kvm_hyp_iommu_domain *domain;
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();

 	/* Sneaky sneaky. */
 	if (!ctxt) {
 		ret = pkvm_pviommu_host_allowed(iommu_id, endpoint_id);
 		if (ret)
 			return ret;
 	}

 	if (!kvm_iommu_ops)
 		return -ENODEV;

 	iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	if (!iommu)
 		return -ENOENT;
 	hyp_spin_lock(&iommu->iommu_lock);
 	domain = handle_to_domain(domain_id);
 	if (!domain || atomic_read(&domain->refs) <= 1)
 		goto out_unlock;

 	ret = access_allowed(domain);
 	if (ret)
 		goto out_unlock;

 	ret = kvm_iommu_ops->detach_dev(iommu, domain_id, domain, endpoint_id, pasid);
 	if (ret)
 		goto out_unlock;

 	domain_put(domain);
 out_unlock:
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 #define IOMMU_PROT_MASK (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE |\
 			 IOMMU_NOEXEC | IOMMU_MMIO | IOMMU_PRIV)

 int kvm_iommu_map_pages_ret(pkvm_handle_t domain_id,
 			    unsigned long iova, phys_addr_t paddr, size_t pgsize,
 			    size_t pgcount, int prot, size_t *total_mapped)
 {
 	size_t size;
 	size_t mapped;
 	size_t granule;
 	int ret;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu_domain *domain;
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();

 	*total_mapped = 0;

 	if (!kvm_iommu_ops)
 		return -ENODEV;

 	if (domain_id == KVM_IOMMU_IDMAPPED_DOMAIN)
 		return -EINVAL;

 	if (prot & ~IOMMU_PROT_MASK)
 		return -EINVAL;

 	if (__builtin_mul_overflow(pgsize, pgcount, &size) ||
 	    iova + size < iova || paddr + size < paddr)
 		return -EINVAL;

 	/*
 	 * TODO: check whether it is safe here to call io-pgtable without a
 	 * lock. Does the driver make assumptions that don't hold for the
 	 * hypervisor, for example that device drivers don't call map/unmap
 	 * concurrently on the same page?
 	 *
 	 * Command queue and iommu->power_is_off are also protected by the
 	 * iommu_lock, taken by the TLB invalidation callbacks.
 	 */

 	domain = handle_to_domain(domain_id);
 	if (!domain || domain_get(domain))
 		return -EINVAL;

 	ret = access_allowed(domain);
 	if (ret)
 		goto err_domain_put;

 	granule = 1 << __ffs(domain->pgtable->cfg.pgsize_bitmap);
 	if (!IS_ALIGNED(iova | paddr | pgsize, granule)) {
 		ret = -EINVAL;
 		goto err_domain_put;
 	}

 	ret = __pkvm_share_dma(paddr, size, ctxt);
 	if (ret)
 		goto err_domain_put;

 	iopt = domain_to_iopt(domain, domain_id);
 	while (pgcount && !ret) {
 		mapped = 0;
 		ret = iopt_map_pages(&iopt, iova, paddr, pgsize, pgcount, prot,
 				     0, &mapped);
 		WARN_ON(!IS_ALIGNED(mapped, pgsize));
 		WARN_ON(mapped > pgcount * pgsize);
 		pgcount -= mapped / pgsize;

 		*total_mapped += mapped;
 		iova += mapped;
 		paddr += mapped;
 	}

        /*
         * Unshare the bits that haven't been mapped yet. The host calls back
         * either to continue mapping, or to unmap and unshare what's been done
         * so far.
         */
 	if (pgcount)
 		__pkvm_unshare_dma(paddr, pgcount * pgsize);
 err_domain_put:
 	domain_put(domain);
 	return ret;
 }

 size_t kvm_iommu_map_pages(pkvm_handle_t domain_id,
 			   unsigned long iova, phys_addr_t paddr, size_t pgsize,
 			   size_t pgcount, int prot)
 {
 	size_t mapped = 0;

 	kvm_iommu_map_pages_ret(domain_id, iova, paddr, pgsize,
 				pgcount, prot, &mapped);

 	return mapped;
 }

 size_t kvm_iommu_unmap_pages(pkvm_handle_t domain_id,
 			     unsigned long iova, size_t pgsize, size_t pgcount)
 {
 	int ret;
 	size_t size;
 	size_t granule;
 	size_t unmapped;
 	phys_addr_t paddr = 0;
 	struct io_pgtable iopt;
 	size_t total_unmapped = 0;
 	struct kvm_hyp_iommu_domain *domain;

 	if (domain_id == KVM_IOMMU_IDMAPPED_DOMAIN)
 		return -EINVAL;

 	if (!kvm_iommu_ops)
 		return -ENODEV;

 	if (!pgsize || !pgcount)
 		return 0;

 	if (__builtin_mul_overflow(pgsize, pgcount, &size) ||
 	    iova + size < iova)
 		return 0;

 	domain = handle_to_domain(domain_id);
 	if (!domain || domain_get(domain))
 		return 0;

 	ret = access_allowed(domain);
 	if (ret)
 		return ret;

 	granule = 1 << __ffs(domain->pgtable->cfg.pgsize_bitmap);
 	if (!IS_ALIGNED(iova | pgsize, granule))
 		goto out_put_domain;

 	iopt = domain_to_iopt(domain, domain_id);

 	while (total_unmapped < size) {
 		/*
 		 * One page/block at a time so that we can unshare each page.
 		 * The IOVA range provided may not be physically contiguous, and
 		 * @pgsize may be larger than the one used when mapping.
 		 */
 		unmapped = iopt_unmap_leaf(&iopt, iova, pgsize, &paddr);
 		if (!unmapped || !paddr)
 			goto out_put_domain;

 		ret = __pkvm_unshare_dma(paddr, unmapped);
 		if (WARN_ON(ret))
 			goto out_put_domain;

 		iova += unmapped;
 		total_unmapped += unmapped;
 	}

 out_put_domain:
 	domain_put(domain);
 	return total_unmapped;
 }

 phys_addr_t kvm_iommu_iova_to_phys(pkvm_handle_t domain_id, unsigned long iova)
 {
 	phys_addr_t phys = 0;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu_domain *domain;

 	if (!kvm_iommu_ops)
 		return -ENODEV;

 	if (domain_id == KVM_IOMMU_IDMAPPED_DOMAIN)
 		return iova;

 	domain = handle_to_domain(domain_id);
 	if (!domain || domain_get(domain))
 		return 0;

 	iopt = domain_to_iopt(domain, domain_id);
 	phys = iopt_iova_to_phys(&iopt, iova);

 	domain_put(domain);
 	return phys;
 }

 int kvm_iommu_block_dev(pkvm_handle_t iommu_id, u32 endpoint_id, struct pkvm_hyp_vm *hyp_vm)
 {
 	int ret = -ENOENT;
 	struct kvm_hyp_iommu *iommu;

 	if (hyp_vm)
 		cur_context = hyp_vm->vcpus[0];

 	iommu = kvm_iommu_ops->get_iommu_by_id(iommu_id);
 	hyp_spin_lock(&iommu->iommu_lock);
 	/* This is optional as it is used for guests only */
 	if (kvm_iommu_ops->block_dev)
 		ret = kvm_iommu_ops->block_dev(iommu, endpoint_id);

 	hyp_spin_unlock(&iommu->iommu_lock);

 	cur_context = NULL;

 	return ret;
 }

 bool kvm_iommu_host_dabt_handler(struct kvm_cpu_context *host_ctxt, u64 esr, u64 addr)
 {
 	bool ret = false;

 	if (kvm_iommu_ops && kvm_iommu_ops->dabt_handler)
 		ret = kvm_iommu_ops->dabt_handler(host_ctxt, esr, addr);

 	if (ret)
 		kvm_skip_host_instr();

 	return ret;
 }

 static int iommu_power_on(struct kvm_power_domain *pd)
 {
 	struct kvm_hyp_iommu *iommu = container_of(pd, struct kvm_hyp_iommu,
 						   power_domain);
 	bool prev;
 	int ret;

 	/*
 	 * We currently assume that the device retains its architectural state
 	 * across power off, hence no save/restore.
 	 */
 	hyp_spin_lock(&iommu->iommu_lock);
 	prev = iommu->power_is_off;
 	iommu->power_is_off = false;

 	ret = kvm_iommu_ops->resume ? kvm_iommu_ops->resume(iommu) : 0;
 	if (ret)
 		iommu->power_is_off = prev;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 static int iommu_power_off(struct kvm_power_domain *pd)
 {
 	struct kvm_hyp_iommu *iommu = container_of(pd, struct kvm_hyp_iommu,
 						   power_domain);
 	bool prev;
 	int ret;

 	hyp_spin_lock(&iommu->iommu_lock);
 	prev = iommu->power_is_off;
 	iommu->power_is_off = true;

 	ret = kvm_iommu_ops->suspend ? kvm_iommu_ops->suspend(iommu) : 0;
 	if (ret)
 		iommu->power_is_off = prev;
 	hyp_spin_unlock(&iommu->iommu_lock);
 	return ret;
 }

 static const struct kvm_power_domain_ops iommu_power_ops = {
 	.power_on	= iommu_power_on,
 	.power_off	= iommu_power_off,
 };

 int kvm_iommu_init_device(struct kvm_hyp_iommu *iommu)
 {
 	hyp_spin_lock_init(&iommu->iommu_lock);

 	return pkvm_init_power_domain(&iommu->power_domain, &iommu_power_ops);
 }

 void __kvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 				   enum kvm_pgtable_prot prot)
 {
 	int pgcount = (end - start) >> PAGE_SHIFT;
 	size_t mapped, unmapped;
 	int ret;
 	struct io_pgtable iopt;
 	struct kvm_hyp_iommu_domain *domain;

 	domain = handle_to_domain(KVM_IOMMU_IDMAPPED_DOMAIN);
 	iopt = domain_to_iopt(domain, KVM_IOMMU_IDMAPPED_DOMAIN);
 	if (prot) {
 		while (pgcount) {
 			mapped = 0;
 			ret = iopt_map_pages(&iopt, start, start, PAGE_SIZE, pgcount, prot,
 						0, &mapped);
 			pgcount -= mapped / PAGE_SIZE;
 			start += mapped;
 			if (!mapped || ret)
 				return;
 		}
 	} else {
 		while (pgcount) {
 			unmapped = iopt_unmap_pages(&iopt, start, PAGE_SIZE, pgcount, NULL);
 			pgcount -= unmapped / PAGE_SIZE;
 			start += unmapped;
 			if (!unmapped || ret)
 				return;
 		}
 	}
 }

 void kvm_iommu_host_stage2_idmap(phys_addr_t start, phys_addr_t end,
 				 enum kvm_pgtable_prot prot)
 {
 	if (!kvm_iommu_is_ready())
 		return;
 	__kvm_iommu_host_stage2_idmap(start, end, prot);
 }

 static int __snapshot_host_stage2(u64 start, u64 pa_max, u32 level,
 				  kvm_pte_t *ptep,
 				  enum kvm_pgtable_walk_flags flags,
 				  void * const arg)
 {
 	u64 end = start + kvm_granule_size(level);
 	kvm_pte_t pte = *ptep;
 	enum kvm_pgtable_prot prot;

 	/*
 	 * We only snapshot memory now, as the MMIO regions are unknow to
 	 * the hypervisor, and they will be mapped once touched by the CPU
 	 * this is not ideal, but works for now ...
 	 */
 	if ((!pte || kvm_pte_valid(pte)) && addr_is_memory(start)) {
 		prot = default_host_prot(addr_is_memory(start));
 		__kvm_iommu_host_stage2_idmap(start, end, prot);
 	}

 	return 0;
 }

 static int snapshot_host_stage2(void)
 {
 	struct kvm_pgtable_walker walker = {
 		.cb	= __snapshot_host_stage2,
 		.flags	= KVM_PGTABLE_WALK_LEAF,
 	};
 	struct kvm_pgtable *pgt = &host_mmu.pgt;

 	return kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker);
 }

 int kvm_iommu_init(struct kvm_iommu_ops *ops, struct kvm_hyp_memcache *mc,
 		   unsigned long init_arg)
 {
 	int ret;
 	void *idmap_pgd;
 	size_t idmap_pgd_sz;
 	void *p;

 	BUILD_BUG_ON(sizeof(hyp_spinlock_t) != HYP_SPINLOCK_SIZE);

 	if (WARN_ON(!ops->get_iommu_by_id ||
 		    !ops->free_domain ||
 		    !ops->alloc_domain ||
 		    !ops->attach_dev ||
 		    !ops->detach_dev ||
 		    !ops->pgd_size ||
 		    !ops->get_iommu_token_by_id))
 		return -ENODEV;

 	ret = ops->init ? ops->init(init_arg) : 0;
 	if (ret)
 		return ret;

 	ret = pkvm_create_mappings(kvm_hyp_iommu_domains, kvm_hyp_iommu_domains +
 				   KVM_IOMMU_DOMAINS_ROOT_ENTRIES, PAGE_HYP);
 	if (ret)
 		return ret;

 	kvm_iommu_ops = ops;

 	ret = hyp_pool_init(&iommu_host_pool, 0, 16 /* order = 4*/, 0, true);
 	/* Init IDMAPPED page tables. */
 	if (mc->head) {
 		u8 order;
 		ret = hyp_pool_init(&iommu_idmap_pool, 0,
 				    16 /* order = 4*/, 0, true);
 		if (ret)
 			return ret;

 		while (mc->nr_pages) {
 			order = mc->head & (PAGE_SIZE - 1);
 			p = pkvm_admit_host_page(mc, order);
 			hyp_set_page_refcounted(hyp_virt_to_page(p));
 			hyp_virt_to_page(p)->order = order;
 			hyp_put_page(&iommu_idmap_pool, p);
 		}

 		idmap_pgd_sz = kvm_iommu_ops->pgd_size(DOMAIN_IDMAPPED_TYPE);
 		idmap_pgd = hyp_alloc_pages(&iommu_idmap_pool, get_order(idmap_pgd_sz));
 		if (!idmap_pgd)
 			return -ENOMEM;

 		/* A bit hacky way to populate first domain to be used immediately. */
 		kvm_hyp_iommu_domains[0] = hyp_alloc_pages(&iommu_idmap_pool, 0);
 		ret = kvm_iommu_alloc_domain_nolock(KVM_IOMMU_IDMAPPED_DOMAIN, (u64)idmap_pgd,
 						    idmap_pgd_sz, DOMAIN_IDMAPPED_TYPE);
 	}

 	return ret;
 }

 /* Request to hypervisor. */
 int kvm_iommu_request(struct kvm_hyp_req *req)
 {
 	struct kvm_hyp_req *cur_req;
 	struct pkvm_hyp_vcpu *ctxt = __get_ctxt();

 	if (ctxt)
 		cur_req = pkvm_hyp_req_reserve(ctxt, KVM_HYP_REQ_EMP);
 	else
 		cur_req = this_cpu_ptr(&host_hyp_reqs);

 	if (cur_req->type != KVM_HYP_REQ_EMP)
 		return -EBUSY;

 	memcpy(cur_req, req, sizeof(struct kvm_hyp_req));

 	return 0;
 }

 u64 kvm_iommu_id_to_token(pkvm_handle_t id)
 {
 	return kvm_iommu_ops->get_iommu_token_by_id(id);
 }