KVM: arm64: Huge page support for pKVM guest memory reclaim
When a pKVM guest dies, all its memory must be returned to the host
which can then unpin it. In preparation to handling THP for guest
stage-2, extend that reclaiming interface to take order > 0 pages.
The host keeps track of donations via kvm_pinned_pages, however, in the
case of huge pages, the hypervisor is free of replacing a block mapping
with a page table and to unmap a portion of that newly introduced page
table. Those changes aren't reported to the host as it would be fiddly
to keep page tables and pinned_pages synchronized. The reclaim path must
then handle the case where one of the huge page can't be reclaimed
(-E2BIG) or a fragment of a broken down huge page is already unmapped.
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
index eafa464..c409129 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h
@@ -67,7 +67,7 @@ extern unsigned long hyp_nr_cpus;
int __pkvm_prot_finalize(void);
int __pkvm_host_share_hyp(u64 pfn);
int __pkvm_host_unshare_hyp(u64 pfn);
-int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa);
+int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa, u8 order);
int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages);
int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages);
int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages);
diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
index 6ceb2c2..28e6e8e 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -110,7 +110,7 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long pgd_hva);
int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu);
int __pkvm_start_teardown_vm(pkvm_handle_t handle);
int __pkvm_finalize_teardown_vm(pkvm_handle_t handle);
-int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa);
+int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa, u8 order);
struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle,
unsigned int vcpu_idx);
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 8fc9e30..6dab2e2 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -1040,8 +1040,9 @@ static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_
DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1);
DECLARE_REG(u64, pfn, host_ctxt, 2);
DECLARE_REG(u64, ipa, host_ctxt, 3);
+ DECLARE_REG(u8, order, host_ctxt, 4);
- cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, pfn, ipa);
+ cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, pfn, ipa, order);
}
static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 6ff48a8..4b004c6 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -1990,21 +1990,24 @@ void drain_hyp_pool(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
}
}
-int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa)
+int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa, u8 order)
{
struct kvm_hyp_memcache mc = { .nr_pages = 0 };
phys_addr_t phys = hyp_pfn_to_phys(pfn);
+ size_t page_size = PAGE_SIZE << order;
kvm_pte_t pte;
- int ret;
+ int ret = 0;
+ u32 level;
host_lock_component();
guest_lock_component(vm);
- ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, NULL);
- if (ret)
- goto unlock;
+ WARN_ON(kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level));
- if (!kvm_pte_valid(pte)) {
+ if (kvm_granule_size(level) != page_size) {
+ ret = -E2BIG;
+ goto unlock;
+ } else if (!kvm_pte_valid(pte)) {
ret = -EINVAL;
goto unlock;
} else if (phys != kvm_pte_to_phys(pte)) {
@@ -2013,25 +2016,25 @@ int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa)
}
/* We could avoid TLB inval, it is done per VMID on the finalize path */
- WARN_ON(__kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE, &mc));
+ WARN_ON(__kvm_pgtable_stage2_unmap(&vm->pgt, ipa, page_size, &mc));
switch(guest_get_page_state(pte, ipa)) {
case PKVM_PAGE_OWNED:
- WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE));
+ WARN_ON(__host_check_page_state_range(phys, page_size, PKVM_NOPAGE));
hyp_poison_page(phys);
- psci_mem_protect_dec(PAGE_SIZE);
+ psci_mem_protect_dec(page_size);
break;
case PKVM_PAGE_SHARED_BORROWED:
- WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED));
+ WARN_ON(__host_check_page_state_range(phys, page_size, PKVM_PAGE_SHARED_OWNED));
break;
case PKVM_PAGE_SHARED_OWNED:
- WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED));
+ WARN_ON(__host_check_page_state_range(phys, page_size, PKVM_PAGE_SHARED_BORROWED));
break;
default:
BUG_ON(1);
}
- WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST));
+ WARN_ON(host_stage2_set_owner_locked(phys, page_size, PKVM_ID_HOST));
unlock:
guest_unlock_component(vm);
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 3de2d85..bad7fe2 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -353,7 +353,7 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle)
return vm_table[idx];
}
-int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa)
+int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa, u8 order)
{
struct pkvm_hyp_vm *hyp_vm;
int ret = -EINVAL;
@@ -363,7 +363,7 @@ int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa)
if (!hyp_vm || !READ_ONCE(hyp_vm->is_dying))
goto unlock;
- ret = __pkvm_host_reclaim_page(hyp_vm, pfn, ipa);
+ ret = __pkvm_host_reclaim_page(hyp_vm, pfn, ipa, order);
if (ret)
goto unlock;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index a479d6b..527cc08 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -214,6 +214,69 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm)
return ret;
}
+/*
+ * Handle broken down huge pages which have not been reported to the
+ * kvm_pinned_page.
+ */
+static int pkvm_reclaim_dying_guest_page(struct kvm *host_kvm,
+ struct kvm_pinned_page *ppage)
+{
+ size_t page_size, size = PAGE_SIZE << ppage->order;
+ u64 pfn = page_to_pfn(ppage->page);
+ u8 order = ppage->order;
+ u64 ipa = ppage->ipa;
+
+ /* We already know this huge-page has been broken down in the stage-2 */
+ if (ppage->pins < (1 << order))
+ order = 1;
+
+ while (size) {
+ int err = kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page,
+ host_kvm->arch.pkvm.handle,
+ pfn, ipa, order);
+ switch (err) {
+ /* The stage-2 huge page has been broken down */
+ case -E2BIG:
+ if (order)
+ order = 0;
+ else
+ /* Something is really wrong ... */
+ return -EINVAL;
+ break;
+ /* This has been unmapped already */
+ case -EINVAL:
+ /*
+ * We are not supposed to lose track of PAGE_SIZE pinned
+ * page.
+ */
+ if (!ppage->order)
+ return -EINVAL;
+
+ fallthrough;
+ case 0:
+ page_size = PAGE_SIZE << order;
+ ipa += page_size;
+ pfn += 1 << order;
+
+ if (!err)
+ ppage->pins -= 1 << order;
+
+ if (!ppage->pins)
+ return 0;
+
+ if (page_size > size)
+ return -EINVAL;
+
+ size -= page_size;
+ break;
+ default:
+ return err;
+ }
+ }
+
+ return 0;
+}
+
void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{
struct mm_struct *mm = current->mm;
@@ -227,10 +290,8 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle));
mt_for_each(&host_kvm->arch.pkvm.pinned_pages, ppage, ipa, ULONG_MAX) {
- WARN_ON(kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page,
- host_kvm->arch.pkvm.handle,
- page_to_pfn(ppage->page),
- ppage->ipa));
+ WARN_ON(pkvm_reclaim_dying_guest_page(host_kvm, ppage));
+
cond_resched();
account_locked_vm(mm, 1, false);