ANDROID: KVM: arm64: Coalesce host stage2 entries on ownership reclaim

This optimization allows us to re-create higher order block mappings in
the host stage2 pagetables after we teardown a guest VM.
When the host reclaims ownership during guest teardown, the page table
walker drops the refcount of the counted entries and clears out
unreferenced entries (refcount == 1). Clearing out the entry installs a
zero PTE. When the host stage2 receives a data abort because there is no
mapping associated, it will try to create the largest possible block
mapping from the founded leaf entry.
With the current patch, we increase the chances of finding a leaf entry
that has level < 3 if the requested region comes from a reclaimed torned
down VM memory. This has the advantage of reducing the TLB pressure at
host stage2.

To increase the coalescing chances, we modify the way we refcount page
table descriptors for host stage2:
- non-zero invalid PTEs
- any of the reserved-high bits(58-55) toogled
- non-default attribute mappings
- page table descriptors

Bug: 222044487
Test: dump the host stage2 pagetables and view the mapping
Signed-off-by: Sebastian Ene <sebastianene@google.com>
Change-Id: I90ff4ec2185e9a76d7ad17e77ef9bdd8ce3e8698
Signed-off-by: Quentin Perret <qperret@google.com>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 4f85740..d150b1d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -208,6 +208,24 @@
 #define PKVM_HOST_MEM_PROT	KVM_PGTABLE_PROT_RWX
 #define PKVM_HOST_MMIO_PROT	KVM_PGTABLE_PROT_RW
 
+#define KVM_HOST_S2_DEFAULT_ATTR   (KVM_PTE_LEAF_ATTR_HI |	\
+				    KVM_PTE_LEAF_ATTR_LO)
+
+#define KVM_HOST_S2_DEFAULT_MEM_PTE		\
+	(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_SH	|	\
+	KVM_PTE_LEAF_ATTR_LO_S2_AF)
+
+#define KVM_HOST_S2_DEFAULT_MMIO_PTE		\
+	(KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR |	\
+	KVM_PTE_LEAF_ATTR_HI_S2_XN |		\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_SH |		\
+	KVM_PTE_LEAF_ATTR_LO_S2_AF)
+
 #define PAGE_HYP		KVM_PGTABLE_PROT_RW
 #define PAGE_HYP_EXEC		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
 #define PAGE_HYP_RO		(KVM_PGTABLE_PROT_R)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index c201da97eda..d3a1a8d 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -178,7 +178,7 @@
 
 static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
-	return host_stage2_pte_is_counted(pte, level);
+	return !!pte;
 }
 
 static void *guest_s2_zalloc_pages_exact(size_t size)
@@ -660,12 +660,26 @@
 
 static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
+	u64 phys;
+
 	/*
 	 * The refcount tracks valid entries as well as invalid entries if they
 	 * encode ownership of a page to another entity than the page-table
 	 * owner, whose id is 0.
 	 */
-	return !!pte;
+	if (!kvm_pte_valid(pte))
+		return !!pte;
+
+	if (kvm_pte_table(pte, level))
+		return true;
+
+	phys = kvm_pte_to_phys(pte);
+	if (!addr_is_memory(phys))
+		return (pte & KVM_HOST_S2_DEFAULT_ATTR) !=
+			KVM_HOST_S2_DEFAULT_MMIO_PTE;
+	else
+		return (pte & KVM_HOST_S2_DEFAULT_ATTR) !=
+			KVM_HOST_S2_DEFAULT_MEM_PTE;
 }
 
 static int host_stage2_idmap(u64 addr)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 82b55bd..7808591 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -767,6 +767,13 @@
 	 */
 	if (pte_ops->pte_is_counted_cb(pte, level))
 		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
+	else {
+		/*
+		 * On non-refcounted PTEs we just clear them out without
+		 * dropping the refcount.
+		 */
+		stage2_clear_pte(ptep, data->mmu, addr, level);
+	}
 
 	kvm_set_table_pte(ptep, childp, mm_ops);
 	mm_ops->get_page(ptep);
@@ -774,6 +781,35 @@
 	return 0;
 }
 
+static void stage2_coalesce_walk_table_post(u64 addr, u64 end, u32 level,
+			kvm_pte_t *ptep,
+			struct stage2_map_data *data)
+{
+	struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops;
+	kvm_pte_t *childp = kvm_pte_follow(*ptep, mm_ops);
+
+	/*
+	 * Decrement the refcount only on the set ownership path to avoid a
+	 * loop situation when the following happens:
+	 *  1. We take a host stage2 fault and we create a small mapping which
+	 *  has default attributes (is not refcounted).
+	 *  2. On the way back we execute the post handler and we zap the
+	 *  table that holds our mapping.
+	 */
+	if (kvm_phys_is_valid(data->phys) ||
+	    !kvm_level_supports_block_mapping(level))
+		return;
+
+	/*
+	 * Free a page that is not referenced anymore and drop the reference
+	 * of the page table page.
+	 */
+	if (mm_ops->page_count(childp) == 1) {
+		stage2_put_pte(ptep, data->mmu, addr, level, mm_ops);
+		mm_ops->put_page(childp);
+	}
+}
+
 static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level,
 				      kvm_pte_t *ptep,
 				      struct stage2_map_data *data)
@@ -782,8 +818,11 @@
 	kvm_pte_t *childp;
 	int ret = 0;
 
-	if (!data->anchor)
+	if (!data->anchor) {
+		stage2_coalesce_walk_table_post(addr, end, level, ptep,
+						data);
 		return 0;
+	}
 
 	if (data->anchor == ptep) {
 		childp = data->childp;