KVM: arm64: Coalesce host stage2 entries on ownership reclaim

This optimization allows us to re-create higher order block mappings in
the host stage2 pagetables after we teardown a guest VM. The coalescing
code is triggered on host_stage2_set_owner_locked path when we annotate
the entries in the host stage2 page-tables with an invalid entry that has
the owner set to PKVM_ID_HOST. This can also be triggered from
page_relinquish when we do page insertion in the ballooning code.

This patch reintroduces the _post callback walker to be able to zap the
page table entries on ownership re-assignment.

When the host reclaims ownership during guest teardown, the page table
walker drops the refcount of the counted entries and clears out
unreferenced entries (refcount == 1). Clearing out the entry installs a
zero PTE. When the host stage2 receives a data abort because there is no
mapping associated, it will try to create the largest possible block
mapping from the founded leaf entry.
With the current patch, we increase the chances of finding a leaf entry
that has level < 3 if the requested region comes from a reclaimed torned
down VM memory. This has the advantage of reducing the TLB pressure at
host stage2.

To be able to do coalescing, we modify the way we do refcounting by not
counting the following descriptor types at host stage 2:
- non-zero invalid PTEs
- any descriptor that has at least one of the reserved-high bits(58-55)
toogled
- non-default attribute mappings
- page table descriptors

The algorithm works as presented below:

  Is refcount(child(pte_table)) == 1 ?

  Yes -> (because we left only default mappings)
	Zap the table by setting 0 in the pte_table
	and put the page that holds the level 3 entries
	back into the memcache

  level 2
+---------+
|         |
|  ...    |
| pte_table---+      level 3      -> we can now re-create a 2Mb mapping
|  ...    |   +---> +---------+
|         |         |         |
|         |         |         |
|         |         |def entry|
+---------+         |         |
                    |def entry|
                    |         |
                    |  ...    |
                    +---------+

Change-Id: I983372a0fac876336ee6799be1f3c4a50015880c
Signed-off-by: Sebastian Ene <sebastianene@google.com>
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index a34004f..b4faf2f 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -262,12 +262,32 @@
 	KVM_PGTABLE_PROT_SW3			= BIT(58),
 };
 
+/*
+ * Used to indicate a pte for which a 'break-before-make' sequence is in
+ * progress.
+ */
+#define KVM_INVALID_PTE_LOCKED		BIT(10)
+
 #define KVM_PGTABLE_PROT_RW	(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
 #define KVM_PGTABLE_PROT_RWX	(KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)
 
 #define PKVM_HOST_MEM_PROT	KVM_PGTABLE_PROT_RWX
 #define PKVM_HOST_MMIO_PROT	KVM_PGTABLE_PROT_RW
 
+#define KVM_HOST_S2_DEFAULT_MASK   (KVM_PTE_LEAF_ATTR_HI |	\
+				    KVM_PTE_LEAF_ATTR_LO)
+
+#define KVM_HOST_S2_DEFAULT_MEM_PTE		\
+	(PTE_S2_MEMATTR(MT_S2_NORMAL) |		\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W |	\
+	KVM_PTE_LEAF_ATTR_LO_S2_AF |		\
+	FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, KVM_PTE_LEAF_ATTR_LO_S2_SH_IS))
+
+#define KVM_HOST_S2_DEFAULT_MMIO_PTE		\
+	(KVM_HOST_S2_DEFAULT_MEM_PTE |		\
+	KVM_PTE_LEAF_ATTR_HI_S2_XN)
+
 #define PAGE_HYP		KVM_PGTABLE_PROT_RW
 #define PAGE_HYP_EXEC		(KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
 #define PAGE_HYP_RO		(KVM_PGTABLE_PROT_R)
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index 11526e5..e38a4cc 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -183,7 +183,12 @@
 
 static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
-	return host_stage2_pte_is_counted(pte, level);
+	/*
+	 * The refcount tracks valid entries as well as invalid entries if they
+	 * encode ownership of a page to another entity than the page-table
+	 * owner, whose id is 0.
+	 */
+	return !!pte;
 }
 
 static void *guest_s2_zalloc_pages_exact(size_t size)
@@ -561,12 +566,20 @@
 
 static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
 {
-	/*
-	 * The refcount tracks valid entries as well as invalid entries if they
-	 * encode ownership of a page to another entity than the page-table
-	 * owner, whose id is 0.
-	 */
-	return !!pte;
+	u64 phys;
+
+	if (!kvm_pte_valid(pte))
+		return !!pte;
+
+	if (kvm_pte_table(pte, level))
+		return true;
+
+	phys = kvm_pte_to_phys(pte);
+	if (addr_is_memory(phys))
+		return (pte & KVM_HOST_S2_DEFAULT_MASK) !=
+			KVM_HOST_S2_DEFAULT_MEM_PTE;
+
+	return (pte & KVM_HOST_S2_DEFAULT_MASK) != KVM_HOST_S2_DEFAULT_MMIO_PTE;
 }
 
 static int host_stage2_idmap(u64 addr)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 30b80aa..576e522 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -19,12 +19,6 @@
 #define KVM_INVALID_PTE_OWNER_MASK	GENMASK(9, 2)
 #define KVM_MAX_OWNER_ID		FIELD_MAX(KVM_INVALID_PTE_OWNER_MASK)
 
-/*
- * Used to indicate a pte for which a 'break-before-make' sequence is in
- * progress.
- */
-#define KVM_INVALID_PTE_LOCKED		BIT(10)
-
 struct kvm_pgtable_walk_data {
 	struct kvm_pgtable_walker	*walker;
 
@@ -813,7 +807,8 @@
 	return true;
 }
 
-static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
+static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx,
+			    kvm_pte_t new)
 {
 	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
 	struct kvm_pgtable_pte_ops *pte_ops = ctx->pte_ops;
@@ -1023,6 +1018,40 @@
 	 */
 	new = kvm_init_table_pte(childp, mm_ops);
 	stage2_make_pte(ctx, new);
+	return 0;
+}
+
+static int stage2_coalesce_walk_table_post(const struct kvm_pgtable_visit_ctx *ctx,
+					   struct stage2_map_data *data)
+{
+	struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+	kvm_pte_t *childp = kvm_pte_follow(*ctx->ptep, mm_ops);
+
+	/*
+	 * Decrement the refcount only on the set ownership path to avoid a
+	 * loop situation when the following happens:
+	 *  1. We take a host stage2 fault and we create a small mapping which
+	 *  has default attributes (is not refcounted).
+	 *  2. On the way back we execute the post handler and we zap the
+	 *  table that holds our mapping.
+	 */
+	if (kvm_phys_is_valid(data->phys) ||
+	    !kvm_level_supports_block_mapping(ctx->level))
+		return 0;
+
+	/*
+	 * Free a page that is not referenced anymore and drop the reference
+	 * of the page table page.
+	 */
+	if (mm_ops->page_count(childp) == 1) {
+		u64 size = kvm_granule_size(ctx->level);
+		u64 addr = ALIGN_DOWN(ctx->addr, size);
+
+		kvm_clear_pte(ctx->ptep);
+		kvm_tlb_flush_vmid_range(data->mmu, addr, size);
+		mm_ops->put_page(ctx->ptep);
+		mm_ops->put_page(childp);
+	}
 
 	return 0;
 }
@@ -1046,6 +1075,8 @@
 		return stage2_map_walk_table_pre(ctx, data);
 	case KVM_PGTABLE_WALK_LEAF:
 		return stage2_map_walk_leaf(ctx, data);
+	case KVM_PGTABLE_WALK_TABLE_POST:
+		return stage2_coalesce_walk_table_post(ctx, data);
 	default:
 		return -EINVAL;
 	}
@@ -1101,7 +1132,8 @@
 	struct kvm_pgtable_walker walker = {
 		.cb		= stage2_map_walker,
 		.flags		= KVM_PGTABLE_WALK_TABLE_PRE |
-				  KVM_PGTABLE_WALK_LEAF,
+				  KVM_PGTABLE_WALK_LEAF |
+				  KVM_PGTABLE_WALK_TABLE_POST,
 		.arg		= &map_data,
 	};
 
@@ -1144,7 +1176,10 @@
 	 * block entry and rely on the remaining portions being faulted
 	 * back lazily.
 	 */
-	stage2_unmap_put_pte(ctx, mmu, mm_ops);
+	if (pte_ops->pte_is_counted_cb(ctx->old, ctx->level))
+		stage2_unmap_put_pte(ctx, mmu, mm_ops);
+	else
+		stage2_unmap_clear_pte(ctx, mmu);
 
 	if (need_flush && mm_ops->dcache_clean_inval_poc)
 		mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),