KVM: arm64: Introduce KVM_CAP_ARM_PROTECTED_VM

Introduce a new VM capability, KVM_CAP_ARM_PROTECTED_VM, which can be
used to isolate guest memory from the host.

Note that this does not yet include an EL2 component.

Signed-off-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <willdeacon@google.com>
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 8881f00..e7dadc5 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6101,6 +6101,77 @@
 When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to
 perform a bulk copy of tags to/from the guest.
 
+7.24 KVM_CAP_ARM_PROTECTED_VM
+-----------------------------
+
+:Architectures: arm64
+:Target: VM
+:Parameters: flags is a single KVM_CAP_ARM_PROTECTED_VM_FLAGS_* value
+
+The presence of this capability indicates that KVM supports running in a
+configuration where the host Linux kernel does not have access to guest memory.
+On such a system, a small hypervisor layer at EL2 can configure the stage-2
+page tables for both the CPU and any DMA-capable devices to protect guest
+memory pages so that they are inaccessible to the host unless access is granted
+explicitly by the guest.
+
+The 'flags' parameter is defined as follows:
+
+7.24.1 KVM_CAP_ARM_PROTECTED_VM_FLAGS_ENABLE
+--------------------------------------------
+
+:Capability: 'flag' parameter to KVM_CAP_ARM_PROTECTED_VM
+:Architectures: arm64
+:Target: VM
+:Parameters: args[0] contains memory slot ID to hold guest firmware
+:Returns: 0 on success; negative error code on failure
+
+Enabling this capability causes all memory slots of the specified VM to be
+unmapped from the host system and put into a state where they are no longer
+configurable. The memory slot corresponding to the ID passed in args[0] is
+populated with the guest firmware image provided by the host firmware.
+
+The first vCPU to enter the guest is defined to be the primary vCPU. All other
+vCPUs belonging to the VM are secondary vCPUs.
+
+All vCPUs belonging to a VM with this capability enabled are initialised to a
+pre-determined reset state irrespective of any prior configuration according
+to the following:
+
+.. include:: arm/pkvm-reset-state.rst
+
+All other registers are initialised to zero, with the following exceptions
+for the primary vCPU:
+
+	===========	===========
+	Register(s)	Reset value
+	===========	===========
+	X0-X14:		Preserved (see KVM_SET_ONE_REG)
+	X15:		Boot protocol version (0)
+	X16-X30:	Reserved (0)
+	PC:		IPA base of firmware memory slot
+	SP:		IPA end of firmware memory slot
+	===========	===========
+
+Secondary vCPUs belonging to a VM with this capability enabled will return
+-EPERM in response to a KVM_RUN ioctl() if the vCPU was not initialised with
+the KVM_ARM_VCPU_POWER_OFF feature.
+
+It is an error to enable this capability on a VM after issuing a KVM_RUN
+ioctl() on one of its vCPUs.
+
+7.24.2 KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO
+------------------------------------------
+
+:Capability: 'flag' parameter to KVM_CAP_ARM_PROTECTED_VM
+:Architectures: arm64
+:Target: VM
+:Parameters: args[0] contains pointer to 'struct kvm_protected_vm_info'
+:Returns: 0 on success; negative error code on failure
+
+Populates the 'struct kvm_protected_vm_info' pointed to by args[0] with
+information about the protected environment for the VM.
+
 8. Other capabilities.
 ======================
 
diff --git a/Documentation/virt/kvm/arm/pkvm-reset-state.rst b/Documentation/virt/kvm/arm/pkvm-reset-state.rst
new file mode 100644
index 0000000..6141c98
--- /dev/null
+++ b/Documentation/virt/kvm/arm/pkvm-reset-state.rst
@@ -0,0 +1,3 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+TODO: describe reset state for vCPUs. All registers not mentioned here are zeroed.
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index b45ae34..5cea500 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -102,6 +102,11 @@
 struct kvm_arch_memory_slot {
 };
 
+struct kvm_protected_vm {
+	bool enabled;
+	struct kvm_memory_slot *firmware_slot;
+};
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;
 
@@ -137,6 +142,8 @@
 
 	/* Memory Tagging Extension enabled for the guest */
 	bool mte_enabled;
+
+	struct kvm_protected_vm pkvm;
 };
 
 struct kvm_vcpu_fault_info {
@@ -782,10 +789,8 @@
 
 int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
 
-static inline bool kvm_vm_is_protected(struct kvm *kvm)
-{
-	return false;
-}
+int kvm_arm_vm_ioctl_pkvm(struct kvm *kvm, struct kvm_enable_cap *cap);
+#define kvm_vm_is_protected(kvm) ((kvm)->arch.pkvm.enabled)
 
 int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
 bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index b3edde68..0b04942 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -413,6 +413,15 @@
 #define KVM_PSCI_RET_INVAL		PSCI_RET_INVALID_PARAMS
 #define KVM_PSCI_RET_DENIED		PSCI_RET_DENIED
 
+/* Protected KVM */
+#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_ENABLE	0
+#define KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO	1
+
+struct kvm_protected_vm_info {
+	__u64 firmware_size;
+	__u64 __reserved[7];
+};
+
 #endif
 
 #endif /* __ARM_KVM_H__ */
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 06a1cf7..14f648f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -80,26 +80,26 @@
 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 			    struct kvm_enable_cap *cap)
 {
-	int r;
-
-	if (cap->flags)
-		return -EINVAL;
+	int r = 0;
 
 	switch (cap->cap) {
 	case KVM_CAP_ARM_NISV_TO_USER:
-		r = 0;
-		kvm->arch.return_nisv_io_abort_to_user = true;
+		if (cap->flags)
+			r = -EINVAL;
+		else
+			kvm->arch.return_nisv_io_abort_to_user = true;
 		break;
 	case KVM_CAP_ARM_MTE:
 		mutex_lock(&kvm->lock);
-		if (!system_supports_mte() || kvm->created_vcpus) {
+		if (cap->flags || !system_supports_mte() || kvm->created_vcpus)
 			r = -EINVAL;
-		} else {
-			r = 0;
+		else
 			kvm->arch.mte_enabled = true;
-		}
 		mutex_unlock(&kvm->lock);
 		break;
+	case KVM_CAP_ARM_PROTECTED_VM:
+		r = kvm_arm_vm_ioctl_pkvm(kvm, cap);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -273,6 +273,9 @@
 	case KVM_CAP_ARM_PTRAUTH_GENERIC:
 		r = system_has_full_ptr_auth();
 		break;
+	case KVM_CAP_ARM_PROTECTED_VM:
+		r = is_protected_kvm_enabled();
+		break;
 	default:
 		r = 0;
 	}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 98c6c38..4f38e8d 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1551,6 +1551,9 @@
 	hva_t reg_end = hva + mem->memory_size;
 	int ret = 0;
 
+	if (kvm_vm_is_protected(kvm))
+		return -EPERM;
+
 	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
 			change != KVM_MR_FLAGS_ONLY)
 		return 0;
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 7af5d03..c6fac22 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -50,3 +50,118 @@
 }
 RESERVEDMEM_OF_DECLARE(pkvm_firmware, "linux,pkvm-guest-firmware-memory",
 		       pkvm_firmware_rmem_init);
+
+static int pkvm_init_el2_context(struct kvm *kvm)
+{
+	/*
+	 * TODO:
+	 * Eventually, this will involve a call to EL2 to:
+	 * - Register this VM as a protected VM
+	 * - Provide pages for the firmware
+	 * - Unmap memslots from the host
+	 * - Force reset state and lock down access
+	 * - Prevent attempts to run unknown vCPUs
+	 * - Ensure that no vCPUs have previously entered the VM
+	 * - ...
+	 */
+	kvm_pr_unimpl("Stage-2 protection is not yet implemented; ignoring\n");
+	return 0;
+}
+
+static int pkvm_init_firmware_slot(struct kvm *kvm, u64 slotid)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *slot;
+
+	if (slotid >= KVM_MEM_SLOTS_NUM || !pkvm_firmware_mem)
+		return -EINVAL;
+
+	slots = kvm_memslots(kvm);
+	if (!slots)
+		return -ENOENT;
+
+	slot = id_to_memslot(slots, slotid);
+	if (!slot)
+		return -ENOENT;
+
+	if (slot->flags)
+		return -EINVAL;
+
+	if ((slot->npages << PAGE_SHIFT) < pkvm_firmware_mem->size)
+		return -ENOMEM;
+
+	kvm->arch.pkvm.firmware_slot = slot;
+	return 0;
+}
+
+static void pkvm_teardown_firmware_slot(struct kvm *kvm)
+{
+	kvm->arch.pkvm.firmware_slot = NULL;
+}
+
+static int pkvm_enable(struct kvm *kvm, u64 slotid)
+{
+	int ret;
+
+	ret = pkvm_init_firmware_slot(kvm, slotid);
+	if (ret)
+		return ret;
+
+	ret = pkvm_init_el2_context(kvm);
+	if (ret)
+		pkvm_teardown_firmware_slot(kvm);
+
+	return ret;
+}
+
+static int pkvm_vm_ioctl_enable(struct kvm *kvm, u64 slotid)
+{
+	int ret = 0;
+
+	mutex_lock(&kvm->lock);
+	if (kvm_vm_is_protected(kvm)) {
+		ret = -EPERM;
+		goto out_kvm_unlock;
+	}
+
+	mutex_lock(&kvm->slots_lock);
+	ret = pkvm_enable(kvm, slotid);
+	if (ret)
+		goto out_slots_unlock;
+
+	kvm->arch.pkvm.enabled = true;
+out_slots_unlock:
+	mutex_unlock(&kvm->slots_lock);
+out_kvm_unlock:
+	mutex_unlock(&kvm->lock);
+	return ret;
+}
+
+static int pkvm_vm_ioctl_info(struct kvm *kvm,
+			      struct kvm_protected_vm_info __user *info)
+{
+	struct kvm_protected_vm_info kinfo = {
+		.firmware_size = pkvm_firmware_mem ?
+				 pkvm_firmware_mem->size :
+				 0,
+	};
+
+	return copy_to_user(info, &kinfo, sizeof(kinfo)) ? -EFAULT : 0;
+}
+
+int kvm_arm_vm_ioctl_pkvm(struct kvm *kvm, struct kvm_enable_cap *cap)
+{
+	if (cap->args[1] || cap->args[2] || cap->args[3])
+		return -EINVAL;
+
+	switch (cap->flags) {
+	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_ENABLE:
+		return pkvm_vm_ioctl_enable(kvm, cap->args[0]);
+	case KVM_CAP_ARM_PROTECTED_VM_FLAGS_INFO:
+		return pkvm_vm_ioctl_info(kvm, (void __user *)cap->args[0]);
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 3c77972..bd48d91 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1073,6 +1073,8 @@
 */
 #define KVM_CAP_ARM_MTE 205
 
+#define KVM_CAP_ARM_PROTECTED_VM 0xffbadab1 /* To be allocated */
+
 #ifdef KVM_CAP_IRQ_ROUTING
 
 struct kvm_irq_routing_irqchip {