CoW cleanups

commit: 39ec0fb1a65dae9ab90527851be649f669af1840 [log] [tgz]
author: Keir Fraser <keirf@google.com> Wed Nov 15 09:39:40 2023 +0000
committer: Keir Fraser <keirf@google.com> Wed Nov 15 09:39:40 2023 +0000
tree: 3131b953df1685ee3b670956adca4e893bc171fa
parent: 1d613ce9269c59d543f71735c7680c43325acb43 [diff]
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 43b5414..3088382 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h

@@ -1252,4 +1252,6 @@ static inline void kvm_hyp_reserve(void) { }
 void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
 bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);
 
+bool vma_is_pkvm_shmem(struct vm_area_struct *vma);
+
 #endif /* __ARM64_KVM_HOST_H__ */

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 1062a31..34864da 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c

@@ -1482,30 +1482,6 @@ static int update_ppage(struct kvm *kvm, struct kvm_pinned_page *ppage)
 	return 0;
 }
 
-#define PagePkvmReadonly(page) (1)
-
-static struct page *kvm_cow_page(struct page *page, unsigned long hva,
-				 unsigned long flags)
-{
-	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
-	struct page *newpage = NULL;
-	int ret = -EPERM;
-
-	mmap_read_lock(mm);
-
-	vma = find_vma(mm, hva);
-	if (!vma)
-		goto unlock;
-
-	zap_page_range_single(vma, hva, PAGE_SIZE, NULL);
-	ret = pin_user_pages(hva, 1, flags, &newpage);
-
-unlock:
-	mmap_read_unlock(mm);
-	return (ret == 1) ? newpage : NULL;
-}
-
 static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva)
 {
@@ -1517,7 +1493,7 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	int ret, nr_pages;
 	struct page *page, *oldpage = NULL;
 	u64 pfn;
-	bool is_cow = false, is_cow_write = false;
+	bool is_ro_shmem = false, is_cow_break = false;
 
 	nr_pages = hyp_memcache->nr_pages;
 	ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm),
@@ -1550,16 +1526,52 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	} else if (ret != 1) {
 		ret = -EFAULT;
 		goto dec_account;
-	} else if (PageSwapBacked(page)) {
-		/* Ok */
-	} else if (PagePkvmReadonly(page)) {
-		/* Ok */
-		if (kvm_is_write_fault(vcpu) &&
-		    !!rb_find((void *)fault_ipa, &kvm->arch.pkvm.pinned_pages,
-			      rb_ppage_cmp)) /* XXX */ {
-			struct page *newpage;
-			if (0)newpage = kvm_cow_page(page, hva, flags);
-			newpage = alloc_page(GFP_KERNEL); /* XXX account to user */
+	} else if (!PageSwapBacked(page)) {
+
+		/* Special handling if the page is in a pkvm shmem vma. */
+		struct vm_area_struct *vma;
+		mmap_read_lock(mm);
+		vma = find_vma(mm, hva);
+		is_ro_shmem = vma && vma_is_pkvm_shmem(vma);
+		mmap_read_unlock(mm);
+
+		if (!is_ro_shmem) {
+			/*
+			 * We really can't deal with page-cache pages returned
+			 * by GUP because (a) we may trigger writeback of a
+			 * page for which we no longer have access and (b)
+			 * page_mkclean() won't find the stage-2 mapping in the
+			 * rmap so we can get out-of-whack with the filesystem
+			 * when marking the page dirty during unpinning (see
+			 * cc5095747edf ("ext4: don't BUG if someone dirty
+			 * pages without asking ext4 first")).
+			 *
+			 * Ideally we'd just restrict ourselves to anonymous
+			 * pages, but we also want to allow memfd (i.e. shmem)
+			 * pages, so check for pages backed by swap in the
+			 * knowledge that the GUP pin will prevent
+			 * try_to_unmap() from succeeding.
+			 */
+			ret = -EIO;
+			goto unpin;
+		}
+
+		/* Perform R/O CoW break if this is a write fault on an
+		 * *existing* R/O mapping. Otherwise we must fault in the
+		 * R/O mapping first.
+		 */
+		if (kvm_is_write_fault(vcpu)) {
+			read_lock(&kvm->mmu_lock);
+			is_cow_break = !!rb_find(
+				(void *)fault_ipa,
+				&kvm->arch.pkvm.pinned_pages,
+				rb_ppage_cmp);
+			read_unlock(&kvm->mmu_lock);
+		}
+
+		if (is_cow_break) {
+			/* XXX account to user */
+			struct page *newpage = alloc_page(GFP_KERNEL);
 			if (newpage == NULL) {
 				ret = -EFAULT;
 				goto unpin;
@@ -1569,35 +1581,17 @@ static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			printk(" - %s: Got CoW page %lx count=%d\n",
 			       __FUNCTION__,
 			       page_to_pfn(page), page_ref_count(page));
-			is_cow_write = true;
 		}
-		is_cow = true;
-	} else {
-		/*
-		 * We really can't deal with page-cache pages returned by GUP
-		 * because (a) we may trigger writeback of a page for which we
-		 * no longer have access and (b) page_mkclean() won't find the
-		 * stage-2 mapping in the rmap so we can get out-of-whack with
-		 * the filesystem when marking the page dirty during unpinning
-		 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
-		 * without asking ext4 first")).
-		 *
-		 * Ideally we'd just restrict ourselves to anonymous pages, but
-		 * we also want to allow memfd (i.e. shmem) pages, so check for
-		 * pages backed by swap in the knowledge that the GUP pin will
-		 * prevent try_to_unmap() from succeeding.
-		 */
-		ret = -EIO;
-		goto unpin;
+
 	}
 
 	write_lock(&kvm->mmu_lock);
 	pfn = page_to_pfn(page);
-	if (!is_cow) {
+	if (!is_ro_shmem) {
 		ret = pkvm_host_map_guest(pfn, fault_ipa >> PAGE_SHIFT);
 	} else {
 		ret = pkvm_host_map_guest_ro(pfn, fault_ipa >> PAGE_SHIFT,
-					     is_cow_write);
+					     is_cow_break);
 	}
 	if (ret) {
 		if (ret == -EAGAIN)
@@ -2274,7 +2268,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		}
 
 		if (new &&
-		    new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+		    new->flags & (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)) {
 			return -EPERM;
 		}
 	}

diff --git a/arch/arm64/kvm/pkvm_shmem.c b/arch/arm64/kvm/pkvm_shmem.c
index cdf351d..8d1505d 100644
--- a/arch/arm64/kvm/pkvm_shmem.c
+++ b/arch/arm64/kvm/pkvm_shmem.c

@@ -102,6 +102,8 @@ static to_page_fn_t get_to_page_fn(struct pkvm_shmem_area *area)
 	return NULL;
 }
 
+static const struct vm_operations_struct pkvm_shmem_ops;
+
 static int pkvm_shmem_area_mmap(struct file *filep,
 				 struct vm_area_struct *vma)
 {
@@ -117,6 +119,7 @@ static int pkvm_shmem_area_mmap(struct file *filep,
 	BUG_ON(!area->kaddr);
 
 	vm_flags_set(vma, VM_DONTEXPAND);
+	vma->vm_ops = &pkvm_shmem_ops;
 
 	to_page = get_to_page_fn(area);
 
@@ -183,6 +186,11 @@ static const struct file_operations pkvm_shmem_area_fops = {
 	.mmap = pkvm_shmem_area_mmap,
 };
 
+bool vma_is_pkvm_shmem(struct vm_area_struct *vma)
+{
+	return vma->vm_ops == &pkvm_shmem_ops;
+}
+
 static int pkvm_shmem_open(struct inode *inode, struct file *filep)
 {
 	printk("%s\n", __FUNCTION__);
commit	39ec0fb1a65dae9ab90527851be649f669af1840	[log] [tgz]
author	Keir Fraser <keirf@google.com>	Wed Nov 15 09:39:40 2023 +0000
committer	Keir Fraser <keirf@google.com>	Wed Nov 15 09:39:40 2023 +0000
tree	3131b953df1685ee3b670956adca4e893bc171fa
parent	1d613ce9269c59d543f71735c7680c43325acb43 [diff]