KVM: MMU: improve write flooding detected
Detecting write-flooding does not work well, when we handle page written, if
the last speculative spte is not accessed, we treat the page is
write-flooding, however, we can speculative spte on many path, such as pte
prefetch, page synced, that means the last speculative spte may be not point
to the written page and the written page can be accessed via other sptes, so
depends on the Accessed bit of the last speculative spte is not enough
Instead of detected page accessed, we can detect whether the spte is accessed
after it is written, if the spte is not accessed but it is written frequently,
we treat is not a page table or it not used for a long time
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3c9ea26..c1f19de 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -239,6 +239,8 @@
int clear_spte_count;
#endif
+ int write_flooding_count;
+
struct rcu_head rcu;
};
@@ -353,10 +355,6 @@
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
- gfn_t last_pt_write_gfn;
- int last_pt_write_count;
- u64 *last_pte_updated;
-
struct fpu guest_fpu;
u64 xcr0;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ca6f72a..e9534ce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1653,6 +1653,18 @@
sp->spt[i] = 0ull;
}
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+ sp->write_flooding_count = 0;
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ __clear_sp_write_flooding_count(sp);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -1696,6 +1708,7 @@
} else if (sp->unsync)
kvm_mmu_mark_parents_unsync(sp);
+ __clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
return sp;
}
@@ -1848,15 +1861,6 @@
mmu_page_remove_parent_pte(sp, parent_pte);
}
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
- int i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- vcpu->arch.last_pte_updated = NULL;
-}
-
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
@@ -1916,7 +1920,6 @@
}
sp->role.invalid = 1;
- kvm_mmu_reset_last_pte_updated(kvm);
return ret;
}
@@ -2361,8 +2364,6 @@
}
}
kvm_release_pfn_clean(pfn);
- if (speculative)
- vcpu->arch.last_pte_updated = sptep;
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -3523,13 +3524,6 @@
kvm_mmu_flush_tlb(vcpu);
}
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
-{
- u64 *spte = vcpu->arch.last_pte_updated;
-
- return !!(spte && (*spte & shadow_accessed_mask));
-}
-
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
const u8 *new, int *bytes)
{
@@ -3570,22 +3564,16 @@
* If we're seeing too many writes to a page, it may no longer be a page table,
* or we may be forking, in which case it is better to unmap the page.
*/
-static bool detect_write_flooding(struct kvm_vcpu *vcpu, gfn_t gfn)
+static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
{
- bool flooded = false;
+ /*
+ * Skip write-flooding detected for the sp whose level is 1, because
+ * it can become unsync, then the guest page is not write-protected.
+ */
+ if (sp->role.level == 1)
+ return false;
- if (gfn == vcpu->arch.last_pt_write_gfn
- && !last_updated_pte_accessed(vcpu)) {
- ++vcpu->arch.last_pt_write_count;
- if (vcpu->arch.last_pt_write_count >= 3)
- flooded = true;
- } else {
- vcpu->arch.last_pt_write_gfn = gfn;
- vcpu->arch.last_pt_write_count = 1;
- vcpu->arch.last_pte_updated = NULL;
- }
-
- return flooded;
+ return ++sp->write_flooding_count >= 3;
}
/*
@@ -3657,7 +3645,7 @@
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
- bool remote_flush, local_flush, zap_page, flooded, misaligned;
+ bool remote_flush, local_flush, zap_page;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -3683,12 +3671,12 @@
++vcpu->kvm->stat.mmu_pte_write;
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
- flooded = detect_write_flooding(vcpu, gfn);
mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
- misaligned = detect_write_misaligned(sp, gpa, bytes);
+ spte = get_written_sptes(sp, gpa, &npte);
- if (misaligned || flooded) {
+ if (detect_write_misaligned(sp, gpa, bytes) ||
+ detect_write_flooding(sp, spte)) {
zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
&invalid_list);
++vcpu->kvm->stat.mmu_flooded;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9efb860..52e9d58 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,6 +497,7 @@
shadow_walk_next(&it)) {
gfn_t table_gfn;
+ clear_sp_write_flooding_count(it.sptep);
drop_large_spte(vcpu, it.sptep);
sp = NULL;
@@ -522,6 +523,7 @@
shadow_walk_next(&it)) {
gfn_t direct_gfn;
+ clear_sp_write_flooding_count(it.sptep);
validate_direct_spte(vcpu, it.sptep, direct_access);
drop_large_spte(vcpu, it.sptep);
@@ -536,6 +538,7 @@
link_shadow_page(it.sptep, sp);
}
+ clear_sp_write_flooding_count(it.sptep);
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
user_fault, write_fault, emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
@@ -599,11 +602,9 @@
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- if (!prefault) {
+ if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- /* reset fork detector */
- vcpu->arch.last_pt_write_count = 0;
- }
+
return 0;
}
@@ -641,9 +642,6 @@
pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
sptep, *sptep, emulate);
- if (!emulate)
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
++vcpu->stat.pf_fixed;
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);