| #include "libcflat.h" |
| #include "desc.h" |
| #include "processor.h" |
| #include "asm/page.h" |
| #include "x86/vm.h" |
| #include "access.h" |
| |
| #define true 1 |
| #define false 0 |
| |
| static _Bool verbose = false; |
| |
| typedef unsigned long pt_element_t; |
| static int invalid_mask; |
| |
| /* Test code/data is at 32MiB, paging structures at 33MiB. */ |
| #define AT_CODE_DATA_PHYS 32 * 1024 * 1024 |
| #define AT_PAGING_STRUCTURES_PHYS 33 * 1024 * 1024 |
| |
| #define PT_BASE_ADDR_MASK ((pt_element_t)((((pt_element_t)1 << 36) - 1) & PAGE_MASK)) |
| #define PT_PSE_BASE_ADDR_MASK (PT_BASE_ADDR_MASK & ~(1ull << 21)) |
| |
| #define CR0_WP_MASK (1UL << 16) |
| #define CR4_SMEP_MASK (1UL << 20) |
| |
| #define PFERR_PRESENT_MASK (1U << 0) |
| #define PFERR_WRITE_MASK (1U << 1) |
| #define PFERR_USER_MASK (1U << 2) |
| #define PFERR_RESERVED_MASK (1U << 3) |
| #define PFERR_FETCH_MASK (1U << 4) |
| #define PFERR_PK_MASK (1U << 5) |
| |
| #define MSR_EFER 0xc0000080 |
| #define EFER_NX_MASK (1ull << 11) |
| |
| #define PT_INDEX(address, level) \ |
| (((address) >> (12 + ((level)-1) * 9)) & 511) |
| |
| /* |
| * Page table access check tests. Each number/bit represent an individual |
| * test case. The main test will bump a counter by 1 to run all permutations |
| * of the below test cases (sans illegal combinations). |
| * |
| * Keep the PRESENT and reserved bits in the higher numbers so that they aren't |
| * toggled on every test, e.g. to keep entries in the TLB. |
| */ |
| enum { |
| AC_PTE_WRITABLE_BIT, |
| AC_PTE_USER_BIT, |
| AC_PTE_ACCESSED_BIT, |
| AC_PTE_DIRTY_BIT, |
| AC_PTE_NX_BIT, |
| AC_PTE_PRESENT_BIT, |
| AC_PTE_BIT51_BIT, |
| AC_PTE_BIT36_BIT, |
| |
| AC_PDE_WRITABLE_BIT, |
| AC_PDE_USER_BIT, |
| AC_PDE_ACCESSED_BIT, |
| AC_PDE_DIRTY_BIT, |
| AC_PDE_PSE_BIT, |
| AC_PDE_NX_BIT, |
| AC_PDE_PRESENT_BIT, |
| AC_PDE_BIT51_BIT, |
| AC_PDE_BIT36_BIT, |
| AC_PDE_BIT13_BIT, |
| |
| /* |
| * special test case to DISABLE writable bit on page directory |
| * pointer table entry. |
| */ |
| AC_PDPTE_NO_WRITABLE_BIT, |
| |
| AC_PKU_AD_BIT, |
| AC_PKU_WD_BIT, |
| AC_PKU_PKEY_BIT, |
| |
| AC_ACCESS_USER_BIT, |
| AC_ACCESS_WRITE_BIT, |
| AC_ACCESS_FETCH_BIT, |
| AC_ACCESS_TWICE_BIT, |
| |
| AC_CPU_EFER_NX_BIT, |
| AC_CPU_CR0_WP_BIT, |
| AC_CPU_CR4_SMEP_BIT, |
| AC_CPU_CR4_PKE_BIT, |
| |
| NR_AC_FLAGS |
| }; |
| |
| #define AC_PTE_PRESENT_MASK (1 << AC_PTE_PRESENT_BIT) |
| #define AC_PTE_WRITABLE_MASK (1 << AC_PTE_WRITABLE_BIT) |
| #define AC_PTE_USER_MASK (1 << AC_PTE_USER_BIT) |
| #define AC_PTE_ACCESSED_MASK (1 << AC_PTE_ACCESSED_BIT) |
| #define AC_PTE_DIRTY_MASK (1 << AC_PTE_DIRTY_BIT) |
| #define AC_PTE_NX_MASK (1 << AC_PTE_NX_BIT) |
| #define AC_PTE_BIT51_MASK (1 << AC_PTE_BIT51_BIT) |
| #define AC_PTE_BIT36_MASK (1 << AC_PTE_BIT36_BIT) |
| |
| #define AC_PDE_PRESENT_MASK (1 << AC_PDE_PRESENT_BIT) |
| #define AC_PDE_WRITABLE_MASK (1 << AC_PDE_WRITABLE_BIT) |
| #define AC_PDE_USER_MASK (1 << AC_PDE_USER_BIT) |
| #define AC_PDE_ACCESSED_MASK (1 << AC_PDE_ACCESSED_BIT) |
| #define AC_PDE_DIRTY_MASK (1 << AC_PDE_DIRTY_BIT) |
| #define AC_PDE_PSE_MASK (1 << AC_PDE_PSE_BIT) |
| #define AC_PDE_NX_MASK (1 << AC_PDE_NX_BIT) |
| #define AC_PDE_BIT51_MASK (1 << AC_PDE_BIT51_BIT) |
| #define AC_PDE_BIT36_MASK (1 << AC_PDE_BIT36_BIT) |
| #define AC_PDE_BIT13_MASK (1 << AC_PDE_BIT13_BIT) |
| |
| #define AC_PDPTE_NO_WRITABLE_MASK (1 << AC_PDPTE_NO_WRITABLE_BIT) |
| |
| #define AC_PKU_AD_MASK (1 << AC_PKU_AD_BIT) |
| #define AC_PKU_WD_MASK (1 << AC_PKU_WD_BIT) |
| #define AC_PKU_PKEY_MASK (1 << AC_PKU_PKEY_BIT) |
| |
| #define AC_ACCESS_USER_MASK (1 << AC_ACCESS_USER_BIT) |
| #define AC_ACCESS_WRITE_MASK (1 << AC_ACCESS_WRITE_BIT) |
| #define AC_ACCESS_FETCH_MASK (1 << AC_ACCESS_FETCH_BIT) |
| #define AC_ACCESS_TWICE_MASK (1 << AC_ACCESS_TWICE_BIT) |
| |
| #define AC_CPU_EFER_NX_MASK (1 << AC_CPU_EFER_NX_BIT) |
| #define AC_CPU_CR0_WP_MASK (1 << AC_CPU_CR0_WP_BIT) |
| #define AC_CPU_CR4_SMEP_MASK (1 << AC_CPU_CR4_SMEP_BIT) |
| #define AC_CPU_CR4_PKE_MASK (1 << AC_CPU_CR4_PKE_BIT) |
| |
| const char *ac_names[] = { |
| [AC_PTE_PRESENT_BIT] = "pte.p", |
| [AC_PTE_ACCESSED_BIT] = "pte.a", |
| [AC_PTE_WRITABLE_BIT] = "pte.rw", |
| [AC_PTE_USER_BIT] = "pte.user", |
| [AC_PTE_DIRTY_BIT] = "pte.d", |
| [AC_PTE_NX_BIT] = "pte.nx", |
| [AC_PTE_BIT51_BIT] = "pte.51", |
| [AC_PTE_BIT36_BIT] = "pte.36", |
| [AC_PDE_PRESENT_BIT] = "pde.p", |
| [AC_PDE_ACCESSED_BIT] = "pde.a", |
| [AC_PDE_WRITABLE_BIT] = "pde.rw", |
| [AC_PDE_USER_BIT] = "pde.user", |
| [AC_PDE_DIRTY_BIT] = "pde.d", |
| [AC_PDE_PSE_BIT] = "pde.pse", |
| [AC_PDE_NX_BIT] = "pde.nx", |
| [AC_PDE_BIT51_BIT] = "pde.51", |
| [AC_PDE_BIT36_BIT] = "pde.36", |
| [AC_PDE_BIT13_BIT] = "pde.13", |
| [AC_PDPTE_NO_WRITABLE_BIT] = "pdpte.ro", |
| [AC_PKU_AD_BIT] = "pkru.ad", |
| [AC_PKU_WD_BIT] = "pkru.wd", |
| [AC_PKU_PKEY_BIT] = "pkey=1", |
| [AC_ACCESS_WRITE_BIT] = "write", |
| [AC_ACCESS_USER_BIT] = "user", |
| [AC_ACCESS_FETCH_BIT] = "fetch", |
| [AC_ACCESS_TWICE_BIT] = "twice", |
| [AC_CPU_EFER_NX_BIT] = "efer.nx", |
| [AC_CPU_CR0_WP_BIT] = "cr0.wp", |
| [AC_CPU_CR4_SMEP_BIT] = "cr4.smep", |
| [AC_CPU_CR4_PKE_BIT] = "cr4.pke", |
| }; |
| |
| static inline void *va(pt_element_t phys) |
| { |
| return (void *)phys; |
| } |
| |
| typedef struct { |
| pt_element_t pt_pool_pa; |
| unsigned int pt_pool_current; |
| int pt_levels; |
| } ac_pt_env_t; |
| |
| typedef struct { |
| unsigned flags; |
| void *virt; |
| pt_element_t phys; |
| pt_element_t *ptep; |
| pt_element_t expected_pte; |
| pt_element_t *pdep; |
| pt_element_t expected_pde; |
| pt_element_t ignore_pde; |
| int expected_fault; |
| unsigned expected_error; |
| int pt_levels; |
| |
| /* 5-level paging, 1-based to avoid math. */ |
| pt_element_t page_tables[6]; |
| } ac_test_t; |
| |
| typedef struct { |
| unsigned short limit; |
| unsigned long linear_addr; |
| } __attribute__((packed)) descriptor_table_t; |
| |
| |
| static void ac_test_show(ac_test_t *at); |
| |
| static unsigned long shadow_cr0; |
| static unsigned long shadow_cr3; |
| static unsigned long shadow_cr4; |
| static unsigned long long shadow_efer; |
| |
| typedef void (*walk_fn)(pt_element_t *ptep, int level, unsigned long virt); |
| |
| /* Returns the size of the range covered by the last processed entry. */ |
| static unsigned long walk_va(ac_test_t *at, int min_level, unsigned long virt, |
| walk_fn callback, bool leaf_only) |
| { |
| unsigned long parent_pte = shadow_cr3; |
| int i; |
| |
| for (i = at->pt_levels; i >= min_level; --i) { |
| pt_element_t *parent_pt = va(parent_pte & PT_BASE_ADDR_MASK); |
| unsigned int index = PT_INDEX(virt, i); |
| pt_element_t *ptep = &parent_pt[index]; |
| |
| assert(!leaf_only || (*ptep & PT_PRESENT_MASK)); |
| |
| if (!leaf_only || i == 1 || (*ptep & PT_PAGE_SIZE_MASK)) |
| callback(ptep, i, virt); |
| |
| if (i == 1 || *ptep & PT_PAGE_SIZE_MASK) |
| break; |
| |
| parent_pte = *ptep; |
| } |
| |
| return 1ul << PGDIR_BITS(i); |
| } |
| |
| static void walk_ptes(ac_test_t *at, unsigned long virt, unsigned long end, |
| walk_fn callback) |
| { |
| unsigned long page_size; |
| |
| for ( ; virt < end; virt = ALIGN_DOWN(virt + page_size, page_size)) |
| page_size = walk_va(at, 1, virt, callback, true); |
| } |
| |
| static void set_cr0_wp(int wp) |
| { |
| unsigned long cr0 = shadow_cr0; |
| |
| cr0 &= ~CR0_WP_MASK; |
| if (wp) |
| cr0 |= CR0_WP_MASK; |
| if (cr0 != shadow_cr0) { |
| write_cr0(cr0); |
| shadow_cr0 = cr0; |
| } |
| } |
| |
| static void clear_user_mask(pt_element_t *ptep, int level, unsigned long virt) |
| { |
| *ptep &= ~PT_USER_MASK; |
| |
| /* Flush to avoid spurious #PF */ |
| invlpg((void*)virt); |
| } |
| |
| static void set_user_mask(pt_element_t *ptep, int level, unsigned long virt) |
| { |
| *ptep |= PT_USER_MASK; |
| |
| /* Flush to avoid spurious #PF */ |
| invlpg((void*)virt); |
| } |
| |
| static unsigned set_cr4_smep(ac_test_t *at, int smep) |
| { |
| extern char stext, etext; |
| unsigned long code_start = (unsigned long)&stext; |
| unsigned long code_end = (unsigned long)&etext; |
| unsigned long cr4 = shadow_cr4; |
| unsigned r; |
| |
| cr4 &= ~CR4_SMEP_MASK; |
| if (smep) |
| cr4 |= CR4_SMEP_MASK; |
| if (cr4 == shadow_cr4) |
| return 0; |
| |
| if (smep) |
| walk_ptes(at, code_start, code_end, clear_user_mask); |
| r = write_cr4_safe(cr4); |
| if (r || !smep) |
| walk_ptes(at, code_start, code_end, set_user_mask); |
| if (!r) |
| shadow_cr4 = cr4; |
| return r; |
| } |
| |
| static void set_cr4_pke(int pke) |
| { |
| unsigned long cr4 = shadow_cr4; |
| |
| cr4 &= ~X86_CR4_PKE; |
| if (pke) |
| cr4 |= X86_CR4_PKE; |
| if (cr4 == shadow_cr4) |
| return; |
| |
| /* Check that protection keys do not affect accesses when CR4.PKE=0. */ |
| if ((shadow_cr4 & X86_CR4_PKE) && !pke) |
| write_pkru(0xfffffffc); |
| write_cr4(cr4); |
| shadow_cr4 = cr4; |
| } |
| |
| static void set_efer_nx(int nx) |
| { |
| unsigned long long efer = shadow_efer; |
| |
| efer &= ~EFER_NX_MASK; |
| if (nx) |
| efer |= EFER_NX_MASK; |
| if (efer != shadow_efer) { |
| wrmsr(MSR_EFER, efer); |
| shadow_efer = efer; |
| } |
| } |
| |
| static void ac_env_int(ac_pt_env_t *pt_env, int page_table_levels) |
| { |
| extern char page_fault, kernel_entry; |
| set_idt_entry(14, &page_fault, 0); |
| set_idt_entry(0x20, &kernel_entry, 3); |
| |
| pt_env->pt_pool_pa = AT_PAGING_STRUCTURES_PHYS; |
| pt_env->pt_pool_current = 0; |
| pt_env->pt_levels = page_table_levels; |
| } |
| |
| static pt_element_t ac_test_alloc_pt(ac_pt_env_t *pt_env) |
| { |
| pt_element_t pt; |
| |
| /* |
| * Each test needs at most pt_levels-1 structures per virtual address, |
| * and no existing scenario uses more than four addresses. |
| */ |
| assert(pt_env->pt_pool_current < (4 * (pt_env->pt_levels - 1))); |
| |
| pt = pt_env->pt_pool_pa + (pt_env->pt_pool_current * PAGE_SIZE); |
| pt_env->pt_pool_current++; |
| memset(va(pt), 0, PAGE_SIZE); |
| return pt; |
| } |
| |
| static void __ac_test_init(ac_test_t *at, unsigned long virt, |
| ac_pt_env_t *pt_env, ac_test_t *buddy) |
| { |
| unsigned long buddy_virt = buddy ? (unsigned long)buddy->virt : 0; |
| pt_element_t *root_pt = va(shadow_cr3 & PT_BASE_ADDR_MASK); |
| int i; |
| |
| /* |
| * The test infrastructure, e.g. this function, must use a different |
| * top-level SPTE than the test, otherwise modifying SPTEs can affect |
| * normal behavior, e.g. crash the test due to marking code SPTEs |
| * USER when CR4.SMEP=1. |
| */ |
| assert(PT_INDEX(virt, pt_env->pt_levels) != |
| PT_INDEX((unsigned long)__ac_test_init, pt_env->pt_levels)); |
| |
| set_efer_nx(1); |
| set_cr0_wp(1); |
| at->flags = 0; |
| at->virt = (void *)virt; |
| at->phys = AT_CODE_DATA_PHYS; |
| at->pt_levels = pt_env->pt_levels; |
| |
| at->page_tables[0] = -1ull; |
| at->page_tables[1] = -1ull; |
| |
| /* |
| * Zap the existing top-level PTE as it may be reused from a previous |
| * sub-test. This allows runtime PTE modification to assert that two |
| * overlapping walks don't try to install different paging structures. |
| */ |
| root_pt[PT_INDEX(virt, pt_env->pt_levels)] = 0; |
| |
| for (i = at->pt_levels; i > 1; i--) { |
| /* |
| * Buddies can reuse any part of the walk that share the same |
| * index. This is weird, but intentional, as several tests |
| * want different walks to merge at lower levels. |
| */ |
| if (buddy && PT_INDEX(virt, i) == PT_INDEX(buddy_virt, i)) |
| at->page_tables[i] = buddy->page_tables[i]; |
| else |
| at->page_tables[i] = ac_test_alloc_pt(pt_env); |
| } |
| } |
| |
| static void ac_test_init(ac_test_t *at, unsigned long virt, ac_pt_env_t *pt_env) |
| { |
| __ac_test_init(at, virt, pt_env, NULL); |
| } |
| |
| static int ac_test_bump_one(ac_test_t *at) |
| { |
| at->flags = ((at->flags | invalid_mask) + 1) & ~invalid_mask; |
| return at->flags < (1 << NR_AC_FLAGS); |
| } |
| |
| #define F(x) ((flags & x##_MASK) != 0) |
| |
| static _Bool ac_test_legal(ac_test_t *at) |
| { |
| int flags = at->flags; |
| unsigned reserved; |
| |
| if (F(AC_CPU_CR4_SMEP)) |
| return false; |
| |
| if (F(AC_ACCESS_FETCH) && F(AC_ACCESS_WRITE)) |
| return false; |
| |
| /* |
| * Since we convert current page to kernel page when cr4.smep=1, |
| * we can't switch to user mode. |
| */ |
| if (F(AC_ACCESS_USER) && F(AC_CPU_CR4_SMEP)) |
| return false; |
| |
| /* |
| * Only test protection key faults if CR4.PKE=1. |
| */ |
| if (!F(AC_CPU_CR4_PKE) && |
| (F(AC_PKU_AD) || F(AC_PKU_WD))) { |
| return false; |
| } |
| |
| /* |
| * pde.bit13 checks handling of reserved bits in largepage PDEs. It is |
| * meaningless if there is a PTE. |
| */ |
| if (!F(AC_PDE_PSE) && F(AC_PDE_BIT13)) |
| return false; |
| |
| /* |
| * Shorten the test by avoiding testing too many reserved bit combinations. |
| * Skip testing multiple reserved bits to shorten the test. Reserved bit |
| * page faults are terminal and multiple reserved bits do not affect the |
| * error code; the odds of a KVM bug are super low, and the odds of actually |
| * being able to detect a bug are even lower. |
| */ |
| reserved = (AC_PDE_BIT51_MASK | AC_PDE_BIT36_MASK | AC_PDE_BIT13_MASK | |
| AC_PTE_BIT51_MASK | AC_PTE_BIT36_MASK); |
| if (!F(AC_CPU_EFER_NX)) |
| reserved |= AC_PDE_NX_MASK | AC_PTE_NX_MASK; |
| |
| /* Only test one reserved bit at a time. */ |
| reserved &= flags; |
| if (reserved & (reserved - 1)) |
| return false; |
| |
| return true; |
| } |
| |
| static int ac_test_bump(ac_test_t *at) |
| { |
| int ret; |
| |
| do { |
| ret = ac_test_bump_one(at); |
| } while (ret && !ac_test_legal(at)); |
| |
| return ret; |
| } |
| |
| static pt_element_t ac_test_permissions(ac_test_t *at, unsigned flags, |
| bool writable, bool user, |
| bool executable) |
| { |
| bool kwritable = !F(AC_CPU_CR0_WP) && !F(AC_ACCESS_USER); |
| pt_element_t expected = 0; |
| |
| if (F(AC_ACCESS_USER) && !user) |
| at->expected_fault = 1; |
| |
| if (F(AC_ACCESS_WRITE) && !writable && !kwritable) |
| at->expected_fault = 1; |
| |
| if (F(AC_ACCESS_FETCH) && !executable) |
| at->expected_fault = 1; |
| |
| if (F(AC_ACCESS_FETCH) && user && F(AC_CPU_CR4_SMEP)) |
| at->expected_fault = 1; |
| |
| if (user && !F(AC_ACCESS_FETCH) && F(AC_PKU_PKEY) && F(AC_CPU_CR4_PKE)) { |
| if (F(AC_PKU_AD)) { |
| at->expected_fault = 1; |
| at->expected_error |= PFERR_PK_MASK; |
| } else if (F(AC_ACCESS_WRITE) && F(AC_PKU_WD) && !kwritable) { |
| at->expected_fault = 1; |
| at->expected_error |= PFERR_PK_MASK; |
| } |
| } |
| |
| if (!at->expected_fault) { |
| expected |= PT_ACCESSED_MASK; |
| if (F(AC_ACCESS_WRITE)) |
| expected |= PT_DIRTY_MASK; |
| } |
| |
| return expected; |
| } |
| |
| static void ac_emulate_access(ac_test_t *at, unsigned flags) |
| { |
| bool pde_valid, pte_valid; |
| bool user, writable, executable; |
| |
| if (F(AC_ACCESS_USER)) |
| at->expected_error |= PFERR_USER_MASK; |
| |
| if (F(AC_ACCESS_WRITE)) |
| at->expected_error |= PFERR_WRITE_MASK; |
| |
| if (F(AC_ACCESS_FETCH)) |
| at->expected_error |= PFERR_FETCH_MASK; |
| |
| if (!F(AC_PDE_ACCESSED)) |
| at->ignore_pde = PT_ACCESSED_MASK; |
| |
| pde_valid = F(AC_PDE_PRESENT) |
| && !F(AC_PDE_BIT51) && !F(AC_PDE_BIT36) && !F(AC_PDE_BIT13) |
| && !(F(AC_PDE_NX) && !F(AC_CPU_EFER_NX)); |
| |
| if (!pde_valid) { |
| at->expected_fault = 1; |
| if (F(AC_PDE_PRESENT)) { |
| at->expected_error |= PFERR_RESERVED_MASK; |
| } else { |
| at->expected_error &= ~PFERR_PRESENT_MASK; |
| } |
| goto fault; |
| } |
| |
| writable = !F(AC_PDPTE_NO_WRITABLE) && F(AC_PDE_WRITABLE); |
| user = F(AC_PDE_USER); |
| executable = !F(AC_PDE_NX); |
| |
| if (F(AC_PDE_PSE)) { |
| at->expected_pde |= ac_test_permissions(at, flags, writable, |
| user, executable); |
| goto no_pte; |
| } |
| |
| at->expected_pde |= PT_ACCESSED_MASK; |
| |
| pte_valid = F(AC_PTE_PRESENT) |
| && !F(AC_PTE_BIT51) && !F(AC_PTE_BIT36) |
| && !(F(AC_PTE_NX) && !F(AC_CPU_EFER_NX)); |
| |
| if (!pte_valid) { |
| at->expected_fault = 1; |
| if (F(AC_PTE_PRESENT)) { |
| at->expected_error |= PFERR_RESERVED_MASK; |
| } else { |
| at->expected_error &= ~PFERR_PRESENT_MASK; |
| } |
| goto fault; |
| } |
| |
| writable &= F(AC_PTE_WRITABLE); |
| user &= F(AC_PTE_USER); |
| executable &= !F(AC_PTE_NX); |
| |
| at->expected_pte |= ac_test_permissions(at, flags, writable, user, |
| executable); |
| |
| no_pte: |
| fault: |
| if (!at->expected_fault) |
| at->ignore_pde = 0; |
| if (!F(AC_CPU_EFER_NX) && !F(AC_CPU_CR4_SMEP)) |
| at->expected_error &= ~PFERR_FETCH_MASK; |
| } |
| |
| static void ac_set_expected_status(ac_test_t *at) |
| { |
| invlpg(at->virt); |
| |
| if (at->ptep) |
| at->expected_pte = *at->ptep; |
| at->expected_pde = *at->pdep; |
| at->ignore_pde = 0; |
| at->expected_fault = 0; |
| at->expected_error = PFERR_PRESENT_MASK; |
| |
| if (at->flags & AC_ACCESS_TWICE_MASK) { |
| ac_emulate_access(at, at->flags & |
| ~AC_ACCESS_WRITE_MASK & |
| ~AC_ACCESS_FETCH_MASK & |
| ~AC_ACCESS_USER_MASK); |
| at->expected_fault = 0; |
| at->expected_error = PFERR_PRESENT_MASK; |
| at->ignore_pde = 0; |
| } |
| |
| ac_emulate_access(at, at->flags); |
| } |
| |
| static pt_element_t ac_get_pt(ac_test_t *at, int i, pt_element_t *ptep) |
| { |
| pt_element_t pte; |
| |
| pte = *ptep; |
| if (pte && !(pte & PT_PAGE_SIZE_MASK) && |
| (pte & PT_BASE_ADDR_MASK) != at->page_tables[i]) { |
| printf("\nPT collision. VA = 0x%lx, level = %d, index = %ld, found PT = 0x%lx, want PT = 0x%lx\n", |
| (unsigned long)at->virt, i, |
| PT_INDEX((unsigned long)at->virt, i), |
| pte, at->page_tables[i]); |
| abort(); |
| } |
| |
| /* |
| * Preserve A/D bits to avoid writing upper level PTEs, |
| * which cannot be unsyc'd when KVM uses shadow paging. |
| */ |
| pte = at->page_tables[i] | (pte & (PT_DIRTY_MASK | PT_ACCESSED_MASK)); |
| return pte; |
| } |
| |
| static void ac_test_setup_ptes(ac_test_t *at) |
| { |
| unsigned long parent_pte = shadow_cr3; |
| int flags = at->flags; |
| int i; |
| |
| at->ptep = 0; |
| for (i = at->pt_levels; i >= 1 && (i >= 2 || !F(AC_PDE_PSE)); --i) { |
| pt_element_t *parent_pt = va(parent_pte & PT_BASE_ADDR_MASK); |
| unsigned index = PT_INDEX((unsigned long)at->virt, i); |
| pt_element_t *ptep = &parent_pt[index]; |
| pt_element_t pte; |
| |
| switch (i) { |
| case 5: |
| case 4: |
| pte = ac_get_pt(at, i, ptep); |
| pte |= PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK; |
| break; |
| case 3: |
| pte = ac_get_pt(at, i, ptep); |
| pte |= PT_PRESENT_MASK | PT_USER_MASK; |
| if (!F(AC_PDPTE_NO_WRITABLE)) |
| pte |= PT_WRITABLE_MASK; |
| break; |
| case 2: |
| if (!F(AC_PDE_PSE)) { |
| pte = ac_get_pt(at, i, ptep); |
| |
| /* The protection key is ignored on non-leaf entries. */ |
| if (F(AC_PKU_PKEY)) |
| pte |= 2ull << 59; |
| } else { |
| pte = at->phys & PT_PSE_BASE_ADDR_MASK; |
| pte |= PT_PAGE_SIZE_MASK; |
| if (F(AC_PKU_PKEY)) |
| pte |= 1ull << 59; |
| } |
| if (F(AC_PDE_PRESENT)) |
| pte |= PT_PRESENT_MASK; |
| if (F(AC_PDE_WRITABLE)) |
| pte |= PT_WRITABLE_MASK; |
| if (F(AC_PDE_USER)) |
| pte |= PT_USER_MASK; |
| if (F(AC_PDE_ACCESSED)) |
| pte |= PT_ACCESSED_MASK; |
| if (F(AC_PDE_DIRTY)) |
| pte |= PT_DIRTY_MASK; |
| if (F(AC_PDE_NX)) |
| pte |= PT64_NX_MASK; |
| if (F(AC_PDE_BIT51)) |
| pte |= 1ull << 51; |
| if (F(AC_PDE_BIT36)) |
| pte |= 1ull << 36; |
| if (F(AC_PDE_BIT13)) |
| pte |= 1ull << 13; |
| at->pdep = ptep; |
| break; |
| case 1: |
| pte = at->phys & PT_BASE_ADDR_MASK; |
| if (F(AC_PKU_PKEY)) |
| pte |= 1ull << 59; |
| if (F(AC_PTE_PRESENT)) |
| pte |= PT_PRESENT_MASK; |
| if (F(AC_PTE_WRITABLE)) |
| pte |= PT_WRITABLE_MASK; |
| if (F(AC_PTE_USER)) |
| pte |= PT_USER_MASK; |
| if (F(AC_PTE_ACCESSED)) |
| pte |= PT_ACCESSED_MASK; |
| if (F(AC_PTE_DIRTY)) |
| pte |= PT_DIRTY_MASK; |
| if (F(AC_PTE_NX)) |
| pte |= PT64_NX_MASK; |
| if (F(AC_PTE_BIT51)) |
| pte |= 1ull << 51; |
| if (F(AC_PTE_BIT36)) |
| pte |= 1ull << 36; |
| at->ptep = ptep; |
| break; |
| default: |
| assert(0); |
| } |
| |
| if (pte != *ptep) |
| *ptep = pte; |
| |
| parent_pte = pte; |
| } |
| ac_set_expected_status(at); |
| } |
| |
| static void __dump_pte(pt_element_t *ptep, int level, unsigned long virt) |
| { |
| printf("------L%d I%lu: %lx\n", level, PT_INDEX(virt, level), *ptep); |
| } |
| |
| static void dump_mapping(ac_test_t *at) |
| { |
| unsigned long virt = (unsigned long)at->virt; |
| int flags = at->flags; |
| |
| printf("Dump mapping: address: %p\n", at->virt); |
| walk_va(at, F(AC_PDE_PSE) ? 2 : 1, virt, __dump_pte, false); |
| } |
| |
| static void ac_test_check(ac_test_t *at, _Bool *success_ret, _Bool cond, |
| const char *fmt, ...) |
| { |
| va_list ap; |
| char buf[500]; |
| |
| if (!*success_ret) { |
| return; |
| } |
| |
| if (!cond) { |
| return; |
| } |
| |
| *success_ret = false; |
| |
| if (!verbose) { |
| puts("\n"); |
| ac_test_show(at); |
| } |
| |
| va_start(ap, fmt); |
| vsnprintf(buf, sizeof(buf), fmt, ap); |
| va_end(ap); |
| printf("FAIL: %s\n", buf); |
| dump_mapping(at); |
| } |
| |
| static int pt_match(pt_element_t pte1, pt_element_t pte2, pt_element_t ignore) |
| { |
| pte1 &= ~ignore; |
| pte2 &= ~ignore; |
| return pte1 == pte2; |
| } |
| |
| static int ac_test_do_access(ac_test_t *at) |
| { |
| static unsigned unique = 42; |
| int fault = 0; |
| unsigned e; |
| static unsigned char user_stack[4096]; |
| unsigned long rsp; |
| _Bool success = true; |
| int flags = at->flags; |
| |
| ++unique; |
| if (!(unique & 65535)) { |
| puts("."); |
| } |
| |
| *((unsigned char *)at->phys) = 0xc3; /* ret */ |
| |
| unsigned r = unique; |
| set_cr0_wp(F(AC_CPU_CR0_WP)); |
| set_efer_nx(F(AC_CPU_EFER_NX)); |
| set_cr4_pke(F(AC_CPU_CR4_PKE)); |
| if (F(AC_CPU_CR4_PKE)) { |
| /* WD2=AD2=1, WD1=F(AC_PKU_WD), AD1=F(AC_PKU_AD) */ |
| write_pkru(0x30 | (F(AC_PKU_WD) ? 8 : 0) | |
| (F(AC_PKU_AD) ? 4 : 0)); |
| } |
| |
| set_cr4_smep(at, F(AC_CPU_CR4_SMEP)); |
| |
| if (F(AC_ACCESS_TWICE)) { |
| asm volatile ("mov $fixed2, %%rsi \n\t" |
| "mov (%[addr]), %[reg] \n\t" |
| "fixed2:" |
| : [reg]"=r"(r), [fault]"=a"(fault), "=b"(e) |
| : [addr]"r"(at->virt) |
| : "rsi"); |
| fault = 0; |
| } |
| |
| asm volatile ("mov $fixed1, %%rsi \n\t" |
| "mov %%rsp, %[rsp0] \n\t" |
| "cmp $0, %[user] \n\t" |
| "jz do_access \n\t" |
| "push %%rax; mov %[user_ds], %%ax; mov %%ax, %%ds; pop %%rax \n\t" |
| "pushq %[user_ds] \n\t" |
| "pushq %[user_stack_top] \n\t" |
| "pushfq \n\t" |
| "pushq %[user_cs] \n\t" |
| "pushq $do_access \n\t" |
| "iretq \n" |
| "do_access: \n\t" |
| "cmp $0, %[fetch] \n\t" |
| "jnz 2f \n\t" |
| "cmp $0, %[write] \n\t" |
| "jnz 1f \n\t" |
| "mov (%[addr]), %[reg] \n\t" |
| "jmp done \n\t" |
| "1: mov %[reg], (%[addr]) \n\t" |
| "jmp done \n\t" |
| "2: call *%[addr] \n\t" |
| "done: \n" |
| "fixed1: \n" |
| "int %[kernel_entry_vector] \n\t" |
| ".section .text.entry \n\t" |
| "kernel_entry: \n\t" |
| "mov %[rsp0], %%rsp \n\t" |
| "jmp back_to_kernel \n\t" |
| ".section .text \n\t" |
| "back_to_kernel:" |
| : [reg]"+r"(r), "+a"(fault), "=b"(e), "=&d"(rsp), |
| [rsp0]"=m"(tss[0].rsp0) |
| : [addr]"r"(at->virt), |
| [write]"r"(F(AC_ACCESS_WRITE)), |
| [user]"r"(F(AC_ACCESS_USER)), |
| [fetch]"r"(F(AC_ACCESS_FETCH)), |
| [user_ds]"i"(USER_DS), |
| [user_cs]"i"(USER_CS), |
| [user_stack_top]"r"(user_stack + sizeof user_stack), |
| [kernel_entry_vector]"i"(0x20) |
| : "rsi"); |
| |
| asm volatile (".section .text.pf \n\t" |
| "page_fault: \n\t" |
| "pop %rbx \n\t" |
| "mov %rsi, (%rsp) \n\t" |
| "movl $1, %eax \n\t" |
| "iretq \n\t" |
| ".section .text"); |
| |
| ac_test_check(at, &success, fault && !at->expected_fault, |
| "unexpected fault"); |
| ac_test_check(at, &success, !fault && at->expected_fault, |
| "unexpected access"); |
| ac_test_check(at, &success, fault && e != at->expected_error, |
| "error code %x expected %x", e, at->expected_error); |
| if (at->ptep) |
| ac_test_check(at, &success, *at->ptep != at->expected_pte, |
| "pte %x expected %x", *at->ptep, at->expected_pte); |
| ac_test_check(at, &success, |
| !pt_match(*at->pdep, at->expected_pde, at->ignore_pde), |
| "pde %x expected %x", *at->pdep, at->expected_pde); |
| |
| if (success && verbose) { |
| if (at->expected_fault) { |
| printf("PASS (%x)\n", at->expected_error); |
| } else { |
| printf("PASS\n"); |
| } |
| } |
| return success; |
| } |
| |
| static void ac_test_show(ac_test_t *at) |
| { |
| char line[5000]; |
| |
| *line = 0; |
| strcat(line, "test"); |
| for (int i = 0; i < NR_AC_FLAGS; ++i) |
| if (at->flags & (1 << i)) { |
| strcat(line, " "); |
| strcat(line, ac_names[i]); |
| } |
| |
| strcat(line, ": "); |
| printf("%s", line); |
| } |
| |
| /* |
| * This test case is used to trigger the bug which is fixed by |
| * commit e09e90a5 in the kvm tree |
| */ |
| static int corrupt_hugepage_trigger(ac_pt_env_t *pt_env) |
| { |
| ac_test_t at1, at2; |
| |
| ac_test_init(&at1, 0xffff923400000000ul, pt_env); |
| __ac_test_init(&at2, 0xffffe66600000000ul, pt_env, &at1); |
| |
| at2.flags = AC_CPU_CR0_WP_MASK | AC_PDE_PSE_MASK | AC_PDE_PRESENT_MASK; |
| ac_test_setup_ptes(&at2); |
| if (!ac_test_do_access(&at2)) |
| goto err; |
| |
| at1.flags = at2.flags | AC_PDE_WRITABLE_MASK; |
| ac_test_setup_ptes(&at1); |
| if (!ac_test_do_access(&at1)) |
| goto err; |
| |
| at1.flags |= AC_ACCESS_WRITE_MASK; |
| ac_set_expected_status(&at1); |
| if (!ac_test_do_access(&at1)) |
| goto err; |
| |
| at2.flags |= AC_ACCESS_WRITE_MASK; |
| ac_set_expected_status(&at2); |
| if (!ac_test_do_access(&at2)) |
| goto err; |
| |
| return 1; |
| |
| err: |
| printf("corrupt_hugepage_trigger test fail\n"); |
| return 0; |
| } |
| |
| /* |
| * This test case is used to trigger the bug which is fixed by |
| * commit 3ddf6c06e13e in the kvm tree |
| */ |
| static int check_pfec_on_prefetch_pte(ac_pt_env_t *pt_env) |
| { |
| ac_test_t at1, at2; |
| |
| ac_test_init(&at1, 0xffff923406001000ul, pt_env); |
| __ac_test_init(&at2, 0xffff923406003000ul, pt_env, &at1); |
| |
| at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK; |
| ac_test_setup_ptes(&at1); |
| |
| at2.flags = at1.flags | AC_PTE_NX_MASK; |
| ac_test_setup_ptes(&at2); |
| |
| if (!ac_test_do_access(&at1)) { |
| printf("%s: prepare fail\n", __FUNCTION__); |
| goto err; |
| } |
| |
| if (!ac_test_do_access(&at2)) { |
| printf("%s: check PFEC on prefetch pte path fail\n", |
| __FUNCTION__); |
| goto err; |
| } |
| |
| return 1; |
| |
| err: |
| return 0; |
| } |
| |
| /* |
| * If the write-fault access is from supervisor and CR0.WP is not set on the |
| * vcpu, kvm will fix it by adjusting pte access - it sets the W bit on pte |
| * and clears U bit. This is the chance that kvm can change pte access from |
| * readonly to writable. |
| * |
| * Unfortunately, the pte access is the access of 'direct' shadow page table, |
| * means direct sp.role.access = pte_access, then we will create a writable |
| * spte entry on the readonly shadow page table. It will cause Dirty bit is |
| * not tracked when two guest ptes point to the same large page. Note, it |
| * does not have other impact except Dirty bit since cr0.wp is encoded into |
| * sp.role. |
| * |
| * Note: to trigger this bug, hugepage should be disabled on host. |
| */ |
| static int check_large_pte_dirty_for_nowp(ac_pt_env_t *pt_env) |
| { |
| ac_test_t at1, at2; |
| |
| ac_test_init(&at1, 0xffff923403000000ul, pt_env); |
| __ac_test_init(&at2, 0xffffe66606000000ul, pt_env, &at1); |
| |
| at2.flags = AC_PDE_PRESENT_MASK | AC_PDE_PSE_MASK; |
| ac_test_setup_ptes(&at2); |
| if (!ac_test_do_access(&at2)) { |
| printf("%s: read on the first mapping fail.\n", __FUNCTION__); |
| goto err; |
| } |
| |
| at1.flags = at2.flags | AC_ACCESS_WRITE_MASK; |
| ac_test_setup_ptes(&at1); |
| if (!ac_test_do_access(&at1)) { |
| printf("%s: write on the second mapping fail.\n", __FUNCTION__); |
| goto err; |
| } |
| |
| at2.flags |= AC_ACCESS_WRITE_MASK; |
| ac_set_expected_status(&at2); |
| if (!ac_test_do_access(&at2)) { |
| printf("%s: write on the first mapping fail.\n", __FUNCTION__); |
| goto err; |
| } |
| |
| return 1; |
| |
| err: |
| return 0; |
| } |
| |
| static int check_smep_andnot_wp(ac_pt_env_t *pt_env) |
| { |
| ac_test_t at1; |
| int err_prepare_andnot_wp, err_smep_andnot_wp; |
| |
| if (!this_cpu_has(X86_FEATURE_SMEP)) { |
| return 1; |
| } |
| |
| ac_test_init(&at1, 0xffff923406001000ul, pt_env); |
| |
| at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK | |
| AC_PDE_USER_MASK | AC_PTE_USER_MASK | |
| AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK | |
| AC_CPU_CR4_SMEP_MASK | |
| AC_CPU_CR0_WP_MASK | |
| AC_ACCESS_WRITE_MASK; |
| ac_test_setup_ptes(&at1); |
| |
| /* |
| * Here we write the ro user page when |
| * cr0.wp=0, then we execute it and SMEP |
| * fault should happen. |
| */ |
| err_prepare_andnot_wp = ac_test_do_access(&at1); |
| if (!err_prepare_andnot_wp) { |
| printf("%s: SMEP prepare fail\n", __FUNCTION__); |
| goto clean_up; |
| } |
| |
| at1.flags &= ~AC_ACCESS_WRITE_MASK; |
| at1.flags |= AC_ACCESS_FETCH_MASK; |
| ac_set_expected_status(&at1); |
| err_smep_andnot_wp = ac_test_do_access(&at1); |
| |
| clean_up: |
| set_cr4_smep(&at1, 0); |
| |
| if (!err_prepare_andnot_wp) |
| goto err; |
| if (!err_smep_andnot_wp) { |
| printf("%s: check SMEP without wp fail\n", __FUNCTION__); |
| goto err; |
| } |
| return 1; |
| |
| err: |
| return 0; |
| } |
| |
| static int check_effective_sp_permissions(ac_pt_env_t *pt_env) |
| { |
| unsigned long ptr1 = 0xffff923480000000; |
| unsigned long ptr2 = ptr1 + SZ_2M; |
| unsigned long ptr3 = ptr1 + SZ_1G; |
| unsigned long ptr4 = ptr3 + SZ_2M; |
| ac_test_t at1, at2, at3, at4; |
| int err_read_at1, err_write_at2; |
| int err_read_at3, err_write_at4; |
| |
| /* |
| * pgd[] pud[] pmd[] virtual address pointers |
| * /->pmd(u--)->pte1(uw-)->page1 <- ptr1 (u--) |
| * /->pud1(uw-)--->pmd(uw-)->pte2(uw-)->page2 <- ptr2 (uw-) |
| * pgd-| |
| * \->pud2(u--)--->pmd(u--)->pte1(uw-)->page1 <- ptr3 (u--) |
| * \->pmd(uw-)->pte2(uw-)->page2 <- ptr4 (u--) |
| * pud1 and pud2 point to the same pmd page. |
| */ |
| |
| ac_test_init(&at1, ptr1, pt_env); |
| at1.flags = AC_PDE_PRESENT_MASK | AC_PTE_PRESENT_MASK | |
| AC_PDE_USER_MASK | AC_PTE_USER_MASK | |
| AC_PDE_ACCESSED_MASK | AC_PTE_ACCESSED_MASK | |
| AC_PTE_WRITABLE_MASK | AC_ACCESS_USER_MASK; |
| ac_test_setup_ptes(&at1); |
| |
| __ac_test_init(&at2, ptr2, pt_env, &at1); |
| at2.flags = at1.flags | AC_PDE_WRITABLE_MASK | AC_PTE_DIRTY_MASK | AC_ACCESS_WRITE_MASK; |
| ac_test_setup_ptes(&at2); |
| |
| __ac_test_init(&at3, ptr3, pt_env, &at1); |
| /* Override the PMD (1-based index) to point at ptr1's PMD. */ |
| at3.page_tables[3] = at1.page_tables[3]; |
| at3.flags = AC_PDPTE_NO_WRITABLE_MASK | at1.flags; |
| ac_test_setup_ptes(&at3); |
| |
| /* Alias ptr2, only the PMD will differ; manually override the PMD. */ |
| __ac_test_init(&at4, ptr4, pt_env, &at2); |
| at4.page_tables[3] = at1.page_tables[3]; |
| at4.flags = AC_PDPTE_NO_WRITABLE_MASK | at2.flags; |
| ac_test_setup_ptes(&at4); |
| |
| err_read_at1 = ac_test_do_access(&at1); |
| if (!err_read_at1) { |
| printf("%s: read access at1 fail\n", __FUNCTION__); |
| return 0; |
| } |
| |
| err_write_at2 = ac_test_do_access(&at2); |
| if (!err_write_at2) { |
| printf("%s: write access at2 fail\n", __FUNCTION__); |
| return 0; |
| } |
| |
| err_read_at3 = ac_test_do_access(&at3); |
| if (!err_read_at3) { |
| printf("%s: read access at3 fail\n", __FUNCTION__); |
| return 0; |
| } |
| |
| err_write_at4 = ac_test_do_access(&at4); |
| if (!err_write_at4) { |
| printf("%s: write access at4 should fail\n", __FUNCTION__); |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| static int ac_test_exec(ac_test_t *at, ac_pt_env_t *pt_env) |
| { |
| int r; |
| |
| if (verbose) { |
| ac_test_show(at); |
| } |
| ac_test_setup_ptes(at); |
| r = ac_test_do_access(at); |
| return r; |
| } |
| |
| typedef int (*ac_test_fn)(ac_pt_env_t *pt_env); |
| const ac_test_fn ac_test_cases[] = |
| { |
| corrupt_hugepage_trigger, |
| check_pfec_on_prefetch_pte, |
| check_large_pte_dirty_for_nowp, |
| check_smep_andnot_wp, |
| check_effective_sp_permissions, |
| }; |
| |
| int ac_test_run(int pt_levels) |
| { |
| ac_test_t at; |
| ac_pt_env_t pt_env; |
| int i, tests, successes; |
| |
| printf("run\n"); |
| tests = successes = 0; |
| |
| shadow_cr0 = read_cr0(); |
| shadow_cr4 = read_cr4(); |
| shadow_cr3 = read_cr3(); |
| shadow_efer = rdmsr(MSR_EFER); |
| |
| if (cpuid_maxphyaddr() >= 52) { |
| invalid_mask |= AC_PDE_BIT51_MASK; |
| invalid_mask |= AC_PTE_BIT51_MASK; |
| } |
| if (cpuid_maxphyaddr() >= 37) { |
| invalid_mask |= AC_PDE_BIT36_MASK; |
| invalid_mask |= AC_PTE_BIT36_MASK; |
| } |
| |
| ac_env_int(&pt_env, pt_levels); |
| ac_test_init(&at, 0xffff923400000000ul, &pt_env); |
| |
| if (this_cpu_has(X86_FEATURE_PKU)) { |
| set_cr4_pke(1); |
| set_cr4_pke(0); |
| /* Now PKRU = 0xFFFFFFFF. */ |
| } else { |
| tests++; |
| if (write_cr4_safe(shadow_cr4 | X86_CR4_PKE) == GP_VECTOR) { |
| successes++; |
| invalid_mask |= AC_PKU_AD_MASK; |
| invalid_mask |= AC_PKU_WD_MASK; |
| invalid_mask |= AC_PKU_PKEY_MASK; |
| invalid_mask |= AC_CPU_CR4_PKE_MASK; |
| printf("CR4.PKE not available, disabling PKE tests\n"); |
| } else { |
| printf("Set PKE in CR4 - expect #GP: FAIL!\n"); |
| set_cr4_pke(0); |
| } |
| } |
| |
| if (!this_cpu_has(X86_FEATURE_SMEP)) { |
| tests++; |
| if (set_cr4_smep(&at, 1) == GP_VECTOR) { |
| successes++; |
| invalid_mask |= AC_CPU_CR4_SMEP_MASK; |
| printf("CR4.SMEP not available, disabling SMEP tests\n"); |
| } else { |
| printf("Set SMEP in CR4 - expect #GP: FAIL!\n"); |
| set_cr4_smep(&at, 0); |
| } |
| } |
| |
| /* Toggling LA57 in 64-bit mode (guaranteed for this test) is illegal. */ |
| if (this_cpu_has(X86_FEATURE_LA57)) { |
| tests++; |
| if (write_cr4_safe(shadow_cr4 ^ X86_CR4_LA57) == GP_VECTOR) |
| successes++; |
| |
| /* Force a VM-Exit on KVM, which doesn't intercept LA57 itself. */ |
| tests++; |
| if (write_cr4_safe(shadow_cr4 ^ (X86_CR4_LA57 | X86_CR4_PSE)) == GP_VECTOR) |
| successes++; |
| } |
| |
| do { |
| ++tests; |
| successes += ac_test_exec(&at, &pt_env); |
| } while (ac_test_bump(&at)); |
| |
| for (i = 0; i < ARRAY_SIZE(ac_test_cases); i++) { |
| ac_env_int(&pt_env, pt_levels); |
| |
| ++tests; |
| successes += ac_test_cases[i](&pt_env); |
| } |
| |
| printf("\n%d tests, %d failures\n", tests, tests - successes); |
| |
| return successes == tests; |
| } |