| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * Stand-alone page-table allocator for hyp stage-1 and guest stage-2. |
| * No bombay mix was harmed in the writing of this file. |
| * |
| * Copyright (C) 2020 Google LLC |
| * Author: Will Deacon <will@kernel.org> |
| */ |
| |
| #include <linux/bitfield.h> |
| #include <asm/kvm_pgtable.h> |
| #include <asm/stage2_pgtable.h> |
| |
| |
| #define KVM_PTE_TYPE BIT(1) |
| #define KVM_PTE_TYPE_BLOCK 0 |
| #define KVM_PTE_TYPE_PAGE 1 |
| #define KVM_PTE_TYPE_TABLE 1 |
| |
| #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) |
| |
| #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) |
| #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) |
| #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ |
| ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) |
| #define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ |
| ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) |
| #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) |
| #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 |
| #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) |
| |
| #define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2) |
| #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6) |
| #define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7) |
| #define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8) |
| #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 |
| #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) |
| |
| #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) |
| |
| #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) |
| |
| #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) |
| |
| #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) |
| |
| #define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) |
| |
| #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ |
| KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ |
| KVM_PTE_LEAF_ATTR_HI_S2_XN) |
| |
| #define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) |
| #define KVM_MAX_OWNER_ID 1 |
| |
| /* |
| * Used to indicate a pte for which a 'break-before-make' sequence is in |
| * progress. |
| */ |
| #define KVM_INVALID_PTE_LOCKED BIT(10) |
| |
| struct kvm_pgtable_walk_data { |
| struct kvm_pgtable_walker *walker; |
| |
| const u64 start; |
| u64 addr; |
| const u64 end; |
| }; |
| |
| static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) |
| { |
| return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); |
| } |
| |
| static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) |
| { |
| return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); |
| } |
| |
| static bool kvm_phys_is_valid(u64 phys) |
| { |
| u64 parange_max = kvm_get_parange_max(); |
| u8 shift = id_aa64mmfr0_parange_to_phys_shift(parange_max); |
| |
| return phys < BIT(shift); |
| } |
| |
| static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys) |
| { |
| u64 granule = kvm_granule_size(ctx->level); |
| |
| if (!kvm_level_supports_block_mapping(ctx->level)) |
| return false; |
| |
| if (granule > (ctx->end - ctx->addr)) |
| return false; |
| |
| if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) |
| return false; |
| |
| return IS_ALIGNED(ctx->addr, granule); |
| } |
| |
| static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level) |
| { |
| u64 shift = kvm_granule_shift(level); |
| u64 mask = BIT(PAGE_SHIFT - 3) - 1; |
| |
| return (data->addr >> shift) & mask; |
| } |
| |
| static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr) |
| { |
| u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */ |
| u64 mask = BIT(pgt->ia_bits) - 1; |
| |
| return (addr & mask) >> shift; |
| } |
| |
| static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level) |
| { |
| struct kvm_pgtable pgt = { |
| .ia_bits = ia_bits, |
| .start_level = start_level, |
| }; |
| |
| return kvm_pgd_page_idx(&pgt, -1ULL) + 1; |
| } |
| |
| static bool kvm_pte_table(kvm_pte_t pte, s8 level) |
| { |
| if (level == KVM_PGTABLE_LAST_LEVEL) |
| return false; |
| |
| if (!kvm_pte_valid(pte)) |
| return false; |
| |
| return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; |
| } |
| |
| static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) |
| { |
| return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); |
| } |
| |
| static void kvm_clear_pte(kvm_pte_t *ptep) |
| { |
| WRITE_ONCE(*ptep, 0); |
| } |
| |
| static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops) |
| { |
| kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); |
| |
| pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); |
| pte |= KVM_PTE_VALID; |
| return pte; |
| } |
| |
| static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level) |
| { |
| kvm_pte_t pte = kvm_phys_to_pte(pa); |
| u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE : |
| KVM_PTE_TYPE_BLOCK; |
| |
| pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI); |
| pte |= FIELD_PREP(KVM_PTE_TYPE, type); |
| pte |= KVM_PTE_VALID; |
| |
| return pte; |
| } |
| |
| static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) |
| { |
| return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); |
| } |
| |
| static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, |
| const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct kvm_pgtable_walker *walker = data->walker; |
| |
| /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */ |
| WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held()); |
| return walker->cb(ctx, visit); |
| } |
| |
| static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker, |
| int r) |
| { |
| /* |
| * Visitor callbacks return EAGAIN when the conditions that led to a |
| * fault are no longer reflected in the page tables due to a race to |
| * update a PTE. In the context of a fault handler this is interpreted |
| * as a signal to retry guest execution. |
| * |
| * Ignore the return code altogether for walkers outside a fault handler |
| * (e.g. write protecting a range of memory) and chug along with the |
| * page table walk. |
| */ |
| if (r == -EAGAIN) |
| return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT); |
| |
| return !r; |
| } |
| |
| static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, |
| struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level); |
| |
| static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data, |
| struct kvm_pgtable_mm_ops *mm_ops, |
| kvm_pteref_t pteref, s8 level) |
| { |
| enum kvm_pgtable_walk_flags flags = data->walker->flags; |
| kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref); |
| struct kvm_pgtable_visit_ctx ctx = { |
| .ptep = ptep, |
| .old = READ_ONCE(*ptep), |
| .arg = data->walker->arg, |
| .mm_ops = mm_ops, |
| .start = data->start, |
| .addr = data->addr, |
| .end = data->end, |
| .level = level, |
| .flags = flags, |
| }; |
| int ret = 0; |
| bool reload = false; |
| kvm_pteref_t childp; |
| bool table = kvm_pte_table(ctx.old, level); |
| |
| if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) { |
| ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE); |
| reload = true; |
| } |
| |
| if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) { |
| ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF); |
| reload = true; |
| } |
| |
| /* |
| * Reload the page table after invoking the walker callback for leaf |
| * entries or after pre-order traversal, to allow the walker to descend |
| * into a newly installed or replaced table. |
| */ |
| if (reload) { |
| ctx.old = READ_ONCE(*ptep); |
| table = kvm_pte_table(ctx.old, level); |
| } |
| |
| if (!kvm_pgtable_walk_continue(data->walker, ret)) |
| goto out; |
| |
| if (!table) { |
| data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level)); |
| data->addr += kvm_granule_size(level); |
| goto out; |
| } |
| |
| childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops); |
| ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1); |
| if (!kvm_pgtable_walk_continue(data->walker, ret)) |
| goto out; |
| |
| if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST) |
| ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST); |
| |
| out: |
| if (kvm_pgtable_walk_continue(data->walker, ret)) |
| return 0; |
| |
| return ret; |
| } |
| |
| static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data, |
| struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level) |
| { |
| u32 idx; |
| int ret = 0; |
| |
| if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL || |
| level > KVM_PGTABLE_LAST_LEVEL)) |
| return -EINVAL; |
| |
| for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) { |
| kvm_pteref_t pteref = &pgtable[idx]; |
| |
| if (data->addr >= data->end) |
| break; |
| |
| ret = __kvm_pgtable_visit(data, mm_ops, pteref, level); |
| if (ret) |
| break; |
| } |
| |
| return ret; |
| } |
| |
| static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data) |
| { |
| u32 idx; |
| int ret = 0; |
| u64 limit = BIT(pgt->ia_bits); |
| |
| if (data->addr > limit || data->end > limit) |
| return -ERANGE; |
| |
| if (!pgt->pgd) |
| return -EINVAL; |
| |
| for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) { |
| kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE]; |
| |
| ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level); |
| if (ret) |
| break; |
| } |
| |
| return ret; |
| } |
| |
| int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, |
| struct kvm_pgtable_walker *walker) |
| { |
| struct kvm_pgtable_walk_data walk_data = { |
| .start = ALIGN_DOWN(addr, PAGE_SIZE), |
| .addr = ALIGN_DOWN(addr, PAGE_SIZE), |
| .end = PAGE_ALIGN(walk_data.addr + size), |
| .walker = walker, |
| }; |
| int r; |
| |
| r = kvm_pgtable_walk_begin(walker); |
| if (r) |
| return r; |
| |
| r = _kvm_pgtable_walk(pgt, &walk_data); |
| kvm_pgtable_walk_end(walker); |
| |
| return r; |
| } |
| |
| struct leaf_walk_data { |
| kvm_pte_t pte; |
| s8 level; |
| }; |
| |
| static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct leaf_walk_data *data = ctx->arg; |
| |
| data->pte = ctx->old; |
| data->level = ctx->level; |
| |
| return 0; |
| } |
| |
| int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, |
| kvm_pte_t *ptep, s8 *level) |
| { |
| struct leaf_walk_data data; |
| struct kvm_pgtable_walker walker = { |
| .cb = leaf_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF, |
| .arg = &data, |
| }; |
| int ret; |
| |
| ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), |
| PAGE_SIZE, &walker); |
| if (!ret) { |
| if (ptep) |
| *ptep = data.pte; |
| if (level) |
| *level = data.level; |
| } |
| |
| return ret; |
| } |
| |
| struct hyp_map_data { |
| const u64 phys; |
| kvm_pte_t attr; |
| }; |
| |
| static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) |
| { |
| bool device = prot & KVM_PGTABLE_PROT_DEVICE; |
| u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; |
| kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); |
| u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; |
| u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW : |
| KVM_PTE_LEAF_ATTR_LO_S1_AP_RO; |
| |
| if (!(prot & KVM_PGTABLE_PROT_R)) |
| return -EINVAL; |
| |
| if (prot & KVM_PGTABLE_PROT_X) { |
| if (prot & KVM_PGTABLE_PROT_W) |
| return -EINVAL; |
| |
| if (device) |
| return -EINVAL; |
| |
| if (system_supports_bti_kernel()) |
| attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; |
| } else { |
| attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; |
| } |
| |
| attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); |
| if (!kvm_lpa2_is_enabled()) |
| attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); |
| attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; |
| attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; |
| *ptep = attr; |
| |
| return 0; |
| } |
| |
| enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) |
| { |
| enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; |
| u32 ap; |
| |
| if (!kvm_pte_valid(pte)) |
| return prot; |
| |
| if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) |
| prot |= KVM_PGTABLE_PROT_X; |
| |
| ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); |
| if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) |
| prot |= KVM_PGTABLE_PROT_R; |
| else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) |
| prot |= KVM_PGTABLE_PROT_RW; |
| |
| return prot; |
| } |
| |
| static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, |
| struct hyp_map_data *data) |
| { |
| u64 phys = data->phys + (ctx->addr - ctx->start); |
| kvm_pte_t new; |
| |
| if (!kvm_block_mapping_supported(ctx, phys)) |
| return false; |
| |
| new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); |
| if (ctx->old == new) |
| return true; |
| if (!kvm_pte_valid(ctx->old)) |
| ctx->mm_ops->get_page(ctx->ptep); |
| else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) |
| return false; |
| |
| smp_store_release(ctx->ptep, new); |
| return true; |
| } |
| |
| static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| kvm_pte_t *childp, new; |
| struct hyp_map_data *data = ctx->arg; |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| if (hyp_map_walker_try_leaf(ctx, data)) |
| return 0; |
| |
| if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) |
| return -EINVAL; |
| |
| childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); |
| if (!childp) |
| return -ENOMEM; |
| |
| new = kvm_init_table_pte(childp, mm_ops); |
| mm_ops->get_page(ctx->ptep); |
| smp_store_release(ctx->ptep, new); |
| |
| return 0; |
| } |
| |
| int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, |
| enum kvm_pgtable_prot prot) |
| { |
| int ret; |
| struct hyp_map_data map_data = { |
| .phys = ALIGN_DOWN(phys, PAGE_SIZE), |
| }; |
| struct kvm_pgtable_walker walker = { |
| .cb = hyp_map_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF, |
| .arg = &map_data, |
| }; |
| |
| ret = hyp_set_prot_attr(prot, &map_data.attr); |
| if (ret) |
| return ret; |
| |
| ret = kvm_pgtable_walk(pgt, addr, size, &walker); |
| dsb(ishst); |
| isb(); |
| return ret; |
| } |
| |
| static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| kvm_pte_t *childp = NULL; |
| u64 granule = kvm_granule_size(ctx->level); |
| u64 *unmapped = ctx->arg; |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| if (!kvm_pte_valid(ctx->old)) |
| return -EINVAL; |
| |
| if (kvm_pte_table(ctx->old, ctx->level)) { |
| childp = kvm_pte_follow(ctx->old, mm_ops); |
| |
| if (mm_ops->page_count(childp) != 1) |
| return 0; |
| |
| kvm_clear_pte(ctx->ptep); |
| dsb(ishst); |
| __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN); |
| } else { |
| if (ctx->end - ctx->addr < granule) |
| return -EINVAL; |
| |
| kvm_clear_pte(ctx->ptep); |
| dsb(ishst); |
| __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level); |
| *unmapped += granule; |
| } |
| |
| dsb(ish); |
| isb(); |
| mm_ops->put_page(ctx->ptep); |
| |
| if (childp) |
| mm_ops->put_page(childp); |
| |
| return 0; |
| } |
| |
| u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) |
| { |
| u64 unmapped = 0; |
| struct kvm_pgtable_walker walker = { |
| .cb = hyp_unmap_walker, |
| .arg = &unmapped, |
| .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, |
| }; |
| |
| if (!pgt->mm_ops->page_count) |
| return 0; |
| |
| kvm_pgtable_walk(pgt, addr, size, &walker); |
| return unmapped; |
| } |
| |
| int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, |
| struct kvm_pgtable_mm_ops *mm_ops) |
| { |
| s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 - |
| ARM64_HW_PGTABLE_LEVELS(va_bits); |
| |
| if (start_level < KVM_PGTABLE_FIRST_LEVEL || |
| start_level > KVM_PGTABLE_LAST_LEVEL) |
| return -EINVAL; |
| |
| pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL); |
| if (!pgt->pgd) |
| return -ENOMEM; |
| |
| pgt->ia_bits = va_bits; |
| pgt->start_level = start_level; |
| pgt->mm_ops = mm_ops; |
| pgt->mmu = NULL; |
| pgt->force_pte_cb = NULL; |
| |
| return 0; |
| } |
| |
| static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| if (!kvm_pte_valid(ctx->old)) |
| return 0; |
| |
| mm_ops->put_page(ctx->ptep); |
| |
| if (kvm_pte_table(ctx->old, ctx->level)) |
| mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); |
| |
| return 0; |
| } |
| |
| void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt) |
| { |
| struct kvm_pgtable_walker walker = { |
| .cb = hyp_free_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, |
| }; |
| |
| WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); |
| pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd)); |
| pgt->pgd = NULL; |
| } |
| |
| struct stage2_map_data { |
| const u64 phys; |
| kvm_pte_t attr; |
| u8 owner_id; |
| |
| kvm_pte_t *anchor; |
| kvm_pte_t *childp; |
| |
| struct kvm_s2_mmu *mmu; |
| void *memcache; |
| |
| /* Force mappings to page granularity */ |
| bool force_pte; |
| }; |
| |
| u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) |
| { |
| u64 vtcr = VTCR_EL2_FLAGS; |
| s8 lvls; |
| |
| vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; |
| vtcr |= VTCR_EL2_T0SZ(phys_shift); |
| /* |
| * Use a minimum 2 level page table to prevent splitting |
| * host PMD huge pages at stage2. |
| */ |
| lvls = stage2_pgtable_levels(phys_shift); |
| if (lvls < 2) |
| lvls = 2; |
| |
| /* |
| * When LPA2 is enabled, the HW supports an extra level of translation |
| * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2 |
| * to as an addition to SL0 to enable encoding this extra start level. |
| * However, since we always use concatenated pages for the first level |
| * lookup, we will never need this extra level and therefore do not need |
| * to touch SL2. |
| */ |
| vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); |
| |
| #ifdef CONFIG_ARM64_HW_AFDBM |
| /* |
| * Enable the Hardware Access Flag management, unconditionally |
| * on all CPUs. In systems that have asymmetric support for the feature |
| * this allows KVM to leverage hardware support on the subset of cores |
| * that implement the feature. |
| * |
| * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by |
| * hardware) on implementations that do not advertise support for the |
| * feature. As such, setting HA unconditionally is safe, unless you |
| * happen to be running on a design that has unadvertised support for |
| * HAFDBS. Here be dragons. |
| */ |
| if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) |
| vtcr |= VTCR_EL2_HA; |
| #endif /* CONFIG_ARM64_HW_AFDBM */ |
| |
| if (kvm_lpa2_is_enabled()) |
| vtcr |= VTCR_EL2_DS; |
| |
| /* Set the vmid bits */ |
| vtcr |= (get_vmid_bits(mmfr1) == 16) ? |
| VTCR_EL2_VS_16BIT : |
| VTCR_EL2_VS_8BIT; |
| |
| return vtcr; |
| } |
| |
| static bool stage2_has_fwb(struct kvm_pgtable *pgt) |
| { |
| if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) |
| return false; |
| |
| return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); |
| } |
| |
| void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu, |
| phys_addr_t addr, size_t size) |
| { |
| unsigned long pages, inval_pages; |
| |
| if (!system_supports_tlb_range()) { |
| kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); |
| return; |
| } |
| |
| pages = size >> PAGE_SHIFT; |
| while (pages > 0) { |
| inval_pages = min(pages, MAX_TLBI_RANGE_PAGES); |
| kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages); |
| |
| addr += inval_pages << PAGE_SHIFT; |
| pages -= inval_pages; |
| } |
| } |
| |
| #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) |
| |
| static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, |
| kvm_pte_t *ptep) |
| { |
| kvm_pte_t attr; |
| u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; |
| |
| switch (prot & (KVM_PGTABLE_PROT_DEVICE | |
| KVM_PGTABLE_PROT_NORMAL_NC)) { |
| case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC: |
| return -EINVAL; |
| case KVM_PGTABLE_PROT_DEVICE: |
| if (prot & KVM_PGTABLE_PROT_X) |
| return -EINVAL; |
| attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); |
| break; |
| case KVM_PGTABLE_PROT_NORMAL_NC: |
| if (prot & KVM_PGTABLE_PROT_X) |
| return -EINVAL; |
| attr = KVM_S2_MEMATTR(pgt, NORMAL_NC); |
| break; |
| default: |
| attr = KVM_S2_MEMATTR(pgt, NORMAL); |
| } |
| |
| if (!(prot & KVM_PGTABLE_PROT_X)) |
| attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; |
| |
| if (prot & KVM_PGTABLE_PROT_R) |
| attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; |
| |
| if (prot & KVM_PGTABLE_PROT_W) |
| attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; |
| |
| if (!kvm_lpa2_is_enabled()) |
| attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); |
| |
| attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; |
| attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; |
| *ptep = attr; |
| |
| return 0; |
| } |
| |
| enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) |
| { |
| enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; |
| |
| if (!kvm_pte_valid(pte)) |
| return prot; |
| |
| if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) |
| prot |= KVM_PGTABLE_PROT_R; |
| if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) |
| prot |= KVM_PGTABLE_PROT_W; |
| if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) |
| prot |= KVM_PGTABLE_PROT_X; |
| |
| return prot; |
| } |
| |
| static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) |
| { |
| if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) |
| return true; |
| |
| return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); |
| } |
| |
| static bool stage2_pte_is_counted(kvm_pte_t pte) |
| { |
| /* |
| * The refcount tracks valid entries as well as invalid entries if they |
| * encode ownership of a page to another entity than the page-table |
| * owner, whose id is 0. |
| */ |
| return !!pte; |
| } |
| |
| static bool stage2_pte_is_locked(kvm_pte_t pte) |
| { |
| return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED); |
| } |
| |
| static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) |
| { |
| if (!kvm_pgtable_walk_shared(ctx)) { |
| WRITE_ONCE(*ctx->ptep, new); |
| return true; |
| } |
| |
| return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old; |
| } |
| |
| /** |
| * stage2_try_break_pte() - Invalidates a pte according to the |
| * 'break-before-make' requirements of the |
| * architecture. |
| * |
| * @ctx: context of the visited pte. |
| * @mmu: stage-2 mmu |
| * |
| * Returns: true if the pte was successfully broken. |
| * |
| * If the removed pte was valid, performs the necessary serialization and TLB |
| * invalidation for the old value. For counted ptes, drops the reference count |
| * on the containing table page. |
| */ |
| static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, |
| struct kvm_s2_mmu *mmu) |
| { |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| if (stage2_pte_is_locked(ctx->old)) { |
| /* |
| * Should never occur if this walker has exclusive access to the |
| * page tables. |
| */ |
| WARN_ON(!kvm_pgtable_walk_shared(ctx)); |
| return false; |
| } |
| |
| if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) |
| return false; |
| |
| if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { |
| /* |
| * Perform the appropriate TLB invalidation based on the |
| * evicted pte value (if any). |
| */ |
| if (kvm_pte_table(ctx->old, ctx->level)) { |
| u64 size = kvm_granule_size(ctx->level); |
| u64 addr = ALIGN_DOWN(ctx->addr, size); |
| |
| kvm_tlb_flush_vmid_range(mmu, addr, size); |
| } else if (kvm_pte_valid(ctx->old)) { |
| kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, |
| ctx->addr, ctx->level); |
| } |
| } |
| |
| if (stage2_pte_is_counted(ctx->old)) |
| mm_ops->put_page(ctx->ptep); |
| |
| return true; |
| } |
| |
| static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new) |
| { |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| WARN_ON(!stage2_pte_is_locked(*ctx->ptep)); |
| |
| if (stage2_pte_is_counted(new)) |
| mm_ops->get_page(ctx->ptep); |
| |
| smp_store_release(ctx->ptep, new); |
| } |
| |
| static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt) |
| { |
| /* |
| * If FEAT_TLBIRANGE is implemented, defer the individual |
| * TLB invalidations until the entire walk is finished, and |
| * then use the range-based TLBI instructions to do the |
| * invalidations. Condition deferred TLB invalidation on the |
| * system supporting FWB as the optimization is entirely |
| * pointless when the unmap walker needs to perform CMOs. |
| */ |
| return system_supports_tlb_range() && stage2_has_fwb(pgt); |
| } |
| |
| static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx, |
| struct kvm_s2_mmu *mmu, |
| struct kvm_pgtable_mm_ops *mm_ops) |
| { |
| struct kvm_pgtable *pgt = ctx->arg; |
| |
| /* |
| * Clear the existing PTE, and perform break-before-make if it was |
| * valid. Depending on the system support, defer the TLB maintenance |
| * for the same until the entire unmap walk is completed. |
| */ |
| if (kvm_pte_valid(ctx->old)) { |
| kvm_clear_pte(ctx->ptep); |
| |
| if (kvm_pte_table(ctx->old, ctx->level)) { |
| kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, |
| TLBI_TTL_UNKNOWN); |
| } else if (!stage2_unmap_defer_tlb_flush(pgt)) { |
| kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, |
| ctx->level); |
| } |
| } |
| |
| mm_ops->put_page(ctx->ptep); |
| } |
| |
| static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) |
| { |
| u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; |
| return memattr == KVM_S2_MEMATTR(pgt, NORMAL); |
| } |
| |
| static bool stage2_pte_executable(kvm_pte_t pte) |
| { |
| return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); |
| } |
| |
| static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx, |
| const struct stage2_map_data *data) |
| { |
| u64 phys = data->phys; |
| |
| /* |
| * Stage-2 walks to update ownership data are communicated to the map |
| * walker using an invalid PA. Avoid offsetting an already invalid PA, |
| * which could overflow and make the address valid again. |
| */ |
| if (!kvm_phys_is_valid(phys)) |
| return phys; |
| |
| /* |
| * Otherwise, work out the correct PA based on how far the walk has |
| * gotten. |
| */ |
| return phys + (ctx->addr - ctx->start); |
| } |
| |
| static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx, |
| struct stage2_map_data *data) |
| { |
| u64 phys = stage2_map_walker_phys_addr(ctx, data); |
| |
| if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL) |
| return false; |
| |
| return kvm_block_mapping_supported(ctx, phys); |
| } |
| |
| static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, |
| struct stage2_map_data *data) |
| { |
| kvm_pte_t new; |
| u64 phys = stage2_map_walker_phys_addr(ctx, data); |
| u64 granule = kvm_granule_size(ctx->level); |
| struct kvm_pgtable *pgt = data->mmu->pgt; |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| if (!stage2_leaf_mapping_allowed(ctx, data)) |
| return -E2BIG; |
| |
| if (kvm_phys_is_valid(phys)) |
| new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level); |
| else |
| new = kvm_init_invalid_leaf_owner(data->owner_id); |
| |
| /* |
| * Skip updating the PTE if we are trying to recreate the exact |
| * same mapping or only change the access permissions. Instead, |
| * the vCPU will exit one more time from guest if still needed |
| * and then go through the path of relaxing permissions. |
| */ |
| if (!stage2_pte_needs_update(ctx->old, new)) |
| return -EAGAIN; |
| |
| if (!stage2_try_break_pte(ctx, data->mmu)) |
| return -EAGAIN; |
| |
| /* Perform CMOs before installation of the guest stage-2 PTE */ |
| if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && |
| stage2_pte_cacheable(pgt, new)) |
| mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), |
| granule); |
| |
| if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && |
| stage2_pte_executable(new)) |
| mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); |
| |
| stage2_make_pte(ctx, new); |
| |
| return 0; |
| } |
| |
| static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, |
| struct stage2_map_data *data) |
| { |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops); |
| int ret; |
| |
| if (!stage2_leaf_mapping_allowed(ctx, data)) |
| return 0; |
| |
| ret = stage2_map_walker_try_leaf(ctx, data); |
| if (ret) |
| return ret; |
| |
| mm_ops->free_unlinked_table(childp, ctx->level); |
| return 0; |
| } |
| |
| static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, |
| struct stage2_map_data *data) |
| { |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| kvm_pte_t *childp, new; |
| int ret; |
| |
| ret = stage2_map_walker_try_leaf(ctx, data); |
| if (ret != -E2BIG) |
| return ret; |
| |
| if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL)) |
| return -EINVAL; |
| |
| if (!data->memcache) |
| return -ENOMEM; |
| |
| childp = mm_ops->zalloc_page(data->memcache); |
| if (!childp) |
| return -ENOMEM; |
| |
| if (!stage2_try_break_pte(ctx, data->mmu)) { |
| mm_ops->put_page(childp); |
| return -EAGAIN; |
| } |
| |
| /* |
| * If we've run into an existing block mapping then replace it with |
| * a table. Accesses beyond 'end' that fall within the new table |
| * will be mapped lazily. |
| */ |
| new = kvm_init_table_pte(childp, mm_ops); |
| stage2_make_pte(ctx, new); |
| |
| return 0; |
| } |
| |
| /* |
| * The TABLE_PRE callback runs for table entries on the way down, looking |
| * for table entries which we could conceivably replace with a block entry |
| * for this mapping. If it finds one it replaces the entry and calls |
| * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. |
| * |
| * Otherwise, the LEAF callback performs the mapping at the existing leaves |
| * instead. |
| */ |
| static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct stage2_map_data *data = ctx->arg; |
| |
| switch (visit) { |
| case KVM_PGTABLE_WALK_TABLE_PRE: |
| return stage2_map_walk_table_pre(ctx, data); |
| case KVM_PGTABLE_WALK_LEAF: |
| return stage2_map_walk_leaf(ctx, data); |
| default: |
| return -EINVAL; |
| } |
| } |
| |
| int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, |
| u64 phys, enum kvm_pgtable_prot prot, |
| void *mc, enum kvm_pgtable_walk_flags flags) |
| { |
| int ret; |
| struct stage2_map_data map_data = { |
| .phys = ALIGN_DOWN(phys, PAGE_SIZE), |
| .mmu = pgt->mmu, |
| .memcache = mc, |
| .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), |
| }; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_map_walker, |
| .flags = flags | |
| KVM_PGTABLE_WALK_TABLE_PRE | |
| KVM_PGTABLE_WALK_LEAF, |
| .arg = &map_data, |
| }; |
| |
| if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) |
| return -EINVAL; |
| |
| ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); |
| if (ret) |
| return ret; |
| |
| ret = kvm_pgtable_walk(pgt, addr, size, &walker); |
| dsb(ishst); |
| return ret; |
| } |
| |
| int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, |
| void *mc, u8 owner_id) |
| { |
| int ret; |
| struct stage2_map_data map_data = { |
| .phys = KVM_PHYS_INVALID, |
| .mmu = pgt->mmu, |
| .memcache = mc, |
| .owner_id = owner_id, |
| .force_pte = true, |
| }; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_map_walker, |
| .flags = KVM_PGTABLE_WALK_TABLE_PRE | |
| KVM_PGTABLE_WALK_LEAF, |
| .arg = &map_data, |
| }; |
| |
| if (owner_id > KVM_MAX_OWNER_ID) |
| return -EINVAL; |
| |
| ret = kvm_pgtable_walk(pgt, addr, size, &walker); |
| return ret; |
| } |
| |
| static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct kvm_pgtable *pgt = ctx->arg; |
| struct kvm_s2_mmu *mmu = pgt->mmu; |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| kvm_pte_t *childp = NULL; |
| bool need_flush = false; |
| |
| if (!kvm_pte_valid(ctx->old)) { |
| if (stage2_pte_is_counted(ctx->old)) { |
| kvm_clear_pte(ctx->ptep); |
| mm_ops->put_page(ctx->ptep); |
| } |
| return 0; |
| } |
| |
| if (kvm_pte_table(ctx->old, ctx->level)) { |
| childp = kvm_pte_follow(ctx->old, mm_ops); |
| |
| if (mm_ops->page_count(childp) != 1) |
| return 0; |
| } else if (stage2_pte_cacheable(pgt, ctx->old)) { |
| need_flush = !stage2_has_fwb(pgt); |
| } |
| |
| /* |
| * This is similar to the map() path in that we unmap the entire |
| * block entry and rely on the remaining portions being faulted |
| * back lazily. |
| */ |
| stage2_unmap_put_pte(ctx, mmu, mm_ops); |
| |
| if (need_flush && mm_ops->dcache_clean_inval_poc) |
| mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), |
| kvm_granule_size(ctx->level)); |
| |
| if (childp) |
| mm_ops->put_page(childp); |
| |
| return 0; |
| } |
| |
| int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) |
| { |
| int ret; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_unmap_walker, |
| .arg = pgt, |
| .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, |
| }; |
| |
| ret = kvm_pgtable_walk(pgt, addr, size, &walker); |
| if (stage2_unmap_defer_tlb_flush(pgt)) |
| /* Perform the deferred TLB invalidations */ |
| kvm_tlb_flush_vmid_range(pgt->mmu, addr, size); |
| |
| return ret; |
| } |
| |
| struct stage2_attr_data { |
| kvm_pte_t attr_set; |
| kvm_pte_t attr_clr; |
| kvm_pte_t pte; |
| s8 level; |
| }; |
| |
| static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| kvm_pte_t pte = ctx->old; |
| struct stage2_attr_data *data = ctx->arg; |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| if (!kvm_pte_valid(ctx->old)) |
| return -EAGAIN; |
| |
| data->level = ctx->level; |
| data->pte = pte; |
| pte &= ~data->attr_clr; |
| pte |= data->attr_set; |
| |
| /* |
| * We may race with the CPU trying to set the access flag here, |
| * but worst-case the access flag update gets lost and will be |
| * set on the next access instead. |
| */ |
| if (data->pte != pte) { |
| /* |
| * Invalidate instruction cache before updating the guest |
| * stage-2 PTE if we are going to add executable permission. |
| */ |
| if (mm_ops->icache_inval_pou && |
| stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old)) |
| mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops), |
| kvm_granule_size(ctx->level)); |
| |
| if (!stage2_try_set_pte(ctx, pte)) |
| return -EAGAIN; |
| } |
| |
| return 0; |
| } |
| |
| static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr, |
| u64 size, kvm_pte_t attr_set, |
| kvm_pte_t attr_clr, kvm_pte_t *orig_pte, |
| s8 *level, enum kvm_pgtable_walk_flags flags) |
| { |
| int ret; |
| kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI; |
| struct stage2_attr_data data = { |
| .attr_set = attr_set & attr_mask, |
| .attr_clr = attr_clr & attr_mask, |
| }; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_attr_walker, |
| .arg = &data, |
| .flags = flags | KVM_PGTABLE_WALK_LEAF, |
| }; |
| |
| ret = kvm_pgtable_walk(pgt, addr, size, &walker); |
| if (ret) |
| return ret; |
| |
| if (orig_pte) |
| *orig_pte = data.pte; |
| |
| if (level) |
| *level = data.level; |
| return 0; |
| } |
| |
| int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) |
| { |
| return stage2_update_leaf_attrs(pgt, addr, size, 0, |
| KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W, |
| NULL, NULL, 0); |
| } |
| |
| kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) |
| { |
| kvm_pte_t pte = 0; |
| int ret; |
| |
| ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, |
| &pte, NULL, |
| KVM_PGTABLE_WALK_HANDLE_FAULT | |
| KVM_PGTABLE_WALK_SHARED); |
| if (!ret) |
| dsb(ishst); |
| |
| return pte; |
| } |
| |
| struct stage2_age_data { |
| bool mkold; |
| bool young; |
| }; |
| |
| static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF; |
| struct stage2_age_data *data = ctx->arg; |
| |
| if (!kvm_pte_valid(ctx->old) || new == ctx->old) |
| return 0; |
| |
| data->young = true; |
| |
| /* |
| * stage2_age_walker() is always called while holding the MMU lock for |
| * write, so this will always succeed. Nonetheless, this deliberately |
| * follows the race detection pattern of the other stage-2 walkers in |
| * case the locking mechanics of the MMU notifiers is ever changed. |
| */ |
| if (data->mkold && !stage2_try_set_pte(ctx, new)) |
| return -EAGAIN; |
| |
| /* |
| * "But where's the TLBI?!", you scream. |
| * "Over in the core code", I sigh. |
| * |
| * See the '->clear_flush_young()' callback on the KVM mmu notifier. |
| */ |
| return 0; |
| } |
| |
| bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, |
| u64 size, bool mkold) |
| { |
| struct stage2_age_data data = { |
| .mkold = mkold, |
| }; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_age_walker, |
| .arg = &data, |
| .flags = KVM_PGTABLE_WALK_LEAF, |
| }; |
| |
| WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker)); |
| return data.young; |
| } |
| |
| int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, |
| enum kvm_pgtable_prot prot) |
| { |
| int ret; |
| s8 level; |
| kvm_pte_t set = 0, clr = 0; |
| |
| if (prot & KVM_PTE_LEAF_ATTR_HI_SW) |
| return -EINVAL; |
| |
| if (prot & KVM_PGTABLE_PROT_R) |
| set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; |
| |
| if (prot & KVM_PGTABLE_PROT_W) |
| set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W; |
| |
| if (prot & KVM_PGTABLE_PROT_X) |
| clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; |
| |
| ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, |
| KVM_PGTABLE_WALK_HANDLE_FAULT | |
| KVM_PGTABLE_WALK_SHARED); |
| if (!ret || ret == -EAGAIN) |
| kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); |
| return ret; |
| } |
| |
| static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct kvm_pgtable *pgt = ctx->arg; |
| struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; |
| |
| if (!kvm_pte_valid(ctx->old) || !stage2_pte_cacheable(pgt, ctx->old)) |
| return 0; |
| |
| if (mm_ops->dcache_clean_inval_poc) |
| mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops), |
| kvm_granule_size(ctx->level)); |
| return 0; |
| } |
| |
| int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) |
| { |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_flush_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF, |
| .arg = pgt, |
| }; |
| |
| if (stage2_has_fwb(pgt)) |
| return 0; |
| |
| return kvm_pgtable_walk(pgt, addr, size, &walker); |
| } |
| |
| kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, |
| u64 phys, s8 level, |
| enum kvm_pgtable_prot prot, |
| void *mc, bool force_pte) |
| { |
| struct stage2_map_data map_data = { |
| .phys = phys, |
| .mmu = pgt->mmu, |
| .memcache = mc, |
| .force_pte = force_pte, |
| }; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_map_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF | |
| KVM_PGTABLE_WALK_SKIP_BBM_TLBI | |
| KVM_PGTABLE_WALK_SKIP_CMO, |
| .arg = &map_data, |
| }; |
| /* |
| * The input address (.addr) is irrelevant for walking an |
| * unlinked table. Construct an ambiguous IA range to map |
| * kvm_granule_size(level) worth of memory. |
| */ |
| struct kvm_pgtable_walk_data data = { |
| .walker = &walker, |
| .addr = 0, |
| .end = kvm_granule_size(level), |
| }; |
| struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; |
| kvm_pte_t *pgtable; |
| int ret; |
| |
| if (!IS_ALIGNED(phys, kvm_granule_size(level))) |
| return ERR_PTR(-EINVAL); |
| |
| ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); |
| if (ret) |
| return ERR_PTR(ret); |
| |
| pgtable = mm_ops->zalloc_page(mc); |
| if (!pgtable) |
| return ERR_PTR(-ENOMEM); |
| |
| ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, |
| level + 1); |
| if (ret) { |
| kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); |
| return ERR_PTR(ret); |
| } |
| |
| return pgtable; |
| } |
| |
| /* |
| * Get the number of page-tables needed to replace a block with a |
| * fully populated tree up to the PTE entries. Note that @level is |
| * interpreted as in "level @level entry". |
| */ |
| static int stage2_block_get_nr_page_tables(s8 level) |
| { |
| switch (level) { |
| case 1: |
| return PTRS_PER_PTE + 1; |
| case 2: |
| return 1; |
| case 3: |
| return 0; |
| default: |
| WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || |
| level > KVM_PGTABLE_LAST_LEVEL); |
| return -EINVAL; |
| }; |
| } |
| |
| static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| struct kvm_mmu_memory_cache *mc = ctx->arg; |
| struct kvm_s2_mmu *mmu; |
| kvm_pte_t pte = ctx->old, new, *childp; |
| enum kvm_pgtable_prot prot; |
| s8 level = ctx->level; |
| bool force_pte; |
| int nr_pages; |
| u64 phys; |
| |
| /* No huge-pages exist at the last level */ |
| if (level == KVM_PGTABLE_LAST_LEVEL) |
| return 0; |
| |
| /* We only split valid block mappings */ |
| if (!kvm_pte_valid(pte)) |
| return 0; |
| |
| nr_pages = stage2_block_get_nr_page_tables(level); |
| if (nr_pages < 0) |
| return nr_pages; |
| |
| if (mc->nobjs >= nr_pages) { |
| /* Build a tree mapped down to the PTE granularity. */ |
| force_pte = true; |
| } else { |
| /* |
| * Don't force PTEs, so create_unlinked() below does |
| * not populate the tree up to the PTE level. The |
| * consequence is that the call will require a single |
| * page of level 2 entries at level 1, or a single |
| * page of PTEs at level 2. If we are at level 1, the |
| * PTEs will be created recursively. |
| */ |
| force_pte = false; |
| nr_pages = 1; |
| } |
| |
| if (mc->nobjs < nr_pages) |
| return -ENOMEM; |
| |
| mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); |
| phys = kvm_pte_to_phys(pte); |
| prot = kvm_pgtable_stage2_pte_prot(pte); |
| |
| childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, |
| level, prot, mc, force_pte); |
| if (IS_ERR(childp)) |
| return PTR_ERR(childp); |
| |
| if (!stage2_try_break_pte(ctx, mmu)) { |
| kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); |
| return -EAGAIN; |
| } |
| |
| /* |
| * Note, the contents of the page table are guaranteed to be made |
| * visible before the new PTE is assigned because stage2_make_pte() |
| * writes the PTE using smp_store_release(). |
| */ |
| new = kvm_init_table_pte(childp, mm_ops); |
| stage2_make_pte(ctx, new); |
| dsb(ishst); |
| return 0; |
| } |
| |
| int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, |
| struct kvm_mmu_memory_cache *mc) |
| { |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_split_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF, |
| .arg = mc, |
| }; |
| |
| return kvm_pgtable_walk(pgt, addr, size, &walker); |
| } |
| |
| int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, |
| struct kvm_pgtable_mm_ops *mm_ops, |
| enum kvm_pgtable_stage2_flags flags, |
| kvm_pgtable_force_pte_cb_t force_pte_cb) |
| { |
| size_t pgd_sz; |
| u64 vtcr = mmu->vtcr; |
| u32 ia_bits = VTCR_EL2_IPA(vtcr); |
| u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); |
| s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; |
| |
| pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; |
| pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz); |
| if (!pgt->pgd) |
| return -ENOMEM; |
| |
| pgt->ia_bits = ia_bits; |
| pgt->start_level = start_level; |
| pgt->mm_ops = mm_ops; |
| pgt->mmu = mmu; |
| pgt->flags = flags; |
| pgt->force_pte_cb = force_pte_cb; |
| |
| /* Ensure zeroed PGD pages are visible to the hardware walker */ |
| dsb(ishst); |
| return 0; |
| } |
| |
| size_t kvm_pgtable_stage2_pgd_size(u64 vtcr) |
| { |
| u32 ia_bits = VTCR_EL2_IPA(vtcr); |
| u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); |
| s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; |
| |
| return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; |
| } |
| |
| static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx, |
| enum kvm_pgtable_walk_flags visit) |
| { |
| struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; |
| |
| if (!stage2_pte_is_counted(ctx->old)) |
| return 0; |
| |
| mm_ops->put_page(ctx->ptep); |
| |
| if (kvm_pte_table(ctx->old, ctx->level)) |
| mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops)); |
| |
| return 0; |
| } |
| |
| void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) |
| { |
| size_t pgd_sz; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_free_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF | |
| KVM_PGTABLE_WALK_TABLE_POST, |
| }; |
| |
| WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); |
| pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; |
| pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz); |
| pgt->pgd = NULL; |
| } |
| |
| void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) |
| { |
| kvm_pteref_t ptep = (kvm_pteref_t)pgtable; |
| struct kvm_pgtable_walker walker = { |
| .cb = stage2_free_walker, |
| .flags = KVM_PGTABLE_WALK_LEAF | |
| KVM_PGTABLE_WALK_TABLE_POST, |
| }; |
| struct kvm_pgtable_walk_data data = { |
| .walker = &walker, |
| |
| /* |
| * At this point the IPA really doesn't matter, as the page |
| * table being traversed has already been removed from the stage |
| * 2. Set an appropriate range to cover the entire page table. |
| */ |
| .addr = 0, |
| .end = kvm_granule_size(level), |
| }; |
| |
| WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1)); |
| |
| WARN_ON(mm_ops->page_count(pgtable) != 1); |
| mm_ops->put_page(pgtable); |
| } |