| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * DAMON Primitives for Virtual Address Spaces |
| * |
| * Author: SeongJae Park <sjpark@amazon.de> |
| */ |
| |
| #define pr_fmt(fmt) "damon-va: " fmt |
| |
| #include <asm-generic/mman-common.h> |
| #include <linux/highmem.h> |
| #include <linux/hugetlb.h> |
| #include <linux/mmu_notifier.h> |
| #include <linux/page_idle.h> |
| #include <linux/pagewalk.h> |
| #include <linux/sched/mm.h> |
| |
| #include "ops-common.h" |
| |
| #ifdef CONFIG_DAMON_VADDR_KUNIT_TEST |
| #undef DAMON_MIN_REGION |
| #define DAMON_MIN_REGION 1 |
| #endif |
| |
| /* |
| * 't->pid' should be the pointer to the relevant 'struct pid' having reference |
| * count. Caller must put the returned task, unless it is NULL. |
| */ |
| static inline struct task_struct *damon_get_task_struct(struct damon_target *t) |
| { |
| return get_pid_task(t->pid, PIDTYPE_PID); |
| } |
| |
| /* |
| * Get the mm_struct of the given target |
| * |
| * Caller _must_ put the mm_struct after use, unless it is NULL. |
| * |
| * Returns the mm_struct of the target on success, NULL on failure |
| */ |
| static struct mm_struct *damon_get_mm(struct damon_target *t) |
| { |
| struct task_struct *task; |
| struct mm_struct *mm; |
| |
| task = damon_get_task_struct(t); |
| if (!task) |
| return NULL; |
| |
| mm = get_task_mm(task); |
| put_task_struct(task); |
| return mm; |
| } |
| |
| /* |
| * Functions for the initial monitoring target regions construction |
| */ |
| |
| /* |
| * Size-evenly split a region into 'nr_pieces' small regions |
| * |
| * Returns 0 on success, or negative error code otherwise. |
| */ |
| static int damon_va_evenly_split_region(struct damon_target *t, |
| struct damon_region *r, unsigned int nr_pieces) |
| { |
| unsigned long sz_orig, sz_piece, orig_end; |
| struct damon_region *n = NULL, *next; |
| unsigned long start; |
| |
| if (!r || !nr_pieces) |
| return -EINVAL; |
| |
| orig_end = r->ar.end; |
| sz_orig = damon_sz_region(r); |
| sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); |
| |
| if (!sz_piece) |
| return -EINVAL; |
| |
| r->ar.end = r->ar.start + sz_piece; |
| next = damon_next_region(r); |
| for (start = r->ar.end; start + sz_piece <= orig_end; |
| start += sz_piece) { |
| n = damon_new_region(start, start + sz_piece); |
| if (!n) |
| return -ENOMEM; |
| damon_insert_region(n, r, next, t); |
| r = n; |
| } |
| /* complement last region for possible rounding error */ |
| if (n) |
| n->ar.end = orig_end; |
| |
| return 0; |
| } |
| |
| static unsigned long sz_range(struct damon_addr_range *r) |
| { |
| return r->end - r->start; |
| } |
| |
| /* |
| * Find three regions separated by two biggest unmapped regions |
| * |
| * vma the head vma of the target address space |
| * regions an array of three address ranges that results will be saved |
| * |
| * This function receives an address space and finds three regions in it which |
| * separated by the two biggest unmapped regions in the space. Please refer to |
| * below comments of '__damon_va_init_regions()' function to know why this is |
| * necessary. |
| * |
| * Returns 0 if success, or negative error code otherwise. |
| */ |
| static int __damon_va_three_regions(struct mm_struct *mm, |
| struct damon_addr_range regions[3]) |
| { |
| struct damon_addr_range first_gap = {0}, second_gap = {0}; |
| VMA_ITERATOR(vmi, mm, 0); |
| struct vm_area_struct *vma, *prev = NULL; |
| unsigned long start; |
| |
| /* |
| * Find the two biggest gaps so that first_gap > second_gap > others. |
| * If this is too slow, it can be optimised to examine the maple |
| * tree gaps. |
| */ |
| for_each_vma(vmi, vma) { |
| unsigned long gap; |
| |
| if (!prev) { |
| start = vma->vm_start; |
| goto next; |
| } |
| gap = vma->vm_start - prev->vm_end; |
| |
| if (gap > sz_range(&first_gap)) { |
| second_gap = first_gap; |
| first_gap.start = prev->vm_end; |
| first_gap.end = vma->vm_start; |
| } else if (gap > sz_range(&second_gap)) { |
| second_gap.start = prev->vm_end; |
| second_gap.end = vma->vm_start; |
| } |
| next: |
| prev = vma; |
| } |
| |
| if (!sz_range(&second_gap) || !sz_range(&first_gap)) |
| return -EINVAL; |
| |
| /* Sort the two biggest gaps by address */ |
| if (first_gap.start > second_gap.start) |
| swap(first_gap, second_gap); |
| |
| /* Store the result */ |
| regions[0].start = ALIGN(start, DAMON_MIN_REGION); |
| regions[0].end = ALIGN(first_gap.start, DAMON_MIN_REGION); |
| regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); |
| regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); |
| regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); |
| regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); |
| |
| return 0; |
| } |
| |
| /* |
| * Get the three regions in the given target (task) |
| * |
| * Returns 0 on success, negative error code otherwise. |
| */ |
| static int damon_va_three_regions(struct damon_target *t, |
| struct damon_addr_range regions[3]) |
| { |
| struct mm_struct *mm; |
| int rc; |
| |
| mm = damon_get_mm(t); |
| if (!mm) |
| return -EINVAL; |
| |
| mmap_read_lock(mm); |
| rc = __damon_va_three_regions(mm, regions); |
| mmap_read_unlock(mm); |
| |
| mmput(mm); |
| return rc; |
| } |
| |
| /* |
| * Initialize the monitoring target regions for the given target (task) |
| * |
| * t the given target |
| * |
| * Because only a number of small portions of the entire address space |
| * is actually mapped to the memory and accessed, monitoring the unmapped |
| * regions is wasteful. That said, because we can deal with small noises, |
| * tracking every mapping is not strictly required but could even incur a high |
| * overhead if the mapping frequently changes or the number of mappings is |
| * high. The adaptive regions adjustment mechanism will further help to deal |
| * with the noise by simply identifying the unmapped areas as a region that |
| * has no access. Moreover, applying the real mappings that would have many |
| * unmapped areas inside will make the adaptive mechanism quite complex. That |
| * said, too huge unmapped areas inside the monitoring target should be removed |
| * to not take the time for the adaptive mechanism. |
| * |
| * For the reason, we convert the complex mappings to three distinct regions |
| * that cover every mapped area of the address space. Also the two gaps |
| * between the three regions are the two biggest unmapped areas in the given |
| * address space. In detail, this function first identifies the start and the |
| * end of the mappings and the two biggest unmapped areas of the address space. |
| * Then, it constructs the three regions as below: |
| * |
| * [mappings[0]->start, big_two_unmapped_areas[0]->start) |
| * [big_two_unmapped_areas[0]->end, big_two_unmapped_areas[1]->start) |
| * [big_two_unmapped_areas[1]->end, mappings[nr_mappings - 1]->end) |
| * |
| * As usual memory map of processes is as below, the gap between the heap and |
| * the uppermost mmap()-ed region, and the gap between the lowermost mmap()-ed |
| * region and the stack will be two biggest unmapped regions. Because these |
| * gaps are exceptionally huge areas in usual address space, excluding these |
| * two biggest unmapped regions will be sufficient to make a trade-off. |
| * |
| * <heap> |
| * <BIG UNMAPPED REGION 1> |
| * <uppermost mmap()-ed region> |
| * (other mmap()-ed regions and small unmapped regions) |
| * <lowermost mmap()-ed region> |
| * <BIG UNMAPPED REGION 2> |
| * <stack> |
| */ |
| static void __damon_va_init_regions(struct damon_ctx *ctx, |
| struct damon_target *t) |
| { |
| struct damon_target *ti; |
| struct damon_region *r; |
| struct damon_addr_range regions[3]; |
| unsigned long sz = 0, nr_pieces; |
| int i, tidx = 0; |
| |
| if (damon_va_three_regions(t, regions)) { |
| damon_for_each_target(ti, ctx) { |
| if (ti == t) |
| break; |
| tidx++; |
| } |
| pr_debug("Failed to get three regions of %dth target\n", tidx); |
| return; |
| } |
| |
| for (i = 0; i < 3; i++) |
| sz += regions[i].end - regions[i].start; |
| if (ctx->attrs.min_nr_regions) |
| sz /= ctx->attrs.min_nr_regions; |
| if (sz < DAMON_MIN_REGION) |
| sz = DAMON_MIN_REGION; |
| |
| /* Set the initial three regions of the target */ |
| for (i = 0; i < 3; i++) { |
| r = damon_new_region(regions[i].start, regions[i].end); |
| if (!r) { |
| pr_err("%d'th init region creation failed\n", i); |
| return; |
| } |
| damon_add_region(r, t); |
| |
| nr_pieces = (regions[i].end - regions[i].start) / sz; |
| damon_va_evenly_split_region(t, r, nr_pieces); |
| } |
| } |
| |
| /* Initialize '->regions_list' of every target (task) */ |
| static void damon_va_init(struct damon_ctx *ctx) |
| { |
| struct damon_target *t; |
| |
| damon_for_each_target(t, ctx) { |
| /* the user may set the target regions as they want */ |
| if (!damon_nr_regions(t)) |
| __damon_va_init_regions(ctx, t); |
| } |
| } |
| |
| /* |
| * Update regions for current memory mappings |
| */ |
| static void damon_va_update(struct damon_ctx *ctx) |
| { |
| struct damon_addr_range three_regions[3]; |
| struct damon_target *t; |
| |
| damon_for_each_target(t, ctx) { |
| if (damon_va_three_regions(t, three_regions)) |
| continue; |
| damon_set_regions(t, three_regions, 3); |
| } |
| } |
| |
| static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, |
| unsigned long next, struct mm_walk *walk) |
| { |
| pte_t *pte; |
| spinlock_t *ptl; |
| |
| if (pmd_trans_huge(*pmd)) { |
| ptl = pmd_lock(walk->mm, pmd); |
| if (!pmd_present(*pmd)) { |
| spin_unlock(ptl); |
| return 0; |
| } |
| |
| if (pmd_trans_huge(*pmd)) { |
| damon_pmdp_mkold(pmd, walk->mm, addr); |
| spin_unlock(ptl); |
| return 0; |
| } |
| spin_unlock(ptl); |
| } |
| |
| if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
| return 0; |
| pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
| if (!pte_present(*pte)) |
| goto out; |
| damon_ptep_mkold(pte, walk->mm, addr); |
| out: |
| pte_unmap_unlock(pte, ptl); |
| return 0; |
| } |
| |
| #ifdef CONFIG_HUGETLB_PAGE |
| static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, |
| struct vm_area_struct *vma, unsigned long addr) |
| { |
| bool referenced = false; |
| pte_t entry = huge_ptep_get(pte); |
| struct folio *folio = pfn_folio(pte_pfn(entry)); |
| |
| folio_get(folio); |
| |
| if (pte_young(entry)) { |
| referenced = true; |
| entry = pte_mkold(entry); |
| set_huge_pte_at(mm, addr, pte, entry); |
| } |
| |
| #ifdef CONFIG_MMU_NOTIFIER |
| if (mmu_notifier_clear_young(mm, addr, |
| addr + huge_page_size(hstate_vma(vma)))) |
| referenced = true; |
| #endif /* CONFIG_MMU_NOTIFIER */ |
| |
| if (referenced) |
| folio_set_young(folio); |
| |
| folio_set_idle(folio); |
| folio_put(folio); |
| } |
| |
| static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, |
| unsigned long addr, unsigned long end, |
| struct mm_walk *walk) |
| { |
| struct hstate *h = hstate_vma(walk->vma); |
| spinlock_t *ptl; |
| pte_t entry; |
| |
| ptl = huge_pte_lock(h, walk->mm, pte); |
| entry = huge_ptep_get(pte); |
| if (!pte_present(entry)) |
| goto out; |
| |
| damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); |
| |
| out: |
| spin_unlock(ptl); |
| return 0; |
| } |
| #else |
| #define damon_mkold_hugetlb_entry NULL |
| #endif /* CONFIG_HUGETLB_PAGE */ |
| |
| static const struct mm_walk_ops damon_mkold_ops = { |
| .pmd_entry = damon_mkold_pmd_entry, |
| .hugetlb_entry = damon_mkold_hugetlb_entry, |
| }; |
| |
| static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) |
| { |
| mmap_read_lock(mm); |
| walk_page_range(mm, addr, addr + 1, &damon_mkold_ops, NULL); |
| mmap_read_unlock(mm); |
| } |
| |
| /* |
| * Functions for the access checking of the regions |
| */ |
| |
| static void __damon_va_prepare_access_check(struct mm_struct *mm, |
| struct damon_region *r) |
| { |
| r->sampling_addr = damon_rand(r->ar.start, r->ar.end); |
| |
| damon_va_mkold(mm, r->sampling_addr); |
| } |
| |
| static void damon_va_prepare_access_checks(struct damon_ctx *ctx) |
| { |
| struct damon_target *t; |
| struct mm_struct *mm; |
| struct damon_region *r; |
| |
| damon_for_each_target(t, ctx) { |
| mm = damon_get_mm(t); |
| if (!mm) |
| continue; |
| damon_for_each_region(r, t) |
| __damon_va_prepare_access_check(mm, r); |
| mmput(mm); |
| } |
| } |
| |
| struct damon_young_walk_private { |
| /* size of the folio for the access checked virtual memory address */ |
| unsigned long *folio_sz; |
| bool young; |
| }; |
| |
| static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, |
| unsigned long next, struct mm_walk *walk) |
| { |
| pte_t *pte; |
| spinlock_t *ptl; |
| struct folio *folio; |
| struct damon_young_walk_private *priv = walk->private; |
| |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| if (pmd_trans_huge(*pmd)) { |
| ptl = pmd_lock(walk->mm, pmd); |
| if (!pmd_present(*pmd)) { |
| spin_unlock(ptl); |
| return 0; |
| } |
| |
| if (!pmd_trans_huge(*pmd)) { |
| spin_unlock(ptl); |
| goto regular_page; |
| } |
| folio = damon_get_folio(pmd_pfn(*pmd)); |
| if (!folio) |
| goto huge_out; |
| if (pmd_young(*pmd) || !folio_test_idle(folio) || |
| mmu_notifier_test_young(walk->mm, |
| addr)) |
| priv->young = true; |
| *priv->folio_sz = HPAGE_PMD_SIZE; |
| folio_put(folio); |
| huge_out: |
| spin_unlock(ptl); |
| return 0; |
| } |
| |
| regular_page: |
| #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
| |
| if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) |
| return -EINVAL; |
| pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
| if (!pte_present(*pte)) |
| goto out; |
| folio = damon_get_folio(pte_pfn(*pte)); |
| if (!folio) |
| goto out; |
| if (pte_young(*pte) || !folio_test_idle(folio) || |
| mmu_notifier_test_young(walk->mm, addr)) |
| priv->young = true; |
| *priv->folio_sz = folio_size(folio); |
| folio_put(folio); |
| out: |
| pte_unmap_unlock(pte, ptl); |
| return 0; |
| } |
| |
| #ifdef CONFIG_HUGETLB_PAGE |
| static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, |
| unsigned long addr, unsigned long end, |
| struct mm_walk *walk) |
| { |
| struct damon_young_walk_private *priv = walk->private; |
| struct hstate *h = hstate_vma(walk->vma); |
| struct folio *folio; |
| spinlock_t *ptl; |
| pte_t entry; |
| |
| ptl = huge_pte_lock(h, walk->mm, pte); |
| entry = huge_ptep_get(pte); |
| if (!pte_present(entry)) |
| goto out; |
| |
| folio = pfn_folio(pte_pfn(entry)); |
| folio_get(folio); |
| |
| if (pte_young(entry) || !folio_test_idle(folio) || |
| mmu_notifier_test_young(walk->mm, addr)) |
| priv->young = true; |
| *priv->folio_sz = huge_page_size(h); |
| |
| folio_put(folio); |
| |
| out: |
| spin_unlock(ptl); |
| return 0; |
| } |
| #else |
| #define damon_young_hugetlb_entry NULL |
| #endif /* CONFIG_HUGETLB_PAGE */ |
| |
| static const struct mm_walk_ops damon_young_ops = { |
| .pmd_entry = damon_young_pmd_entry, |
| .hugetlb_entry = damon_young_hugetlb_entry, |
| }; |
| |
| static bool damon_va_young(struct mm_struct *mm, unsigned long addr, |
| unsigned long *folio_sz) |
| { |
| struct damon_young_walk_private arg = { |
| .folio_sz = folio_sz, |
| .young = false, |
| }; |
| |
| mmap_read_lock(mm); |
| walk_page_range(mm, addr, addr + 1, &damon_young_ops, &arg); |
| mmap_read_unlock(mm); |
| return arg.young; |
| } |
| |
| /* |
| * Check whether the region was accessed after the last preparation |
| * |
| * mm 'mm_struct' for the given virtual address space |
| * r the region to be checked |
| */ |
| static void __damon_va_check_access(struct mm_struct *mm, |
| struct damon_region *r, bool same_target) |
| { |
| static unsigned long last_addr; |
| static unsigned long last_folio_sz = PAGE_SIZE; |
| static bool last_accessed; |
| |
| /* If the region is in the last checked page, reuse the result */ |
| if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == |
| ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { |
| if (last_accessed) |
| r->nr_accesses++; |
| return; |
| } |
| |
| last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); |
| if (last_accessed) |
| r->nr_accesses++; |
| |
| last_addr = r->sampling_addr; |
| } |
| |
| static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) |
| { |
| struct damon_target *t; |
| struct mm_struct *mm; |
| struct damon_region *r; |
| unsigned int max_nr_accesses = 0; |
| bool same_target; |
| |
| damon_for_each_target(t, ctx) { |
| mm = damon_get_mm(t); |
| if (!mm) |
| continue; |
| same_target = false; |
| damon_for_each_region(r, t) { |
| __damon_va_check_access(mm, r, same_target); |
| max_nr_accesses = max(r->nr_accesses, max_nr_accesses); |
| same_target = true; |
| } |
| mmput(mm); |
| } |
| |
| return max_nr_accesses; |
| } |
| |
| /* |
| * Functions for the target validity check and cleanup |
| */ |
| |
| static bool damon_va_target_valid(struct damon_target *t) |
| { |
| struct task_struct *task; |
| |
| task = damon_get_task_struct(t); |
| if (task) { |
| put_task_struct(task); |
| return true; |
| } |
| |
| return false; |
| } |
| |
| #ifndef CONFIG_ADVISE_SYSCALLS |
| static unsigned long damos_madvise(struct damon_target *target, |
| struct damon_region *r, int behavior) |
| { |
| return 0; |
| } |
| #else |
| static unsigned long damos_madvise(struct damon_target *target, |
| struct damon_region *r, int behavior) |
| { |
| struct mm_struct *mm; |
| unsigned long start = PAGE_ALIGN(r->ar.start); |
| unsigned long len = PAGE_ALIGN(damon_sz_region(r)); |
| unsigned long applied; |
| |
| mm = damon_get_mm(target); |
| if (!mm) |
| return 0; |
| |
| applied = do_madvise(mm, start, len, behavior) ? 0 : len; |
| mmput(mm); |
| |
| return applied; |
| } |
| #endif /* CONFIG_ADVISE_SYSCALLS */ |
| |
| static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, |
| struct damon_target *t, struct damon_region *r, |
| struct damos *scheme) |
| { |
| int madv_action; |
| |
| switch (scheme->action) { |
| case DAMOS_WILLNEED: |
| madv_action = MADV_WILLNEED; |
| break; |
| case DAMOS_COLD: |
| madv_action = MADV_COLD; |
| break; |
| case DAMOS_PAGEOUT: |
| madv_action = MADV_PAGEOUT; |
| break; |
| case DAMOS_HUGEPAGE: |
| madv_action = MADV_HUGEPAGE; |
| break; |
| case DAMOS_NOHUGEPAGE: |
| madv_action = MADV_NOHUGEPAGE; |
| break; |
| case DAMOS_STAT: |
| return 0; |
| default: |
| /* |
| * DAMOS actions that are not yet supported by 'vaddr'. |
| */ |
| return 0; |
| } |
| |
| return damos_madvise(t, r, madv_action); |
| } |
| |
| static int damon_va_scheme_score(struct damon_ctx *context, |
| struct damon_target *t, struct damon_region *r, |
| struct damos *scheme) |
| { |
| |
| switch (scheme->action) { |
| case DAMOS_PAGEOUT: |
| return damon_cold_score(context, r, scheme); |
| default: |
| break; |
| } |
| |
| return DAMOS_MAX_SCORE; |
| } |
| |
| static int __init damon_va_initcall(void) |
| { |
| struct damon_operations ops = { |
| .id = DAMON_OPS_VADDR, |
| .init = damon_va_init, |
| .update = damon_va_update, |
| .prepare_access_checks = damon_va_prepare_access_checks, |
| .check_accesses = damon_va_check_accesses, |
| .reset_aggregated = NULL, |
| .target_valid = damon_va_target_valid, |
| .cleanup = NULL, |
| .apply_scheme = damon_va_apply_scheme, |
| .get_scheme_score = damon_va_scheme_score, |
| }; |
| /* ops for fixed virtual address ranges */ |
| struct damon_operations ops_fvaddr = ops; |
| int err; |
| |
| /* Don't set the monitoring target regions for the entire mapping */ |
| ops_fvaddr.id = DAMON_OPS_FVADDR; |
| ops_fvaddr.init = NULL; |
| ops_fvaddr.update = NULL; |
| |
| err = damon_register_ops(&ops); |
| if (err) |
| return err; |
| return damon_register_ops(&ops_fvaddr); |
| }; |
| |
| subsys_initcall(damon_va_initcall); |
| |
| #include "vaddr-test.h" |