Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
Muchun Song | dff0338 | 2022-06-28 17:22:30 +0800 | [diff] [blame] | 3 | * HugeTLB Vmemmap Optimization (HVO) |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 4 | * |
Muchun Song | dff0338 | 2022-06-28 17:22:30 +0800 | [diff] [blame] | 5 | * Copyright (c) 2020, ByteDance. All rights reserved. |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 6 | * |
| 7 | * Author: Muchun Song <songmuchun@bytedance.com> |
| 8 | * |
Mike Rapoport | ee65728 | 2022-06-27 09:00:26 +0300 | [diff] [blame] | 9 | * See Documentation/mm/vmemmap_dedup.rst |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 10 | */ |
Muchun Song | e9fdff8 | 2021-06-30 18:47:25 -0700 | [diff] [blame] | 11 | #define pr_fmt(fmt) "HugeTLB: " fmt |
| 12 | |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 13 | #include <linux/pgtable.h> |
Vasily Gorbik | db5e8d8 | 2022-11-02 19:09:17 +0100 | [diff] [blame] | 14 | #include <linux/moduleparam.h> |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 15 | #include <linux/bootmem_info.h> |
| 16 | #include <asm/pgalloc.h> |
| 17 | #include <asm/tlbflush.h> |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 18 | #include "hugetlb_vmemmap.h" |
| 19 | |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 20 | /** |
| 21 | * struct vmemmap_remap_walk - walk vmemmap page table |
| 22 | * |
| 23 | * @remap_pte: called for each lowest-level entry (PTE). |
| 24 | * @nr_walked: the number of walked pte. |
| 25 | * @reuse_page: the page which is reused for the tail vmemmap pages. |
| 26 | * @reuse_addr: the virtual address of the @reuse_page page. |
| 27 | * @vmemmap_pages: the list head of the vmemmap pages that can be freed |
| 28 | * or is mapped from. |
| 29 | */ |
| 30 | struct vmemmap_remap_walk { |
| 31 | void (*remap_pte)(pte_t *pte, unsigned long addr, |
| 32 | struct vmemmap_remap_walk *walk); |
| 33 | unsigned long nr_walked; |
| 34 | struct page *reuse_page; |
| 35 | unsigned long reuse_addr; |
| 36 | struct list_head *vmemmap_pages; |
| 37 | }; |
| 38 | |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 39 | static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) |
| 40 | { |
| 41 | pmd_t __pmd; |
| 42 | int i; |
| 43 | unsigned long addr = start; |
| 44 | struct page *page = pmd_page(*pmd); |
| 45 | pte_t *pgtable = pte_alloc_one_kernel(&init_mm); |
| 46 | |
| 47 | if (!pgtable) |
| 48 | return -ENOMEM; |
| 49 | |
| 50 | pmd_populate_kernel(&init_mm, &__pmd, pgtable); |
| 51 | |
Muchun Song | e38f055 | 2022-06-28 17:22:35 +0800 | [diff] [blame] | 52 | for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 53 | pte_t entry, *pte; |
| 54 | pgprot_t pgprot = PAGE_KERNEL; |
| 55 | |
| 56 | entry = mk_pte(page + i, pgprot); |
| 57 | pte = pte_offset_kernel(&__pmd, addr); |
| 58 | set_pte_at(&init_mm, addr, pte, entry); |
| 59 | } |
| 60 | |
| 61 | spin_lock(&init_mm.page_table_lock); |
| 62 | if (likely(pmd_leaf(*pmd))) { |
| 63 | /* |
| 64 | * Higher order allocations from buddy allocator must be able to |
| 65 | * be treated as indepdenent small pages (as they can be freed |
| 66 | * individually). |
| 67 | */ |
| 68 | if (!PageReserved(page)) |
| 69 | split_page(page, get_order(PMD_SIZE)); |
| 70 | |
| 71 | /* Make pte visible before pmd. See comment in pmd_install(). */ |
| 72 | smp_wmb(); |
| 73 | pmd_populate_kernel(&init_mm, pmd, pgtable); |
| 74 | flush_tlb_kernel_range(start, start + PMD_SIZE); |
| 75 | } else { |
| 76 | pte_free_kernel(&init_mm, pgtable); |
| 77 | } |
| 78 | spin_unlock(&init_mm.page_table_lock); |
| 79 | |
| 80 | return 0; |
| 81 | } |
| 82 | |
| 83 | static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) |
| 84 | { |
| 85 | int leaf; |
| 86 | |
| 87 | spin_lock(&init_mm.page_table_lock); |
| 88 | leaf = pmd_leaf(*pmd); |
| 89 | spin_unlock(&init_mm.page_table_lock); |
| 90 | |
| 91 | if (!leaf) |
| 92 | return 0; |
| 93 | |
| 94 | return __split_vmemmap_huge_pmd(pmd, start); |
| 95 | } |
| 96 | |
| 97 | static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, |
| 98 | unsigned long end, |
| 99 | struct vmemmap_remap_walk *walk) |
| 100 | { |
| 101 | pte_t *pte = pte_offset_kernel(pmd, addr); |
| 102 | |
| 103 | /* |
| 104 | * The reuse_page is found 'first' in table walk before we start |
| 105 | * remapping (which is calling @walk->remap_pte). |
| 106 | */ |
| 107 | if (!walk->reuse_page) { |
Ryan Roberts | c33c794 | 2023-06-12 16:15:45 +0100 | [diff] [blame] | 108 | walk->reuse_page = pte_page(ptep_get(pte)); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 109 | /* |
| 110 | * Because the reuse address is part of the range that we are |
| 111 | * walking, skip the reuse address range. |
| 112 | */ |
| 113 | addr += PAGE_SIZE; |
| 114 | pte++; |
| 115 | walk->nr_walked++; |
| 116 | } |
| 117 | |
| 118 | for (; addr != end; addr += PAGE_SIZE, pte++) { |
| 119 | walk->remap_pte(pte, addr, walk); |
| 120 | walk->nr_walked++; |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, |
| 125 | unsigned long end, |
| 126 | struct vmemmap_remap_walk *walk) |
| 127 | { |
| 128 | pmd_t *pmd; |
| 129 | unsigned long next; |
| 130 | |
| 131 | pmd = pmd_offset(pud, addr); |
| 132 | do { |
| 133 | int ret; |
| 134 | |
| 135 | ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); |
| 136 | if (ret) |
| 137 | return ret; |
| 138 | |
| 139 | next = pmd_addr_end(addr, end); |
| 140 | vmemmap_pte_range(pmd, addr, next, walk); |
| 141 | } while (pmd++, addr = next, addr != end); |
| 142 | |
| 143 | return 0; |
| 144 | } |
| 145 | |
| 146 | static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, |
| 147 | unsigned long end, |
| 148 | struct vmemmap_remap_walk *walk) |
| 149 | { |
| 150 | pud_t *pud; |
| 151 | unsigned long next; |
| 152 | |
| 153 | pud = pud_offset(p4d, addr); |
| 154 | do { |
| 155 | int ret; |
| 156 | |
| 157 | next = pud_addr_end(addr, end); |
| 158 | ret = vmemmap_pmd_range(pud, addr, next, walk); |
| 159 | if (ret) |
| 160 | return ret; |
| 161 | } while (pud++, addr = next, addr != end); |
| 162 | |
| 163 | return 0; |
| 164 | } |
| 165 | |
| 166 | static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, |
| 167 | unsigned long end, |
| 168 | struct vmemmap_remap_walk *walk) |
| 169 | { |
| 170 | p4d_t *p4d; |
| 171 | unsigned long next; |
| 172 | |
| 173 | p4d = p4d_offset(pgd, addr); |
| 174 | do { |
| 175 | int ret; |
| 176 | |
| 177 | next = p4d_addr_end(addr, end); |
| 178 | ret = vmemmap_pud_range(p4d, addr, next, walk); |
| 179 | if (ret) |
| 180 | return ret; |
| 181 | } while (p4d++, addr = next, addr != end); |
| 182 | |
| 183 | return 0; |
| 184 | } |
| 185 | |
| 186 | static int vmemmap_remap_range(unsigned long start, unsigned long end, |
| 187 | struct vmemmap_remap_walk *walk) |
| 188 | { |
| 189 | unsigned long addr = start; |
| 190 | unsigned long next; |
| 191 | pgd_t *pgd; |
| 192 | |
| 193 | VM_BUG_ON(!PAGE_ALIGNED(start)); |
| 194 | VM_BUG_ON(!PAGE_ALIGNED(end)); |
| 195 | |
| 196 | pgd = pgd_offset_k(addr); |
| 197 | do { |
| 198 | int ret; |
| 199 | |
| 200 | next = pgd_addr_end(addr, end); |
| 201 | ret = vmemmap_p4d_range(pgd, addr, next, walk); |
| 202 | if (ret) |
| 203 | return ret; |
| 204 | } while (pgd++, addr = next, addr != end); |
| 205 | |
Joao Martins | 11aad26 | 2022-11-07 15:39:22 +0000 | [diff] [blame] | 206 | flush_tlb_kernel_range(start, end); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 207 | |
| 208 | return 0; |
| 209 | } |
| 210 | |
| 211 | /* |
| 212 | * Free a vmemmap page. A vmemmap page can be allocated from the memblock |
| 213 | * allocator or buddy allocator. If the PG_reserved flag is set, it means |
| 214 | * that it allocated from the memblock allocator, just free it via the |
| 215 | * free_bootmem_page(). Otherwise, use __free_page(). |
| 216 | */ |
| 217 | static inline void free_vmemmap_page(struct page *page) |
| 218 | { |
| 219 | if (PageReserved(page)) |
| 220 | free_bootmem_page(page); |
| 221 | else |
| 222 | __free_page(page); |
| 223 | } |
| 224 | |
| 225 | /* Free a list of the vmemmap pages */ |
| 226 | static void free_vmemmap_page_list(struct list_head *list) |
| 227 | { |
| 228 | struct page *page, *next; |
| 229 | |
Muchun Song | 1cc53a0 | 2022-10-27 11:36:41 +0800 | [diff] [blame] | 230 | list_for_each_entry_safe(page, next, list, lru) |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 231 | free_vmemmap_page(page); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 232 | } |
| 233 | |
| 234 | static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, |
| 235 | struct vmemmap_remap_walk *walk) |
| 236 | { |
| 237 | /* |
| 238 | * Remap the tail pages as read-only to catch illegal write operation |
| 239 | * to the tail pages. |
| 240 | */ |
| 241 | pgprot_t pgprot = PAGE_KERNEL_RO; |
Ryan Roberts | c33c794 | 2023-06-12 16:15:45 +0100 | [diff] [blame] | 242 | struct page *page = pte_page(ptep_get(pte)); |
Joao Martins | 11aad26 | 2022-11-07 15:39:22 +0000 | [diff] [blame] | 243 | pte_t entry; |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 244 | |
Joao Martins | 11aad26 | 2022-11-07 15:39:22 +0000 | [diff] [blame] | 245 | /* Remapping the head page requires r/w */ |
| 246 | if (unlikely(addr == walk->reuse_addr)) { |
| 247 | pgprot = PAGE_KERNEL; |
| 248 | list_del(&walk->reuse_page->lru); |
| 249 | |
| 250 | /* |
| 251 | * Makes sure that preceding stores to the page contents from |
| 252 | * vmemmap_remap_free() become visible before the set_pte_at() |
| 253 | * write. |
| 254 | */ |
| 255 | smp_wmb(); |
| 256 | } |
| 257 | |
| 258 | entry = mk_pte(walk->reuse_page, pgprot); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 259 | list_add_tail(&page->lru, walk->vmemmap_pages); |
| 260 | set_pte_at(&init_mm, addr, pte, entry); |
| 261 | } |
| 262 | |
| 263 | /* |
| 264 | * How many struct page structs need to be reset. When we reuse the head |
| 265 | * struct page, the special metadata (e.g. page->flags or page->mapping) |
| 266 | * cannot copy to the tail struct page structs. The invalid value will be |
Vlastimil Babka | 8666925 | 2023-04-05 16:28:40 +0200 | [diff] [blame] | 267 | * checked in the free_tail_page_prepare(). In order to avoid the message |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 268 | * of "corrupted mapping in tail page". We need to reset at least 3 (one |
| 269 | * head struct page struct and two tail struct page structs) struct page |
| 270 | * structs. |
| 271 | */ |
| 272 | #define NR_RESET_STRUCT_PAGE 3 |
| 273 | |
| 274 | static inline void reset_struct_pages(struct page *start) |
| 275 | { |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 276 | struct page *from = start + NR_RESET_STRUCT_PAGE; |
| 277 | |
Muchun Song | 33febb5 | 2022-08-19 11:55:32 +0800 | [diff] [blame] | 278 | BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); |
| 279 | memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 280 | } |
| 281 | |
| 282 | static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, |
| 283 | struct vmemmap_remap_walk *walk) |
| 284 | { |
| 285 | pgprot_t pgprot = PAGE_KERNEL; |
| 286 | struct page *page; |
| 287 | void *to; |
| 288 | |
Ryan Roberts | c33c794 | 2023-06-12 16:15:45 +0100 | [diff] [blame] | 289 | BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 290 | |
| 291 | page = list_first_entry(walk->vmemmap_pages, struct page, lru); |
| 292 | list_del(&page->lru); |
| 293 | to = page_to_virt(page); |
| 294 | copy_page(to, (void *)walk->reuse_addr); |
| 295 | reset_struct_pages(to); |
| 296 | |
Miaohe Lin | 939de63 | 2022-08-16 21:05:51 +0800 | [diff] [blame] | 297 | /* |
| 298 | * Makes sure that preceding stores to the page contents become visible |
| 299 | * before the set_pte_at() write. |
| 300 | */ |
| 301 | smp_wmb(); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 302 | set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); |
| 303 | } |
| 304 | |
| 305 | /** |
| 306 | * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) |
| 307 | * to the page which @reuse is mapped to, then free vmemmap |
| 308 | * which the range are mapped to. |
| 309 | * @start: start address of the vmemmap virtual address range that we want |
| 310 | * to remap. |
| 311 | * @end: end address of the vmemmap virtual address range that we want to |
| 312 | * remap. |
| 313 | * @reuse: reuse address. |
| 314 | * |
| 315 | * Return: %0 on success, negative error code otherwise. |
| 316 | */ |
| 317 | static int vmemmap_remap_free(unsigned long start, unsigned long end, |
| 318 | unsigned long reuse) |
| 319 | { |
| 320 | int ret; |
| 321 | LIST_HEAD(vmemmap_pages); |
| 322 | struct vmemmap_remap_walk walk = { |
| 323 | .remap_pte = vmemmap_remap_pte, |
| 324 | .reuse_addr = reuse, |
| 325 | .vmemmap_pages = &vmemmap_pages, |
| 326 | }; |
Joao Martins | 11aad26 | 2022-11-07 15:39:22 +0000 | [diff] [blame] | 327 | int nid = page_to_nid((struct page *)start); |
| 328 | gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | |
| 329 | __GFP_NOWARN; |
| 330 | |
| 331 | /* |
| 332 | * Allocate a new head vmemmap page to avoid breaking a contiguous |
| 333 | * block of struct page memory when freeing it back to page allocator |
| 334 | * in free_vmemmap_page_list(). This will allow the likely contiguous |
| 335 | * struct page backing memory to be kept contiguous and allowing for |
| 336 | * more allocations of hugepages. Fallback to the currently |
| 337 | * mapped head page in case should it fail to allocate. |
| 338 | */ |
| 339 | walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); |
| 340 | if (walk.reuse_page) { |
| 341 | copy_page(page_to_virt(walk.reuse_page), |
| 342 | (void *)walk.reuse_addr); |
| 343 | list_add(&walk.reuse_page->lru, &vmemmap_pages); |
| 344 | } |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 345 | |
| 346 | /* |
| 347 | * In order to make remapping routine most efficient for the huge pages, |
| 348 | * the routine of vmemmap page table walking has the following rules |
| 349 | * (see more details from the vmemmap_pte_range()): |
| 350 | * |
| 351 | * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) |
| 352 | * should be continuous. |
| 353 | * - The @reuse address is part of the range [@reuse, @end) that we are |
| 354 | * walking which is passed to vmemmap_remap_range(). |
| 355 | * - The @reuse address is the first in the complete range. |
| 356 | * |
| 357 | * So we need to make sure that @start and @reuse meet the above rules. |
| 358 | */ |
| 359 | BUG_ON(start - reuse != PAGE_SIZE); |
| 360 | |
| 361 | mmap_read_lock(&init_mm); |
| 362 | ret = vmemmap_remap_range(reuse, end, &walk); |
| 363 | if (ret && walk.nr_walked) { |
| 364 | end = reuse + walk.nr_walked * PAGE_SIZE; |
| 365 | /* |
| 366 | * vmemmap_pages contains pages from the previous |
| 367 | * vmemmap_remap_range call which failed. These |
| 368 | * are pages which were removed from the vmemmap. |
| 369 | * They will be restored in the following call. |
| 370 | */ |
| 371 | walk = (struct vmemmap_remap_walk) { |
| 372 | .remap_pte = vmemmap_restore_pte, |
| 373 | .reuse_addr = reuse, |
| 374 | .vmemmap_pages = &vmemmap_pages, |
| 375 | }; |
| 376 | |
| 377 | vmemmap_remap_range(reuse, end, &walk); |
| 378 | } |
| 379 | mmap_read_unlock(&init_mm); |
| 380 | |
| 381 | free_vmemmap_page_list(&vmemmap_pages); |
| 382 | |
| 383 | return ret; |
| 384 | } |
| 385 | |
| 386 | static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, |
Pasha Tatashin | eb83f65 | 2023-05-08 23:40:59 +0000 | [diff] [blame] | 387 | struct list_head *list) |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 388 | { |
Pasha Tatashin | eb83f65 | 2023-05-08 23:40:59 +0000 | [diff] [blame] | 389 | gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE; |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 390 | unsigned long nr_pages = (end - start) >> PAGE_SHIFT; |
| 391 | int nid = page_to_nid((struct page *)start); |
| 392 | struct page *page, *next; |
| 393 | |
| 394 | while (nr_pages--) { |
| 395 | page = alloc_pages_node(nid, gfp_mask, 0); |
| 396 | if (!page) |
| 397 | goto out; |
| 398 | list_add_tail(&page->lru, list); |
| 399 | } |
| 400 | |
| 401 | return 0; |
| 402 | out: |
| 403 | list_for_each_entry_safe(page, next, list, lru) |
Lorenzo Stoakes | dcc1be1 | 2023-03-13 12:27:14 +0000 | [diff] [blame] | 404 | __free_page(page); |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 405 | return -ENOMEM; |
| 406 | } |
| 407 | |
| 408 | /** |
| 409 | * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) |
| 410 | * to the page which is from the @vmemmap_pages |
| 411 | * respectively. |
| 412 | * @start: start address of the vmemmap virtual address range that we want |
| 413 | * to remap. |
| 414 | * @end: end address of the vmemmap virtual address range that we want to |
| 415 | * remap. |
| 416 | * @reuse: reuse address. |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 417 | * |
| 418 | * Return: %0 on success, negative error code otherwise. |
| 419 | */ |
| 420 | static int vmemmap_remap_alloc(unsigned long start, unsigned long end, |
Pasha Tatashin | eb83f65 | 2023-05-08 23:40:59 +0000 | [diff] [blame] | 421 | unsigned long reuse) |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 422 | { |
| 423 | LIST_HEAD(vmemmap_pages); |
| 424 | struct vmemmap_remap_walk walk = { |
| 425 | .remap_pte = vmemmap_restore_pte, |
| 426 | .reuse_addr = reuse, |
| 427 | .vmemmap_pages = &vmemmap_pages, |
| 428 | }; |
| 429 | |
| 430 | /* See the comment in the vmemmap_remap_free(). */ |
| 431 | BUG_ON(start - reuse != PAGE_SIZE); |
| 432 | |
Pasha Tatashin | eb83f65 | 2023-05-08 23:40:59 +0000 | [diff] [blame] | 433 | if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) |
Muchun Song | 998a299 | 2022-06-28 17:22:31 +0800 | [diff] [blame] | 434 | return -ENOMEM; |
| 435 | |
| 436 | mmap_read_lock(&init_mm); |
| 437 | vmemmap_remap_range(reuse, end, &walk); |
| 438 | mmap_read_unlock(&init_mm); |
| 439 | |
| 440 | return 0; |
| 441 | } |
| 442 | |
Muchun Song | cf5472e | 2022-06-28 17:22:29 +0800 | [diff] [blame] | 443 | DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); |
Muchun Song | f10f144 | 2022-04-28 23:16:15 -0700 | [diff] [blame] | 444 | EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); |
Muchun Song | e9fdff8 | 2021-06-30 18:47:25 -0700 | [diff] [blame] | 445 | |
Muchun Song | 3015224 | 2022-06-28 17:22:32 +0800 | [diff] [blame] | 446 | static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); |
| 447 | core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 448 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 449 | /** |
| 450 | * hugetlb_vmemmap_restore - restore previously optimized (by |
| 451 | * hugetlb_vmemmap_optimize()) vmemmap pages which |
| 452 | * will be reallocated and remapped. |
| 453 | * @h: struct hstate. |
| 454 | * @head: the head page whose vmemmap pages will be restored. |
| 455 | * |
| 456 | * Return: %0 if @head's vmemmap pages have been reallocated and remapped, |
| 457 | * negative error code otherwise. |
Muchun Song | ad2fa37 | 2021-06-30 18:47:21 -0700 | [diff] [blame] | 458 | */ |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 459 | int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) |
Muchun Song | ad2fa37 | 2021-06-30 18:47:21 -0700 | [diff] [blame] | 460 | { |
| 461 | int ret; |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 462 | unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; |
| 463 | unsigned long vmemmap_reuse; |
Muchun Song | ad2fa37 | 2021-06-30 18:47:21 -0700 | [diff] [blame] | 464 | |
| 465 | if (!HPageVmemmapOptimized(head)) |
| 466 | return 0; |
| 467 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 468 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
| 469 | vmemmap_reuse = vmemmap_start; |
| 470 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
Muchun Song | 5981611 | 2022-04-28 23:16:14 -0700 | [diff] [blame] | 471 | |
Muchun Song | ad2fa37 | 2021-06-30 18:47:21 -0700 | [diff] [blame] | 472 | /* |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 473 | * The pages which the vmemmap virtual address range [@vmemmap_start, |
Muchun Song | ad2fa37 | 2021-06-30 18:47:21 -0700 | [diff] [blame] | 474 | * @vmemmap_end) are mapped to are freed to the buddy allocator, and |
| 475 | * the range is mapped to the page which @vmemmap_reuse is mapped to. |
| 476 | * When a HugeTLB page is freed to the buddy allocator, previously |
| 477 | * discarded vmemmap pages must be allocated and remapping. |
| 478 | */ |
Pasha Tatashin | eb83f65 | 2023-05-08 23:40:59 +0000 | [diff] [blame] | 479 | ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse); |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 480 | if (!ret) { |
Muchun Song | ad2fa37 | 2021-06-30 18:47:21 -0700 | [diff] [blame] | 481 | ClearHPageVmemmapOptimized(head); |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 482 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
| 483 | } |
Muchun Song | ad2fa37 | 2021-06-30 18:47:21 -0700 | [diff] [blame] | 484 | |
| 485 | return ret; |
| 486 | } |
| 487 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 488 | /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ |
| 489 | static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) |
Muchun Song | 6636109 | 2022-06-17 21:56:50 +0800 | [diff] [blame] | 490 | { |
Muchun Song | cf5472e | 2022-06-28 17:22:29 +0800 | [diff] [blame] | 491 | if (!READ_ONCE(vmemmap_optimize_enabled)) |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 492 | return false; |
| 493 | |
| 494 | if (!hugetlb_vmemmap_optimizable(h)) |
| 495 | return false; |
Muchun Song | 6636109 | 2022-06-17 21:56:50 +0800 | [diff] [blame] | 496 | |
| 497 | if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { |
| 498 | pmd_t *pmdp, pmd; |
| 499 | struct page *vmemmap_page; |
| 500 | unsigned long vaddr = (unsigned long)head; |
| 501 | |
| 502 | /* |
| 503 | * Only the vmemmap page's vmemmap page can be self-hosted. |
| 504 | * Walking the page tables to find the backing page of the |
| 505 | * vmemmap page. |
| 506 | */ |
| 507 | pmdp = pmd_off_k(vaddr); |
| 508 | /* |
| 509 | * The READ_ONCE() is used to stabilize *pmdp in a register or |
| 510 | * on the stack so that it will stop changing under the code. |
| 511 | * The only concurrent operation where it can be changed is |
| 512 | * split_vmemmap_huge_pmd() (*pmdp will be stable after this |
| 513 | * operation). |
| 514 | */ |
| 515 | pmd = READ_ONCE(*pmdp); |
| 516 | if (pmd_leaf(pmd)) |
| 517 | vmemmap_page = pmd_page(pmd) + pte_index(vaddr); |
| 518 | else |
| 519 | vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); |
| 520 | /* |
| 521 | * Due to HugeTLB alignment requirements and the vmemmap pages |
| 522 | * being at the start of the hotplugged memory region in |
| 523 | * memory_hotplug.memmap_on_memory case. Checking any vmemmap |
| 524 | * page's vmemmap page if it is marked as VmemmapSelfHosted is |
| 525 | * sufficient. |
| 526 | * |
| 527 | * [ hotplugged memory ] |
| 528 | * [ section ][...][ section ] |
| 529 | * [ vmemmap ][ usable memory ] |
| 530 | * ^ | | | |
| 531 | * +---+ | | |
| 532 | * ^ | | |
| 533 | * +-------+ | |
| 534 | * ^ | |
| 535 | * +-------------------------------------------+ |
| 536 | */ |
| 537 | if (PageVmemmapSelfHosted(vmemmap_page)) |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 538 | return false; |
Muchun Song | 6636109 | 2022-06-17 21:56:50 +0800 | [diff] [blame] | 539 | } |
| 540 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 541 | return true; |
Muchun Song | 6636109 | 2022-06-17 21:56:50 +0800 | [diff] [blame] | 542 | } |
| 543 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 544 | /** |
| 545 | * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. |
| 546 | * @h: struct hstate. |
| 547 | * @head: the head page whose vmemmap pages will be optimized. |
| 548 | * |
| 549 | * This function only tries to optimize @head's vmemmap pages and does not |
| 550 | * guarantee that the optimization will succeed after it returns. The caller |
| 551 | * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages |
| 552 | * have been optimized. |
| 553 | */ |
| 554 | void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 555 | { |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 556 | unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; |
| 557 | unsigned long vmemmap_reuse; |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 558 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 559 | if (!vmemmap_should_optimize(h, head)) |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 560 | return; |
| 561 | |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 562 | static_branch_inc(&hugetlb_optimize_vmemmap_key); |
| 563 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 564 | vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); |
| 565 | vmemmap_reuse = vmemmap_start; |
| 566 | vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 567 | |
| 568 | /* |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 569 | * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 570 | * to the page which @vmemmap_reuse is mapped to, then free the pages |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 571 | * which the range [@vmemmap_start, @vmemmap_end] is mapped to. |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 572 | */ |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 573 | if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 574 | static_branch_dec(&hugetlb_optimize_vmemmap_key); |
| 575 | else |
Muchun Song | 3bc2b6a | 2021-06-30 18:48:22 -0700 | [diff] [blame] | 576 | SetHPageVmemmapOptimized(head); |
Muchun Song | f41f2ed | 2021-06-30 18:47:13 -0700 | [diff] [blame] | 577 | } |
Muchun Song | 7749058 | 2021-06-30 18:47:33 -0700 | [diff] [blame] | 578 | |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 579 | static struct ctl_table hugetlb_vmemmap_sysctls[] = { |
| 580 | { |
| 581 | .procname = "hugetlb_optimize_vmemmap", |
Muchun Song | cf5472e | 2022-06-28 17:22:29 +0800 | [diff] [blame] | 582 | .data = &vmemmap_optimize_enabled, |
Ondrej Mosnacek | f1aa2eb | 2023-02-10 15:58:23 +0100 | [diff] [blame] | 583 | .maxlen = sizeof(vmemmap_optimize_enabled), |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 584 | .mode = 0644, |
Muchun Song | cf5472e | 2022-06-28 17:22:29 +0800 | [diff] [blame] | 585 | .proc_handler = proc_dobool, |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 586 | }, |
| 587 | { } |
| 588 | }; |
| 589 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 590 | static int __init hugetlb_vmemmap_init(void) |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 591 | { |
Muchun Song | 1231856 | 2023-02-23 14:59:47 +0800 | [diff] [blame] | 592 | const struct hstate *h; |
| 593 | |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 594 | /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ |
| 595 | BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 596 | |
Muchun Song | 1231856 | 2023-02-23 14:59:47 +0800 | [diff] [blame] | 597 | for_each_hstate(h) { |
| 598 | if (hugetlb_vmemmap_optimizable(h)) { |
| 599 | register_sysctl_init("vm", hugetlb_vmemmap_sysctls); |
| 600 | break; |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 601 | } |
| 602 | } |
Muchun Song | 78f3908 | 2022-05-13 16:48:56 -0700 | [diff] [blame] | 603 | return 0; |
| 604 | } |
Muchun Song | 6213834 | 2022-06-28 17:22:33 +0800 | [diff] [blame] | 605 | late_initcall(hugetlb_vmemmap_init); |