blob: 3ca89e0279a7006ef7497cb1bffc802c5f407f2e [file] [log] [blame]
Thomas Gleixner20c8ccb2019-06-04 10:11:32 +02001// SPDX-License-Identifier: GPL-2.0-only
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08002/*
3 * Copyright (C) 2009 Red Hat, Inc.
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08004 */
5
Andrew Mortonae3a8c12014-06-04 16:06:58 -07006#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08008#include <linux/mm.h>
9#include <linux/sched.h>
Zi Yanfa6c0232021-05-04 18:34:23 -070010#include <linux/sched/mm.h>
Ingo Molnarf7ccbae2017-02-08 18:51:30 +010011#include <linux/sched/coredump.h>
Ingo Molnar6a3827d2017-02-08 18:51:31 +010012#include <linux/sched/numa_balancing.h>
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -080013#include <linux/highmem.h>
14#include <linux/hugetlb.h>
15#include <linux/mmu_notifier.h>
16#include <linux/rmap.h>
17#include <linux/swap.h>
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -080018#include <linux/shrinker.h>
Andrea Arcangeliba761492011-01-13 15:46:58 -080019#include <linux/mm_inline.h>
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -080020#include <linux/swapops.h>
Matthew Wilcox (Oracle)fb5c2022022-06-28 20:15:29 -040021#include <linux/backing-dev.h>
Matthew Wilcox4897c762015-09-08 14:58:45 -070022#include <linux/dax.h>
Baolin Wang4b989952024-06-11 18:11:07 +080023#include <linux/mm_types.h>
Andrea Arcangeliba761492011-01-13 15:46:58 -080024#include <linux/khugepaged.h>
Andrea Arcangeli878aee72011-01-13 15:47:10 -080025#include <linux/freezer.h>
Dan Williamsf25748e32016-01-15 16:56:43 -080026#include <linux/pfn_t.h>
Andrea Arcangelia664b2d2011-01-13 15:47:17 -080027#include <linux/mman.h>
Dan Williams3565fce2016-01-15 16:56:55 -080028#include <linux/memremap.h>
Ralf Baechle325adeb2012-10-15 13:44:56 +020029#include <linux/pagemap.h>
Kirill A. Shutemov49071d42016-01-15 16:54:40 -080030#include <linux/debugfs.h>
Mel Gorman4daae3b2012-11-02 11:33:45 +000031#include <linux/migrate.h>
Sasha Levin43b5fbb2013-02-22 16:32:27 -080032#include <linux/hashtable.h>
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -070033#include <linux/userfaultfd_k.h>
Vladimir Davydov33c3fc72015-09-09 15:35:45 -070034#include <linux/page_idle.h>
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -070035#include <linux/shmem_fs.h>
Michal Hocko6b31d592017-08-18 15:16:15 -070036#include <linux/oom.h>
Anshuman Khandual98fa15f2019-03-05 15:42:58 -080037#include <linux/numa.h>
Vlastimil Babkaf7da6772019-08-24 17:54:59 -070038#include <linux/page_owner.h>
Huang Yinga1a3a2f2022-03-22 14:46:27 -070039#include <linux/sched/sysctl.h>
Aneesh Kumar K.V467b1712022-08-18 18:40:41 +053040#include <linux/memory-tiers.h>
Yang Shi4ef9ad192024-01-18 10:05:05 -080041#include <linux/compat.h>
Suren Baghdasaryanbe25d1d2024-03-21 09:36:41 -070042#include <linux/pgalloc_tag.h>
David Hildenbrand8710f6e2024-08-02 17:55:20 +020043#include <linux/pagewalk.h>
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -080044
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -080045#include <asm/tlb.h>
46#include <asm/pgalloc.h>
47#include "internal.h"
NeilBrown014bb1d2022-05-09 18:20:47 -070048#include "swap.h"
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -080049
Anshuman Khandual283fd6f2022-03-24 18:09:58 -070050#define CREATE_TRACE_POINTS
51#include <trace/events/thp.h>
52
Andrea Arcangeliba761492011-01-13 15:46:58 -080053/*
Michael DeGuzisb14d5952017-05-17 15:19:21 -040054 * By default, transparent hugepage support is disabled in order to avoid
55 * risking an increased memory footprint for applications that are not
56 * guaranteed to benefit from it. When transparent hugepage support is
57 * enabled, it is for all mappings, and khugepaged scans all mappings.
Jianguo Wu8bfa3f92013-11-12 15:07:16 -080058 * Defrag is invoked by khugepaged hugepage allocations and by page faults
59 * for all hugepage allocations.
Andrea Arcangeliba761492011-01-13 15:46:58 -080060 */
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -080061unsigned long transparent_hugepage_flags __read_mostly =
Andrea Arcangeli13ece882011-01-13 15:47:07 -080062#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
Andrea Arcangeliba761492011-01-13 15:46:58 -080063 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
Andrea Arcangeli13ece882011-01-13 15:47:07 -080064#endif
65#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
66 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
67#endif
Mel Gorman444eb2a42016-03-17 14:19:23 -070068 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
Kirill A. Shutemov79da5402012-12-12 13:51:12 -080069 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
70 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
Andrea Arcangeliba761492011-01-13 15:46:58 -080071
Qi Zheng54d91722023-09-11 17:44:16 +080072static struct shrinker *deferred_split_shrinker;
73static unsigned long deferred_split_count(struct shrinker *shrink,
74 struct shrink_control *sc);
75static unsigned long deferred_split_scan(struct shrinker *shrink,
76 struct shrink_control *sc);
Usama Arif81d3ff32024-08-30 11:03:40 +010077static bool split_underused_thp = true;
Andrea Arcangelif0005652011-01-13 15:47:04 -080078
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -080079static atomic_t huge_zero_refcount;
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +000080struct folio *huge_zero_folio __read_mostly;
Hugh Dickins3b77e8c2021-06-15 18:23:49 -070081unsigned long huge_zero_pfn __read_mostly = ~0UL;
Ryan Roberts3485b882023-12-07 16:12:04 +000082unsigned long huge_anon_orders_always __read_mostly;
83unsigned long huge_anon_orders_madvise __read_mostly;
84unsigned long huge_anon_orders_inherit __read_mostly;
Ryan Robertsdd4d30d2024-08-14 14:02:47 +120085static bool anon_orders_configured __initdata;
Kirill A. Shutemov4a6c1292012-12-12 13:50:47 -080086
Ryan Roberts3485b882023-12-07 16:12:04 +000087unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
Matthew Wilcoxe0ffb292024-04-25 05:00:55 +010088 unsigned long vm_flags,
89 unsigned long tva_flags,
Ryan Roberts3485b882023-12-07 16:12:04 +000090 unsigned long orders)
Michal Hocko7635d9c2018-12-28 00:38:21 -080091{
Matthew Wilcoxe0ffb292024-04-25 05:00:55 +010092 bool smaps = tva_flags & TVA_SMAPS;
93 bool in_pf = tva_flags & TVA_IN_PF;
94 bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
Gavin Shand659b712024-07-15 10:04:23 +100095 unsigned long supported_orders;
96
Ryan Roberts3485b882023-12-07 16:12:04 +000097 /* Check the intersection of requested and supported orders. */
Gavin Shand659b712024-07-15 10:04:23 +100098 if (vma_is_anonymous(vma))
99 supported_orders = THP_ORDERS_ALL_ANON;
Peter Xu5dd40722024-08-26 16:43:38 -0400100 else if (vma_is_special_huge(vma))
101 supported_orders = THP_ORDERS_ALL_SPECIAL;
Gavin Shand659b712024-07-15 10:04:23 +1000102 else
103 supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
104
105 orders &= supported_orders;
Ryan Roberts3485b882023-12-07 16:12:04 +0000106 if (!orders)
107 return 0;
108
Yang Shi9fec5162022-06-16 10:48:37 -0700109 if (!vma->vm_mm) /* vdso */
Ryan Roberts3485b882023-12-07 16:12:04 +0000110 return 0;
Yang Shi9fec5162022-06-16 10:48:37 -0700111
Yang Shi7da4e2c2022-06-16 10:48:38 -0700112 /*
113 * Explicitly disabled through madvise or prctl, or some
114 * architectures may disable THP for some mappings, for
115 * example, s390 kvm.
116 * */
117 if ((vm_flags & VM_NOHUGEPAGE) ||
118 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
Ryan Roberts3485b882023-12-07 16:12:04 +0000119 return 0;
Yang Shi7da4e2c2022-06-16 10:48:38 -0700120 /*
121 * If the hardware/firmware marked hugepage support disabled.
122 */
Peter Xu3c556d22023-03-15 13:16:42 -0400123 if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
Ryan Roberts3485b882023-12-07 16:12:04 +0000124 return 0;
Yang Shi9fec5162022-06-16 10:48:37 -0700125
Yang Shi7da4e2c2022-06-16 10:48:38 -0700126 /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
127 if (vma_is_dax(vma))
Ryan Roberts3485b882023-12-07 16:12:04 +0000128 return in_pf ? orders : 0;
Yang Shi7da4e2c2022-06-16 10:48:38 -0700129
130 /*
Zach O'Keefe7a817512023-09-25 13:01:10 -0700131 * khugepaged special VMA and hugetlb VMA.
Yang Shi7da4e2c2022-06-16 10:48:38 -0700132 * Must be checked after dax since some dax mappings may have
133 * VM_MIXEDMAP set.
134 */
Zach O'Keefe7a817512023-09-25 13:01:10 -0700135 if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
Ryan Roberts3485b882023-12-07 16:12:04 +0000136 return 0;
Yang Shi9fec5162022-06-16 10:48:37 -0700137
Yang Shi7da4e2c2022-06-16 10:48:38 -0700138 /*
Ryan Roberts3485b882023-12-07 16:12:04 +0000139 * Check alignment for file vma and size for both file and anon vma by
140 * filtering out the unsuitable orders.
Yang Shi7da4e2c2022-06-16 10:48:38 -0700141 *
142 * Skip the check for page fault. Huge fault does the check in fault
Ryan Roberts3485b882023-12-07 16:12:04 +0000143 * handlers.
Yang Shi7da4e2c2022-06-16 10:48:38 -0700144 */
Ryan Roberts3485b882023-12-07 16:12:04 +0000145 if (!in_pf) {
146 int order = highest_order(orders);
147 unsigned long addr;
148
149 while (orders) {
150 addr = vma->vm_end - (PAGE_SIZE << order);
151 if (thp_vma_suitable_order(vma, addr, order))
152 break;
153 order = next_order(&orders, order);
154 }
155
156 if (!orders)
157 return 0;
158 }
Yang Shi9fec5162022-06-16 10:48:37 -0700159
Yang Shi7da4e2c2022-06-16 10:48:38 -0700160 /*
161 * Enabled via shmem mount options or sysfs settings.
162 * Must be done before hugepage flags check since shmem has its
163 * own flags.
164 */
Baolin Wang6beeab82024-07-22 13:43:19 +0800165 if (!in_pf && shmem_file(vma->vm_file))
Bang Li26c7d842024-07-05 11:23:09 +0800166 return shmem_allowable_huge_orders(file_inode(vma->vm_file),
Rik van Riele1e4cfd2024-09-03 11:19:28 -0400167 vma, vma->vm_pgoff, 0,
Baolin Wang6beeab82024-07-22 13:43:19 +0800168 !enforce_sysfs);
Yang Shi9fec5162022-06-16 10:48:37 -0700169
Zach O'Keefe7a817512023-09-25 13:01:10 -0700170 if (!vma_is_anonymous(vma)) {
171 /*
Ryan Roberts3485b882023-12-07 16:12:04 +0000172 * Enforce sysfs THP requirements as necessary. Anonymous vmas
173 * were already handled in thp_vma_allowable_orders().
174 */
175 if (enforce_sysfs &&
176 (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
177 !hugepage_global_always())))
178 return 0;
179
180 /*
Zach O'Keefe7a817512023-09-25 13:01:10 -0700181 * Trust that ->huge_fault() handlers know what they are doing
182 * in fault path.
183 */
184 if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
Ryan Roberts3485b882023-12-07 16:12:04 +0000185 return orders;
Zach O'Keefe7a817512023-09-25 13:01:10 -0700186 /* Only regular file is valid in collapse path */
187 if (((!in_pf || smaps)) && file_thp_enabled(vma))
Ryan Roberts3485b882023-12-07 16:12:04 +0000188 return orders;
189 return 0;
Zach O'Keefe7a817512023-09-25 13:01:10 -0700190 }
Yang Shi9fec5162022-06-16 10:48:37 -0700191
192 if (vma_is_temporary_stack(vma))
Ryan Roberts3485b882023-12-07 16:12:04 +0000193 return 0;
Yang Shi9fec5162022-06-16 10:48:37 -0700194
195 /*
196 * THPeligible bit of smaps should show 1 for proper VMAs even
197 * though anon_vma is not initialized yet.
Yang Shi7da4e2c2022-06-16 10:48:38 -0700198 *
199 * Allow page fault since anon_vma may be not initialized until
200 * the first page fault.
Yang Shi9fec5162022-06-16 10:48:37 -0700201 */
202 if (!vma->anon_vma)
Ryan Roberts3485b882023-12-07 16:12:04 +0000203 return (smaps || in_pf) ? orders : 0;
Yang Shi9fec5162022-06-16 10:48:37 -0700204
Ryan Roberts3485b882023-12-07 16:12:04 +0000205 return orders;
Michal Hocko7635d9c2018-12-28 00:38:21 -0800206}
207
Miaohe Linaaa97052021-05-04 18:33:55 -0700208static bool get_huge_zero_page(void)
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800209{
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000210 struct folio *zero_folio;
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800211retry:
212 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
Miaohe Linaaa97052021-05-04 18:33:55 -0700213 return true;
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800214
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000215 zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800216 HPAGE_PMD_ORDER);
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000217 if (!zero_folio) {
Kirill A. Shutemovd8a8e1f2012-12-12 13:51:09 -0800218 count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
Miaohe Linaaa97052021-05-04 18:33:55 -0700219 return false;
Kirill A. Shutemovd8a8e1f2012-12-12 13:51:09 -0800220 }
Miaohe Lin2a1b8642024-09-14 09:53:06 +0800221 /* Ensure zero folio won't have large_rmappable flag set. */
222 folio_clear_large_rmappable(zero_folio);
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800223 preempt_disable();
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000224 if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800225 preempt_enable();
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000226 folio_put(zero_folio);
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800227 goto retry;
228 }
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000229 WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800230
231 /* We take additional reference here. It will be put back by shrinker */
232 atomic_set(&huge_zero_refcount, 2);
233 preempt_enable();
Liu Shixinf4981502022-09-09 10:16:53 +0800234 count_vm_event(THP_ZERO_PAGE_ALLOC);
Miaohe Linaaa97052021-05-04 18:33:55 -0700235 return true;
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800236}
237
Aaron Lu6fcb52a2016-10-07 17:00:08 -0700238static void put_huge_zero_page(void)
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800239{
240 /*
241 * Counter should never go to zero here. Only shrinker can put
242 * last reference.
243 */
244 BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
245}
246
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000247struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
Aaron Lu6fcb52a2016-10-07 17:00:08 -0700248{
249 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000250 return READ_ONCE(huge_zero_folio);
Aaron Lu6fcb52a2016-10-07 17:00:08 -0700251
252 if (!get_huge_zero_page())
253 return NULL;
254
255 if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
256 put_huge_zero_page();
257
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000258 return READ_ONCE(huge_zero_folio);
Aaron Lu6fcb52a2016-10-07 17:00:08 -0700259}
260
Matthew Wilcox (Oracle)632230f2024-03-26 20:28:28 +0000261void mm_put_huge_zero_folio(struct mm_struct *mm)
Aaron Lu6fcb52a2016-10-07 17:00:08 -0700262{
263 if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
264 put_huge_zero_page();
265}
266
Glauber Costa48896462013-08-28 10:18:15 +1000267static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
268 struct shrink_control *sc)
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800269{
Glauber Costa48896462013-08-28 10:18:15 +1000270 /* we can free zero page only if last reference remains */
271 return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
272}
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800273
Glauber Costa48896462013-08-28 10:18:15 +1000274static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
275 struct shrink_control *sc)
276{
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800277 if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000278 struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
279 BUG_ON(zero_folio == NULL);
Hugh Dickins3b77e8c2021-06-15 18:23:49 -0700280 WRITE_ONCE(huge_zero_pfn, ~0UL);
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +0000281 folio_put(zero_folio);
Glauber Costa48896462013-08-28 10:18:15 +1000282 return HPAGE_PMD_NR;
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800283 }
284
285 return 0;
286}
287
Qi Zheng54d91722023-09-11 17:44:16 +0800288static struct shrinker *huge_zero_page_shrinker;
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800289
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800290#ifdef CONFIG_SYSFS
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800291static ssize_t enabled_show(struct kobject *kobj,
292 struct kobj_attribute *attr, char *buf)
293{
Joe Perchesbfb0ffe2020-12-14 19:14:46 -0800294 const char *output;
295
Mel Gorman444eb2a42016-03-17 14:19:23 -0700296 if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
Joe Perchesbfb0ffe2020-12-14 19:14:46 -0800297 output = "[always] madvise never";
298 else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
299 &transparent_hugepage_flags))
300 output = "always [madvise] never";
Mel Gorman444eb2a42016-03-17 14:19:23 -0700301 else
Joe Perchesbfb0ffe2020-12-14 19:14:46 -0800302 output = "always madvise [never]";
303
304 return sysfs_emit(buf, "%s\n", output);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800305}
Mel Gorman444eb2a42016-03-17 14:19:23 -0700306
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800307static ssize_t enabled_store(struct kobject *kobj,
308 struct kobj_attribute *attr,
309 const char *buf, size_t count)
310{
David Rientjes21440d72017-02-22 15:45:49 -0800311 ssize_t ret = count;
Andrea Arcangeliba761492011-01-13 15:46:58 -0800312
David Rientjesf42f2552020-01-30 22:14:48 -0800313 if (sysfs_streq(buf, "always")) {
David Rientjes21440d72017-02-22 15:45:49 -0800314 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
315 set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
David Rientjesf42f2552020-01-30 22:14:48 -0800316 } else if (sysfs_streq(buf, "madvise")) {
David Rientjes21440d72017-02-22 15:45:49 -0800317 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
318 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
David Rientjesf42f2552020-01-30 22:14:48 -0800319 } else if (sysfs_streq(buf, "never")) {
David Rientjes21440d72017-02-22 15:45:49 -0800320 clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
321 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
322 } else
323 ret = -EINVAL;
Andrea Arcangeliba761492011-01-13 15:46:58 -0800324
325 if (ret > 0) {
Kirill A. Shutemovb46e7562016-07-26 15:26:24 -0700326 int err = start_stop_khugepaged();
Andrea Arcangeliba761492011-01-13 15:46:58 -0800327 if (err)
328 ret = err;
329 }
Andrea Arcangeliba761492011-01-13 15:46:58 -0800330 return ret;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800331}
Miaohe Lin37139bb2022-07-04 21:21:53 +0800332
333static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800334
Kirill A. Shutemovb46e7562016-07-26 15:26:24 -0700335ssize_t single_hugepage_flag_show(struct kobject *kobj,
Joe Perchesbfb0ffe2020-12-14 19:14:46 -0800336 struct kobj_attribute *attr, char *buf,
337 enum transparent_hugepage_flag flag)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800338{
Joe Perchesbfb0ffe2020-12-14 19:14:46 -0800339 return sysfs_emit(buf, "%d\n",
340 !!test_bit(flag, &transparent_hugepage_flags));
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800341}
Ben Hutchingse27e6152011-04-14 15:22:21 -0700342
Kirill A. Shutemovb46e7562016-07-26 15:26:24 -0700343ssize_t single_hugepage_flag_store(struct kobject *kobj,
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800344 struct kobj_attribute *attr,
345 const char *buf, size_t count,
346 enum transparent_hugepage_flag flag)
347{
Ben Hutchingse27e6152011-04-14 15:22:21 -0700348 unsigned long value;
349 int ret;
350
351 ret = kstrtoul(buf, 10, &value);
352 if (ret < 0)
353 return ret;
354 if (value > 1)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800355 return -EINVAL;
356
Ben Hutchingse27e6152011-04-14 15:22:21 -0700357 if (value)
358 set_bit(flag, &transparent_hugepage_flags);
359 else
360 clear_bit(flag, &transparent_hugepage_flags);
361
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800362 return count;
363}
364
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800365static ssize_t defrag_show(struct kobject *kobj,
366 struct kobj_attribute *attr, char *buf)
367{
Joe Perchesbfb0ffe2020-12-14 19:14:46 -0800368 const char *output;
369
370 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
371 &transparent_hugepage_flags))
372 output = "[always] defer defer+madvise madvise never";
373 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
374 &transparent_hugepage_flags))
375 output = "always [defer] defer+madvise madvise never";
376 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
377 &transparent_hugepage_flags))
378 output = "always defer [defer+madvise] madvise never";
379 else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
380 &transparent_hugepage_flags))
381 output = "always defer defer+madvise [madvise] never";
382 else
383 output = "always defer defer+madvise madvise [never]";
384
385 return sysfs_emit(buf, "%s\n", output);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800386}
David Rientjes21440d72017-02-22 15:45:49 -0800387
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800388static ssize_t defrag_store(struct kobject *kobj,
389 struct kobj_attribute *attr,
390 const char *buf, size_t count)
391{
David Rientjesf42f2552020-01-30 22:14:48 -0800392 if (sysfs_streq(buf, "always")) {
David Rientjes21440d72017-02-22 15:45:49 -0800393 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
394 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
395 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
396 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
David Rientjesf42f2552020-01-30 22:14:48 -0800397 } else if (sysfs_streq(buf, "defer+madvise")) {
David Rientjes21440d72017-02-22 15:45:49 -0800398 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
399 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
400 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
401 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
David Rientjesf42f2552020-01-30 22:14:48 -0800402 } else if (sysfs_streq(buf, "defer")) {
David Rientjes4fad7fb2017-04-07 16:04:54 -0700403 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
404 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
405 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
406 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
David Rientjesf42f2552020-01-30 22:14:48 -0800407 } else if (sysfs_streq(buf, "madvise")) {
David Rientjes21440d72017-02-22 15:45:49 -0800408 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
409 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
410 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
411 set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
David Rientjesf42f2552020-01-30 22:14:48 -0800412 } else if (sysfs_streq(buf, "never")) {
David Rientjes21440d72017-02-22 15:45:49 -0800413 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
414 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
415 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
416 clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
417 } else
418 return -EINVAL;
419
420 return count;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800421}
Miaohe Lin37139bb2022-07-04 21:21:53 +0800422static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800423
Kirill A. Shutemov79da5402012-12-12 13:51:12 -0800424static ssize_t use_zero_page_show(struct kobject *kobj,
Joe Perchesae7a9272020-12-14 19:14:42 -0800425 struct kobj_attribute *attr, char *buf)
Kirill A. Shutemov79da5402012-12-12 13:51:12 -0800426{
Kirill A. Shutemovb46e7562016-07-26 15:26:24 -0700427 return single_hugepage_flag_show(kobj, attr, buf,
Joe Perchesae7a9272020-12-14 19:14:42 -0800428 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
Kirill A. Shutemov79da5402012-12-12 13:51:12 -0800429}
430static ssize_t use_zero_page_store(struct kobject *kobj,
431 struct kobj_attribute *attr, const char *buf, size_t count)
432{
Kirill A. Shutemovb46e7562016-07-26 15:26:24 -0700433 return single_hugepage_flag_store(kobj, attr, buf, count,
Kirill A. Shutemov79da5402012-12-12 13:51:12 -0800434 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
435}
Miaohe Lin37139bb2022-07-04 21:21:53 +0800436static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
Hugh Dickins49920d22016-12-12 16:44:50 -0800437
438static ssize_t hpage_pmd_size_show(struct kobject *kobj,
Joe Perchesae7a9272020-12-14 19:14:42 -0800439 struct kobj_attribute *attr, char *buf)
Hugh Dickins49920d22016-12-12 16:44:50 -0800440{
Joe Perchesae7a9272020-12-14 19:14:42 -0800441 return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
Hugh Dickins49920d22016-12-12 16:44:50 -0800442}
443static struct kobj_attribute hpage_pmd_size_attr =
444 __ATTR_RO(hpage_pmd_size);
445
Usama Arif81d3ff32024-08-30 11:03:40 +0100446static ssize_t split_underused_thp_show(struct kobject *kobj,
447 struct kobj_attribute *attr, char *buf)
448{
449 return sysfs_emit(buf, "%d\n", split_underused_thp);
450}
451
452static ssize_t split_underused_thp_store(struct kobject *kobj,
453 struct kobj_attribute *attr,
454 const char *buf, size_t count)
455{
456 int err = kstrtobool(buf, &split_underused_thp);
457
458 if (err < 0)
459 return err;
460
461 return count;
462}
463
464static struct kobj_attribute split_underused_thp_attr = __ATTR(
465 shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
466
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800467static struct attribute *hugepage_attr[] = {
468 &enabled_attr.attr,
469 &defrag_attr.attr,
Kirill A. Shutemov79da5402012-12-12 13:51:12 -0800470 &use_zero_page_attr.attr,
Hugh Dickins49920d22016-12-12 16:44:50 -0800471 &hpage_pmd_size_attr.attr,
Matthew Wilcox (Oracle)396bcc52020-04-06 20:04:35 -0700472#ifdef CONFIG_SHMEM
Kirill A. Shutemov5a6e75f2016-07-26 15:26:13 -0700473 &shmem_enabled_attr.attr,
474#endif
Usama Arif81d3ff32024-08-30 11:03:40 +0100475 &split_underused_thp_attr.attr,
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800476 NULL,
477};
478
Arvind Yadav8aa95a22017-09-06 16:22:03 -0700479static const struct attribute_group hugepage_attr_group = {
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800480 .attrs = hugepage_attr,
Andrea Arcangeliba761492011-01-13 15:46:58 -0800481};
482
Ryan Roberts3485b882023-12-07 16:12:04 +0000483static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
484static void thpsize_release(struct kobject *kobj);
485static DEFINE_SPINLOCK(huge_anon_orders_lock);
486static LIST_HEAD(thpsize_list);
487
Ryan Roberts70e59a72024-08-08 12:18:47 +0100488static ssize_t anon_enabled_show(struct kobject *kobj,
489 struct kobj_attribute *attr, char *buf)
Ryan Roberts3485b882023-12-07 16:12:04 +0000490{
491 int order = to_thpsize(kobj)->order;
492 const char *output;
493
494 if (test_bit(order, &huge_anon_orders_always))
495 output = "[always] inherit madvise never";
496 else if (test_bit(order, &huge_anon_orders_inherit))
497 output = "always [inherit] madvise never";
498 else if (test_bit(order, &huge_anon_orders_madvise))
499 output = "always inherit [madvise] never";
500 else
501 output = "always inherit madvise [never]";
502
503 return sysfs_emit(buf, "%s\n", output);
504}
505
Ryan Roberts70e59a72024-08-08 12:18:47 +0100506static ssize_t anon_enabled_store(struct kobject *kobj,
507 struct kobj_attribute *attr,
508 const char *buf, size_t count)
Ryan Roberts3485b882023-12-07 16:12:04 +0000509{
510 int order = to_thpsize(kobj)->order;
511 ssize_t ret = count;
512
513 if (sysfs_streq(buf, "always")) {
514 spin_lock(&huge_anon_orders_lock);
515 clear_bit(order, &huge_anon_orders_inherit);
516 clear_bit(order, &huge_anon_orders_madvise);
517 set_bit(order, &huge_anon_orders_always);
518 spin_unlock(&huge_anon_orders_lock);
519 } else if (sysfs_streq(buf, "inherit")) {
520 spin_lock(&huge_anon_orders_lock);
521 clear_bit(order, &huge_anon_orders_always);
522 clear_bit(order, &huge_anon_orders_madvise);
523 set_bit(order, &huge_anon_orders_inherit);
524 spin_unlock(&huge_anon_orders_lock);
525 } else if (sysfs_streq(buf, "madvise")) {
526 spin_lock(&huge_anon_orders_lock);
527 clear_bit(order, &huge_anon_orders_always);
528 clear_bit(order, &huge_anon_orders_inherit);
529 set_bit(order, &huge_anon_orders_madvise);
530 spin_unlock(&huge_anon_orders_lock);
531 } else if (sysfs_streq(buf, "never")) {
532 spin_lock(&huge_anon_orders_lock);
533 clear_bit(order, &huge_anon_orders_always);
534 clear_bit(order, &huge_anon_orders_inherit);
535 clear_bit(order, &huge_anon_orders_madvise);
536 spin_unlock(&huge_anon_orders_lock);
537 } else
538 ret = -EINVAL;
539
Ryan Roberts00f58102024-07-04 10:10:50 +0100540 if (ret > 0) {
541 int err;
542
543 err = start_stop_khugepaged();
544 if (err)
545 ret = err;
546 }
Ryan Roberts3485b882023-12-07 16:12:04 +0000547 return ret;
548}
549
Ryan Roberts70e59a72024-08-08 12:18:47 +0100550static struct kobj_attribute anon_enabled_attr =
551 __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
Ryan Roberts3485b882023-12-07 16:12:04 +0000552
Ryan Roberts70e59a72024-08-08 12:18:47 +0100553static struct attribute *anon_ctrl_attrs[] = {
554 &anon_enabled_attr.attr,
555 NULL,
556};
557
558static const struct attribute_group anon_ctrl_attr_grp = {
559 .attrs = anon_ctrl_attrs,
560};
561
562static struct attribute *file_ctrl_attrs[] = {
Baolin Wang4b989952024-06-11 18:11:07 +0800563#ifdef CONFIG_SHMEM
564 &thpsize_shmem_enabled_attr.attr,
565#endif
Ryan Roberts3485b882023-12-07 16:12:04 +0000566 NULL,
567};
568
Ryan Roberts70e59a72024-08-08 12:18:47 +0100569static const struct attribute_group file_ctrl_attr_grp = {
570 .attrs = file_ctrl_attrs,
571};
572
573static struct attribute *any_ctrl_attrs[] = {
574 NULL,
575};
576
577static const struct attribute_group any_ctrl_attr_grp = {
578 .attrs = any_ctrl_attrs,
Ryan Roberts3485b882023-12-07 16:12:04 +0000579};
580
581static const struct kobj_type thpsize_ktype = {
582 .release = &thpsize_release,
583 .sysfs_ops = &kobj_sysfs_ops,
584};
585
Barry Songec336872024-04-12 23:48:55 +1200586DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
587
588static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
589{
590 unsigned long sum = 0;
591 int cpu;
592
593 for_each_possible_cpu(cpu) {
594 struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
595
596 sum += this->stats[order][item];
597 }
598
599 return sum;
600}
601
602#define DEFINE_MTHP_STAT_ATTR(_name, _index) \
603static ssize_t _name##_show(struct kobject *kobj, \
604 struct kobj_attribute *attr, char *buf) \
605{ \
606 int order = to_thpsize(kobj)->order; \
607 \
608 return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index)); \
609} \
610static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
611
612DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
613DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
614DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
Baolin Wang0d648dd2024-05-23 10:36:39 +0800615DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
616DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
Ryan Roberts70e59a72024-08-08 12:18:47 +0100617#ifdef CONFIG_SHMEM
Ryan Roberts63d98662024-07-10 10:55:01 +0100618DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
619DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
620DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
Ryan Roberts70e59a72024-08-08 12:18:47 +0100621#endif
Lance Yangf216c842024-06-28 21:07:49 +0800622DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
623DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
624DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
Barry Song5d65c8d2024-08-24 13:04:40 +1200625DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
Barry Song8175ebf2024-08-24 13:04:41 +1200626DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
Barry Songec336872024-04-12 23:48:55 +1200627
Ryan Roberts70e59a72024-08-08 12:18:47 +0100628static struct attribute *anon_stats_attrs[] = {
Barry Songec336872024-04-12 23:48:55 +1200629 &anon_fault_alloc_attr.attr,
630 &anon_fault_fallback_attr.attr,
631 &anon_fault_fallback_charge_attr.attr,
Ryan Roberts70e59a72024-08-08 12:18:47 +0100632#ifndef CONFIG_SHMEM
Baolin Wang0d648dd2024-05-23 10:36:39 +0800633 &swpout_attr.attr,
634 &swpout_fallback_attr.attr,
Ryan Roberts70e59a72024-08-08 12:18:47 +0100635#endif
Lance Yangf216c842024-06-28 21:07:49 +0800636 &split_deferred_attr.attr,
Barry Song5d65c8d2024-08-24 13:04:40 +1200637 &nr_anon_attr.attr,
Barry Song8175ebf2024-08-24 13:04:41 +1200638 &nr_anon_partially_mapped_attr.attr,
Barry Songec336872024-04-12 23:48:55 +1200639 NULL,
640};
641
Ryan Roberts70e59a72024-08-08 12:18:47 +0100642static struct attribute_group anon_stats_attr_grp = {
Barry Songec336872024-04-12 23:48:55 +1200643 .name = "stats",
Ryan Roberts70e59a72024-08-08 12:18:47 +0100644 .attrs = anon_stats_attrs,
Barry Songec336872024-04-12 23:48:55 +1200645};
646
Ryan Roberts70e59a72024-08-08 12:18:47 +0100647static struct attribute *file_stats_attrs[] = {
648#ifdef CONFIG_SHMEM
649 &shmem_alloc_attr.attr,
650 &shmem_fallback_attr.attr,
651 &shmem_fallback_charge_attr.attr,
652#endif
653 NULL,
654};
655
656static struct attribute_group file_stats_attr_grp = {
657 .name = "stats",
658 .attrs = file_stats_attrs,
659};
660
661static struct attribute *any_stats_attrs[] = {
662#ifdef CONFIG_SHMEM
663 &swpout_attr.attr,
664 &swpout_fallback_attr.attr,
665#endif
666 &split_attr.attr,
667 &split_failed_attr.attr,
668 NULL,
669};
670
671static struct attribute_group any_stats_attr_grp = {
672 .name = "stats",
673 .attrs = any_stats_attrs,
674};
675
676static int sysfs_add_group(struct kobject *kobj,
677 const struct attribute_group *grp)
678{
679 int ret = -ENOENT;
680
681 /*
682 * If the group is named, try to merge first, assuming the subdirectory
683 * was already created. This avoids the warning emitted by
684 * sysfs_create_group() if the directory already exists.
685 */
686 if (grp->name)
687 ret = sysfs_merge_group(kobj, grp);
688 if (ret)
689 ret = sysfs_create_group(kobj, grp);
690
691 return ret;
692}
693
Ryan Roberts3485b882023-12-07 16:12:04 +0000694static struct thpsize *thpsize_create(int order, struct kobject *parent)
695{
696 unsigned long size = (PAGE_SIZE << order) / SZ_1K;
697 struct thpsize *thpsize;
Ryan Roberts70e59a72024-08-08 12:18:47 +0100698 int ret = -ENOMEM;
Ryan Roberts3485b882023-12-07 16:12:04 +0000699
700 thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
701 if (!thpsize)
Ryan Roberts70e59a72024-08-08 12:18:47 +0100702 goto err;
703
704 thpsize->order = order;
Ryan Roberts3485b882023-12-07 16:12:04 +0000705
706 ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
707 "hugepages-%lukB", size);
708 if (ret) {
709 kfree(thpsize);
Ryan Roberts70e59a72024-08-08 12:18:47 +0100710 goto err;
Ryan Roberts3485b882023-12-07 16:12:04 +0000711 }
712
Ryan Roberts70e59a72024-08-08 12:18:47 +0100713
714 ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
715 if (ret)
716 goto err_put;
717
718 ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
719 if (ret)
720 goto err_put;
721
722 if (BIT(order) & THP_ORDERS_ALL_ANON) {
723 ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
724 if (ret)
725 goto err_put;
726
727 ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
728 if (ret)
729 goto err_put;
Ryan Roberts3485b882023-12-07 16:12:04 +0000730 }
731
Ryan Roberts70e59a72024-08-08 12:18:47 +0100732 if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
733 ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
734 if (ret)
735 goto err_put;
736
737 ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
738 if (ret)
739 goto err_put;
Barry Songec336872024-04-12 23:48:55 +1200740 }
741
Ryan Roberts3485b882023-12-07 16:12:04 +0000742 return thpsize;
Ryan Roberts70e59a72024-08-08 12:18:47 +0100743err_put:
744 kobject_put(&thpsize->kobj);
745err:
746 return ERR_PTR(ret);
Ryan Roberts3485b882023-12-07 16:12:04 +0000747}
748
749static void thpsize_release(struct kobject *kobj)
750{
751 kfree(to_thpsize(kobj));
752}
753
Shaohua Li569e5592012-01-12 17:19:11 -0800754static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
755{
756 int err;
Ryan Roberts3485b882023-12-07 16:12:04 +0000757 struct thpsize *thpsize;
758 unsigned long orders;
759 int order;
760
761 /*
762 * Default to setting PMD-sized THP to inherit the global setting and
763 * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
764 * constant so we have to do this here.
765 */
Ryan Robertsdd4d30d2024-08-14 14:02:47 +1200766 if (!anon_orders_configured)
767 huge_anon_orders_inherit = BIT(PMD_ORDER);
Shaohua Li569e5592012-01-12 17:19:11 -0800768
769 *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
770 if (unlikely(!*hugepage_kobj)) {
Andrew Mortonae3a8c12014-06-04 16:06:58 -0700771 pr_err("failed to create transparent hugepage kobject\n");
Shaohua Li569e5592012-01-12 17:19:11 -0800772 return -ENOMEM;
773 }
774
775 err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
776 if (err) {
Andrew Mortonae3a8c12014-06-04 16:06:58 -0700777 pr_err("failed to register transparent hugepage group\n");
Shaohua Li569e5592012-01-12 17:19:11 -0800778 goto delete_obj;
779 }
780
781 err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
782 if (err) {
Andrew Mortonae3a8c12014-06-04 16:06:58 -0700783 pr_err("failed to register transparent hugepage group\n");
Shaohua Li569e5592012-01-12 17:19:11 -0800784 goto remove_hp_group;
785 }
786
Ryan Roberts70e59a72024-08-08 12:18:47 +0100787 orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
Ryan Roberts3485b882023-12-07 16:12:04 +0000788 order = highest_order(orders);
789 while (orders) {
790 thpsize = thpsize_create(order, *hugepage_kobj);
791 if (IS_ERR(thpsize)) {
792 pr_err("failed to create thpsize for order %d\n", order);
793 err = PTR_ERR(thpsize);
794 goto remove_all;
795 }
796 list_add(&thpsize->node, &thpsize_list);
797 order = next_order(&orders, order);
798 }
799
Shaohua Li569e5592012-01-12 17:19:11 -0800800 return 0;
801
Ryan Roberts3485b882023-12-07 16:12:04 +0000802remove_all:
803 hugepage_exit_sysfs(*hugepage_kobj);
804 return err;
Shaohua Li569e5592012-01-12 17:19:11 -0800805remove_hp_group:
806 sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
807delete_obj:
808 kobject_put(*hugepage_kobj);
809 return err;
810}
811
812static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
813{
Ryan Roberts3485b882023-12-07 16:12:04 +0000814 struct thpsize *thpsize, *tmp;
815
816 list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
817 list_del(&thpsize->node);
818 kobject_put(&thpsize->kobj);
819 }
820
Shaohua Li569e5592012-01-12 17:19:11 -0800821 sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
822 sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
823 kobject_put(hugepage_kobj);
824}
825#else
826static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
827{
828 return 0;
829}
830
831static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
832{
833}
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800834#endif /* CONFIG_SYSFS */
835
Qi Zheng54d91722023-09-11 17:44:16 +0800836static int __init thp_shrinker_init(void)
837{
838 huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
839 if (!huge_zero_page_shrinker)
840 return -ENOMEM;
841
842 deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
843 SHRINKER_MEMCG_AWARE |
844 SHRINKER_NONSLAB,
845 "thp-deferred_split");
846 if (!deferred_split_shrinker) {
847 shrinker_free(huge_zero_page_shrinker);
848 return -ENOMEM;
849 }
850
851 huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
852 huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
853 shrinker_register(huge_zero_page_shrinker);
854
855 deferred_split_shrinker->count_objects = deferred_split_count;
856 deferred_split_shrinker->scan_objects = deferred_split_scan;
857 shrinker_register(deferred_split_shrinker);
858
859 return 0;
860}
861
862static void __init thp_shrinker_exit(void)
863{
864 shrinker_free(huge_zero_page_shrinker);
865 shrinker_free(deferred_split_shrinker);
866}
867
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800868static int __init hugepage_init(void)
869{
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800870 int err;
Shaohua Li569e5592012-01-12 17:19:11 -0800871 struct kobject *hugepage_kobj;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800872
Andrea Arcangeli4b7167b2011-01-13 15:47:09 -0800873 if (!has_transparent_hugepage()) {
Peter Xu3c556d22023-03-15 13:16:42 -0400874 transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
Shaohua Li569e5592012-01-12 17:19:11 -0800875 return -EINVAL;
Andrea Arcangeli4b7167b2011-01-13 15:47:09 -0800876 }
877
Kirill A. Shutemovff20c2e2016-03-01 09:45:14 +0530878 /*
879 * hugepages can't be allocated by the buddy allocator
880 */
Kirill A. Shutemov5e0a7602023-12-28 17:47:04 +0300881 MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);
Kirill A. Shutemovff20c2e2016-03-01 09:45:14 +0530882
Shaohua Li569e5592012-01-12 17:19:11 -0800883 err = hugepage_init_sysfs(&hugepage_kobj);
884 if (err)
Kirill A. Shutemov65ebb642015-04-15 16:14:20 -0700885 goto err_sysfs;
Andrea Arcangeliba761492011-01-13 15:46:58 -0800886
Kirill A. Shutemovb46e7562016-07-26 15:26:24 -0700887 err = khugepaged_init();
Andrea Arcangeliba761492011-01-13 15:46:58 -0800888 if (err)
Kirill A. Shutemov65ebb642015-04-15 16:14:20 -0700889 goto err_slab;
Andrea Arcangeliba761492011-01-13 15:46:58 -0800890
Qi Zheng54d91722023-09-11 17:44:16 +0800891 err = thp_shrinker_init();
Kirill A. Shutemov65ebb642015-04-15 16:14:20 -0700892 if (err)
Qi Zheng54d91722023-09-11 17:44:16 +0800893 goto err_shrinker;
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -0800894
Rik van Riel97562cd2011-01-13 15:47:12 -0800895 /*
896 * By default disable transparent hugepages on smaller systems,
897 * where the extra memory used could hurt more than TLB overhead
898 * is likely to save. The admin can still enable it through /sys.
899 */
Arun KSca79b0c2018-12-28 00:34:29 -0800900 if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
Rik van Riel97562cd2011-01-13 15:47:12 -0800901 transparent_hugepage_flags = 0;
Kirill A. Shutemov79553da2932015-04-15 16:14:56 -0700902 return 0;
903 }
Rik van Riel97562cd2011-01-13 15:47:12 -0800904
Kirill A. Shutemov79553da2932015-04-15 16:14:56 -0700905 err = start_stop_khugepaged();
Kirill A. Shutemov65ebb642015-04-15 16:14:20 -0700906 if (err)
907 goto err_khugepaged;
Andrea Arcangeliba761492011-01-13 15:46:58 -0800908
Shaohua Li569e5592012-01-12 17:19:11 -0800909 return 0;
Kirill A. Shutemov65ebb642015-04-15 16:14:20 -0700910err_khugepaged:
Qi Zheng54d91722023-09-11 17:44:16 +0800911 thp_shrinker_exit();
912err_shrinker:
Kirill A. Shutemovb46e7562016-07-26 15:26:24 -0700913 khugepaged_destroy();
Kirill A. Shutemov65ebb642015-04-15 16:14:20 -0700914err_slab:
Shaohua Li569e5592012-01-12 17:19:11 -0800915 hugepage_exit_sysfs(hugepage_kobj);
Kirill A. Shutemov65ebb642015-04-15 16:14:20 -0700916err_sysfs:
Andrea Arcangeliba761492011-01-13 15:46:58 -0800917 return err;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800918}
Paul Gortmakera64fb3c2014-01-23 15:53:30 -0800919subsys_initcall(hugepage_init);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800920
921static int __init setup_transparent_hugepage(char *str)
922{
923 int ret = 0;
924 if (!str)
925 goto out;
926 if (!strcmp(str, "always")) {
927 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
928 &transparent_hugepage_flags);
929 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
930 &transparent_hugepage_flags);
931 ret = 1;
932 } else if (!strcmp(str, "madvise")) {
933 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
934 &transparent_hugepage_flags);
935 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
936 &transparent_hugepage_flags);
937 ret = 1;
938 } else if (!strcmp(str, "never")) {
939 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
940 &transparent_hugepage_flags);
941 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
942 &transparent_hugepage_flags);
943 ret = 1;
944 }
945out:
946 if (!ret)
Andrew Mortonae3a8c12014-06-04 16:06:58 -0700947 pr_warn("transparent_hugepage= cannot parse, ignored\n");
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -0800948 return ret;
949}
950__setup("transparent_hugepage=", setup_transparent_hugepage);
951
Ryan Robertsdd4d30d2024-08-14 14:02:47 +1200952static inline int get_order_from_str(const char *size_str)
953{
954 unsigned long size;
955 char *endptr;
956 int order;
957
958 size = memparse(size_str, &endptr);
959
960 if (!is_power_of_2(size))
961 goto err;
962 order = get_order(size);
963 if (BIT(order) & ~THP_ORDERS_ALL_ANON)
964 goto err;
965
966 return order;
967err:
968 pr_err("invalid size %s in thp_anon boot parameter\n", size_str);
969 return -EINVAL;
970}
971
972static char str_dup[PAGE_SIZE] __initdata;
973static int __init setup_thp_anon(char *str)
974{
975 char *token, *range, *policy, *subtoken;
976 unsigned long always, inherit, madvise;
977 char *start_size, *end_size;
978 int start, end, nr;
979 char *p;
980
981 if (!str || strlen(str) + 1 > PAGE_SIZE)
982 goto err;
983 strcpy(str_dup, str);
984
985 always = huge_anon_orders_always;
986 madvise = huge_anon_orders_madvise;
987 inherit = huge_anon_orders_inherit;
988 p = str_dup;
989 while ((token = strsep(&p, ";")) != NULL) {
990 range = strsep(&token, ":");
991 policy = token;
992
993 if (!policy)
994 goto err;
995
996 while ((subtoken = strsep(&range, ",")) != NULL) {
997 if (strchr(subtoken, '-')) {
998 start_size = strsep(&subtoken, "-");
999 end_size = subtoken;
1000
1001 start = get_order_from_str(start_size);
1002 end = get_order_from_str(end_size);
1003 } else {
1004 start = end = get_order_from_str(subtoken);
1005 }
1006
1007 if (start < 0 || end < 0 || start > end)
1008 goto err;
1009
1010 nr = end - start + 1;
1011 if (!strcmp(policy, "always")) {
1012 bitmap_set(&always, start, nr);
1013 bitmap_clear(&inherit, start, nr);
1014 bitmap_clear(&madvise, start, nr);
1015 } else if (!strcmp(policy, "madvise")) {
1016 bitmap_set(&madvise, start, nr);
1017 bitmap_clear(&inherit, start, nr);
1018 bitmap_clear(&always, start, nr);
1019 } else if (!strcmp(policy, "inherit")) {
1020 bitmap_set(&inherit, start, nr);
1021 bitmap_clear(&madvise, start, nr);
1022 bitmap_clear(&always, start, nr);
1023 } else if (!strcmp(policy, "never")) {
1024 bitmap_clear(&inherit, start, nr);
1025 bitmap_clear(&madvise, start, nr);
1026 bitmap_clear(&always, start, nr);
1027 } else {
1028 pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
1029 goto err;
1030 }
1031 }
1032 }
1033
1034 huge_anon_orders_always = always;
1035 huge_anon_orders_madvise = madvise;
1036 huge_anon_orders_inherit = inherit;
1037 anon_orders_configured = true;
1038 return 1;
1039
1040err:
1041 pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
1042 return 0;
1043}
1044__setup("thp_anon=", setup_thp_anon);
1045
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001046pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001047{
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001048 if (likely(vma->vm_flags & VM_WRITE))
Rick Edgecombe161e3932023-06-12 17:10:29 -07001049 pmd = pmd_mkwrite(pmd, vma);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001050 return pmd;
1051}
1052
Yang Shi87eaceb2019-09-23 15:38:15 -07001053#ifdef CONFIG_MEMCG
Matthew Wilcox (Oracle)f8baa6b2023-01-11 14:29:12 +00001054static inline
1055struct deferred_split *get_deferred_split_queue(struct folio *folio)
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08001056{
Matthew Wilcox (Oracle)f8baa6b2023-01-11 14:29:12 +00001057 struct mem_cgroup *memcg = folio_memcg(folio);
1058 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
Yang Shi87eaceb2019-09-23 15:38:15 -07001059
1060 if (memcg)
1061 return &memcg->deferred_split_queue;
1062 else
1063 return &pgdat->deferred_split_queue;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08001064}
Yang Shi87eaceb2019-09-23 15:38:15 -07001065#else
Matthew Wilcox (Oracle)f8baa6b2023-01-11 14:29:12 +00001066static inline
1067struct deferred_split *get_deferred_split_queue(struct folio *folio)
Yang Shi87eaceb2019-09-23 15:38:15 -07001068{
Matthew Wilcox (Oracle)f8baa6b2023-01-11 14:29:12 +00001069 struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));
Yang Shi87eaceb2019-09-23 15:38:15 -07001070
1071 return &pgdat->deferred_split_queue;
1072}
1073#endif
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08001074
Matthew Wilcox (Oracle)5beaee52024-03-26 20:28:22 +00001075static inline bool is_transparent_hugepage(const struct folio *folio)
Sean Christopherson005ba372020-01-08 12:24:36 -08001076{
Matthew Wilcox (Oracle)a644b0a2023-08-16 16:12:01 +01001077 if (!folio_test_large(folio))
Zou Weifa1f68c2020-06-04 16:49:46 -07001078 return false;
Sean Christopherson005ba372020-01-08 12:24:36 -08001079
Matthew Wilcox (Oracle)5beaee52024-03-26 20:28:22 +00001080 return is_huge_zero_folio(folio) ||
Matthew Wilcox (Oracle)de53c052023-08-16 16:11:56 +01001081 folio_test_large_rmappable(folio);
Sean Christopherson005ba372020-01-08 12:24:36 -08001082}
Sean Christopherson005ba372020-01-08 12:24:36 -08001083
Kirill A. Shutemov97d3d0f2020-01-13 16:29:10 -08001084static unsigned long __thp_get_unmapped_area(struct file *filp,
1085 unsigned long addr, unsigned long len,
Rick Edgecombeed48e872024-03-25 19:16:48 -07001086 loff_t off, unsigned long flags, unsigned long size,
1087 vm_flags_t vm_flags)
Toshi Kani74d2fad2016-10-07 16:59:56 -07001088{
Toshi Kani74d2fad2016-10-07 16:59:56 -07001089 loff_t off_end = off + len;
1090 loff_t off_align = round_up(off, size);
Ryan Roberts96204e12024-01-23 17:14:20 +00001091 unsigned long len_pad, ret, off_sub;
Toshi Kani74d2fad2016-10-07 16:59:56 -07001092
Yang Shid9592022024-07-12 08:58:55 -07001093 if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
Yang Shi4ef9ad192024-01-18 10:05:05 -08001094 return 0;
1095
Toshi Kani74d2fad2016-10-07 16:59:56 -07001096 if (off_end <= off_align || (off_end - off_align) < size)
1097 return 0;
1098
1099 len_pad = len + size;
1100 if (len_pad < len || (off + len_pad) < off)
1101 return 0;
1102
Rick Edgecombeed48e872024-03-25 19:16:48 -07001103 ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
1104 off >> PAGE_SHIFT, flags, vm_flags);
Kirill A. Shutemov97d3d0f2020-01-13 16:29:10 -08001105
1106 /*
1107 * The failure might be due to length padding. The caller will retry
1108 * without the padding.
1109 */
1110 if (IS_ERR_VALUE(ret))
Toshi Kani74d2fad2016-10-07 16:59:56 -07001111 return 0;
1112
Kirill A. Shutemov97d3d0f2020-01-13 16:29:10 -08001113 /*
1114 * Do not try to align to THP boundary if allocation at the address
1115 * hint succeeds.
1116 */
1117 if (ret == addr)
1118 return addr;
1119
Ryan Roberts96204e12024-01-23 17:14:20 +00001120 off_sub = (off - ret) & (size - 1);
1121
Rick Edgecombe529ce232024-03-25 19:16:44 -07001122 if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
Ryan Roberts96204e12024-01-23 17:14:20 +00001123 return ret + size;
1124
1125 ret += off_sub;
Kirill A. Shutemov97d3d0f2020-01-13 16:29:10 -08001126 return ret;
Toshi Kani74d2fad2016-10-07 16:59:56 -07001127}
1128
Rick Edgecombeed48e872024-03-25 19:16:48 -07001129unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
1130 unsigned long len, unsigned long pgoff, unsigned long flags,
1131 vm_flags_t vm_flags)
Toshi Kani74d2fad2016-10-07 16:59:56 -07001132{
Kirill A. Shutemov97d3d0f2020-01-13 16:29:10 -08001133 unsigned long ret;
Toshi Kani74d2fad2016-10-07 16:59:56 -07001134 loff_t off = (loff_t)pgoff << PAGE_SHIFT;
1135
Rick Edgecombeed48e872024-03-25 19:16:48 -07001136 ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
Kirill A. Shutemov97d3d0f2020-01-13 16:29:10 -08001137 if (ret)
1138 return ret;
William Kucharski1854bc62019-09-22 08:43:15 -04001139
Rick Edgecombeed48e872024-03-25 19:16:48 -07001140 return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
1141 vm_flags);
1142}
1143
1144unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
1145 unsigned long len, unsigned long pgoff, unsigned long flags)
1146{
1147 return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
Toshi Kani74d2fad2016-10-07 16:59:56 -07001148}
1149EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
1150
Souptick Joarder2b740302018-08-23 17:01:36 -07001151static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
1152 struct page *page, gfp_t gfp)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001153{
Jan Kara82b0f8c2016-12-14 15:06:58 -08001154 struct vm_area_struct *vma = vmf->vma;
Kefeng Wangcfe32362023-03-02 19:58:29 +08001155 struct folio *folio = page_folio(page);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001156 pgtable_t pgtable;
Jan Kara82b0f8c2016-12-14 15:06:58 -08001157 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
Souptick Joarder2b740302018-08-23 17:01:36 -07001158 vm_fault_t ret = 0;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001159
Kefeng Wangcfe32362023-03-02 19:58:29 +08001160 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
Johannes Weiner00501b52014-08-08 14:19:20 -07001161
Kefeng Wangcfe32362023-03-02 19:58:29 +08001162 if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
1163 folio_put(folio);
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001164 count_vm_event(THP_FAULT_FALLBACK);
David Rientjes85b9f462020-04-06 20:04:28 -07001165 count_vm_event(THP_FAULT_FALLBACK_CHARGE);
Barry Songec336872024-04-12 23:48:55 +12001166 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
1167 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001168 return VM_FAULT_FALLBACK;
1169 }
Kefeng Wangcfe32362023-03-02 19:58:29 +08001170 folio_throttle_swaprate(folio, gfp);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001171
Joel Fernandes (Google)4cf58922019-01-03 15:28:34 -08001172 pgtable = pte_alloc_one(vma->vm_mm);
Johannes Weiner00501b52014-08-08 14:19:20 -07001173 if (unlikely(!pgtable)) {
Michal Hocko6b31d592017-08-18 15:16:15 -07001174 ret = VM_FAULT_OOM;
1175 goto release;
Johannes Weiner00501b52014-08-08 14:19:20 -07001176 }
1177
Kefeng Wang78fefd02024-06-18 17:12:39 +08001178 folio_zero_user(folio, vmf->address);
Minchan Kim52f37622013-04-29 15:08:15 -07001179 /*
Kefeng Wangcfe32362023-03-02 19:58:29 +08001180 * The memory barrier inside __folio_mark_uptodate makes sure that
Kefeng Wang78fefd02024-06-18 17:12:39 +08001181 * folio_zero_user writes become visible before the set_pmd_at()
Minchan Kim52f37622013-04-29 15:08:15 -07001182 * write.
1183 */
Kefeng Wangcfe32362023-03-02 19:58:29 +08001184 __folio_mark_uptodate(folio);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001185
Jan Kara82b0f8c2016-12-14 15:06:58 -08001186 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1187 if (unlikely(!pmd_none(*vmf->pmd))) {
Michal Hocko6b31d592017-08-18 15:16:15 -07001188 goto unlock_release;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001189 } else {
1190 pmd_t entry;
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001191
Michal Hocko6b31d592017-08-18 15:16:15 -07001192 ret = check_stable_address_space(vma->vm_mm);
1193 if (ret)
1194 goto unlock_release;
1195
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001196 /* Deliver the page fault to userland */
1197 if (userfaultfd_missing(vma)) {
Jan Kara82b0f8c2016-12-14 15:06:58 -08001198 spin_unlock(vmf->ptl);
Kefeng Wangcfe32362023-03-02 19:58:29 +08001199 folio_put(folio);
Kirill A. Shutemovbae473a2016-07-26 15:25:20 -07001200 pte_free(vma->vm_mm, pgtable);
Miaohe Lin8fd5eda2021-05-04 18:33:49 -07001201 ret = handle_userfault(vmf, VM_UFFD_MISSING);
1202 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1203 return ret;
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001204 }
1205
Kirill A. Shutemov31223592013-09-12 15:14:01 -07001206 entry = mk_huge_pmd(page, vma->vm_page_prot);
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001207 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
Barry Song15bde4a2024-06-18 11:11:35 +12001208 folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
Kefeng Wangcfe32362023-03-02 19:58:29 +08001209 folio_add_lru_vma(folio, vma);
Jan Kara82b0f8c2016-12-14 15:06:58 -08001210 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
1211 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
Bibo Maofca40572021-02-24 12:06:42 -08001212 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
Kirill A. Shutemovbae473a2016-07-26 15:25:20 -07001213 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
Kirill A. Shutemovc4812902017-11-15 17:35:37 -08001214 mm_inc_nr_ptes(vma->vm_mm);
Usama Arifdafff3f2024-08-30 11:03:39 +01001215 deferred_split_folio(folio, false);
Jan Kara82b0f8c2016-12-14 15:06:58 -08001216 spin_unlock(vmf->ptl);
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001217 count_vm_event(THP_FAULT_ALLOC);
Barry Songec336872024-04-12 23:48:55 +12001218 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
Johannes Weiner9d82c692020-06-03 16:02:04 -07001219 count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001220 }
1221
David Rientjesaa2e8782012-05-29 15:06:17 -07001222 return 0;
Michal Hocko6b31d592017-08-18 15:16:15 -07001223unlock_release:
1224 spin_unlock(vmf->ptl);
1225release:
1226 if (pgtable)
1227 pte_free(vma->vm_mm, pgtable);
Kefeng Wangcfe32362023-03-02 19:58:29 +08001228 folio_put(folio);
Michal Hocko6b31d592017-08-18 15:16:15 -07001229 return ret;
1230
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001231}
1232
Mel Gorman444eb2a42016-03-17 14:19:23 -07001233/*
David Rientjes21440d72017-02-22 15:45:49 -08001234 * always: directly stall for all thp allocations
1235 * defer: wake kswapd and fail if not immediately available
1236 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
1237 * fail if not immediately available
1238 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
1239 * available
1240 * never: never stall for any thp allocation
Mel Gorman444eb2a42016-03-17 14:19:23 -07001241 */
Rik van Riel164cc4f2021-02-25 17:16:18 -08001242gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
Andrea Arcangeli0bbbc0b2011-01-13 15:47:05 -08001243{
Rik van Riel164cc4f2021-02-25 17:16:18 -08001244 const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
Michal Hocko89c83fb2018-11-02 15:48:31 -07001245
David Rientjesac79f782019-09-04 12:54:18 -07001246 /* Always do synchronous compaction */
David Rientjes21440d72017-02-22 15:45:49 -08001247 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
Andrea Arcangelia8282602019-08-13 15:37:53 -07001248 return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
David Rientjesac79f782019-09-04 12:54:18 -07001249
1250 /* Kick kcompactd and fail quickly */
David Rientjes21440d72017-02-22 15:45:49 -08001251 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
David Rientjes19deb762019-09-04 12:54:20 -07001252 return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
David Rientjesac79f782019-09-04 12:54:18 -07001253
1254 /* Synchronous compaction if madvised, otherwise kick kcompactd */
David Rientjes21440d72017-02-22 15:45:49 -08001255 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
David Rientjes19deb762019-09-04 12:54:20 -07001256 return GFP_TRANSHUGE_LIGHT |
1257 (vma_madvised ? __GFP_DIRECT_RECLAIM :
1258 __GFP_KSWAPD_RECLAIM);
David Rientjesac79f782019-09-04 12:54:18 -07001259
1260 /* Only do synchronous compaction if madvised */
David Rientjes21440d72017-02-22 15:45:49 -08001261 if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
David Rientjes19deb762019-09-04 12:54:20 -07001262 return GFP_TRANSHUGE_LIGHT |
1263 (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
David Rientjesac79f782019-09-04 12:54:18 -07001264
David Rientjes19deb762019-09-04 12:54:20 -07001265 return GFP_TRANSHUGE_LIGHT;
Mel Gorman444eb2a42016-03-17 14:19:23 -07001266}
1267
Kirill A. Shutemovc4088eb2013-11-14 14:31:04 -08001268/* Caller must hold page table lock. */
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001269static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -08001270 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001271 struct folio *zero_folio)
Kirill A. Shutemovfc9fe822012-12-12 13:50:51 -08001272{
1273 pmd_t entry;
Andrew Morton7c414162015-09-08 14:58:43 -07001274 if (!pmd_none(*pmd))
Miaohe Lin2efeb8d2021-02-24 12:07:29 -08001275 return;
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001276 entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
Kirill A. Shutemovfc9fe822012-12-12 13:50:51 -08001277 entry = pmd_mkhuge(entry);
Qi Zhengc8bb4162022-08-18 16:27:48 +08001278 pgtable_trans_huge_deposit(mm, pmd, pgtable);
Kirill A. Shutemovfc9fe822012-12-12 13:50:51 -08001279 set_pmd_at(mm, haddr, pmd, entry);
Kirill A. Shutemovc4812902017-11-15 17:35:37 -08001280 mm_inc_nr_ptes(mm);
Kirill A. Shutemovfc9fe822012-12-12 13:50:51 -08001281}
1282
Souptick Joarder2b740302018-08-23 17:01:36 -07001283vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001284{
Jan Kara82b0f8c2016-12-14 15:06:58 -08001285 struct vm_area_struct *vma = vmf->vma;
Aneesh Kumar K.V077fcf12015-02-11 15:27:12 -08001286 gfp_t gfp;
Matthew Wilcox (Oracle)cb196ee2022-05-12 20:23:01 -07001287 struct folio *folio;
Jan Kara82b0f8c2016-12-14 15:06:58 -08001288 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
Matthew Wilcox (Oracle)a373bae2024-04-26 15:45:01 +01001289 vm_fault_t ret;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001290
Ryan Roberts3485b882023-12-07 16:12:04 +00001291 if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
Kirill A. Shutemovc0292552013-09-12 15:14:05 -07001292 return VM_FAULT_FALLBACK;
Matthew Wilcox (Oracle)a373bae2024-04-26 15:45:01 +01001293 ret = vmf_anon_prepare(vmf);
1294 if (ret)
1295 return ret;
Yang Shi4fa68932022-06-16 10:48:35 -07001296 khugepaged_enter_vma(vma, vma->vm_flags);
Yang Shid2081b22022-05-19 14:08:49 -07001297
Jan Kara82b0f8c2016-12-14 15:06:58 -08001298 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
Kirill A. Shutemovbae473a2016-07-26 15:25:20 -07001299 !mm_forbids_zeropage(vma->vm_mm) &&
Kirill A. Shutemov128ec032013-09-12 15:14:03 -07001300 transparent_hugepage_use_zero_page()) {
1301 pgtable_t pgtable;
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001302 struct folio *zero_folio;
Souptick Joarder2b740302018-08-23 17:01:36 -07001303 vm_fault_t ret;
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001304
Joel Fernandes (Google)4cf58922019-01-03 15:28:34 -08001305 pgtable = pte_alloc_one(vma->vm_mm);
Kirill A. Shutemov128ec032013-09-12 15:14:03 -07001306 if (unlikely(!pgtable))
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001307 return VM_FAULT_OOM;
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001308 zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
1309 if (unlikely(!zero_folio)) {
Kirill A. Shutemovbae473a2016-07-26 15:25:20 -07001310 pte_free(vma->vm_mm, pgtable);
Andi Kleen81ab4202011-04-14 15:22:06 -07001311 count_vm_event(THP_FAULT_FALLBACK);
Kirill A. Shutemovc0292552013-09-12 15:14:05 -07001312 return VM_FAULT_FALLBACK;
Andi Kleen81ab4202011-04-14 15:22:06 -07001313 }
Jan Kara82b0f8c2016-12-14 15:06:58 -08001314 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001315 ret = 0;
Jan Kara82b0f8c2016-12-14 15:06:58 -08001316 if (pmd_none(*vmf->pmd)) {
Michal Hocko6b31d592017-08-18 15:16:15 -07001317 ret = check_stable_address_space(vma->vm_mm);
1318 if (ret) {
1319 spin_unlock(vmf->ptl);
Gerald Schaeferbfe8cc12020-11-21 22:17:15 -08001320 pte_free(vma->vm_mm, pgtable);
Michal Hocko6b31d592017-08-18 15:16:15 -07001321 } else if (userfaultfd_missing(vma)) {
Jan Kara82b0f8c2016-12-14 15:06:58 -08001322 spin_unlock(vmf->ptl);
Gerald Schaeferbfe8cc12020-11-21 22:17:15 -08001323 pte_free(vma->vm_mm, pgtable);
Jan Kara82b0f8c2016-12-14 15:06:58 -08001324 ret = handle_userfault(vmf, VM_UFFD_MISSING);
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001325 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
1326 } else {
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001327 set_huge_zero_folio(pgtable, vma->vm_mm, vma,
1328 haddr, vmf->pmd, zero_folio);
Bibo Maofca40572021-02-24 12:06:42 -08001329 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
Jan Kara82b0f8c2016-12-14 15:06:58 -08001330 spin_unlock(vmf->ptl);
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001331 }
Gerald Schaeferbfe8cc12020-11-21 22:17:15 -08001332 } else {
Jan Kara82b0f8c2016-12-14 15:06:58 -08001333 spin_unlock(vmf->ptl);
Kirill A. Shutemovbae473a2016-07-26 15:25:20 -07001334 pte_free(vma->vm_mm, pgtable);
Gerald Schaeferbfe8cc12020-11-21 22:17:15 -08001335 }
Andrea Arcangeli6b251fc2015-09-04 15:46:20 -07001336 return ret;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001337 }
Rik van Riel164cc4f2021-02-25 17:16:18 -08001338 gfp = vma_thp_gfp_mask(vma);
Matthew Wilcox (Oracle)cb196ee2022-05-12 20:23:01 -07001339 folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
1340 if (unlikely(!folio)) {
Kirill A. Shutemov128ec032013-09-12 15:14:03 -07001341 count_vm_event(THP_FAULT_FALLBACK);
Barry Songec336872024-04-12 23:48:55 +12001342 count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
Kirill A. Shutemovc0292552013-09-12 15:14:05 -07001343 return VM_FAULT_FALLBACK;
Kirill A. Shutemov128ec032013-09-12 15:14:03 -07001344 }
Matthew Wilcox (Oracle)cb196ee2022-05-12 20:23:01 -07001345 return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001346}
1347
Matthew Wilcoxae18d6d2015-09-08 14:59:14 -07001348static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
Oliver O'Halloran3b6521f2017-05-08 15:59:43 -07001349 pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
1350 pgtable_t pgtable)
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001351{
1352 struct mm_struct *mm = vma->vm_mm;
1353 pmd_t entry;
1354 spinlock_t *ptl;
1355
1356 ptl = pmd_lock(mm, pmd);
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001357 if (!pmd_none(*pmd)) {
1358 if (write) {
1359 if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
1360 WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
1361 goto out_unlock;
1362 }
1363 entry = pmd_mkyoung(*pmd);
1364 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1365 if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
1366 update_mmu_cache_pmd(vma, addr, pmd);
1367 }
1368
1369 goto out_unlock;
1370 }
1371
Dan Williamsf25748e32016-01-15 16:56:43 -08001372 entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
1373 if (pfn_t_devmap(pfn))
1374 entry = pmd_mkdevmap(entry);
Peter Xu3c8e44c2024-08-26 16:43:37 -04001375 else
1376 entry = pmd_mkspecial(entry);
Ross Zwisler01871e52016-01-15 16:56:02 -08001377 if (write) {
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001378 entry = pmd_mkyoung(pmd_mkdirty(entry));
1379 entry = maybe_pmd_mkwrite(entry, vma);
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001380 }
Oliver O'Halloran3b6521f2017-05-08 15:59:43 -07001381
1382 if (pgtable) {
1383 pgtable_trans_huge_deposit(mm, pmd, pgtable);
Kirill A. Shutemovc4812902017-11-15 17:35:37 -08001384 mm_inc_nr_ptes(mm);
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001385 pgtable = NULL;
Oliver O'Halloran3b6521f2017-05-08 15:59:43 -07001386 }
1387
Ross Zwisler01871e52016-01-15 16:56:02 -08001388 set_pmd_at(mm, addr, pmd, entry);
1389 update_mmu_cache_pmd(vma, addr, pmd);
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001390
1391out_unlock:
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001392 spin_unlock(ptl);
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001393 if (pgtable)
1394 pte_free(mm, pgtable);
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001395}
1396
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001397/**
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001398 * vmf_insert_pfn_pmd - insert a pmd size pfn
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001399 * @vmf: Structure describing the fault
1400 * @pfn: pfn to insert
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001401 * @write: whether it's a write fault
1402 *
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001403 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001404 *
1405 * Return: vm_fault_t value.
1406 */
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001407vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001408{
Dan Williamsfce86ff2019-05-13 17:15:33 -07001409 unsigned long addr = vmf->address & PMD_MASK;
1410 struct vm_area_struct *vma = vmf->vma;
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001411 pgprot_t pgprot = vma->vm_page_prot;
Oliver O'Halloran3b6521f2017-05-08 15:59:43 -07001412 pgtable_t pgtable = NULL;
Dan Williamsfce86ff2019-05-13 17:15:33 -07001413
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001414 /*
1415 * If we had pmd_special, we could avoid all these restrictions,
1416 * but we need to be consistent with PTEs and architectures that
1417 * can't support a 'special' bit.
1418 */
Dave Jiange1fb4a02018-08-17 15:43:40 -07001419 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1420 !pfn_t_devmap(pfn));
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001421 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1422 (VM_PFNMAP|VM_MIXEDMAP));
1423 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001424
1425 if (addr < vma->vm_start || addr >= vma->vm_end)
1426 return VM_FAULT_SIGBUS;
Borislav Petkov308a0472016-10-26 19:43:43 +02001427
Oliver O'Halloran3b6521f2017-05-08 15:59:43 -07001428 if (arch_needs_pgtable_deposit()) {
Joel Fernandes (Google)4cf58922019-01-03 15:28:34 -08001429 pgtable = pte_alloc_one(vma->vm_mm);
Oliver O'Halloran3b6521f2017-05-08 15:59:43 -07001430 if (!pgtable)
1431 return VM_FAULT_OOM;
1432 }
1433
Borislav Petkov308a0472016-10-26 19:43:43 +02001434 track_pfn_insert(vma, &pgprot, pfn);
1435
Dan Williamsfce86ff2019-05-13 17:15:33 -07001436 insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
Matthew Wilcoxae18d6d2015-09-08 14:59:14 -07001437 return VM_FAULT_NOPAGE;
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001438}
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001439EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
Matthew Wilcox5cad4652015-09-08 14:58:54 -07001440
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001441#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001442static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001443{
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001444 if (likely(vma->vm_flags & VM_WRITE))
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001445 pud = pud_mkwrite(pud);
1446 return pud;
1447}
1448
1449static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001450 pud_t *pud, pfn_t pfn, bool write)
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001451{
1452 struct mm_struct *mm = vma->vm_mm;
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001453 pgprot_t prot = vma->vm_page_prot;
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001454 pud_t entry;
1455 spinlock_t *ptl;
1456
1457 ptl = pud_lock(mm, pud);
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001458 if (!pud_none(*pud)) {
1459 if (write) {
Peter Xuef713ec2024-08-26 16:43:36 -04001460 if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001461 goto out_unlock;
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001462 entry = pud_mkyoung(*pud);
1463 entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
1464 if (pudp_set_access_flags(vma, addr, pud, entry, 1))
1465 update_mmu_cache_pud(vma, addr, pud);
1466 }
1467 goto out_unlock;
1468 }
1469
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001470 entry = pud_mkhuge(pfn_t_pud(pfn, prot));
1471 if (pfn_t_devmap(pfn))
1472 entry = pud_mkdevmap(entry);
Peter Xu3c8e44c2024-08-26 16:43:37 -04001473 else
1474 entry = pud_mkspecial(entry);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001475 if (write) {
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001476 entry = pud_mkyoung(pud_mkdirty(entry));
1477 entry = maybe_pud_mkwrite(entry, vma);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001478 }
1479 set_pud_at(mm, addr, pud, entry);
1480 update_mmu_cache_pud(vma, addr, pud);
Aneesh Kumar K.Vc6f3c5e2019-04-05 18:39:10 -07001481
1482out_unlock:
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001483 spin_unlock(ptl);
1484}
1485
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001486/**
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001487 * vmf_insert_pfn_pud - insert a pud size pfn
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001488 * @vmf: Structure describing the fault
1489 * @pfn: pfn to insert
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001490 * @write: whether it's a write fault
1491 *
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001492 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
Thomas Hellstrom (VMware)9a9731b2020-03-24 18:48:09 +01001493 *
1494 * Return: vm_fault_t value.
1495 */
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001496vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001497{
Dan Williamsfce86ff2019-05-13 17:15:33 -07001498 unsigned long addr = vmf->address & PUD_MASK;
1499 struct vm_area_struct *vma = vmf->vma;
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001500 pgprot_t pgprot = vma->vm_page_prot;
Dan Williamsfce86ff2019-05-13 17:15:33 -07001501
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001502 /*
1503 * If we had pud_special, we could avoid all these restrictions,
1504 * but we need to be consistent with PTEs and architectures that
1505 * can't support a 'special' bit.
1506 */
Dave Jiang62ec0d82018-09-04 15:46:16 -07001507 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
1508 !pfn_t_devmap(pfn));
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001509 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1510 (VM_PFNMAP|VM_MIXEDMAP));
1511 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001512
1513 if (addr < vma->vm_start || addr >= vma->vm_end)
1514 return VM_FAULT_SIGBUS;
1515
1516 track_pfn_insert(vma, &pgprot, pfn);
1517
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001518 insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001519 return VM_FAULT_NOPAGE;
1520}
Lorenzo Stoakes7b806d22023-03-12 23:40:14 +00001521EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001522#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1523
Peter Xu4418c522024-03-27 11:23:30 -04001524void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
1525 pmd_t *pmd, bool write)
Dan Williams3565fce2016-01-15 16:56:55 -08001526{
1527 pmd_t _pmd;
1528
Kirill A. Shutemova8f97362017-11-27 06:21:25 +03001529 _pmd = pmd_mkyoung(*pmd);
Miaohe Lina69e4712022-07-04 21:21:50 +08001530 if (write)
Kirill A. Shutemova8f97362017-11-27 06:21:25 +03001531 _pmd = pmd_mkdirty(_pmd);
Dan Williams3565fce2016-01-15 16:56:55 -08001532 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
Miaohe Lina69e4712022-07-04 21:21:50 +08001533 pmd, _pmd, write))
Dan Williams3565fce2016-01-15 16:56:55 -08001534 update_mmu_cache_pmd(vma, addr, pmd);
1535}
1536
1537struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
Keith Buschdf06b372018-10-26 15:10:28 -07001538 pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
Dan Williams3565fce2016-01-15 16:56:55 -08001539{
1540 unsigned long pfn = pmd_pfn(*pmd);
1541 struct mm_struct *mm = vma->vm_mm;
Dan Williams3565fce2016-01-15 16:56:55 -08001542 struct page *page;
Logan Gunthorpe0f089232022-10-21 11:41:08 -06001543 int ret;
Dan Williams3565fce2016-01-15 16:56:55 -08001544
1545 assert_spin_locked(pmd_lockptr(mm, pmd));
1546
Linus Torvaldsf6f37322017-12-15 18:53:22 -08001547 if (flags & FOLL_WRITE && !pmd_write(*pmd))
Dan Williams3565fce2016-01-15 16:56:55 -08001548 return NULL;
1549
1550 if (pmd_present(*pmd) && pmd_devmap(*pmd))
1551 /* pass */;
1552 else
1553 return NULL;
1554
1555 if (flags & FOLL_TOUCH)
Miaohe Lina69e4712022-07-04 21:21:50 +08001556 touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
Dan Williams3565fce2016-01-15 16:56:55 -08001557
1558 /*
1559 * device mapped pages can only be returned if the
1560 * caller will manage the page reference count.
1561 */
John Hubbard3faa52c2020-04-01 21:05:29 -07001562 if (!(flags & (FOLL_GET | FOLL_PIN)))
Dan Williams3565fce2016-01-15 16:56:55 -08001563 return ERR_PTR(-EEXIST);
1564
1565 pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
Keith Buschdf06b372018-10-26 15:10:28 -07001566 *pgmap = get_dev_pagemap(pfn, *pgmap);
1567 if (!*pgmap)
Dan Williams3565fce2016-01-15 16:56:55 -08001568 return ERR_PTR(-EFAULT);
1569 page = pfn_to_page(pfn);
Yang Shif442fa62024-06-28 12:14:58 -07001570 ret = try_grab_folio(page_folio(page), 1, flags);
Logan Gunthorpe0f089232022-10-21 11:41:08 -06001571 if (ret)
1572 page = ERR_PTR(ret);
Dan Williams3565fce2016-01-15 16:56:55 -08001573
1574 return page;
1575}
1576
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001577int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1578 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
Peter Xu8f34f1e2021-06-30 18:49:02 -07001579 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001580{
Kirill A. Shutemovc4088eb2013-11-14 14:31:04 -08001581 spinlock_t *dst_ptl, *src_ptl;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001582 struct page *src_page;
David Hildenbrand96c772c2023-12-20 23:44:59 +01001583 struct folio *src_folio;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001584 pmd_t pmd;
Matthew Wilcox12c9d702016-02-02 16:57:57 -08001585 pgtable_t pgtable = NULL;
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001586 int ret = -ENOMEM;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001587
Peter Xubc02afb2024-08-26 16:43:41 -04001588 pmd = pmdp_get_lockless(src_pmd);
1589 if (unlikely(pmd_special(pmd))) {
1590 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1591 src_ptl = pmd_lockptr(src_mm, src_pmd);
1592 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1593 /*
1594 * No need to recheck the pmd, it can't change with write
1595 * mmap lock held here.
1596 *
1597 * Meanwhile, making sure it's not a CoW VMA with writable
1598 * mapping, otherwise it means either the anon page wrongly
1599 * applied special bit, or we made the PRIVATE mapping be
1600 * able to wrongly write to the backend MMIO.
1601 */
1602 VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
1603 goto set_pmd;
1604 }
1605
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001606 /* Skip if can be re-fill on fault */
Peter Xu8f34f1e2021-06-30 18:49:02 -07001607 if (!vma_is_anonymous(dst_vma))
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001608 return 0;
1609
Joel Fernandes (Google)4cf58922019-01-03 15:28:34 -08001610 pgtable = pte_alloc_one(dst_mm);
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001611 if (unlikely(!pgtable))
1612 goto out;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001613
Kirill A. Shutemovc4088eb2013-11-14 14:31:04 -08001614 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1615 src_ptl = pmd_lockptr(src_mm, src_pmd);
1616 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001617
1618 ret = -EAGAIN;
1619 pmd = *src_pmd;
Zi Yan84c3fc42017-09-08 16:11:01 -07001620
1621#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1622 if (unlikely(is_swap_pmd(pmd))) {
1623 swp_entry_t entry = pmd_to_swp_entry(pmd);
1624
1625 VM_BUG_ON(!is_pmd_migration_entry(pmd));
David Hildenbrand6c287602022-05-09 18:20:44 -07001626 if (!is_readable_migration_entry(entry)) {
Alistair Popple4dd845b2021-06-30 18:54:09 -07001627 entry = make_readable_migration_entry(
1628 swp_offset(entry));
Zi Yan84c3fc42017-09-08 16:11:01 -07001629 pmd = swp_entry_to_pmd(entry);
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07001630 if (pmd_swp_soft_dirty(*src_pmd))
1631 pmd = pmd_swp_mksoft_dirty(pmd);
Peter Xu8f34f1e2021-06-30 18:49:02 -07001632 if (pmd_swp_uffd_wp(*src_pmd))
1633 pmd = pmd_swp_mkuffd_wp(pmd);
Zi Yan84c3fc42017-09-08 16:11:01 -07001634 set_pmd_at(src_mm, addr, src_pmd, pmd);
1635 }
Zi Yandd8a67f2017-11-02 15:59:47 -07001636 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
Kirill A. Shutemovaf5b0f62017-11-15 17:35:40 -08001637 mm_inc_nr_ptes(dst_mm);
Zi Yandd8a67f2017-11-02 15:59:47 -07001638 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
Peter Xu8f34f1e2021-06-30 18:49:02 -07001639 if (!userfaultfd_wp(dst_vma))
1640 pmd = pmd_swp_clear_uffd_wp(pmd);
Zi Yan84c3fc42017-09-08 16:11:01 -07001641 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1642 ret = 0;
1643 goto out_unlock;
1644 }
1645#endif
1646
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001647 if (unlikely(!pmd_trans_huge(pmd))) {
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001648 pte_free(dst_mm, pgtable);
1649 goto out_unlock;
1650 }
Kirill A. Shutemovfc9fe822012-12-12 13:50:51 -08001651 /*
Kirill A. Shutemovc4088eb2013-11-14 14:31:04 -08001652 * When page table lock is held, the huge zero pmd should not be
Kirill A. Shutemovfc9fe822012-12-12 13:50:51 -08001653 * under splitting since we don't split the page itself, only pmd to
1654 * a page table.
1655 */
1656 if (is_huge_zero_pmd(pmd)) {
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -08001657 /*
Matthew Wilcox (Oracle)e28833b2024-03-26 20:28:26 +00001658 * mm_get_huge_zero_folio() will never allocate a new
1659 * folio here, since we already have a zero page to
1660 * copy. It just takes a reference.
Kirill A. Shutemov97ae1742012-12-12 13:51:06 -08001661 */
Matthew Wilcox (Oracle)56917532024-03-26 20:28:25 +00001662 mm_get_huge_zero_folio(dst_mm);
Peter Xu5fc7a5f2021-06-30 18:48:59 -07001663 goto out_zero_page;
Kirill A. Shutemovfc9fe822012-12-12 13:50:51 -08001664 }
Mel Gormande466bd2013-12-18 17:08:42 -08001665
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001666 src_page = pmd_page(pmd);
1667 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
David Hildenbrand96c772c2023-12-20 23:44:59 +01001668 src_folio = page_folio(src_page);
Peter Xud0420352020-09-25 18:26:00 -04001669
David Hildenbrand96c772c2023-12-20 23:44:59 +01001670 folio_get(src_folio);
1671 if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, src_vma))) {
David Hildenbrandfb3d8242022-05-09 18:20:43 -07001672 /* Page maybe pinned: split and retry the fault on PTEs. */
David Hildenbrand96c772c2023-12-20 23:44:59 +01001673 folio_put(src_folio);
Peter Xud0420352020-09-25 18:26:00 -04001674 pte_free(dst_mm, pgtable);
1675 spin_unlock(src_ptl);
1676 spin_unlock(dst_ptl);
Peter Xu8f34f1e2021-06-30 18:49:02 -07001677 __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
Peter Xud0420352020-09-25 18:26:00 -04001678 return -EAGAIN;
1679 }
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001680 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
Peter Xu5fc7a5f2021-06-30 18:48:59 -07001681out_zero_page:
Kirill A. Shutemovc4812902017-11-15 17:35:37 -08001682 mm_inc_nr_ptes(dst_mm);
Kirill A. Shutemov628d47c2016-07-26 15:25:42 -07001683 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001684 pmdp_set_wrprotect(src_mm, addr, src_pmd);
Peter Xu8f34f1e2021-06-30 18:49:02 -07001685 if (!userfaultfd_wp(dst_vma))
1686 pmd = pmd_clear_uffd_wp(pmd);
Peter Xubc02afb2024-08-26 16:43:41 -04001687 pmd = pmd_wrprotect(pmd);
1688set_pmd:
1689 pmd = pmd_mkold(pmd);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001690 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001691
1692 ret = 0;
1693out_unlock:
Kirill A. Shutemovc4088eb2013-11-14 14:31:04 -08001694 spin_unlock(src_ptl);
1695 spin_unlock(dst_ptl);
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001696out:
1697 return ret;
1698}
1699
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001700#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
Peter Xu1b167612024-03-27 11:23:29 -04001701void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1702 pud_t *pud, bool write)
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001703{
1704 pud_t _pud;
1705
Kirill A. Shutemova8f97362017-11-27 06:21:25 +03001706 _pud = pud_mkyoung(*pud);
Miaohe Lin5fe653e2022-07-04 21:21:49 +08001707 if (write)
Kirill A. Shutemova8f97362017-11-27 06:21:25 +03001708 _pud = pud_mkdirty(_pud);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001709 if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
Miaohe Lin5fe653e2022-07-04 21:21:49 +08001710 pud, _pud, write))
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001711 update_mmu_cache_pud(vma, addr, pud);
1712}
1713
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001714int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1715 pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1716 struct vm_area_struct *vma)
1717{
1718 spinlock_t *dst_ptl, *src_ptl;
1719 pud_t pud;
1720 int ret;
1721
1722 dst_ptl = pud_lock(dst_mm, dst_pud);
1723 src_ptl = pud_lockptr(src_mm, src_pud);
1724 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1725
1726 ret = -EAGAIN;
1727 pud = *src_pud;
1728 if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1729 goto out_unlock;
1730
1731 /*
David Hildenbrand96c772c2023-12-20 23:44:59 +01001732 * TODO: once we support anonymous pages, use
1733 * folio_try_dup_anon_rmap_*() and split if duplicating fails.
David Hildenbrandfb3d8242022-05-09 18:20:43 -07001734 */
Peter Xubc02afb2024-08-26 16:43:41 -04001735 if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
1736 pudp_set_wrprotect(src_mm, addr, src_pud);
1737 pud = pud_wrprotect(pud);
1738 }
1739 pud = pud_mkold(pud);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001740 set_pud_at(dst_mm, addr, dst_pud, pud);
1741
1742 ret = 0;
1743out_unlock:
1744 spin_unlock(src_ptl);
1745 spin_unlock(dst_ptl);
1746 return ret;
1747}
1748
1749void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1750{
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001751 bool write = vmf->flags & FAULT_FLAG_WRITE;
1752
1753 vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1754 if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1755 goto unlock;
1756
Miaohe Lin5fe653e2022-07-04 21:21:49 +08001757 touch_pud(vmf->vma, vmf->address, vmf->pud, write);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08001758unlock:
1759 spin_unlock(vmf->ptl);
1760}
1761#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1762
Yang Shi5db4f152021-06-30 18:51:35 -07001763void huge_pmd_set_accessed(struct vm_fault *vmf)
Will Deacona1dd4502012-12-11 16:01:27 -08001764{
Minchan Kim20f664a2017-01-10 16:57:51 -08001765 bool write = vmf->flags & FAULT_FLAG_WRITE;
Will Deacona1dd4502012-12-11 16:01:27 -08001766
Jan Kara82b0f8c2016-12-14 15:06:58 -08001767 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
Miaohe Lina69e4712022-07-04 21:21:50 +08001768 if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
Will Deacona1dd4502012-12-11 16:01:27 -08001769 goto unlock;
1770
Miaohe Lina69e4712022-07-04 21:21:50 +08001771 touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
Will Deacona1dd4502012-12-11 16:01:27 -08001772
1773unlock:
Jan Kara82b0f8c2016-12-14 15:06:58 -08001774 spin_unlock(vmf->ptl);
Will Deacona1dd4502012-12-11 16:01:27 -08001775}
1776
Yang Shi5db4f152021-06-30 18:51:35 -07001777vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001778{
David Hildenbrandc89357e2022-05-09 18:20:45 -07001779 const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
Jan Kara82b0f8c2016-12-14 15:06:58 -08001780 struct vm_area_struct *vma = vmf->vma;
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001781 struct folio *folio;
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001782 struct page *page;
Jan Kara82b0f8c2016-12-14 15:06:58 -08001783 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
Yang Shi5db4f152021-06-30 18:51:35 -07001784 pmd_t orig_pmd = vmf->orig_pmd;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001785
Jan Kara82b0f8c2016-12-14 15:06:58 -08001786 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
Sasha Levin81d1b092014-10-09 15:28:10 -07001787 VM_BUG_ON_VMA(!vma->anon_vma, vma);
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001788
Kirill A. Shutemov93b47962012-12-12 13:50:54 -08001789 if (is_huge_zero_pmd(orig_pmd))
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001790 goto fallback;
1791
Jan Kara82b0f8c2016-12-14 15:06:58 -08001792 spin_lock(vmf->ptl);
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001793
1794 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1795 spin_unlock(vmf->ptl);
1796 return 0;
1797 }
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001798
1799 page = pmd_page(orig_pmd);
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001800 folio = page_folio(page);
Miaohe Linf6004e72021-05-04 18:34:02 -07001801 VM_BUG_ON_PAGE(!PageHead(page), page);
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001802
David Hildenbrand6c287602022-05-09 18:20:44 -07001803 /* Early check when only holding the PT lock. */
1804 if (PageAnonExclusive(page))
1805 goto reuse;
1806
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001807 if (!folio_trylock(folio)) {
1808 folio_get(folio);
Huang Yingba3c4ce2017-09-06 16:22:19 -07001809 spin_unlock(vmf->ptl);
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001810 folio_lock(folio);
Huang Yingba3c4ce2017-09-06 16:22:19 -07001811 spin_lock(vmf->ptl);
1812 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001813 spin_unlock(vmf->ptl);
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001814 folio_unlock(folio);
1815 folio_put(folio);
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001816 return 0;
Huang Yingba3c4ce2017-09-06 16:22:19 -07001817 }
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001818 folio_put(folio);
Huang Yingba3c4ce2017-09-06 16:22:19 -07001819 }
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001820
David Hildenbrand6c287602022-05-09 18:20:44 -07001821 /* Recheck after temporarily dropping the PT lock. */
1822 if (PageAnonExclusive(page)) {
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001823 folio_unlock(folio);
David Hildenbrand6c287602022-05-09 18:20:44 -07001824 goto reuse;
1825 }
1826
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001827 /*
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001828 * See do_wp_page(): we can only reuse the folio exclusively if
1829 * there are no additional references. Note that we always drain
Matthew Wilcox (Oracle)1fec68902023-06-21 17:45:56 +01001830 * the LRU cache immediately after adding a THP.
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001831 */
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001832 if (folio_ref_count(folio) >
1833 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
David Hildenbrand3bff7e32022-03-24 18:13:43 -07001834 goto unlock_fallback;
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001835 if (folio_test_swapcache(folio))
1836 folio_free_swap(folio);
1837 if (folio_ref_count(folio) == 1) {
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001838 pmd_t entry;
David Hildenbrand6c54dc62022-05-09 18:20:43 -07001839
David Hildenbrand06968622023-10-02 16:29:48 +02001840 folio_move_anon_rmap(folio, vma);
David Hildenbrand5ca43282023-10-02 16:29:47 +02001841 SetPageAnonExclusive(page);
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001842 folio_unlock(folio);
David Hildenbrand6c287602022-05-09 18:20:44 -07001843reuse:
David Hildenbrandc89357e2022-05-09 18:20:45 -07001844 if (unlikely(unshare)) {
1845 spin_unlock(vmf->ptl);
1846 return 0;
1847 }
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001848 entry = pmd_mkyoung(orig_pmd);
Linus Torvaldsf55e1012017-11-29 09:01:01 -08001849 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001850 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
Jan Kara82b0f8c2016-12-14 15:06:58 -08001851 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
Jan Kara82b0f8c2016-12-14 15:06:58 -08001852 spin_unlock(vmf->ptl);
David Hildenbrandcb8d8632022-10-21 12:11:35 +02001853 return 0;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001854 }
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001855
David Hildenbrand3bff7e32022-03-24 18:13:43 -07001856unlock_fallback:
Matthew Wilcox (Oracle)2fad3d12022-09-02 20:46:38 +01001857 folio_unlock(folio);
Jan Kara82b0f8c2016-12-14 15:06:58 -08001858 spin_unlock(vmf->ptl);
Kirill A. Shutemov3917c802020-06-03 16:00:27 -07001859fallback:
1860 __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
1861 return VM_FAULT_FALLBACK;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08001862}
1863
David Hildenbrandc27f4792022-11-08 18:46:48 +01001864static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
1865 unsigned long addr, pmd_t pmd)
1866{
1867 struct page *page;
1868
1869 if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
1870 return false;
1871
1872 /* Don't touch entries that are not even readable (NUMA hinting). */
1873 if (pmd_protnone(pmd))
1874 return false;
1875
1876 /* Do we need write faults for softdirty tracking? */
Barry Songf38ee282024-06-08 09:13:57 +12001877 if (pmd_needs_soft_dirty_wp(vma, pmd))
David Hildenbrandc27f4792022-11-08 18:46:48 +01001878 return false;
1879
1880 /* Do we need write faults for uffd-wp tracking? */
1881 if (userfaultfd_huge_pmd_wp(vma, pmd))
1882 return false;
1883
1884 if (!(vma->vm_flags & VM_SHARED)) {
1885 /* See can_change_pte_writable(). */
1886 page = vm_normal_page_pmd(vma, addr, pmd);
1887 return page && PageAnon(page) && PageAnonExclusive(page);
1888 }
1889
1890 /* See can_change_pte_writable(). */
1891 return pmd_dirty(pmd);
1892}
1893
Mel Gormand10e63f2012-10-25 14:16:31 +02001894/* NUMA hinting page fault entry point for trans huge pmds */
Yang Shi5db4f152021-06-30 18:51:35 -07001895vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
Mel Gormand10e63f2012-10-25 14:16:31 +02001896{
Jan Kara82b0f8c2016-12-14 15:06:58 -08001897 struct vm_area_struct *vma = vmf->vma;
Kefeng Wang667ffc32023-09-21 15:44:13 +08001898 struct folio *folio;
Jan Kara82b0f8c2016-12-14 15:06:58 -08001899 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
Kefeng Wang667ffc32023-09-21 15:44:13 +08001900 int nid = NUMA_NO_NODE;
Zi Yan727d50a72024-08-09 10:59:06 -04001901 int target_nid, last_cpupid;
1902 pmd_t pmd, old_pmd;
David Hildenbrand4b88c232024-06-20 23:29:34 +02001903 bool writable = false;
Peter Zijlstra6688cc02013-10-07 11:29:24 +01001904 int flags = 0;
Mel Gormand10e63f2012-10-25 14:16:31 +02001905
Jan Kara82b0f8c2016-12-14 15:06:58 -08001906 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
Zi Yan727d50a72024-08-09 10:59:06 -04001907 old_pmd = pmdp_get(vmf->pmd);
1908
1909 if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
Jan Kara82b0f8c2016-12-14 15:06:58 -08001910 spin_unlock(vmf->ptl);
Zi Yanfd8c35a2024-08-09 10:59:05 -04001911 return 0;
Mel Gormande466bd2013-12-18 17:08:42 -08001912 }
1913
Zi Yan727d50a72024-08-09 10:59:06 -04001914 pmd = pmd_modify(old_pmd, vma->vm_page_prot);
David Hildenbrand6a56ccb2022-11-08 18:46:50 +01001915
1916 /*
1917 * Detect now whether the PMD could be writable; this information
1918 * is only valid while holding the PT lock.
1919 */
1920 writable = pmd_write(pmd);
1921 if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
1922 can_change_pmd_writable(vma, vmf->address, pmd))
1923 writable = true;
1924
Kefeng Wang667ffc32023-09-21 15:44:13 +08001925 folio = vm_normal_folio_pmd(vma, haddr, pmd);
1926 if (!folio)
Yang Shic5b5a3d2021-06-30 18:51:42 -07001927 goto out_map;
1928
Kefeng Wang667ffc32023-09-21 15:44:13 +08001929 nid = folio_nid(folio);
Zi Yan727d50a72024-08-09 10:59:06 -04001930
1931 target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
1932 &last_cpupid);
David Hildenbrandee868142024-06-20 23:29:35 +02001933 if (target_nid == NUMA_NO_NODE)
1934 goto out_map;
1935 if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
1936 flags |= TNF_MIGRATE_FAIL;
Yang Shic5b5a3d2021-06-30 18:51:42 -07001937 goto out_map;
1938 }
David Hildenbrandee868142024-06-20 23:29:35 +02001939 /* The folio is isolated and isolation code holds a folio reference. */
Jan Kara82b0f8c2016-12-14 15:06:58 -08001940 spin_unlock(vmf->ptl);
David Hildenbrand6a56ccb2022-11-08 18:46:50 +01001941 writable = false;
Peter Zijlstra8b1b4362017-06-07 18:05:07 +02001942
David Hildenbrand4b88c232024-06-20 23:29:34 +02001943 if (!migrate_misplaced_folio(folio, vma, target_nid)) {
Peter Zijlstra6688cc02013-10-07 11:29:24 +01001944 flags |= TNF_MIGRATED;
Kefeng Wang667ffc32023-09-21 15:44:13 +08001945 nid = target_nid;
Zi Yanfd8c35a2024-08-09 10:59:05 -04001946 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
1947 return 0;
Yang Shic5b5a3d2021-06-30 18:51:42 -07001948 }
Mel Gormanb8916632013-10-07 11:28:44 +01001949
Zi Yanfd8c35a2024-08-09 10:59:05 -04001950 flags |= TNF_MIGRATE_FAIL;
1951 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
Zi Yan727d50a72024-08-09 10:59:06 -04001952 if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
Zi Yanfd8c35a2024-08-09 10:59:05 -04001953 spin_unlock(vmf->ptl);
1954 return 0;
1955 }
Yang Shic5b5a3d2021-06-30 18:51:42 -07001956out_map:
1957 /* Restore the PMD */
Zi Yan727d50a72024-08-09 10:59:06 -04001958 pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
Yang Shic5b5a3d2021-06-30 18:51:42 -07001959 pmd = pmd_mkyoung(pmd);
David Hildenbrand6a56ccb2022-11-08 18:46:50 +01001960 if (writable)
Rick Edgecombe161e3932023-06-12 17:10:29 -07001961 pmd = pmd_mkwrite(pmd, vma);
Yang Shic5b5a3d2021-06-30 18:51:42 -07001962 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1963 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1964 spin_unlock(vmf->ptl);
Zi Yanfd8c35a2024-08-09 10:59:05 -04001965
1966 if (nid != NUMA_NO_NODE)
1967 task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
1968 return 0;
Mel Gormand10e63f2012-10-25 14:16:31 +02001969}
1970
Huang Ying319904a2016-07-28 15:48:03 -07001971/*
1972 * Return true if we do MADV_FREE successfully on entire pmd page.
1973 * Otherwise, return false.
1974 */
1975bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08001976 pmd_t *pmd, unsigned long addr, unsigned long next)
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08001977{
1978 spinlock_t *ptl;
1979 pmd_t orig_pmd;
Kefeng Wangfc986a32022-12-07 10:34:30 +08001980 struct folio *folio;
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08001981 struct mm_struct *mm = tlb->mm;
Huang Ying319904a2016-07-28 15:48:03 -07001982 bool ret = false;
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08001983
Peter Zijlstraed6a7932018-08-31 14:46:08 +02001984 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
Aneesh Kumar K.V07e32662016-12-12 16:42:40 -08001985
Kirill A. Shutemovb6ec57f2016-01-21 16:40:25 -08001986 ptl = pmd_trans_huge_lock(pmd, vma);
1987 if (!ptl)
Linus Torvalds25eedab2016-01-17 18:33:15 -08001988 goto out_unlocked;
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08001989
1990 orig_pmd = *pmd;
Huang Ying319904a2016-07-28 15:48:03 -07001991 if (is_huge_zero_pmd(orig_pmd))
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08001992 goto out;
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08001993
Zi Yan84c3fc42017-09-08 16:11:01 -07001994 if (unlikely(!pmd_present(orig_pmd))) {
1995 VM_BUG_ON(thp_migration_supported() &&
1996 !is_pmd_migration_entry(orig_pmd));
1997 goto out;
1998 }
1999
Matthew Wilcox (Oracle)e06d03d2024-03-26 20:28:23 +00002000 folio = pmd_folio(orig_pmd);
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002001 /*
Kefeng Wangfc986a32022-12-07 10:34:30 +08002002 * If other processes are mapping this folio, we couldn't discard
2003 * the folio unless they all do MADV_FREE so let's skip the folio.
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002004 */
David Hildenbrandebb34f72024-02-27 21:15:48 +01002005 if (folio_likely_mapped_shared(folio))
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002006 goto out;
2007
Kefeng Wangfc986a32022-12-07 10:34:30 +08002008 if (!folio_trylock(folio))
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002009 goto out;
2010
2011 /*
2012 * If user want to discard part-pages of THP, split it so MADV_FREE
2013 * will deactivate only them.
2014 */
2015 if (next - addr != HPAGE_PMD_SIZE) {
Kefeng Wangfc986a32022-12-07 10:34:30 +08002016 folio_get(folio);
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002017 spin_unlock(ptl);
Kefeng Wangfc986a32022-12-07 10:34:30 +08002018 split_folio(folio);
2019 folio_unlock(folio);
2020 folio_put(folio);
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002021 goto out_unlocked;
2022 }
2023
Kefeng Wangfc986a32022-12-07 10:34:30 +08002024 if (folio_test_dirty(folio))
2025 folio_clear_dirty(folio);
2026 folio_unlock(folio);
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002027
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002028 if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
Kirill A. Shutemov58ceeb62017-04-13 14:56:26 -07002029 pmdp_invalidate(vma, addr, pmd);
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002030 orig_pmd = pmd_mkold(orig_pmd);
2031 orig_pmd = pmd_mkclean(orig_pmd);
2032
2033 set_pmd_at(mm, addr, pmd, orig_pmd);
2034 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
2035 }
Shaohua Li802a3a92017-05-03 14:52:32 -07002036
Kefeng Wang6a6fe9e2022-12-09 10:06:18 +08002037 folio_mark_lazyfree(folio);
Huang Ying319904a2016-07-28 15:48:03 -07002038 ret = true;
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08002039out:
2040 spin_unlock(ptl);
2041out_unlocked:
2042 return ret;
2043}
2044
Aneesh Kumar K.V953c66c2016-12-12 16:44:32 -08002045static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
2046{
2047 pgtable_t pgtable;
2048
2049 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2050 pte_free(mm, pgtable);
Kirill A. Shutemovc4812902017-11-15 17:35:37 -08002051 mm_dec_nr_ptes(mm);
Aneesh Kumar K.V953c66c2016-12-12 16:44:32 -08002052}
2053
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08002054int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
Shaohua Lif21760b2012-01-12 17:19:16 -08002055 pmd_t *pmd, unsigned long addr)
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08002056{
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002057 pmd_t orig_pmd;
Kirill A. Shutemovbf929152013-11-14 14:30:54 -08002058 spinlock_t *ptl;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08002059
Peter Zijlstraed6a7932018-08-31 14:46:08 +02002060 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
Aneesh Kumar K.V07e32662016-12-12 16:42:40 -08002061
Kirill A. Shutemovb6ec57f2016-01-21 16:40:25 -08002062 ptl = __pmd_trans_huge_lock(pmd, vma);
2063 if (!ptl)
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002064 return 0;
2065 /*
2066 * For architectures like ppc64 we look at deposited pgtable
2067 * when calling pmdp_huge_get_and_clear. So do the
2068 * pgtable_trans_huge_withdraw after finishing pmdp related
2069 * operations.
2070 */
Aneesh Kumar K.V93a98692020-05-05 12:47:28 +05302071 orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
2072 tlb->fullmm);
Rick Edgecombee5136e82023-06-12 17:10:43 -07002073 arch_check_zapped_pmd(vma, orig_pmd);
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002074 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
Thomas Hellstrom (VMware)2484ca92020-03-24 18:47:17 +01002075 if (vma_is_special_huge(vma)) {
Oliver O'Halloran3b6521f2017-05-08 15:59:43 -07002076 if (arch_needs_pgtable_deposit())
2077 zap_deposited_table(tlb->mm, pmd);
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002078 spin_unlock(ptl);
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002079 } else if (is_huge_zero_pmd(orig_pmd)) {
Oliver O'Halloranc14a6eb2017-05-08 15:59:40 -07002080 zap_deposited_table(tlb->mm, pmd);
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002081 spin_unlock(ptl);
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002082 } else {
Kefeng Wang0103b272024-01-11 15:24:25 +00002083 struct folio *folio = NULL;
Zi Yan616b8372017-09-08 16:10:57 -07002084 int flush_needed = 1;
2085
2086 if (pmd_present(orig_pmd)) {
Kefeng Wang0103b272024-01-11 15:24:25 +00002087 struct page *page = pmd_page(orig_pmd);
2088
2089 folio = page_folio(page);
2090 folio_remove_rmap_pmd(folio, page, vma);
David Hildenbrand0a7bda42024-04-09 21:22:51 +02002091 WARN_ON_ONCE(folio_mapcount(folio) < 0);
Zi Yan616b8372017-09-08 16:10:57 -07002092 VM_BUG_ON_PAGE(!PageHead(page), page);
2093 } else if (thp_migration_supported()) {
2094 swp_entry_t entry;
2095
2096 VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
2097 entry = pmd_to_swp_entry(orig_pmd);
Kefeng Wang0103b272024-01-11 15:24:25 +00002098 folio = pfn_swap_entry_folio(entry);
Zi Yan616b8372017-09-08 16:10:57 -07002099 flush_needed = 0;
2100 } else
2101 WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
2102
Kefeng Wang0103b272024-01-11 15:24:25 +00002103 if (folio_test_anon(folio)) {
Oliver O'Halloranc14a6eb2017-05-08 15:59:40 -07002104 zap_deposited_table(tlb->mm, pmd);
Kirill A. Shutemovb5072382016-07-26 15:25:34 -07002105 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
2106 } else {
Aneesh Kumar K.V953c66c2016-12-12 16:44:32 -08002107 if (arch_needs_pgtable_deposit())
2108 zap_deposited_table(tlb->mm, pmd);
Kefeng Wang6b27cc6c2024-01-11 15:24:29 +00002109 add_mm_counter(tlb->mm, mm_counter_file(folio),
Kefeng Wang0103b272024-01-11 15:24:25 +00002110 -HPAGE_PMD_NR);
Kirill A. Shutemovb5072382016-07-26 15:25:34 -07002111 }
Zi Yan616b8372017-09-08 16:10:57 -07002112
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002113 spin_unlock(ptl);
Zi Yan616b8372017-09-08 16:10:57 -07002114 if (flush_needed)
Kefeng Wang0103b272024-01-11 15:24:25 +00002115 tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
Naoya Horiguchi025c5b22012-03-21 16:33:57 -07002116 }
Kirill A. Shutemovda146762015-09-08 14:59:31 -07002117 return 1;
Andrea Arcangeli71e3aac2011-01-13 15:46:52 -08002118}
2119
Aneesh Kumar K.V1dd38b62016-12-12 16:44:29 -08002120#ifndef pmd_move_must_withdraw
2121static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
2122 spinlock_t *old_pmd_ptl,
2123 struct vm_area_struct *vma)
2124{
2125 /*
2126 * With split pmd lock we also need to move preallocated
2127 * PTE page table if new_pmd is on different PMD page table.
2128 *
2129 * We also don't deposit and withdraw tables for file pages.
2130 */
2131 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
2132}
2133#endif
2134
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07002135static pmd_t move_soft_dirty_pmd(pmd_t pmd)
2136{
2137#ifdef CONFIG_MEM_SOFT_DIRTY
2138 if (unlikely(is_pmd_migration_entry(pmd)))
2139 pmd = pmd_swp_mksoft_dirty(pmd);
2140 else if (pmd_present(pmd))
2141 pmd = pmd_mksoft_dirty(pmd);
2142#endif
2143 return pmd;
2144}
2145
Hugh Dickinsbf8616d2016-05-19 17:12:54 -07002146bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
Wei Yangb8aa9d92020-08-06 23:23:40 -07002147 unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002148{
Kirill A. Shutemovbf929152013-11-14 14:30:54 -08002149 spinlock_t *old_ptl, *new_ptl;
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002150 pmd_t pmd;
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002151 struct mm_struct *mm = vma->vm_mm;
Aaron Lu5d190422016-11-10 17:16:33 +08002152 bool force_flush = false;
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002153
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002154 /*
2155 * The destination pmd shouldn't be established, free_pgtables()
Hugh Dickinsa5be6212023-06-08 18:32:47 -07002156 * should have released it; but move_page_tables() might have already
2157 * inserted a page table, if racing against shmem/file collapse.
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002158 */
Hugh Dickinsa5be6212023-06-08 18:32:47 -07002159 if (!pmd_none(*new_pmd)) {
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002160 VM_BUG_ON(pmd_trans_huge(*new_pmd));
Kirill A. Shutemov4b471e82016-01-15 16:53:39 -08002161 return false;
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002162 }
2163
Kirill A. Shutemovbf929152013-11-14 14:30:54 -08002164 /*
2165 * We don't have to worry about the ordering of src and dst
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07002166 * ptlocks because exclusive mmap_lock prevents deadlock.
Kirill A. Shutemovbf929152013-11-14 14:30:54 -08002167 */
Kirill A. Shutemovb6ec57f2016-01-21 16:40:25 -08002168 old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
2169 if (old_ptl) {
Kirill A. Shutemovbf929152013-11-14 14:30:54 -08002170 new_ptl = pmd_lockptr(mm, new_pmd);
2171 if (new_ptl != old_ptl)
2172 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
Aneesh Kumar K.V8809aa22015-06-24 16:57:44 -07002173 pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
Linus Torvaldseb66ae02018-10-12 15:22:59 -07002174 if (pmd_present(pmd))
Aaron Lua2ce2662016-11-29 13:27:31 +08002175 force_flush = true;
Naoya Horiguchi025c5b22012-03-21 16:33:57 -07002176 VM_BUG_ON(!pmd_none(*new_pmd));
Kirill A. Shutemov35928062013-12-12 17:12:33 -08002177
Aneesh Kumar K.V1dd38b62016-12-12 16:44:29 -08002178 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
Aneesh Kumar K.Vb3084f42014-01-13 11:34:24 +05302179 pgtable_t pgtable;
Kirill A. Shutemov35928062013-12-12 17:12:33 -08002180 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
2181 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
Kirill A. Shutemov35928062013-12-12 17:12:33 -08002182 }
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07002183 pmd = move_soft_dirty_pmd(pmd);
2184 set_pmd_at(mm, new_addr, new_pmd, pmd);
Aaron Lu5d190422016-11-10 17:16:33 +08002185 if (force_flush)
Miaohe Lin7c38f182022-07-04 21:21:46 +08002186 flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
Linus Torvaldseb66ae02018-10-12 15:22:59 -07002187 if (new_ptl != old_ptl)
2188 spin_unlock(new_ptl);
Kirill A. Shutemovbf929152013-11-14 14:30:54 -08002189 spin_unlock(old_ptl);
Kirill A. Shutemov4b471e82016-01-15 16:53:39 -08002190 return true;
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002191 }
Kirill A. Shutemov4b471e82016-01-15 16:53:39 -08002192 return false;
Andrea Arcangeli37a1c492011-10-31 17:08:30 -07002193}
2194
Mel Gormanf123d742013-10-07 11:28:49 +01002195/*
2196 * Returns
2197 * - 0 if PMD could not be locked
Ingo Molnarf0953a12021-05-06 18:06:47 -07002198 * - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
Yang Shie346e662021-06-30 18:51:55 -07002199 * or if prot_numa but THP migration is not supported
Ingo Molnarf0953a12021-05-06 18:06:47 -07002200 * - HPAGE_PMD_NR if protections changed and TLB flush necessary
Mel Gormanf123d742013-10-07 11:28:49 +01002201 */
Nadav Amit4a184192022-05-09 18:20:50 -07002202int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
2203 pmd_t *pmd, unsigned long addr, pgprot_t newprot,
2204 unsigned long cp_flags)
Johannes Weinercd7548a2011-01-13 15:47:04 -08002205{
2206 struct mm_struct *mm = vma->vm_mm;
Kirill A. Shutemovbf929152013-11-14 14:30:54 -08002207 spinlock_t *ptl;
Nadav Amitc9fe6652022-05-09 18:20:50 -07002208 pmd_t oldpmd, entry;
Peter Xu58705442020-04-06 20:05:45 -07002209 bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
Peter Xu292924b2020-04-06 20:05:49 -07002210 bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
2211 bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
David Hildenbrand6a56ccb2022-11-08 18:46:50 +01002212 int ret = 1;
Johannes Weinercd7548a2011-01-13 15:47:04 -08002213
Nadav Amit4a184192022-05-09 18:20:50 -07002214 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
2215
Yang Shie346e662021-06-30 18:51:55 -07002216 if (prot_numa && !thp_migration_supported())
2217 return 1;
2218
Kirill A. Shutemovb6ec57f2016-01-21 16:40:25 -08002219 ptl = __pmd_trans_huge_lock(pmd, vma);
Kirill A. Shutemov0a85e51d2017-04-13 14:56:17 -07002220 if (!ptl)
2221 return 0;
Mel Gormane944fd62015-02-12 14:58:35 -08002222
Zi Yan84c3fc42017-09-08 16:11:01 -07002223#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
2224 if (is_swap_pmd(*pmd)) {
2225 swp_entry_t entry = pmd_to_swp_entry(*pmd);
Matthew Wilcox (Oracle)56624002024-01-11 15:24:20 +00002226 struct folio *folio = pfn_swap_entry_folio(entry);
David Hildenbrand24bf08c2023-04-05 18:02:35 +02002227 pmd_t newpmd;
Zi Yan84c3fc42017-09-08 16:11:01 -07002228
2229 VM_BUG_ON(!is_pmd_migration_entry(*pmd));
Alistair Popple4dd845b2021-06-30 18:54:09 -07002230 if (is_writable_migration_entry(entry)) {
Zi Yan84c3fc42017-09-08 16:11:01 -07002231 /*
2232 * A protection check is difficult so
2233 * just be safe and disable write
2234 */
Kefeng Wangd986ba22023-10-18 22:07:57 +08002235 if (folio_test_anon(folio))
David Hildenbrand6c287602022-05-09 18:20:44 -07002236 entry = make_readable_exclusive_migration_entry(swp_offset(entry));
2237 else
2238 entry = make_readable_migration_entry(swp_offset(entry));
Zi Yan84c3fc42017-09-08 16:11:01 -07002239 newpmd = swp_entry_to_pmd(entry);
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07002240 if (pmd_swp_soft_dirty(*pmd))
2241 newpmd = pmd_swp_mksoft_dirty(newpmd);
David Hildenbrand24bf08c2023-04-05 18:02:35 +02002242 } else {
2243 newpmd = *pmd;
Zi Yan84c3fc42017-09-08 16:11:01 -07002244 }
David Hildenbrand24bf08c2023-04-05 18:02:35 +02002245
2246 if (uffd_wp)
2247 newpmd = pmd_swp_mkuffd_wp(newpmd);
2248 else if (uffd_wp_resolve)
2249 newpmd = pmd_swp_clear_uffd_wp(newpmd);
2250 if (!pmd_same(*pmd, newpmd))
2251 set_pmd_at(mm, addr, pmd, newpmd);
Zi Yan84c3fc42017-09-08 16:11:01 -07002252 goto unlock;
2253 }
2254#endif
2255
Huang Yinga1a3a2f2022-03-22 14:46:27 -07002256 if (prot_numa) {
Kefeng Wangd986ba22023-10-18 22:07:57 +08002257 struct folio *folio;
Huang Ying33024532022-07-13 16:39:51 +08002258 bool toptier;
Huang Yinga1a3a2f2022-03-22 14:46:27 -07002259 /*
2260 * Avoid trapping faults against the zero page. The read-only
2261 * data is likely to be read-cached on the local CPU and
2262 * local/remote hits to the zero page are not interesting.
2263 */
2264 if (is_huge_zero_pmd(*pmd))
2265 goto unlock;
Johannes Weinercd7548a2011-01-13 15:47:04 -08002266
Huang Yinga1a3a2f2022-03-22 14:46:27 -07002267 if (pmd_protnone(*pmd))
2268 goto unlock;
Kirill A. Shutemov0a85e51d2017-04-13 14:56:17 -07002269
Matthew Wilcox (Oracle)e06d03d2024-03-26 20:28:23 +00002270 folio = pmd_folio(*pmd);
Kefeng Wangd986ba22023-10-18 22:07:57 +08002271 toptier = node_is_toptier(folio_nid(folio));
Huang Yinga1a3a2f2022-03-22 14:46:27 -07002272 /*
2273 * Skip scanning top tier node if normal numa
2274 * balancing is disabled
2275 */
2276 if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
Huang Ying33024532022-07-13 16:39:51 +08002277 toptier)
Huang Yinga1a3a2f2022-03-22 14:46:27 -07002278 goto unlock;
Huang Ying33024532022-07-13 16:39:51 +08002279
Zi Yan2a287132024-07-24 09:01:14 -04002280 if (folio_use_access_time(folio))
Kefeng Wangd986ba22023-10-18 22:07:57 +08002281 folio_xchg_access_time(folio,
2282 jiffies_to_msecs(jiffies));
Huang Yinga1a3a2f2022-03-22 14:46:27 -07002283 }
Kirill A. Shutemovced10802017-04-13 14:56:20 -07002284 /*
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07002285 * In case prot_numa, we are under mmap_read_lock(mm). It's critical
Kirill A. Shutemovced10802017-04-13 14:56:20 -07002286 * to not clear pmd intermittently to avoid race with MADV_DONTNEED
Michel Lespinasse3e4e28c2020-06-08 21:33:51 -07002287 * which is also under mmap_read_lock(mm):
Kirill A. Shutemovced10802017-04-13 14:56:20 -07002288 *
2289 * CPU0: CPU1:
2290 * change_huge_pmd(prot_numa=1)
2291 * pmdp_huge_get_and_clear_notify()
2292 * madvise_dontneed()
2293 * zap_pmd_range()
2294 * pmd_trans_huge(*pmd) == 0 (without ptl)
2295 * // skip the pmd
2296 * set_pmd_at();
2297 * // pmd is re-established
2298 *
2299 * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
2300 * which may break userspace.
2301 *
Nadav Amit4f831452022-05-09 18:20:50 -07002302 * pmdp_invalidate_ad() is required to make sure we don't miss
Kirill A. Shutemovced10802017-04-13 14:56:20 -07002303 * dirty/young flags set by hardware.
2304 */
Nadav Amit4f831452022-05-09 18:20:50 -07002305 oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
Kirill A. Shutemovced10802017-04-13 14:56:20 -07002306
Nadav Amitc9fe6652022-05-09 18:20:50 -07002307 entry = pmd_modify(oldpmd, newprot);
Peter Xuf1eb1ba2022-12-14 15:15:33 -05002308 if (uffd_wp)
Peter Xu292924b2020-04-06 20:05:49 -07002309 entry = pmd_mkuffd_wp(entry);
Peter Xuf1eb1ba2022-12-14 15:15:33 -05002310 else if (uffd_wp_resolve)
Peter Xu292924b2020-04-06 20:05:49 -07002311 /*
2312 * Leave the write bit to be handled by PF interrupt
2313 * handler, then things like COW could be properly
2314 * handled.
2315 */
2316 entry = pmd_clear_uffd_wp(entry);
David Hildenbrandc27f4792022-11-08 18:46:48 +01002317
2318 /* See change_pte_range(). */
2319 if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
2320 can_change_pmd_writable(vma, addr, entry))
Rick Edgecombe161e3932023-06-12 17:10:29 -07002321 entry = pmd_mkwrite(entry, vma);
David Hildenbrandc27f4792022-11-08 18:46:48 +01002322
Kirill A. Shutemov0a85e51d2017-04-13 14:56:17 -07002323 ret = HPAGE_PMD_NR;
2324 set_pmd_at(mm, addr, pmd, entry);
Nadav Amit4a184192022-05-09 18:20:50 -07002325
Nadav Amitc9fe6652022-05-09 18:20:50 -07002326 if (huge_pmd_needs_flush(oldpmd, entry))
2327 tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
Kirill A. Shutemov0a85e51d2017-04-13 14:56:17 -07002328unlock:
2329 spin_unlock(ptl);
Johannes Weinercd7548a2011-01-13 15:47:04 -08002330 return ret;
2331}
2332
Peter Xucb0f01b2024-08-12 14:12:25 -04002333/*
2334 * Returns:
2335 *
2336 * - 0: if pud leaf changed from under us
2337 * - 1: if pud can be skipped
2338 * - HPAGE_PUD_NR: if pud was successfully processed
2339 */
2340#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2341int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2342 pud_t *pudp, unsigned long addr, pgprot_t newprot,
2343 unsigned long cp_flags)
2344{
2345 struct mm_struct *mm = vma->vm_mm;
2346 pud_t oldpud, entry;
2347 spinlock_t *ptl;
2348
2349 tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
2350
2351 /* NUMA balancing doesn't apply to dax */
2352 if (cp_flags & MM_CP_PROT_NUMA)
2353 return 1;
2354
2355 /*
2356 * Huge entries on userfault-wp only works with anonymous, while we
2357 * don't have anonymous PUDs yet.
2358 */
2359 if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
2360 return 1;
2361
2362 ptl = __pud_trans_huge_lock(pudp, vma);
2363 if (!ptl)
2364 return 0;
2365
2366 /*
2367 * Can't clear PUD or it can race with concurrent zapping. See
2368 * change_huge_pmd().
2369 */
2370 oldpud = pudp_invalidate(vma, addr, pudp);
2371 entry = pud_modify(oldpud, newprot);
2372 set_pud_at(mm, addr, pudp, entry);
2373 tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
2374
2375 spin_unlock(ptl);
2376 return HPAGE_PUD_NR;
2377}
2378#endif
2379
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002380#ifdef CONFIG_USERFAULTFD
2381/*
Lokesh Gidra867a43a2024-02-15 10:27:56 -08002382 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002383 * the caller, but it must return after releasing the page_table_lock.
2384 * Just move the page from src_pmd to dst_pmd if possible.
2385 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
2386 * repeated by the caller, or other errors in case of failure.
2387 */
2388int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
2389 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
2390 unsigned long dst_addr, unsigned long src_addr)
2391{
2392 pmd_t _dst_pmd, src_pmdval;
2393 struct page *src_page;
2394 struct folio *src_folio;
2395 struct anon_vma *src_anon_vma;
2396 spinlock_t *src_ptl, *dst_ptl;
2397 pgtable_t src_pgtable;
2398 struct mmu_notifier_range range;
2399 int err = 0;
2400
2401 src_pmdval = *src_pmd;
2402 src_ptl = pmd_lockptr(mm, src_pmd);
2403
2404 lockdep_assert_held(src_ptl);
Lokesh Gidra867a43a2024-02-15 10:27:56 -08002405 vma_assert_locked(src_vma);
2406 vma_assert_locked(dst_vma);
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002407
2408 /* Sanity checks before the operation */
2409 if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
2410 WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
2411 spin_unlock(src_ptl);
2412 return -EINVAL;
2413 }
2414
2415 if (!pmd_trans_huge(src_pmdval)) {
2416 spin_unlock(src_ptl);
2417 if (is_pmd_migration_entry(src_pmdval)) {
2418 pmd_migration_entry_wait(mm, &src_pmdval);
2419 return -EAGAIN;
2420 }
2421 return -ENOENT;
2422 }
2423
2424 src_page = pmd_page(src_pmdval);
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002425
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002426 if (!is_huge_zero_pmd(src_pmdval)) {
2427 if (unlikely(!PageAnonExclusive(src_page))) {
2428 spin_unlock(src_ptl);
2429 return -EBUSY;
2430 }
2431
2432 src_folio = page_folio(src_page);
2433 folio_get(src_folio);
2434 } else
2435 src_folio = NULL;
2436
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002437 spin_unlock(src_ptl);
2438
2439 flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
2440 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
2441 src_addr + HPAGE_PMD_SIZE);
2442 mmu_notifier_invalidate_range_start(&range);
2443
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002444 if (src_folio) {
2445 folio_lock(src_folio);
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002446
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002447 /*
2448 * split_huge_page walks the anon_vma chain without the page
2449 * lock. Serialize against it with the anon_vma lock, the page
2450 * lock is not enough.
2451 */
2452 src_anon_vma = folio_get_anon_vma(src_folio);
2453 if (!src_anon_vma) {
2454 err = -EAGAIN;
2455 goto unlock_folio;
2456 }
2457 anon_vma_lock_write(src_anon_vma);
2458 } else
2459 src_anon_vma = NULL;
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002460
2461 dst_ptl = pmd_lockptr(mm, dst_pmd);
2462 double_pt_lock(src_ptl, dst_ptl);
2463 if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
2464 !pmd_same(*dst_pmd, dst_pmdval))) {
2465 err = -EAGAIN;
2466 goto unlock_ptls;
2467 }
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002468 if (src_folio) {
2469 if (folio_maybe_dma_pinned(src_folio) ||
2470 !PageAnonExclusive(&src_folio->page)) {
2471 err = -EBUSY;
2472 goto unlock_ptls;
2473 }
2474
2475 if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
2476 WARN_ON_ONCE(!folio_test_anon(src_folio))) {
2477 err = -EBUSY;
2478 goto unlock_ptls;
2479 }
2480
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002481 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2482 /* Folio got pinned from under us. Put it back and fail the move. */
2483 if (folio_maybe_dma_pinned(src_folio)) {
2484 set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
2485 err = -EBUSY;
2486 goto unlock_ptls;
2487 }
2488
Lokesh Gidrac0205ea2024-04-04 10:17:26 -07002489 folio_move_anon_rmap(src_folio, dst_vma);
Suren Baghdasaryanb5ba3a62024-04-14 19:08:21 -07002490 src_folio->index = linear_page_index(dst_vma, dst_addr);
Lokesh Gidrac0205ea2024-04-04 10:17:26 -07002491
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002492 _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
2493 /* Follow mremap() behavior and treat the entry dirty after the move */
2494 _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
2495 } else {
2496 src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
2497 _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002498 }
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002499 set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
2500
2501 src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
2502 pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
2503unlock_ptls:
2504 double_pt_unlock(src_ptl, dst_ptl);
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002505 if (src_anon_vma) {
2506 anon_vma_unlock_write(src_anon_vma);
2507 put_anon_vma(src_anon_vma);
2508 }
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002509unlock_folio:
2510 /* unblock rmap walks */
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002511 if (src_folio)
2512 folio_unlock(src_folio);
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002513 mmu_notifier_invalidate_range_end(&range);
Suren Baghdasaryaneb1521d2024-01-31 09:56:18 -08002514 if (src_folio)
2515 folio_put(src_folio);
Andrea Arcangeliadef4402023-12-06 02:36:56 -08002516 return err;
2517}
2518#endif /* CONFIG_USERFAULTFD */
2519
Naoya Horiguchi025c5b22012-03-21 16:33:57 -07002520/*
Huang Ying8f19b0c2016-07-26 15:27:04 -07002521 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
Naoya Horiguchi025c5b22012-03-21 16:33:57 -07002522 *
Huang Ying8f19b0c2016-07-26 15:27:04 -07002523 * Note that if it returns page table lock pointer, this routine returns without
2524 * unlocking page table lock. So callers must unlock it.
Naoya Horiguchi025c5b22012-03-21 16:33:57 -07002525 */
Kirill A. Shutemovb6ec57f2016-01-21 16:40:25 -08002526spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
Naoya Horiguchi025c5b22012-03-21 16:33:57 -07002527{
Kirill A. Shutemovb6ec57f2016-01-21 16:40:25 -08002528 spinlock_t *ptl;
2529 ptl = pmd_lock(vma->vm_mm, pmd);
Zi Yan84c3fc42017-09-08 16:11:01 -07002530 if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2531 pmd_devmap(*pmd)))
Kirill A. Shutemovb6ec57f2016-01-21 16:40:25 -08002532 return ptl;
2533 spin_unlock(ptl);
2534 return NULL;
Naoya Horiguchi025c5b22012-03-21 16:33:57 -07002535}
2536
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002537/*
Miaohe Lind965e392022-07-04 21:21:48 +08002538 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002539 *
Miaohe Lind965e392022-07-04 21:21:48 +08002540 * Note that if it returns page table lock pointer, this routine returns without
2541 * unlocking page table lock. So callers must unlock it.
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002542 */
2543spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2544{
2545 spinlock_t *ptl;
2546
2547 ptl = pud_lock(vma->vm_mm, pud);
2548 if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2549 return ptl;
2550 spin_unlock(ptl);
2551 return NULL;
2552}
2553
2554#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2555int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2556 pud_t *pud, unsigned long addr)
2557{
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002558 spinlock_t *ptl;
Peter Xu1c399e72024-08-12 14:12:23 -04002559 pud_t orig_pud;
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002560
2561 ptl = __pud_trans_huge_lock(pud, vma);
2562 if (!ptl)
2563 return 0;
Miaohe Lin749290792022-07-04 21:21:54 +08002564
Peter Xu1c399e72024-08-12 14:12:23 -04002565 orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
2566 arch_check_zapped_pud(vma, orig_pud);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002567 tlb_remove_pud_tlb_entry(tlb, pud, addr);
Thomas Hellstrom (VMware)2484ca92020-03-24 18:47:17 +01002568 if (vma_is_special_huge(vma)) {
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002569 spin_unlock(ptl);
2570 /* No zero page support yet */
2571 } else {
2572 /* No support for anonymous PUD pages yet */
2573 BUG();
2574 }
2575 return 1;
2576}
2577
2578static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2579 unsigned long haddr)
2580{
2581 VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2582 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2583 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2584 VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2585
Yisheng Xiece9311c2017-03-09 16:17:00 -08002586 count_vm_event(THP_SPLIT_PUD);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002587
Alistair Poppleec8832d2023-07-25 23:42:06 +10002588 pudp_huge_clear_flush(vma, haddr, pud);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002589}
2590
2591void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2592 unsigned long address)
2593{
2594 spinlock_t *ptl;
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002595 struct mmu_notifier_range range;
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002596
Alistair Popple7d4a8be2023-01-10 13:57:22 +11002597 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
Jérôme Glisse6f4f13e2019-05-13 17:20:49 -07002598 address & HPAGE_PUD_MASK,
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002599 (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2600 mmu_notifier_invalidate_range_start(&range);
2601 ptl = pud_lock(vma->vm_mm, pud);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002602 if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2603 goto out;
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002604 __split_huge_pud_locked(vma, pud, range.start);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002605
2606out:
2607 spin_unlock(ptl);
Alistair Poppleec8832d2023-07-25 23:42:06 +10002608 mmu_notifier_invalidate_range_end(&range);
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002609}
Peter Xucb0f01b2024-08-12 14:12:25 -04002610#else
2611void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2612 unsigned long address)
2613{
2614}
Matthew Wilcoxa00cc7d2017-02-24 14:57:02 -08002615#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2616
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002617static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2618 unsigned long haddr, pmd_t *pmd)
2619{
2620 struct mm_struct *mm = vma->vm_mm;
2621 pgtable_t pgtable;
David Hildenbrand42b2af22023-03-02 18:54:23 +01002622 pmd_t _pmd, old_pmd;
Hugh Dickinsc9c1ee22023-06-08 18:41:31 -07002623 unsigned long addr;
2624 pte_t *pte;
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002625 int i;
2626
Jérôme Glisse0f108512017-11-15 17:34:07 -08002627 /*
2628 * Leave pmd empty until pte is filled note that it is fine to delay
2629 * notification until mmu_notifier_invalidate_range_end() as we are
2630 * replacing a zero pmd write protected page with a zero pte write
2631 * protected page.
2632 *
Mike Rapoportee657282022-06-27 09:00:26 +03002633 * See Documentation/mm/mmu_notifier.rst
Jérôme Glisse0f108512017-11-15 17:34:07 -08002634 */
David Hildenbrand42b2af22023-03-02 18:54:23 +01002635 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002636
2637 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2638 pmd_populate(mm, &_pmd, pgtable);
2639
Hugh Dickinsc9c1ee22023-06-08 18:41:31 -07002640 pte = pte_offset_map(&_pmd, haddr);
2641 VM_BUG_ON(!pte);
2642 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2643 pte_t entry;
2644
2645 entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002646 entry = pte_mkspecial(entry);
David Hildenbrand42b2af22023-03-02 18:54:23 +01002647 if (pmd_uffd_wp(old_pmd))
2648 entry = pte_mkuffd_wp(entry);
Ryan Robertsc33c7942023-06-12 16:15:45 +01002649 VM_BUG_ON(!pte_none(ptep_get(pte)));
Hugh Dickinsc9c1ee22023-06-08 18:41:31 -07002650 set_pte_at(mm, addr, pte, entry);
2651 pte++;
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002652 }
Hugh Dickinsc9c1ee22023-06-08 18:41:31 -07002653 pte_unmap(pte - 1);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002654 smp_wmb(); /* make pte visible before pmd */
2655 pmd_populate(mm, pmd, pgtable);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002656}
2657
2658static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
Kirill A. Shutemovba988282016-01-15 16:53:56 -08002659 unsigned long haddr, bool freeze)
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002660{
2661 struct mm_struct *mm = vma->vm_mm;
David Hildenbrand91b29782023-12-20 23:44:39 +01002662 struct folio *folio;
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002663 struct page *page;
2664 pgtable_t pgtable;
Aneesh Kumar K.V423ac9a2018-01-31 16:18:24 -08002665 pmd_t old_pmd, _pmd;
Peter Xu292924b2020-04-06 20:05:49 -07002666 bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
Peter Xu0ccf7f12022-08-11 12:13:28 -04002667 bool anon_exclusive = false, dirty = false;
Kirill A. Shutemov2ac015e2016-02-24 18:58:03 +03002668 unsigned long addr;
Hugh Dickinsc9c1ee22023-06-08 18:41:31 -07002669 pte_t *pte;
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002670 int i;
2671
2672 VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2673 VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2674 VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
Zi Yan84c3fc42017-09-08 16:11:01 -07002675 VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2676 && !pmd_devmap(*pmd));
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002677
2678 count_vm_event(THP_SPLIT_PMD);
2679
Kirill A. Shutemovd21b9e52016-07-26 15:25:37 -07002680 if (!vma_is_anonymous(vma)) {
Alistair Poppleec8832d2023-07-25 23:42:06 +10002681 old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
Aneesh Kumar K.V953c66c2016-12-12 16:44:32 -08002682 /*
2683 * We are going to unmap this huge page. So
2684 * just go ahead and zap it
2685 */
2686 if (arch_needs_pgtable_deposit())
2687 zap_deposited_table(mm, pmd);
Thomas Hellstrom (VMware)2484ca92020-03-24 18:47:17 +01002688 if (vma_is_special_huge(vma))
Kirill A. Shutemovd21b9e52016-07-26 15:25:37 -07002689 return;
Hugh Dickins99fa8a42021-06-15 18:23:45 -07002690 if (unlikely(is_pmd_migration_entry(old_pmd))) {
2691 swp_entry_t entry;
2692
2693 entry = pmd_to_swp_entry(old_pmd);
Kefeng Wang439992f2024-01-11 15:24:24 +00002694 folio = pfn_swap_entry_folio(entry);
Hugh Dickins99fa8a42021-06-15 18:23:45 -07002695 } else {
2696 page = pmd_page(old_pmd);
David Hildenbranda8e61d52023-12-20 23:44:49 +01002697 folio = page_folio(page);
2698 if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
David Hildenbranddb44c652024-01-22 18:54:07 +01002699 folio_mark_dirty(folio);
David Hildenbranda8e61d52023-12-20 23:44:49 +01002700 if (!folio_test_referenced(folio) && pmd_young(old_pmd))
2701 folio_set_referenced(folio);
2702 folio_remove_rmap_pmd(folio, page, vma);
2703 folio_put(folio);
Hugh Dickins99fa8a42021-06-15 18:23:45 -07002704 }
Kefeng Wang6b27cc6c2024-01-11 15:24:29 +00002705 add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002706 return;
Hugh Dickins99fa8a42021-06-15 18:23:45 -07002707 }
2708
Hugh Dickins3b77e8c2021-06-15 18:23:49 -07002709 if (is_huge_zero_pmd(*pmd)) {
Jérôme Glisse4645b9f2017-11-15 17:34:11 -08002710 /*
2711 * FIXME: Do we want to invalidate secondary mmu by calling
Alistair Popple1af5a812023-07-25 23:42:07 +10002712 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
2713 * inside __split_huge_pmd() ?
Jérôme Glisse4645b9f2017-11-15 17:34:11 -08002714 *
2715 * We are going from a zero huge page write protected to zero
2716 * small page also write protected so it does not seems useful
2717 * to invalidate secondary mmu at this time.
2718 */
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002719 return __split_huge_zero_page_pmd(vma, haddr, pmd);
2720 }
2721
Ryan Roberts3a5a8d32024-05-01 15:33:10 +01002722 pmd_migration = is_pmd_migration_entry(*pmd);
Peter Xu2e83ee12018-12-21 14:30:50 -08002723 if (unlikely(pmd_migration)) {
Zi Yan84c3fc42017-09-08 16:11:01 -07002724 swp_entry_t entry;
2725
Ryan Roberts3a5a8d32024-05-01 15:33:10 +01002726 old_pmd = *pmd;
Aneesh Kumar K.V423ac9a2018-01-31 16:18:24 -08002727 entry = pmd_to_swp_entry(old_pmd);
Alistair Poppleaf5cdaf2021-06-30 18:54:06 -07002728 page = pfn_swap_entry_to_page(entry);
Alistair Popple4dd845b2021-06-30 18:54:09 -07002729 write = is_writable_migration_entry(entry);
David Hildenbrand6c287602022-05-09 18:20:44 -07002730 if (PageAnon(page))
2731 anon_exclusive = is_readable_exclusive_migration_entry(entry);
Peter Xu2e346872022-08-11 12:13:29 -04002732 young = is_migration_entry_young(entry);
2733 dirty = is_migration_entry_dirty(entry);
Peter Xu2e83ee12018-12-21 14:30:50 -08002734 soft_dirty = pmd_swp_soft_dirty(old_pmd);
Peter Xuf45ec5f2020-04-06 20:06:01 -07002735 uffd_wp = pmd_swp_uffd_wp(old_pmd);
Peter Xu2e83ee12018-12-21 14:30:50 -08002736 } else {
Ryan Roberts3a5a8d32024-05-01 15:33:10 +01002737 /*
2738 * Up to this point the pmd is present and huge and userland has
2739 * the whole access to the hugepage during the split (which
2740 * happens in place). If we overwrite the pmd with the not-huge
2741 * version pointing to the pte here (which of course we could if
2742 * all CPUs were bug free), userland could trigger a small page
2743 * size TLB miss on the small sized TLB while the hugepage TLB
2744 * entry is still established in the huge TLB. Some CPU doesn't
2745 * like that. See
2746 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
2747 * 383 on page 105. Intel should be safe but is also warns that
2748 * it's only safe if the permission and cache attributes of the
2749 * two entries loaded in the two TLB is identical (which should
2750 * be the case here). But it is generally safer to never allow
2751 * small and huge TLB entries for the same virtual address to be
2752 * loaded simultaneously. So instead of doing "pmd_populate();
2753 * flush_pmd_tlb_range();" we first mark the current pmd
2754 * notpresent (atomically because here the pmd_trans_huge must
2755 * remain set at all times on the pmd until the split is
2756 * complete for this pmd), then we flush the SMP TLB and finally
2757 * we write the non-huge version of the pmd entry with
2758 * pmd_populate.
2759 */
2760 old_pmd = pmdp_invalidate(vma, haddr, pmd);
Aneesh Kumar K.V423ac9a2018-01-31 16:18:24 -08002761 page = pmd_page(old_pmd);
David Hildenbrand91b29782023-12-20 23:44:39 +01002762 folio = page_folio(page);
Peter Xu0ccf7f12022-08-11 12:13:28 -04002763 if (pmd_dirty(old_pmd)) {
2764 dirty = true;
David Hildenbrand91b29782023-12-20 23:44:39 +01002765 folio_set_dirty(folio);
Peter Xu0ccf7f12022-08-11 12:13:28 -04002766 }
Peter Xu2e83ee12018-12-21 14:30:50 -08002767 write = pmd_write(old_pmd);
2768 young = pmd_young(old_pmd);
2769 soft_dirty = pmd_soft_dirty(old_pmd);
Peter Xu292924b2020-04-06 20:05:49 -07002770 uffd_wp = pmd_uffd_wp(old_pmd);
David Hildenbrand6c287602022-05-09 18:20:44 -07002771
David Hildenbrand91b29782023-12-20 23:44:39 +01002772 VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
2773 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
David Hildenbrand6c287602022-05-09 18:20:44 -07002774
2775 /*
2776 * Without "freeze", we'll simply split the PMD, propagating the
2777 * PageAnonExclusive() flag for each PTE by setting it for
2778 * each subpage -- no need to (temporarily) clear.
2779 *
2780 * With "freeze" we want to replace mapped pages by
2781 * migration entries right away. This is only possible if we
2782 * managed to clear PageAnonExclusive() -- see
2783 * set_pmd_migration_entry().
2784 *
2785 * In case we cannot clear PageAnonExclusive(), split the PMD
2786 * only and let try_to_migrate_one() fail later.
David Hildenbrand088b8aa2022-09-01 10:35:59 +02002787 *
David Hildenbrande3b4b132023-12-20 23:45:02 +01002788 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
David Hildenbrand6c287602022-05-09 18:20:44 -07002789 */
David Hildenbrand91b29782023-12-20 23:44:39 +01002790 anon_exclusive = PageAnonExclusive(page);
David Hildenbrande3b4b132023-12-20 23:45:02 +01002791 if (freeze && anon_exclusive &&
2792 folio_try_share_anon_rmap_pmd(folio, page))
David Hildenbrand6c287602022-05-09 18:20:44 -07002793 freeze = false;
David Hildenbrand91b29782023-12-20 23:44:39 +01002794 if (!freeze) {
2795 rmap_t rmap_flags = RMAP_NONE;
2796
2797 folio_ref_add(folio, HPAGE_PMD_NR - 1);
2798 if (anon_exclusive)
2799 rmap_flags |= RMAP_EXCLUSIVE;
2800 folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
2801 vma, haddr, rmap_flags);
2802 }
Peter Xu2e83ee12018-12-21 14:30:50 -08002803 }
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002804
Aneesh Kumar K.V423ac9a2018-01-31 16:18:24 -08002805 /*
2806 * Withdraw the table only after we mark the pmd entry invalid.
2807 * This's critical for some architectures (Power).
2808 */
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002809 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2810 pmd_populate(mm, &_pmd, pgtable);
2811
Hugh Dickinsc9c1ee22023-06-08 18:41:31 -07002812 pte = pte_offset_map(&_pmd, haddr);
2813 VM_BUG_ON(!pte);
Ryan Roberts2bdba982024-02-15 10:31:49 +00002814
2815 /*
2816 * Note that NUMA hinting access restrictions are not transferred to
2817 * avoid any possibility of altering permissions across VMAs.
2818 */
2819 if (freeze || pmd_migration) {
2820 for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2821 pte_t entry;
Kirill A. Shutemovba988282016-01-15 16:53:56 -08002822 swp_entry_t swp_entry;
Ryan Roberts2bdba982024-02-15 10:31:49 +00002823
Alistair Popple4dd845b2021-06-30 18:54:09 -07002824 if (write)
2825 swp_entry = make_writable_migration_entry(
2826 page_to_pfn(page + i));
David Hildenbrand6c287602022-05-09 18:20:44 -07002827 else if (anon_exclusive)
2828 swp_entry = make_readable_exclusive_migration_entry(
2829 page_to_pfn(page + i));
Alistair Popple4dd845b2021-06-30 18:54:09 -07002830 else
2831 swp_entry = make_readable_migration_entry(
2832 page_to_pfn(page + i));
Peter Xu2e346872022-08-11 12:13:29 -04002833 if (young)
2834 swp_entry = make_migration_entry_young(swp_entry);
2835 if (dirty)
2836 swp_entry = make_migration_entry_dirty(swp_entry);
Kirill A. Shutemovba988282016-01-15 16:53:56 -08002837 entry = swp_entry_to_pte(swp_entry);
Andrea Arcangeli804dd152016-08-25 15:16:57 -07002838 if (soft_dirty)
2839 entry = pte_swp_mksoft_dirty(entry);
Peter Xuf45ec5f2020-04-06 20:06:01 -07002840 if (uffd_wp)
2841 entry = pte_swp_mkuffd_wp(entry);
Ryan Roberts2bdba982024-02-15 10:31:49 +00002842
2843 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2844 set_pte_at(mm, addr, pte + i, entry);
Kirill A. Shutemovba988282016-01-15 16:53:56 -08002845 }
Ryan Roberts2bdba982024-02-15 10:31:49 +00002846 } else {
2847 pte_t entry;
2848
2849 entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
2850 if (write)
2851 entry = pte_mkwrite(entry, vma);
2852 if (!young)
2853 entry = pte_mkold(entry);
2854 /* NOTE: this may set soft-dirty too on some archs */
2855 if (dirty)
2856 entry = pte_mkdirty(entry);
2857 if (soft_dirty)
2858 entry = pte_mksoft_dirty(entry);
2859 if (uffd_wp)
2860 entry = pte_mkuffd_wp(entry);
2861
2862 for (i = 0; i < HPAGE_PMD_NR; i++)
2863 VM_WARN_ON(!pte_none(ptep_get(pte + i)));
2864
2865 set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002866 }
Ryan Roberts2bdba982024-02-15 10:31:49 +00002867 pte_unmap(pte);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002868
Hugh Dickinscb67f422022-11-02 18:51:38 -07002869 if (!pmd_migration)
David Hildenbranda8e61d52023-12-20 23:44:49 +01002870 folio_remove_rmap_pmd(folio, page, vma);
Hugh Dickins96d82de2022-11-22 01:51:50 -08002871 if (freeze)
2872 put_page(page);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002873
2874 smp_wmb(); /* make pte visible before pmd */
2875 pmd_populate(mm, pmd, pgtable);
2876}
2877
Lance Yang29e847d2024-06-14 09:51:37 +08002878void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
2879 pmd_t *pmd, bool freeze, struct folio *folio)
2880{
2881 VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
2882 VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
2883 VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
2884 VM_BUG_ON(freeze && !folio);
2885
2886 /*
2887 * When the caller requests to set up a migration entry, we
2888 * require a folio to check the PMD against. Otherwise, there
2889 * is a risk of replacing the wrong folio.
2890 */
2891 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
2892 is_pmd_migration_entry(*pmd)) {
2893 if (folio && folio != pmd_folio(*pmd))
2894 return;
2895 __split_huge_pmd_locked(vma, pmd, address, freeze);
2896 }
2897}
2898
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002899void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
Matthew Wilcox (Oracle)af28a982022-01-21 10:44:52 -05002900 unsigned long address, bool freeze, struct folio *folio)
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002901{
2902 spinlock_t *ptl;
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002903 struct mmu_notifier_range range;
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002904
Alistair Popple7d4a8be2023-01-10 13:57:22 +11002905 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
Jérôme Glisse6f4f13e2019-05-13 17:20:49 -07002906 address & HPAGE_PMD_MASK,
Jérôme Glisseac46d4f2018-12-28 00:38:09 -08002907 (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2908 mmu_notifier_invalidate_range_start(&range);
2909 ptl = pmd_lock(vma->vm_mm, pmd);
Lance Yang29e847d2024-06-14 09:51:37 +08002910 split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002911 spin_unlock(ptl);
Alistair Poppleec8832d2023-07-25 23:42:06 +10002912 mmu_notifier_invalidate_range_end(&range);
Kirill A. Shutemoveef1b3b2016-01-15 16:53:53 -08002913}
2914
Kirill A. Shutemovfec89c12016-03-17 14:20:10 -07002915void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
Matthew Wilcox (Oracle)af28a982022-01-21 10:44:52 -05002916 bool freeze, struct folio *folio)
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002917{
Zach O'Keefe50722802022-07-06 16:59:26 -07002918 pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002919
Zach O'Keefe50722802022-07-06 16:59:26 -07002920 if (!pmd)
Hugh Dickinsf72e7dc2014-06-23 13:22:05 -07002921 return;
2922
Matthew Wilcox (Oracle)af28a982022-01-21 10:44:52 -05002923 __split_huge_pmd(vma, pmd, address, freeze, folio);
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002924}
2925
Miaohe Lin71f9e582021-05-04 18:33:52 -07002926static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
2927{
2928 /*
2929 * If the new address isn't hpage aligned and it could previously
2930 * contain an hugepage: check if we need to split an huge pmd.
2931 */
2932 if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
2933 range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
2934 ALIGN(address, HPAGE_PMD_SIZE)))
2935 split_huge_pmd_address(vma, address, false, NULL);
2936}
2937
Kirill A. Shutemove1b99962015-09-08 14:58:37 -07002938void vma_adjust_trans_huge(struct vm_area_struct *vma,
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002939 unsigned long start,
2940 unsigned long end,
2941 long adjust_next)
2942{
Miaohe Lin71f9e582021-05-04 18:33:52 -07002943 /* Check if we need to split start first. */
2944 split_huge_pmd_if_needed(vma, start);
2945
2946 /* Check if we need to split end next. */
2947 split_huge_pmd_if_needed(vma, end);
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002948
2949 /*
Matthew Wilcox (Oracle)68540502022-09-06 19:49:00 +00002950 * If we're also updating the next vma vm_start,
Miaohe Lin71f9e582021-05-04 18:33:52 -07002951 * check if we need to split it.
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002952 */
2953 if (adjust_next > 0) {
Matthew Wilcox (Oracle)68540502022-09-06 19:49:00 +00002954 struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002955 unsigned long nstart = next->vm_start;
Wei Yangf9d86a62020-10-13 16:53:57 -07002956 nstart += adjust_next;
Miaohe Lin71f9e582021-05-04 18:33:52 -07002957 split_huge_pmd_if_needed(next, nstart);
Andrea Arcangeli94fcc582011-01-13 15:47:08 -08002958 }
2959}
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08002960
Matthew Wilcox (Oracle)684555a2022-09-02 20:46:49 +01002961static void unmap_folio(struct folio *folio)
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08002962{
Zi Yan319a6242024-02-26 15:55:27 -05002963 enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
2964 TTU_BATCH_FLUSH;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08002965
Matthew Wilcox (Oracle)684555a2022-09-02 20:46:49 +01002966 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08002967
Zi Yan319a6242024-02-26 15:55:27 -05002968 if (folio_test_pmd_mappable(folio))
2969 ttu_flags |= TTU_SPLIT_HUGE_PMD;
2970
Alistair Popplea98a2f02021-06-30 18:54:16 -07002971 /*
2972 * Anon pages need migration entries to preserve them, but file
2973 * pages can simply be left unmapped, then faulted back on demand.
2974 * If that is ever changed (perhaps for mlock), update remap_page().
2975 */
Matthew Wilcox (Oracle)4b8554c2022-01-28 14:29:43 -05002976 if (folio_test_anon(folio))
2977 try_to_migrate(folio, ttu_flags);
Alistair Popplea98a2f02021-06-30 18:54:16 -07002978 else
Matthew Wilcox (Oracle)869f7ee2022-02-15 09:28:49 -05002979 try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
Baolin Wang3027c6f2023-10-30 09:11:47 +08002980
2981 try_to_unmap_flush();
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08002982}
2983
Lance Yang735ecdf2024-06-14 09:51:38 +08002984static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
2985 unsigned long addr, pmd_t *pmdp,
2986 struct folio *folio)
2987{
2988 struct mm_struct *mm = vma->vm_mm;
2989 int ref_count, map_count;
2990 pmd_t orig_pmd = *pmdp;
Lance Yang735ecdf2024-06-14 09:51:38 +08002991
2992 if (folio_test_dirty(folio) || pmd_dirty(orig_pmd))
2993 return false;
2994
2995 orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);
2996
2997 /*
2998 * Syncing against concurrent GUP-fast:
2999 * - clear PMD; barrier; read refcount
3000 * - inc refcount; barrier; read PMD
3001 */
3002 smp_mb();
3003
3004 ref_count = folio_ref_count(folio);
3005 map_count = folio_mapcount(folio);
3006
3007 /*
3008 * Order reads for folio refcount and dirty flag
3009 * (see comments in __remove_mapping()).
3010 */
3011 smp_rmb();
3012
3013 /*
3014 * If the folio or its PMD is redirtied at this point, or if there
3015 * are unexpected references, we will give up to discard this folio
3016 * and remap it.
3017 *
3018 * The only folio refs must be one from isolation plus the rmap(s).
3019 */
3020 if (folio_test_dirty(folio) || pmd_dirty(orig_pmd) ||
3021 ref_count != map_count + 1) {
3022 set_pmd_at(mm, addr, pmdp, orig_pmd);
3023 return false;
3024 }
3025
Andrew Mortond40f74a2024-06-25 14:51:36 -07003026 folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
Lance Yang735ecdf2024-06-14 09:51:38 +08003027 zap_deposited_table(mm, pmdp);
3028 add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
3029 if (vma->vm_flags & VM_LOCKED)
3030 mlock_drain_local();
3031 folio_put(folio);
3032
3033 return true;
3034}
3035
3036bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
3037 pmd_t *pmdp, struct folio *folio)
3038{
3039 VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
3040 VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
3041 VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));
3042
3043 if (folio_test_anon(folio) && !folio_test_swapbacked(folio))
3044 return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
3045
3046 return false;
3047}
3048
Yu Zhaob1f20202024-08-30 11:03:36 +01003049static void remap_page(struct folio *folio, unsigned long nr, int flags)
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003050{
Matthew Wilcox (Oracle)4eecb8b2022-01-28 23:32:59 -05003051 int i = 0;
Hugh Dickinsab02c252021-06-30 18:52:04 -07003052
Matthew Wilcox (Oracle)684555a2022-09-02 20:46:49 +01003053 /* If unmap_folio() uses try_to_migrate() on file, remove this check */
Matthew Wilcox (Oracle)4eecb8b2022-01-28 23:32:59 -05003054 if (!folio_test_anon(folio))
Hugh Dickinsab02c252021-06-30 18:52:04 -07003055 return;
Matthew Wilcox (Oracle)4eecb8b2022-01-28 23:32:59 -05003056 for (;;) {
Yu Zhaob1f20202024-08-30 11:03:36 +01003057 remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
Matthew Wilcox (Oracle)4eecb8b2022-01-28 23:32:59 -05003058 i += folio_nr_pages(folio);
3059 if (i >= nr)
3060 break;
3061 folio = folio_next(folio);
Kirill A. Shutemovace71a12017-02-24 14:57:45 -08003062 }
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003063}
3064
Matthew Wilcox (Oracle)cb29e792024-08-21 20:34:38 +01003065static void lru_add_page_tail(struct folio *folio, struct page *tail,
Alex Shi88dcb9a2020-12-15 12:33:20 -08003066 struct lruvec *lruvec, struct list_head *list)
3067{
Matthew Wilcox (Oracle)cb29e792024-08-21 20:34:38 +01003068 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
3069 VM_BUG_ON_FOLIO(PageLRU(tail), folio);
Alex Shi6168d0d2020-12-15 12:34:29 -08003070 lockdep_assert_held(&lruvec->lru_lock);
Alex Shi88dcb9a2020-12-15 12:33:20 -08003071
Alex Shi6dbb5742020-12-15 12:33:29 -08003072 if (list) {
Alex Shi88dcb9a2020-12-15 12:33:20 -08003073 /* page reclaim is reclaiming a huge page */
Matthew Wilcox (Oracle)cb29e792024-08-21 20:34:38 +01003074 VM_WARN_ON(folio_test_lru(folio));
Alex Shi94866632020-12-15 12:33:24 -08003075 get_page(tail);
3076 list_add_tail(&tail->lru, list);
Alex Shi88dcb9a2020-12-15 12:33:20 -08003077 } else {
Alex Shi6dbb5742020-12-15 12:33:29 -08003078 /* head is still on lru (and we have it frozen) */
Matthew Wilcox (Oracle)cb29e792024-08-21 20:34:38 +01003079 VM_WARN_ON(!folio_test_lru(folio));
3080 if (folio_test_unevictable(folio))
Hugh Dickins07ca7602022-02-14 18:29:54 -08003081 tail->mlock_count = 0;
3082 else
Matthew Wilcox (Oracle)cb29e792024-08-21 20:34:38 +01003083 list_add_tail(&tail->lru, &folio->lru);
Alex Shi6dbb5742020-12-15 12:33:29 -08003084 SetPageLRU(tail);
Alex Shi88dcb9a2020-12-15 12:33:20 -08003085 }
3086}
3087
David Hildenbrand07e09c42023-08-21 18:08:49 +02003088static void __split_huge_page_tail(struct folio *folio, int tail,
Zi Yanc010d472024-02-26 15:55:33 -05003089 struct lruvec *lruvec, struct list_head *list,
3090 unsigned int new_order)
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003091{
David Hildenbrand07e09c42023-08-21 18:08:49 +02003092 struct page *head = &folio->page;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003093 struct page *page_tail = head + tail;
David Hildenbrand07e09c42023-08-21 18:08:49 +02003094 /*
3095 * Careful: new_folio is not a "real" folio before we cleared PageTail.
3096 * Don't pass it around before clear_compound_head().
3097 */
3098 struct folio *new_folio = (struct folio *)page_tail;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003099
Kirill A. Shutemov8df651c2016-03-15 14:57:30 -07003100 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003101
3102 /*
Konstantin Khlebnikov605ca5e2018-04-05 16:23:28 -07003103 * Clone page flags before unfreezing refcount.
3104 *
3105 * After successful get_page_unless_zero() might follow flags change,
Haitao Shi8958b242020-12-15 20:47:26 -08003106 * for example lock_page() which set PG_waiters.
David Hildenbrand6c287602022-05-09 18:20:44 -07003107 *
3108 * Note that for mapped sub-pages of an anonymous THP,
Matthew Wilcox (Oracle)684555a2022-09-02 20:46:49 +01003109 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
David Hildenbrand6c287602022-05-09 18:20:44 -07003110 * the migration entry instead from where remap_page() will restore it.
3111 * We can still have PG_anon_exclusive set on effectively unmapped and
3112 * unreferenced sub-pages of an anonymous THP: we can simply drop
3113 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003114 */
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003115 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3116 page_tail->flags |= (head->flags &
3117 ((1L << PG_referenced) |
3118 (1L << PG_swapbacked) |
Huang Ying38d8b4e2017-07-06 15:37:18 -07003119 (1L << PG_swapcache) |
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003120 (1L << PG_mlocked) |
3121 (1L << PG_uptodate) |
3122 (1L << PG_active) |
Johannes Weiner1899ad12018-10-26 15:06:04 -07003123 (1L << PG_workingset) |
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003124 (1L << PG_locked) |
Minchan Kimb8d3c4c2016-01-15 16:55:42 -08003125 (1L << PG_unevictable) |
Matthew Wilcox (Oracle)7a872252024-08-21 20:34:43 +01003126#ifdef CONFIG_ARCH_USES_PG_ARCH_2
Catalin Marinas72e6afa2020-07-02 10:19:30 +01003127 (1L << PG_arch_2) |
Matthew Wilcox (Oracle)7a872252024-08-21 20:34:43 +01003128#endif
3129#ifdef CONFIG_ARCH_USES_PG_ARCH_3
Peter Collingbourneef6458b2022-11-03 18:10:37 -07003130 (1L << PG_arch_3) |
Catalin Marinas72e6afa2020-07-02 10:19:30 +01003131#endif
Yu Zhaoec1c86b2022-09-18 02:00:02 -06003132 (1L << PG_dirty) |
3133 LRU_GEN_MASK | LRU_REFS_MASK));
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003134
Hugh Dickinscb67f422022-11-02 18:51:38 -07003135 /* ->mapping in first and second tail page is replaced by other uses */
Hugh Dickins173d9d92018-11-30 14:10:16 -08003136 VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
3137 page_tail);
3138 page_tail->mapping = head->mapping;
3139 page_tail->index = head->index + tail;
Mel Gorman71e2d662022-10-19 14:41:56 +01003140
3141 /*
David Hildenbrandcfeed8f2023-08-21 18:08:46 +02003142 * page->private should not be set in tail pages. Fix up and warn once
3143 * if private is unexpectedly set.
Mel Gorman71e2d662022-10-19 14:41:56 +01003144 */
David Hildenbrandcfeed8f2023-08-21 18:08:46 +02003145 if (unlikely(page_tail->private)) {
3146 VM_WARN_ON_ONCE_PAGE(true, page_tail);
Mel Gorman71e2d662022-10-19 14:41:56 +01003147 page_tail->private = 0;
3148 }
David Hildenbrand07e09c42023-08-21 18:08:49 +02003149 if (folio_test_swapcache(folio))
3150 new_folio->swap.val = folio->swap.val + tail;
Hugh Dickins173d9d92018-11-30 14:10:16 -08003151
Konstantin Khlebnikov605ca5e2018-04-05 16:23:28 -07003152 /* Page flags must be visible before we make the page non-compound. */
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003153 smp_wmb();
3154
Konstantin Khlebnikov605ca5e2018-04-05 16:23:28 -07003155 /*
3156 * Clear PageTail before unfreezing page refcount.
3157 *
3158 * After successful get_page_unless_zero() might follow put_page()
3159 * which needs correct compound_head().
3160 */
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003161 clear_compound_head(page_tail);
Zi Yanc010d472024-02-26 15:55:33 -05003162 if (new_order) {
3163 prep_compound_page(page_tail, new_order);
Matthew Wilcox (Oracle)85edc152024-03-21 14:24:41 +00003164 folio_set_large_rmappable(new_folio);
Zi Yanc010d472024-02-26 15:55:33 -05003165 }
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003166
Konstantin Khlebnikov605ca5e2018-04-05 16:23:28 -07003167 /* Finally unfreeze refcount. Additional reference from page cache. */
Zi Yanc010d472024-02-26 15:55:33 -05003168 page_ref_unfreeze(page_tail,
3169 1 + ((!folio_test_anon(folio) || folio_test_swapcache(folio)) ?
3170 folio_nr_pages(new_folio) : 0));
Konstantin Khlebnikov605ca5e2018-04-05 16:23:28 -07003171
Kefeng Wangb7542762023-11-10 11:33:21 +08003172 if (folio_test_young(folio))
3173 folio_set_young(new_folio);
3174 if (folio_test_idle(folio))
3175 folio_set_idle(new_folio);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003176
Kefeng Wangc8253012023-10-18 22:08:02 +08003177 folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
Michal Hocko94723aa2018-04-10 16:30:07 -07003178
3179 /*
3180 * always add to the tail because some iterators expect new
3181 * pages to show after the currently processed elements - e.g.
3182 * migrate_pages
3183 */
Matthew Wilcox (Oracle)cb29e792024-08-21 20:34:38 +01003184 lru_add_page_tail(folio, page_tail, lruvec, list);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003185}
3186
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003187static void __split_huge_page(struct page *page, struct list_head *list,
Zi Yanc010d472024-02-26 15:55:33 -05003188 pgoff_t end, unsigned int new_order)
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003189{
Matthew Wilcox (Oracle)e809c3f2021-06-28 21:59:47 -04003190 struct folio *folio = page_folio(page);
3191 struct page *head = &folio->page;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003192 struct lruvec *lruvec;
Matthew Wilcox (Oracle)41011962019-09-23 15:34:52 -07003193 struct address_space *swap_cache = NULL;
3194 unsigned long offset = 0;
Hugh Dickins509f0062023-07-25 16:45:10 +02003195 int i, nr_dropped = 0;
Zi Yanc010d472024-02-26 15:55:33 -05003196 unsigned int new_nr = 1 << new_order;
Zi Yan502003b2024-02-26 15:55:29 -05003197 int order = folio_order(folio);
3198 unsigned int nr = 1 << order;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003199
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003200 /* complete memcg works before add pages to LRU */
Zi Yanc010d472024-02-26 15:55:33 -05003201 split_page_memcg(head, order, new_order);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003202
David Hildenbrand07e09c42023-08-21 18:08:49 +02003203 if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
Kairui Song7aad25b2024-05-22 01:58:53 +08003204 offset = swap_cache_index(folio->swap);
David Hildenbrand07e09c42023-08-21 18:08:49 +02003205 swap_cache = swap_address_space(folio->swap);
Matthew Wilcox (Oracle)41011962019-09-23 15:34:52 -07003206 xa_lock(&swap_cache->i_pages);
3207 }
3208
Ingo Molnarf0953a12021-05-06 18:06:47 -07003209 /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
Matthew Wilcox (Oracle)e809c3f2021-06-28 21:59:47 -04003210 lruvec = folio_lruvec_lock(folio);
Alex Shib6769832020-12-15 12:33:33 -08003211
Yang Shieac96c32021-10-28 14:36:11 -07003212 ClearPageHasHWPoisoned(head);
3213
Zi Yanc010d472024-02-26 15:55:33 -05003214 for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
3215 __split_huge_page_tail(folio, i, lruvec, list, new_order);
Hugh Dickinsd144bf62021-09-02 14:54:21 -07003216 /* Some pages can be beyond EOF: drop them from page cache */
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003217 if (head[i].index >= end) {
Matthew Wilcox (Oracle)fb5c2022022-06-28 20:15:29 -04003218 struct folio *tail = page_folio(head + i);
3219
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003220 if (shmem_mapping(folio->mapping))
Hugh Dickins509f0062023-07-25 16:45:10 +02003221 nr_dropped++;
Matthew Wilcox (Oracle)fb5c2022022-06-28 20:15:29 -04003222 else if (folio_test_clear_dirty(tail))
3223 folio_account_cleaned(tail,
3224 inode_to_wb(folio->mapping->host));
3225 __filemap_remove_folio(tail, NULL);
3226 folio_put(tail);
Matthew Wilcox (Oracle)41011962019-09-23 15:34:52 -07003227 } else if (!PageAnon(page)) {
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003228 __xa_store(&folio->mapping->i_pages, head[i].index,
Matthew Wilcox (Oracle)41011962019-09-23 15:34:52 -07003229 head + i, 0);
3230 } else if (swap_cache) {
3231 __xa_store(&swap_cache->i_pages, offset + i,
3232 head + i, 0);
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003233 }
3234 }
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003235
Zi Yanc010d472024-02-26 15:55:33 -05003236 if (!new_order)
3237 ClearPageCompound(head);
3238 else {
3239 struct folio *new_folio = (struct folio *)head;
3240
3241 folio_set_order(new_folio, new_order);
3242 }
Alex Shi6168d0d2020-12-15 12:34:29 -08003243 unlock_page_lruvec(lruvec);
Alex Shib6769832020-12-15 12:33:33 -08003244 /* Caller disabled irqs, so they are still disabled here */
Vlastimil Babkaf7da6772019-08-24 17:54:59 -07003245
Zi Yanc010d472024-02-26 15:55:33 -05003246 split_page_owner(head, order, new_order);
Yu Zhao95599ef2024-09-05 22:21:07 -06003247 pgalloc_tag_split(folio, order, new_order);
Vlastimil Babkaf7da6772019-08-24 17:54:59 -07003248
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003249 /* See comment in __split_huge_page_tail() */
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003250 if (folio_test_anon(folio)) {
Matthew Wilcoxaa5dc072017-12-04 10:16:10 -05003251 /* Additional pin to swap cache */
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003252 if (folio_test_swapcache(folio)) {
3253 folio_ref_add(folio, 1 + new_nr);
Matthew Wilcox (Oracle)41011962019-09-23 15:34:52 -07003254 xa_unlock(&swap_cache->i_pages);
3255 } else {
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003256 folio_ref_inc(folio);
Matthew Wilcox (Oracle)41011962019-09-23 15:34:52 -07003257 }
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003258 } else {
Matthew Wilcoxaa5dc072017-12-04 10:16:10 -05003259 /* Additional pin to page cache */
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003260 folio_ref_add(folio, 1 + new_nr);
3261 xa_unlock(&folio->mapping->i_pages);
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003262 }
Alex Shib6769832020-12-15 12:33:33 -08003263 local_irq_enable();
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003264
Hugh Dickins509f0062023-07-25 16:45:10 +02003265 if (nr_dropped)
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003266 shmem_uncharge(folio->mapping->host, nr_dropped);
Yu Zhaob1f20202024-08-30 11:03:36 +01003267 remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003268
Zi Yanc010d472024-02-26 15:55:33 -05003269 /*
3270 * set page to its compound_head when split to non order-0 pages, so
3271 * we can skip unlocking it below, since PG_locked is transferred to
3272 * the compound_head of the page and the caller will unlock it.
3273 */
3274 if (new_order)
3275 page = compound_head(page);
3276
3277 for (i = 0; i < nr; i += new_nr) {
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003278 struct page *subpage = head + i;
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003279 struct folio *new_folio = page_folio(subpage);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003280 if (subpage == page)
3281 continue;
Matthew Wilcox (Oracle)435a7552024-02-28 16:42:36 +00003282 folio_unlock(new_folio);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003283
3284 /*
3285 * Subpages may be freed if there wasn't any mapping
3286 * like if add_to_swap() is running on a lru page that
3287 * had its mapping zapped. And freeing these pages
3288 * requires taking the lru_lock so we do the put_page
3289 * of the tail pages after the split is complete.
3290 */
Miaohe Lin0b175462022-07-04 21:21:56 +08003291 free_page_and_swap_cache(subpage);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003292 }
3293}
3294
Huang Yingb8f593c2017-07-06 15:37:28 -07003295/* Racy check whether the huge page can be split */
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003296bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
Huang Yingb8f593c2017-07-06 15:37:28 -07003297{
3298 int extra_pins;
3299
Matthew Wilcoxaa5dc072017-12-04 10:16:10 -05003300 /* Additional pins from page cache */
Matthew Wilcox (Oracle)d4b40842022-02-04 14:13:31 -05003301 if (folio_test_anon(folio))
3302 extra_pins = folio_test_swapcache(folio) ?
3303 folio_nr_pages(folio) : 0;
Huang Yingb8f593c2017-07-06 15:37:28 -07003304 else
Matthew Wilcox (Oracle)d4b40842022-02-04 14:13:31 -05003305 extra_pins = folio_nr_pages(folio);
Huang Yingb8f593c2017-07-06 15:37:28 -07003306 if (pextra_pins)
3307 *pextra_pins = extra_pins;
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003308 return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
3309 caller_pins;
Huang Yingb8f593c2017-07-06 15:37:28 -07003310}
3311
Andrea Arcangeli6d0a07e2016-05-12 15:42:25 -07003312/*
John Hubbarda8353dc2024-03-24 21:44:52 -07003313 * This function splits a large folio into smaller folios of order @new_order.
3314 * @page can point to any page of the large folio to split. The split operation
3315 * does not change the position of @page.
Zi Yanc010d472024-02-26 15:55:33 -05003316 *
John Hubbarda8353dc2024-03-24 21:44:52 -07003317 * Prerequisites:
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003318 *
John Hubbarda8353dc2024-03-24 21:44:52 -07003319 * 1) The caller must hold a reference on the @page's owning folio, also known
3320 * as the large folio.
3321 *
3322 * 2) The large folio must be locked.
3323 *
3324 * 3) The folio must not be pinned. Any unexpected folio references, including
3325 * GUP pins, will result in the folio not getting split; instead, the caller
David Hildenbrandd21f9962024-04-18 17:18:34 +02003326 * will receive an -EAGAIN.
John Hubbarda8353dc2024-03-24 21:44:52 -07003327 *
3328 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
3329 * supported for non-file-backed folios, because folio->_deferred_list, which
3330 * is used by partially mapped folios, is stored in subpage 2, but an order-1
3331 * folio only has subpages 0 and 1. File-backed order-1 folios are supported,
3332 * since they do not use _deferred_list.
3333 *
3334 * After splitting, the caller's folio reference will be transferred to @page,
3335 * resulting in a raised refcount of @page after this call. The other pages may
3336 * be freed if they are not mapped.
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003337 *
3338 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3339 *
John Hubbarda8353dc2024-03-24 21:44:52 -07003340 * Pages in @new_order will inherit the mapping, flags, and so on from the
3341 * huge page.
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003342 *
John Hubbarda8353dc2024-03-24 21:44:52 -07003343 * Returns 0 if the huge page was split successfully.
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003344 *
David Hildenbrandd21f9962024-04-18 17:18:34 +02003345 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
3346 * the folio was concurrently removed from the page cache.
3347 *
3348 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
3349 * under writeback, if fs-specific folio metadata cannot currently be
3350 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
3351 * truncation).
3352 *
Luis Chamberlaine2209172024-08-22 15:50:12 +02003353 * Callers should ensure that the order respects the address space mapping
3354 * min-order if one is set for non-anonymous folios.
3355 *
David Hildenbrandd21f9962024-04-18 17:18:34 +02003356 * Returns -EINVAL when trying to split to an order that is incompatible
3357 * with the folio. Splitting to order 0 is compatible with all folios.
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003358 */
Zi Yanc010d472024-02-26 15:55:33 -05003359int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
3360 unsigned int new_order)
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003361{
Matthew Wilcox (Oracle)4eecb8b2022-01-28 23:32:59 -05003362 struct folio *folio = page_folio(page);
Matthew Wilcox (Oracle)f8baa6b2023-01-11 14:29:12 +00003363 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
Zi Yanc010d472024-02-26 15:55:33 -05003364 /* reset xarray order to new order after split */
3365 XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
Barry Song5d65c8d2024-08-24 13:04:40 +12003366 bool is_anon = folio_test_anon(folio);
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003367 struct address_space *mapping = NULL;
Barry Song5d65c8d2024-08-24 13:04:40 +12003368 struct anon_vma *anon_vma = NULL;
Lance Yangf216c842024-06-28 21:07:49 +08003369 int order = folio_order(folio);
Yang Shi504e0702021-06-15 18:24:07 -07003370 int extra_pins, ret;
Hugh Dickins006d3ff2018-11-30 14:10:21 -08003371 pgoff_t end;
Xu Yu478d1342022-04-28 23:14:43 -07003372 bool is_hzp;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003373
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003374 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
3375 VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003376
Zi Yan1412ecb2024-03-07 13:18:53 -05003377 if (new_order >= folio_order(folio))
3378 return -EINVAL;
3379
Barry Song5d65c8d2024-08-24 13:04:40 +12003380 if (is_anon) {
Ran Xiaokai6a50c9b2024-06-07 17:40:48 +08003381 /* order-1 is not supported for anonymous THP. */
3382 if (new_order == 1) {
3383 VM_WARN_ONCE(1, "Cannot split to order-1 folio");
Zi Yanc010d472024-02-26 15:55:33 -05003384 return -EINVAL;
Ran Xiaokai6a50c9b2024-06-07 17:40:48 +08003385 }
3386 } else if (new_order) {
Zi Yanc010d472024-02-26 15:55:33 -05003387 /* Split shmem folio to non-zero order not supported */
3388 if (shmem_mapping(folio->mapping)) {
3389 VM_WARN_ONCE(1,
3390 "Cannot split shmem folio to non-0 order");
3391 return -EINVAL;
3392 }
Ran Xiaokai6a50c9b2024-06-07 17:40:48 +08003393 /*
3394 * No split if the file system does not support large folio.
3395 * Note that we might still have THPs in such mappings due to
3396 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
3397 * does not actually support large folios properly.
3398 */
3399 if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
3400 !mapping_large_folio_support(folio->mapping)) {
Zi Yanc010d472024-02-26 15:55:33 -05003401 VM_WARN_ONCE(1,
3402 "Cannot split file folio to non-0 order");
3403 return -EINVAL;
3404 }
3405 }
3406
Ran Xiaokai6a50c9b2024-06-07 17:40:48 +08003407 /* Only swapping a whole PMD-mapped folio is supported */
3408 if (folio_test_swapcache(folio) && new_order)
3409 return -EINVAL;
Zi Yanc010d472024-02-26 15:55:33 -05003410
Matthew Wilcox (Oracle)5beaee52024-03-26 20:28:22 +00003411 is_hzp = is_huge_zero_folio(folio);
Naoya Horiguchi4737edb2023-04-06 17:20:04 +09003412 if (is_hzp) {
3413 pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
Xu Yu478d1342022-04-28 23:14:43 -07003414 return -EBUSY;
Naoya Horiguchi4737edb2023-04-06 17:20:04 +09003415 }
Xu Yu478d1342022-04-28 23:14:43 -07003416
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003417 if (folio_test_writeback(folio))
Huang Ying59807682017-09-06 16:22:34 -07003418 return -EBUSY;
3419
Barry Song5d65c8d2024-08-24 13:04:40 +12003420 if (is_anon) {
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003421 /*
Michel Lespinassec1e8d7c2020-06-08 21:33:54 -07003422 * The caller does not necessarily hold an mmap_lock that would
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003423 * prevent the anon_vma disappearing so we first we take a
3424 * reference to it and then lock the anon_vma for write. This
Matthew Wilcox (Oracle)2f031c62022-01-29 16:06:53 -05003425 * is similar to folio_lock_anon_vma_read except the write lock
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003426 * is taken to serialise against parallel split or collapse
3427 * operations.
3428 */
Matthew Wilcox (Oracle)29eea9b2022-09-02 20:46:50 +01003429 anon_vma = folio_get_anon_vma(folio);
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003430 if (!anon_vma) {
3431 ret = -EBUSY;
3432 goto out;
3433 }
Hugh Dickins006d3ff2018-11-30 14:10:21 -08003434 end = -1;
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003435 mapping = NULL;
3436 anon_vma_lock_write(anon_vma);
3437 } else {
Luis Chamberlaine2209172024-08-22 15:50:12 +02003438 unsigned int min_order;
Yin Fengwei6a3edd292022-08-10 14:49:07 +08003439 gfp_t gfp;
3440
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003441 mapping = folio->mapping;
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003442
3443 /* Truncated ? */
3444 if (!mapping) {
3445 ret = -EBUSY;
3446 goto out;
3447 }
3448
Luis Chamberlaine2209172024-08-22 15:50:12 +02003449 min_order = mapping_min_folio_order(folio->mapping);
3450 if (new_order < min_order) {
3451 VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
3452 min_order);
3453 ret = -EINVAL;
3454 goto out;
3455 }
3456
Yin Fengwei6a3edd292022-08-10 14:49:07 +08003457 gfp = current_gfp_context(mapping_gfp_mask(mapping) &
3458 GFP_RECLAIM_MASK);
3459
David Howells0201ebf2023-06-28 11:48:51 +01003460 if (!filemap_release_folio(folio, gfp)) {
Yin Fengwei6a3edd292022-08-10 14:49:07 +08003461 ret = -EBUSY;
3462 goto out;
3463 }
3464
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003465 xas_split_alloc(&xas, folio, folio_order(folio), gfp);
Matthew Wilcox (Oracle)6b24ca42020-06-27 22:19:08 -04003466 if (xas_error(&xas)) {
3467 ret = xas_error(&xas);
3468 goto out;
3469 }
3470
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003471 anon_vma = NULL;
3472 i_mmap_lock_read(mapping);
Hugh Dickins006d3ff2018-11-30 14:10:21 -08003473
3474 /*
3475 *__split_huge_page() may need to trim off pages beyond EOF:
3476 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
3477 * which cannot be nested inside the page tree lock. So note
3478 * end now: i_size itself may be changed at any moment, but
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003479 * folio lock is good enough to serialize the trimming.
Hugh Dickins006d3ff2018-11-30 14:10:21 -08003480 */
3481 end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
Hugh Dickinsd144bf62021-09-02 14:54:21 -07003482 if (shmem_mapping(mapping))
3483 end = shmem_fallocend(mapping->host, end);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003484 }
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003485
3486 /*
Matthew Wilcox (Oracle)684555a2022-09-02 20:46:49 +01003487 * Racy check if we can split the page, before unmap_folio() will
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003488 * split PMDs
3489 */
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003490 if (!can_split_folio(folio, 1, &extra_pins)) {
Baolin Wangfd4a7ac2022-10-24 16:34:22 +08003491 ret = -EAGAIN;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003492 goto out_unlock;
3493 }
3494
Matthew Wilcox (Oracle)684555a2022-09-02 20:46:49 +01003495 unmap_folio(folio);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003496
Alex Shib6769832020-12-15 12:33:33 -08003497 /* block interrupt reentry in xa_lock and spinlock */
3498 local_irq_disable();
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003499 if (mapping) {
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003500 /*
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003501 * Check if the folio is present in page cache.
3502 * We assume all tail are present too, if folio is there.
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003503 */
Matthew Wilcox (Oracle)6b24ca42020-06-27 22:19:08 -04003504 xas_lock(&xas);
3505 xas_reset(&xas);
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003506 if (xas_load(&xas) != folio)
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003507 goto fail;
3508 }
3509
Joonsoo Kim0139aa72016-05-19 17:10:49 -07003510 /* Prevent deferred_split_scan() touching ->_refcount */
Yang Shi364c1ee2019-09-23 15:38:06 -07003511 spin_lock(&ds_queue->split_queue_lock);
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003512 if (folio_ref_freeze(folio, 1 + extra_pins)) {
Matthew Wilcox (Oracle)88972772024-02-26 15:55:28 -05003513 if (folio_order(folio) > 1 &&
3514 !list_empty(&folio->_deferred_list)) {
Yang Shi364c1ee2019-09-23 15:38:06 -07003515 ds_queue->split_queue_len--;
Usama Arif8422acd2024-08-30 11:03:38 +01003516 if (folio_test_partially_mapped(folio)) {
3517 __folio_clear_partially_mapped(folio);
3518 mod_mthp_stat(folio_order(folio),
3519 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
3520 }
Zi Yanc010d472024-02-26 15:55:33 -05003521 /*
3522 * Reinitialize page_deferred_list after removing the
3523 * page from the split_queue, otherwise a subsequent
3524 * split will see list corruption when checking the
3525 * page_deferred_list.
3526 */
3527 list_del_init(&folio->_deferred_list);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003528 }
Wei Yangafb97172020-01-30 22:14:35 -08003529 spin_unlock(&ds_queue->split_queue_lock);
Kirill A. Shutemov06d3eff2019-10-18 20:20:30 -07003530 if (mapping) {
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003531 int nr = folio_nr_pages(folio);
Muchun Songbf9ecea2021-02-24 12:03:27 -08003532
Matthew Wilcox (Oracle)3e9a13d2022-09-02 20:46:48 +01003533 xas_split(&xas, folio, folio_order(folio));
Zi Yanc010d472024-02-26 15:55:33 -05003534 if (folio_test_pmd_mappable(folio) &&
3535 new_order < HPAGE_PMD_ORDER) {
Stefan Roescha48d5bd2023-11-06 10:19:18 -08003536 if (folio_test_swapbacked(folio)) {
3537 __lruvec_stat_mod_folio(folio,
3538 NR_SHMEM_THPS, -nr);
3539 } else {
3540 __lruvec_stat_mod_folio(folio,
3541 NR_FILE_THPS, -nr);
3542 filemap_nr_thps_dec(mapping);
3543 }
Marek Szyprowski1ca75542021-10-18 15:16:19 -07003544 }
Kirill A. Shutemov06d3eff2019-10-18 20:20:30 -07003545 }
3546
Barry Song5d65c8d2024-08-24 13:04:40 +12003547 if (is_anon) {
3548 mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
3549 mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
3550 }
Zi Yanc010d472024-02-26 15:55:33 -05003551 __split_huge_page(page, list, end, new_order);
Huang Yingc4f9c702020-10-15 20:06:07 -07003552 ret = 0;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003553 } else {
Yang Shi364c1ee2019-09-23 15:38:06 -07003554 spin_unlock(&ds_queue->split_queue_lock);
Yang Shi504e0702021-06-15 18:24:07 -07003555fail:
3556 if (mapping)
Matthew Wilcox (Oracle)6b24ca42020-06-27 22:19:08 -04003557 xas_unlock(&xas);
Alex Shib6769832020-12-15 12:33:33 -08003558 local_irq_enable();
Yu Zhaob1f20202024-08-30 11:03:36 +01003559 remap_page(folio, folio_nr_pages(folio), 0);
Baolin Wangfd4a7ac2022-10-24 16:34:22 +08003560 ret = -EAGAIN;
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003561 }
3562
3563out_unlock:
Kirill A. Shutemovbaa355f2016-07-26 15:25:51 -07003564 if (anon_vma) {
3565 anon_vma_unlock_write(anon_vma);
3566 put_anon_vma(anon_vma);
3567 }
3568 if (mapping)
3569 i_mmap_unlock_read(mapping);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003570out:
Matthew Wilcox (Oracle)69a37a82022-06-08 15:18:34 -04003571 xas_destroy(&xas);
Lance Yangf216c842024-06-28 21:07:49 +08003572 if (order == HPAGE_PMD_ORDER)
Baolin Wang835c3a22024-03-29 14:59:33 +08003573 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
Lance Yangf216c842024-06-28 21:07:49 +08003574 count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
Kirill A. Shutemove9b61f12016-01-15 16:54:10 -08003575 return ret;
3576}
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003577
Luis Chamberlaine2209172024-08-22 15:50:12 +02003578int min_order_for_split(struct folio *folio)
3579{
3580 if (folio_test_anon(folio))
3581 return 0;
3582
3583 if (!folio->mapping) {
3584 if (folio_test_pmd_mappable(folio))
3585 count_vm_event(THP_SPLIT_PAGE_FAILED);
3586 return -EBUSY;
3587 }
3588
3589 return mapping_min_folio_order(folio->mapping);
3590}
3591
3592int split_folio_to_list(struct folio *folio, struct list_head *list)
3593{
3594 int ret = min_order_for_split(folio);
3595
3596 if (ret < 0)
3597 return ret;
3598
3599 return split_huge_page_to_list_to_order(&folio->page, list, ret);
3600}
3601
Kefeng Wang593a10d2024-05-21 21:03:15 +08003602void __folio_undo_large_rmappable(struct folio *folio)
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003603{
Matthew Wilcox (Oracle)8dc4a8f2023-08-16 16:11:52 +01003604 struct deferred_split *ds_queue;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003605 unsigned long flags;
3606
Matthew Wilcox (Oracle)8dc4a8f2023-08-16 16:11:52 +01003607 ds_queue = get_deferred_split_queue(folio);
3608 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3609 if (!list_empty(&folio->_deferred_list)) {
3610 ds_queue->split_queue_len--;
Usama Arif8422acd2024-08-30 11:03:38 +01003611 if (folio_test_partially_mapped(folio)) {
3612 __folio_clear_partially_mapped(folio);
3613 mod_mthp_stat(folio_order(folio),
3614 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
3615 }
Baolin Wang9bcef592023-12-20 14:51:40 +08003616 list_del_init(&folio->_deferred_list);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003617 }
Matthew Wilcox (Oracle)8dc4a8f2023-08-16 16:11:52 +01003618 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003619}
3620
Usama Arif8422acd2024-08-30 11:03:38 +01003621/* partially_mapped=false won't clear PG_partially_mapped folio flag */
3622void deferred_split_folio(struct folio *folio, bool partially_mapped)
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003623{
Matthew Wilcox (Oracle)f8baa6b2023-01-11 14:29:12 +00003624 struct deferred_split *ds_queue = get_deferred_split_queue(folio);
Yang Shi87eaceb2019-09-23 15:38:15 -07003625#ifdef CONFIG_MEMCG
Matthew Wilcox (Oracle)8991de92023-01-11 14:29:11 +00003626 struct mem_cgroup *memcg = folio_memcg(folio);
Yang Shi87eaceb2019-09-23 15:38:15 -07003627#endif
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003628 unsigned long flags;
3629
Matthew Wilcox (Oracle)88972772024-02-26 15:55:28 -05003630 /*
3631 * Order 1 folios have no space for a deferred list, but we also
3632 * won't waste much memory by not adding them to the deferred list.
3633 */
3634 if (folio_order(folio) <= 1)
3635 return;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003636
Usama Arif81d3ff32024-08-30 11:03:40 +01003637 if (!partially_mapped && !split_underused_thp)
3638 return;
3639
Yang Shi87eaceb2019-09-23 15:38:15 -07003640 /*
3641 * The try_to_unmap() in page reclaim path might reach here too,
3642 * this may cause a race condition to corrupt deferred split queue.
Matthew Wilcox (Oracle)8991de92023-01-11 14:29:11 +00003643 * And, if page reclaim is already handling the same folio, it is
Yang Shi87eaceb2019-09-23 15:38:15 -07003644 * unnecessary to handle it again in shrinker.
3645 *
Matthew Wilcox (Oracle)8991de92023-01-11 14:29:11 +00003646 * Check the swapcache flag to determine if the folio is being
3647 * handled by page reclaim since THP swap would add the folio into
Yang Shi87eaceb2019-09-23 15:38:15 -07003648 * swap cache before calling try_to_unmap().
3649 */
Matthew Wilcox (Oracle)8991de92023-01-11 14:29:11 +00003650 if (folio_test_swapcache(folio))
Yang Shi87eaceb2019-09-23 15:38:15 -07003651 return;
3652
Yang Shi364c1ee2019-09-23 15:38:06 -07003653 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
Usama Arif8422acd2024-08-30 11:03:38 +01003654 if (partially_mapped) {
3655 if (!folio_test_partially_mapped(folio)) {
3656 __folio_set_partially_mapped(folio);
3657 if (folio_test_pmd_mappable(folio))
3658 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
3659 count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
3660 mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
3661
3662 }
3663 } else {
3664 /* partially mapped folios cannot become non-partially mapped */
3665 VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
3666 }
Matthew Wilcox (Oracle)8991de92023-01-11 14:29:11 +00003667 if (list_empty(&folio->_deferred_list)) {
Matthew Wilcox (Oracle)8991de92023-01-11 14:29:11 +00003668 list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
Yang Shi364c1ee2019-09-23 15:38:06 -07003669 ds_queue->split_queue_len++;
Yang Shi87eaceb2019-09-23 15:38:15 -07003670#ifdef CONFIG_MEMCG
3671 if (memcg)
Matthew Wilcox (Oracle)8991de92023-01-11 14:29:11 +00003672 set_shrinker_bit(memcg, folio_nid(folio),
Qi Zheng54d91722023-09-11 17:44:16 +08003673 deferred_split_shrinker->id);
Yang Shi87eaceb2019-09-23 15:38:15 -07003674#endif
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003675 }
Yang Shi364c1ee2019-09-23 15:38:06 -07003676 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003677}
3678
3679static unsigned long deferred_split_count(struct shrinker *shrink,
3680 struct shrink_control *sc)
3681{
Kirill A. Shutemova3d0a9182016-02-02 16:57:08 -08003682 struct pglist_data *pgdata = NODE_DATA(sc->nid);
Yang Shi364c1ee2019-09-23 15:38:06 -07003683 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
Yang Shi87eaceb2019-09-23 15:38:15 -07003684
3685#ifdef CONFIG_MEMCG
3686 if (sc->memcg)
3687 ds_queue = &sc->memcg->deferred_split_queue;
3688#endif
Yang Shi364c1ee2019-09-23 15:38:06 -07003689 return READ_ONCE(ds_queue->split_queue_len);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003690}
3691
Usama Arifdafff3f2024-08-30 11:03:39 +01003692static bool thp_underused(struct folio *folio)
3693{
3694 int num_zero_pages = 0, num_filled_pages = 0;
3695 void *kaddr;
3696 int i;
3697
3698 if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
3699 return false;
3700
3701 for (i = 0; i < folio_nr_pages(folio); i++) {
3702 kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
3703 if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
3704 num_zero_pages++;
3705 if (num_zero_pages > khugepaged_max_ptes_none) {
3706 kunmap_local(kaddr);
3707 return true;
3708 }
3709 } else {
3710 /*
3711 * Another path for early exit once the number
3712 * of non-zero filled pages exceeds threshold.
3713 */
3714 num_filled_pages++;
3715 if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
3716 kunmap_local(kaddr);
3717 return false;
3718 }
3719 }
3720 kunmap_local(kaddr);
3721 }
3722 return false;
3723}
3724
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003725static unsigned long deferred_split_scan(struct shrinker *shrink,
3726 struct shrink_control *sc)
3727{
Kirill A. Shutemova3d0a9182016-02-02 16:57:08 -08003728 struct pglist_data *pgdata = NODE_DATA(sc->nid);
Yang Shi364c1ee2019-09-23 15:38:06 -07003729 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003730 unsigned long flags;
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003731 LIST_HEAD(list);
3732 struct folio *folio, *next;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003733 int split = 0;
3734
Yang Shi87eaceb2019-09-23 15:38:15 -07003735#ifdef CONFIG_MEMCG
3736 if (sc->memcg)
3737 ds_queue = &sc->memcg->deferred_split_queue;
3738#endif
3739
Yang Shi364c1ee2019-09-23 15:38:06 -07003740 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003741 /* Take pin on all head pages to avoid freeing them under us */
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003742 list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
3743 _deferred_list) {
3744 if (folio_try_get(folio)) {
3745 list_move(&folio->_deferred_list, &list);
Kirill A. Shutemove3ae1952016-02-02 16:57:15 -08003746 } else {
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003747 /* We lost race with folio_put() */
Usama Arif8422acd2024-08-30 11:03:38 +01003748 if (folio_test_partially_mapped(folio)) {
3749 __folio_clear_partially_mapped(folio);
3750 mod_mthp_stat(folio_order(folio),
3751 MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
3752 }
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003753 list_del_init(&folio->_deferred_list);
Yang Shi364c1ee2019-09-23 15:38:06 -07003754 ds_queue->split_queue_len--;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003755 }
Kirill A. Shutemove3ae1952016-02-02 16:57:15 -08003756 if (!--sc->nr_to_scan)
3757 break;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003758 }
Yang Shi364c1ee2019-09-23 15:38:06 -07003759 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003760
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003761 list_for_each_entry_safe(folio, next, &list, _deferred_list) {
Usama Arifdafff3f2024-08-30 11:03:39 +01003762 bool did_split = false;
3763 bool underused = false;
3764
3765 if (!folio_test_partially_mapped(folio)) {
3766 underused = thp_underused(folio);
3767 if (!underused)
3768 goto next;
3769 }
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003770 if (!folio_trylock(folio))
Kirill A. Shutemovfa41b902018-03-22 16:17:31 -07003771 goto next;
Usama Arifdafff3f2024-08-30 11:03:39 +01003772 if (!split_folio(folio)) {
3773 did_split = true;
3774 if (underused)
3775 count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003776 split++;
Usama Arifdafff3f2024-08-30 11:03:39 +01003777 }
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003778 folio_unlock(folio);
Kirill A. Shutemovfa41b902018-03-22 16:17:31 -07003779next:
Usama Arifdafff3f2024-08-30 11:03:39 +01003780 /*
3781 * split_folio() removes folio from list on success.
3782 * Only add back to the queue if folio is partially mapped.
3783 * If thp_underused returns false, or if split_folio fails
3784 * in the case it was underused, then consider it used and
3785 * don't add it back to split_queue.
3786 */
3787 if (!did_split && !folio_test_partially_mapped(folio)) {
3788 list_del_init(&folio->_deferred_list);
3789 ds_queue->split_queue_len--;
3790 }
Matthew Wilcox (Oracle)4375a552023-01-11 14:29:10 +00003791 folio_put(folio);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003792 }
3793
Yang Shi364c1ee2019-09-23 15:38:06 -07003794 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
3795 list_splice_tail(&list, &ds_queue->split_queue);
3796 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003797
Kirill A. Shutemovcb8d68e2016-02-02 16:57:12 -08003798 /*
3799 * Stop shrinker if we didn't split any page, but the queue is empty.
3800 * This can happen if pages were freed under us.
3801 */
Yang Shi364c1ee2019-09-23 15:38:06 -07003802 if (!split && list_empty(&ds_queue->split_queue))
Kirill A. Shutemovcb8d68e2016-02-02 16:57:12 -08003803 return SHRINK_STOP;
3804 return split;
Kirill A. Shutemov9a982252016-01-15 16:54:17 -08003805}
3806
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003807#ifdef CONFIG_DEBUG_FS
Zi Yanfa6c0232021-05-04 18:34:23 -07003808static void split_huge_pages_all(void)
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003809{
3810 struct zone *zone;
3811 struct page *page;
Kefeng Wang630e7c52022-12-29 20:25:03 +08003812 struct folio *folio;
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003813 unsigned long pfn, max_zone_pfn;
3814 unsigned long total = 0, split = 0;
3815
Zi Yanfa6c0232021-05-04 18:34:23 -07003816 pr_debug("Split all THPs\n");
Miaohe Lina17206d2022-07-04 21:21:57 +08003817 for_each_zone(zone) {
3818 if (!managed_zone(zone))
3819 continue;
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003820 max_zone_pfn = zone_end_pfn(zone);
3821 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
Miaohe Lina17206d2022-07-04 21:21:57 +08003822 int nr_pages;
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003823
Naoya Horiguchi2b7aa912022-09-08 13:11:50 +09003824 page = pfn_to_online_page(pfn);
Kefeng Wang630e7c52022-12-29 20:25:03 +08003825 if (!page || PageTail(page))
3826 continue;
3827 folio = page_folio(page);
3828 if (!folio_try_get(folio))
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003829 continue;
3830
Kefeng Wang630e7c52022-12-29 20:25:03 +08003831 if (unlikely(page_folio(page) != folio))
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003832 goto next;
3833
Kefeng Wang630e7c52022-12-29 20:25:03 +08003834 if (zone != folio_zone(folio))
3835 goto next;
3836
3837 if (!folio_test_large(folio)
3838 || folio_test_hugetlb(folio)
3839 || !folio_test_lru(folio))
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003840 goto next;
3841
3842 total++;
Kefeng Wang630e7c52022-12-29 20:25:03 +08003843 folio_lock(folio);
3844 nr_pages = folio_nr_pages(folio);
3845 if (!split_folio(folio))
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003846 split++;
Miaohe Lina17206d2022-07-04 21:21:57 +08003847 pfn += nr_pages - 1;
Kefeng Wang630e7c52022-12-29 20:25:03 +08003848 folio_unlock(folio);
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003849next:
Kefeng Wang630e7c52022-12-29 20:25:03 +08003850 folio_put(folio);
Zi Yanfa6c0232021-05-04 18:34:23 -07003851 cond_resched();
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003852 }
3853 }
3854
Zi Yanfa6c0232021-05-04 18:34:23 -07003855 pr_debug("%lu of %lu THP split\n", split, total);
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08003856}
Zi Yanfa6c0232021-05-04 18:34:23 -07003857
3858static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
3859{
3860 return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
3861 is_vm_hugetlb_page(vma);
3862}
3863
3864static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
Zi Yanfc4d1822024-02-26 15:55:34 -05003865 unsigned long vaddr_end, unsigned int new_order)
Zi Yanfa6c0232021-05-04 18:34:23 -07003866{
3867 int ret = 0;
3868 struct task_struct *task;
3869 struct mm_struct *mm;
3870 unsigned long total = 0, split = 0;
3871 unsigned long addr;
3872
3873 vaddr_start &= PAGE_MASK;
3874 vaddr_end &= PAGE_MASK;
3875
Nanyong Sune4bfc672024-09-05 23:30:28 +08003876 task = find_get_task_by_vpid(pid);
Zi Yanfa6c0232021-05-04 18:34:23 -07003877 if (!task) {
Zi Yanfa6c0232021-05-04 18:34:23 -07003878 ret = -ESRCH;
3879 goto out;
3880 }
Zi Yanfa6c0232021-05-04 18:34:23 -07003881
3882 /* Find the mm_struct */
3883 mm = get_task_mm(task);
3884 put_task_struct(task);
3885
3886 if (!mm) {
3887 ret = -EINVAL;
3888 goto out;
3889 }
3890
3891 pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
3892 pid, vaddr_start, vaddr_end);
3893
3894 mmap_read_lock(mm);
3895 /*
3896 * always increase addr by PAGE_SIZE, since we could have a PTE page
3897 * table filled with PTE-mapped THPs, each of which is distinct.
3898 */
3899 for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
Miaohe Lin74ba2b32022-07-04 21:21:52 +08003900 struct vm_area_struct *vma = vma_lookup(mm, addr);
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003901 struct folio_walk fw;
Matthew Wilcox (Oracle)a644b0a2023-08-16 16:12:01 +01003902 struct folio *folio;
Luis Chamberlaine2209172024-08-22 15:50:12 +02003903 struct address_space *mapping;
3904 unsigned int target_order = new_order;
Zi Yanfa6c0232021-05-04 18:34:23 -07003905
Miaohe Lin74ba2b32022-07-04 21:21:52 +08003906 if (!vma)
Zi Yanfa6c0232021-05-04 18:34:23 -07003907 break;
3908
3909 /* skip special VMA and hugetlb VMA */
3910 if (vma_not_suitable_for_thp_split(vma)) {
3911 addr = vma->vm_end;
3912 continue;
3913 }
3914
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003915 folio = folio_walk_start(&fw, vma, addr, 0);
3916 if (!folio)
Zi Yanfa6c0232021-05-04 18:34:23 -07003917 continue;
3918
Matthew Wilcox (Oracle)a644b0a2023-08-16 16:12:01 +01003919 if (!is_transparent_hugepage(folio))
Zi Yanfa6c0232021-05-04 18:34:23 -07003920 goto next;
3921
Luis Chamberlaine2209172024-08-22 15:50:12 +02003922 if (!folio_test_anon(folio)) {
3923 mapping = folio->mapping;
3924 target_order = max(new_order,
3925 mapping_min_folio_order(mapping));
3926 }
3927
3928 if (target_order >= folio_order(folio))
Zi Yan2394aef2024-03-07 13:18:54 -05003929 goto next;
3930
Zi Yanfa6c0232021-05-04 18:34:23 -07003931 total++;
Zi Yanfc4d1822024-02-26 15:55:34 -05003932 /*
3933 * For folios with private, split_huge_page_to_list_to_order()
3934 * will try to drop it before split and then check if the folio
3935 * can be split or not. So skip the check here.
3936 */
3937 if (!folio_test_private(folio) &&
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003938 !can_split_folio(folio, 0, NULL))
Zi Yanfa6c0232021-05-04 18:34:23 -07003939 goto next;
3940
Matthew Wilcox (Oracle)a644b0a2023-08-16 16:12:01 +01003941 if (!folio_trylock(folio))
Zi Yanfa6c0232021-05-04 18:34:23 -07003942 goto next;
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003943 folio_get(folio);
3944 folio_walk_end(&fw, vma);
Zi Yanfa6c0232021-05-04 18:34:23 -07003945
Luis Chamberlaine2209172024-08-22 15:50:12 +02003946 if (!folio_test_anon(folio) && folio->mapping != mapping)
3947 goto unlock;
3948
3949 if (!split_folio_to_order(folio, target_order))
Zi Yanfa6c0232021-05-04 18:34:23 -07003950 split++;
3951
Luis Chamberlaine2209172024-08-22 15:50:12 +02003952unlock:
3953
Matthew Wilcox (Oracle)a644b0a2023-08-16 16:12:01 +01003954 folio_unlock(folio);
Matthew Wilcox (Oracle)a644b0a2023-08-16 16:12:01 +01003955 folio_put(folio);
David Hildenbrand8710f6e2024-08-02 17:55:20 +02003956
3957 cond_resched();
3958 continue;
3959next:
3960 folio_walk_end(&fw, vma);
Zi Yanfa6c0232021-05-04 18:34:23 -07003961 cond_resched();
3962 }
3963 mmap_read_unlock(mm);
3964 mmput(mm);
3965
3966 pr_debug("%lu of %lu THP split\n", split, total);
3967
3968out:
3969 return ret;
3970}
3971
Zi Yanfbe37502021-05-04 18:34:26 -07003972static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
Zi Yanfc4d1822024-02-26 15:55:34 -05003973 pgoff_t off_end, unsigned int new_order)
Zi Yanfbe37502021-05-04 18:34:26 -07003974{
3975 struct filename *file;
3976 struct file *candidate;
3977 struct address_space *mapping;
3978 int ret = -EINVAL;
3979 pgoff_t index;
3980 int nr_pages = 1;
3981 unsigned long total = 0, split = 0;
Luis Chamberlaine2209172024-08-22 15:50:12 +02003982 unsigned int min_order;
3983 unsigned int target_order;
Zi Yanfbe37502021-05-04 18:34:26 -07003984
3985 file = getname_kernel(file_path);
3986 if (IS_ERR(file))
3987 return ret;
3988
3989 candidate = file_open_name(file, O_RDONLY, 0);
3990 if (IS_ERR(candidate))
3991 goto out;
3992
3993 pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
3994 file_path, off_start, off_end);
3995
3996 mapping = candidate->f_mapping;
Luis Chamberlaine2209172024-08-22 15:50:12 +02003997 min_order = mapping_min_folio_order(mapping);
3998 target_order = max(new_order, min_order);
Zi Yanfbe37502021-05-04 18:34:26 -07003999
4000 for (index = off_start; index < off_end; index += nr_pages) {
Christoph Hellwig1fb130b2023-03-07 15:34:04 +01004001 struct folio *folio = filemap_get_folio(mapping, index);
Zi Yanfbe37502021-05-04 18:34:26 -07004002
4003 nr_pages = 1;
Christoph Hellwig66dabbb2023-03-07 15:34:10 +01004004 if (IS_ERR(folio))
Zi Yanfbe37502021-05-04 18:34:26 -07004005 continue;
4006
Matthew Wilcox (Oracle)9ee2c082022-10-19 19:33:29 +01004007 if (!folio_test_large(folio))
Zi Yanfbe37502021-05-04 18:34:26 -07004008 goto next;
4009
4010 total++;
Matthew Wilcox (Oracle)9ee2c082022-10-19 19:33:29 +01004011 nr_pages = folio_nr_pages(folio);
Zi Yanfbe37502021-05-04 18:34:26 -07004012
Luis Chamberlaine2209172024-08-22 15:50:12 +02004013 if (target_order >= folio_order(folio))
Zi Yan2394aef2024-03-07 13:18:54 -05004014 goto next;
4015
Matthew Wilcox (Oracle)9ee2c082022-10-19 19:33:29 +01004016 if (!folio_trylock(folio))
Zi Yanfbe37502021-05-04 18:34:26 -07004017 goto next;
4018
Luis Chamberlaine2209172024-08-22 15:50:12 +02004019 if (folio->mapping != mapping)
4020 goto unlock;
4021
4022 if (!split_folio_to_order(folio, target_order))
Zi Yanfbe37502021-05-04 18:34:26 -07004023 split++;
4024
Luis Chamberlaine2209172024-08-22 15:50:12 +02004025unlock:
Matthew Wilcox (Oracle)9ee2c082022-10-19 19:33:29 +01004026 folio_unlock(folio);
Zi Yanfbe37502021-05-04 18:34:26 -07004027next:
Matthew Wilcox (Oracle)9ee2c082022-10-19 19:33:29 +01004028 folio_put(folio);
Zi Yanfbe37502021-05-04 18:34:26 -07004029 cond_resched();
4030 }
4031
4032 filp_close(candidate, NULL);
4033 ret = 0;
4034
4035 pr_debug("%lu of %lu file-backed THP split\n", split, total);
4036out:
4037 putname(file);
4038 return ret;
4039}
4040
Zi Yanfa6c0232021-05-04 18:34:23 -07004041#define MAX_INPUT_BUF_SZ 255
4042
4043static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
4044 size_t count, loff_t *ppops)
4045{
4046 static DEFINE_MUTEX(split_debug_mutex);
4047 ssize_t ret;
Zi Yanfc4d1822024-02-26 15:55:34 -05004048 /*
4049 * hold pid, start_vaddr, end_vaddr, new_order or
4050 * file_path, off_start, off_end, new_order
4051 */
Zi Yanfbe37502021-05-04 18:34:26 -07004052 char input_buf[MAX_INPUT_BUF_SZ];
Zi Yanfa6c0232021-05-04 18:34:23 -07004053 int pid;
4054 unsigned long vaddr_start, vaddr_end;
Zi Yanfc4d1822024-02-26 15:55:34 -05004055 unsigned int new_order = 0;
Zi Yanfa6c0232021-05-04 18:34:23 -07004056
4057 ret = mutex_lock_interruptible(&split_debug_mutex);
4058 if (ret)
4059 return ret;
4060
4061 ret = -EFAULT;
4062
4063 memset(input_buf, 0, MAX_INPUT_BUF_SZ);
4064 if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
4065 goto out;
4066
4067 input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';
Zi Yanfbe37502021-05-04 18:34:26 -07004068
4069 if (input_buf[0] == '/') {
4070 char *tok;
4071 char *buf = input_buf;
4072 char file_path[MAX_INPUT_BUF_SZ];
4073 pgoff_t off_start = 0, off_end = 0;
4074 size_t input_len = strlen(input_buf);
4075
4076 tok = strsep(&buf, ",");
4077 if (tok) {
Matthew Wilcox (Oracle)1212e002021-06-30 18:52:11 -07004078 strcpy(file_path, tok);
Zi Yanfbe37502021-05-04 18:34:26 -07004079 } else {
4080 ret = -EINVAL;
4081 goto out;
4082 }
4083
Zi Yanfc4d1822024-02-26 15:55:34 -05004084 ret = sscanf(buf, "0x%lx,0x%lx,%d", &off_start, &off_end, &new_order);
4085 if (ret != 2 && ret != 3) {
Zi Yanfbe37502021-05-04 18:34:26 -07004086 ret = -EINVAL;
4087 goto out;
4088 }
Zi Yanfc4d1822024-02-26 15:55:34 -05004089 ret = split_huge_pages_in_file(file_path, off_start, off_end, new_order);
Zi Yanfbe37502021-05-04 18:34:26 -07004090 if (!ret)
4091 ret = input_len;
4092
4093 goto out;
4094 }
4095
Zi Yanfc4d1822024-02-26 15:55:34 -05004096 ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d", &pid, &vaddr_start, &vaddr_end, &new_order);
Zi Yanfa6c0232021-05-04 18:34:23 -07004097 if (ret == 1 && pid == 1) {
4098 split_huge_pages_all();
4099 ret = strlen(input_buf);
4100 goto out;
Zi Yanfc4d1822024-02-26 15:55:34 -05004101 } else if (ret != 3 && ret != 4) {
Zi Yanfa6c0232021-05-04 18:34:23 -07004102 ret = -EINVAL;
4103 goto out;
4104 }
4105
Zi Yanfc4d1822024-02-26 15:55:34 -05004106 ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order);
Zi Yanfa6c0232021-05-04 18:34:23 -07004107 if (!ret)
4108 ret = strlen(input_buf);
4109out:
4110 mutex_unlock(&split_debug_mutex);
4111 return ret;
4112
4113}
4114
4115static const struct file_operations split_huge_pages_fops = {
4116 .owner = THIS_MODULE,
4117 .write = split_huge_pages_write,
Zi Yanfa6c0232021-05-04 18:34:23 -07004118};
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08004119
4120static int __init split_huge_pages_debugfs(void)
4121{
Greg Kroah-Hartmand9f79792019-03-05 15:46:09 -08004122 debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
4123 &split_huge_pages_fops);
Kirill A. Shutemov49071d42016-01-15 16:54:40 -08004124 return 0;
4125}
4126late_initcall(split_huge_pages_debugfs);
4127#endif
Zi Yan616b8372017-09-08 16:10:57 -07004128
4129#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
David Hildenbrand7f5abe62022-05-09 18:20:44 -07004130int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
Zi Yan616b8372017-09-08 16:10:57 -07004131 struct page *page)
4132{
David Hildenbranda8e61d52023-12-20 23:44:49 +01004133 struct folio *folio = page_folio(page);
Zi Yan616b8372017-09-08 16:10:57 -07004134 struct vm_area_struct *vma = pvmw->vma;
4135 struct mm_struct *mm = vma->vm_mm;
4136 unsigned long address = pvmw->address;
David Hildenbrand6c287602022-05-09 18:20:44 -07004137 bool anon_exclusive;
Zi Yan616b8372017-09-08 16:10:57 -07004138 pmd_t pmdval;
4139 swp_entry_t entry;
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07004140 pmd_t pmdswp;
Zi Yan616b8372017-09-08 16:10:57 -07004141
4142 if (!(pvmw->pmd && !pvmw->pte))
David Hildenbrand7f5abe62022-05-09 18:20:44 -07004143 return 0;
Zi Yan616b8372017-09-08 16:10:57 -07004144
Zi Yan616b8372017-09-08 16:10:57 -07004145 flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
Huang Ying8a8683a2020-03-05 22:28:29 -08004146 pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
David Hildenbrand6c287602022-05-09 18:20:44 -07004147
David Hildenbrande3b4b132023-12-20 23:45:02 +01004148 /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
David Hildenbranda8e61d52023-12-20 23:44:49 +01004149 anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
David Hildenbrande3b4b132023-12-20 23:45:02 +01004150 if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
David Hildenbrand6c287602022-05-09 18:20:44 -07004151 set_pmd_at(mm, address, pvmw->pmd, pmdval);
David Hildenbrand7f5abe62022-05-09 18:20:44 -07004152 return -EBUSY;
David Hildenbrand6c287602022-05-09 18:20:44 -07004153 }
4154
Zi Yan616b8372017-09-08 16:10:57 -07004155 if (pmd_dirty(pmdval))
David Hildenbranddb44c652024-01-22 18:54:07 +01004156 folio_mark_dirty(folio);
Alistair Popple4dd845b2021-06-30 18:54:09 -07004157 if (pmd_write(pmdval))
4158 entry = make_writable_migration_entry(page_to_pfn(page));
David Hildenbrand6c287602022-05-09 18:20:44 -07004159 else if (anon_exclusive)
4160 entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
Alistair Popple4dd845b2021-06-30 18:54:09 -07004161 else
4162 entry = make_readable_migration_entry(page_to_pfn(page));
Peter Xu2e346872022-08-11 12:13:29 -04004163 if (pmd_young(pmdval))
4164 entry = make_migration_entry_young(entry);
4165 if (pmd_dirty(pmdval))
4166 entry = make_migration_entry_dirty(entry);
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07004167 pmdswp = swp_entry_to_pmd(entry);
4168 if (pmd_soft_dirty(pmdval))
4169 pmdswp = pmd_swp_mksoft_dirty(pmdswp);
David Hildenbrand24bf08c2023-04-05 18:02:35 +02004170 if (pmd_uffd_wp(pmdval))
4171 pmdswp = pmd_swp_mkuffd_wp(pmdswp);
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07004172 set_pmd_at(mm, address, pvmw->pmd, pmdswp);
David Hildenbranda8e61d52023-12-20 23:44:49 +01004173 folio_remove_rmap_pmd(folio, page, vma);
4174 folio_put(folio);
Anshuman Khandual283fd6f2022-03-24 18:09:58 -07004175 trace_set_migration_pmd(address, pmd_val(pmdswp));
David Hildenbrand7f5abe62022-05-09 18:20:44 -07004176
4177 return 0;
Zi Yan616b8372017-09-08 16:10:57 -07004178}
4179
4180void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
4181{
David Hildenbrand14d85a62023-12-20 23:44:33 +01004182 struct folio *folio = page_folio(new);
Zi Yan616b8372017-09-08 16:10:57 -07004183 struct vm_area_struct *vma = pvmw->vma;
4184 struct mm_struct *mm = vma->vm_mm;
4185 unsigned long address = pvmw->address;
Miaohe Lin4fba8f22022-07-04 21:21:51 +08004186 unsigned long haddr = address & HPAGE_PMD_MASK;
Zi Yan616b8372017-09-08 16:10:57 -07004187 pmd_t pmde;
4188 swp_entry_t entry;
4189
4190 if (!(pvmw->pmd && !pvmw->pte))
4191 return;
4192
4193 entry = pmd_to_swp_entry(*pvmw->pmd);
David Hildenbrand14d85a62023-12-20 23:44:33 +01004194 folio_get(folio);
Peter Xu2e346872022-08-11 12:13:29 -04004195 pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
Naoya Horiguchiab6e3d02017-09-08 16:11:04 -07004196 if (pmd_swp_soft_dirty(*pvmw->pmd))
4197 pmde = pmd_mksoft_dirty(pmde);
David Hildenbrand3c811f72023-04-11 16:25:10 +02004198 if (is_writable_migration_entry(entry))
Rick Edgecombe161e3932023-06-12 17:10:29 -07004199 pmde = pmd_mkwrite(pmde, vma);
Peter Xu8f34f1e2021-06-30 18:49:02 -07004200 if (pmd_swp_uffd_wp(*pvmw->pmd))
Peter Xuf1eb1ba2022-12-14 15:15:33 -05004201 pmde = pmd_mkuffd_wp(pmde);
Peter Xu2e346872022-08-11 12:13:29 -04004202 if (!is_migration_entry_young(entry))
4203 pmde = pmd_mkold(pmde);
4204 /* NOTE: this may contain setting soft-dirty on some archs */
David Hildenbrand14d85a62023-12-20 23:44:33 +01004205 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
Peter Xu2e346872022-08-11 12:13:29 -04004206 pmde = pmd_mkdirty(pmde);
Zi Yan616b8372017-09-08 16:10:57 -07004207
David Hildenbrand14d85a62023-12-20 23:44:33 +01004208 if (folio_test_anon(folio)) {
David Hildenbrand395db7b2023-12-20 23:44:40 +01004209 rmap_t rmap_flags = RMAP_NONE;
David Hildenbrand6c287602022-05-09 18:20:44 -07004210
4211 if (!is_readable_migration_entry(entry))
4212 rmap_flags |= RMAP_EXCLUSIVE;
4213
David Hildenbrand395db7b2023-12-20 23:44:40 +01004214 folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
David Hildenbrand6c287602022-05-09 18:20:44 -07004215 } else {
David Hildenbrand14d85a62023-12-20 23:44:33 +01004216 folio_add_file_rmap_pmd(folio, new, vma);
David Hildenbrand6c287602022-05-09 18:20:44 -07004217 }
David Hildenbrand14d85a62023-12-20 23:44:33 +01004218 VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
Miaohe Lin4fba8f22022-07-04 21:21:51 +08004219 set_pmd_at(mm, haddr, pvmw->pmd, pmde);
Muchun Song5cbcf222022-03-22 14:41:53 -07004220
4221 /* No need to invalidate - it was non-present before */
Zi Yan616b8372017-09-08 16:10:57 -07004222 update_mmu_cache_pmd(vma, address, pvmw->pmd);
Anshuman Khandual283fd6f2022-03-24 18:09:58 -07004223 trace_remove_migration_pmd(address, pmd_val(pmde));
Zi Yan616b8372017-09-08 16:10:57 -07004224}
4225#endif