Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"155 patches.
Subsystems affected by this patch series: mm (dax, debug, thp,
readahead, page-poison, util, memory-hotplug, zram, cleanups), misc,
core-kernel, get_maintainer, MAINTAINERS, lib, bitops, checkpatch,
binfmt, ramfs, autofs, nilfs, rapidio, panic, relay, kgdb, ubsan,
romfs, and fault-injection"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (155 commits)
lib, uaccess: add failure injection to usercopy functions
lib, include/linux: add usercopy failure capability
ROMFS: support inode blocks calculation
ubsan: introduce CONFIG_UBSAN_LOCAL_BOUNDS for Clang
sched.h: drop in_ubsan field when UBSAN is in trap mode
scripts/gdb/tasks: add headers and improve spacing format
scripts/gdb/proc: add struct mount & struct super_block addr in lx-mounts command
kernel/relay.c: drop unneeded initialization
panic: dump registers on panic_on_warn
rapidio: fix the missed put_device() for rio_mport_add_riodev
rapidio: fix error handling path
nilfs2: fix some kernel-doc warnings for nilfs2
autofs: harden ioctl table
ramfs: fix nommu mmap with gaps in the page cache
mm: remove the now-unnecessary mmget_still_valid() hack
mm/gup: take mmap_lock in get_dump_page()
binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot
coredump: rework elf/elf_fdpic vma_dump_size() into common helper
coredump: refactor page range dumping into common helper
coredump: let dump_emit() bail out on short writes
...
diff --git a/.mailmap b/.mailmap
index e4ccac4e..1e14566 100644
--- a/.mailmap
+++ b/.mailmap
@@ -133,6 +133,7 @@
Jan Glauber <jan.glauber@gmail.com> <jang@de.ibm.com>
Jan Glauber <jan.glauber@gmail.com> <jang@linux.vnet.ibm.com>
Jan Glauber <jan.glauber@gmail.com> <jglauber@cavium.com>
+Jarkko Sakkinen <jarkko@kernel.org> <jarkko.sakkinen@linux.intel.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgg@mellanox.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgg@nvidia.com>
Jason Gunthorpe <jgg@ziepe.ca> <jgunthorpe@obsidianresearch.com>
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 0f1fb7e..d246ad4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1343,6 +1343,7 @@
current integrity status.
failslab=
+ fail_usercopy=
fail_page_alloc=
fail_make_request=[KNL]
General fault injection mechanism.
diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst
index 640934b..a137a0e 100644
--- a/Documentation/core-api/xarray.rst
+++ b/Documentation/core-api/xarray.rst
@@ -475,13 +475,15 @@
Each entry will only be returned once, no matter how many indices it
occupies.
-Using xas_next() or xas_prev() with a multi-index xa_state
-is not supported. Using either of these functions on a multi-index entry
-will reveal sibling entries; these should be skipped over by the caller.
+Using xas_next() or xas_prev() with a multi-index xa_state is not
+supported. Using either of these functions on a multi-index entry will
+reveal sibling entries; these should be skipped over by the caller.
-Storing ``NULL`` into any index of a multi-index entry will set the entry
-at every index to ``NULL`` and dissolve the tie. Splitting a multi-index
-entry into entries occupying smaller ranges is not yet supported.
+Storing ``NULL`` into any index of a multi-index entry will set the
+entry at every index to ``NULL`` and dissolve the tie. A multi-index
+entry can be split into entries occupying smaller ranges by calling
+xas_split_alloc() without the xa_lock held, followed by taking the lock
+and calling xas_split().
Functions and structures
========================
diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst
index f850ad0..31ecfe4 100644
--- a/Documentation/fault-injection/fault-injection.rst
+++ b/Documentation/fault-injection/fault-injection.rst
@@ -16,6 +16,10 @@
injects page allocation failures. (alloc_pages(), get_free_pages(), ...)
+- fail_usercopy
+
+ injects failures in user memory access functions. (copy_from_user(), get_user(), ...)
+
- fail_futex
injects futex deadlock and uaddr fault errors.
@@ -177,6 +181,7 @@
failslab=
fail_page_alloc=
+ fail_usercopy=
fail_make_request=
fail_futex=
mmc_core.fail_request=<interval>,<probability>,<space>,<times>
@@ -222,7 +227,7 @@
- debugfs entries
- failslab, fail_page_alloc, and fail_make_request use this way.
+ failslab, fail_page_alloc, fail_usercopy, and fail_make_request use this way.
Helper functions:
fault_create_debugfs_attr(name, parent, attr);
diff --git a/MAINTAINERS b/MAINTAINERS
index 0f59b04..2c31900 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9715,7 +9715,7 @@
KEYS-TRUSTED
M: James Bottomley <jejb@linux.ibm.com>
-M: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
+M: Jarkko Sakkinen <jarkko@kernel.org>
M: Mimi Zohar <zohar@linux.ibm.com>
L: linux-integrity@vger.kernel.org
L: keyrings@vger.kernel.org
@@ -9727,7 +9727,7 @@
KEYS/KEYRINGS
M: David Howells <dhowells@redhat.com>
-M: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
+M: Jarkko Sakkinen <jarkko@kernel.org>
L: keyrings@vger.kernel.org
S: Maintained
F: Documentation/security/keys/core.rst
@@ -17717,7 +17717,7 @@
TPM DEVICE DRIVER
M: Peter Huewe <peterhuewe@gmx.de>
-M: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
+M: Jarkko Sakkinen <jarkko@kernel.org>
R: Jason Gunthorpe <jgg@ziepe.ca>
L: linux-integrity@vger.kernel.org
S: Maintained
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d8686bf..ef12e09 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -537,7 +537,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg)
if (map_start < map_end)
memmap_init_zone((unsigned long)(map_end - map_start),
args->nid, args->zone, page_to_pfn(map_start),
- MEMINIT_EARLY, NULL);
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
return 0;
}
@@ -547,7 +547,7 @@ memmap_init (unsigned long size, int nid, unsigned long zone,
{
if (!vmem_map) {
memmap_init_zone(size, nid, zone, start_pfn,
- MEMINIT_EARLY, NULL);
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
} else {
struct page *start;
struct memmap_init_callback_data args;
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 495fc0c..f6d2c44 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -615,7 +615,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
VM_BUG_ON(pfn >> (64 - PAGE_SHIFT));
VM_BUG_ON((pfn << PAGE_SHIFT) & ~PTE_RPN_MASK);
- return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot));
+ return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE);
}
static inline unsigned long pte_pfn(pte_t pte)
@@ -651,11 +651,6 @@ static inline pte_t pte_mkexec(pte_t pte)
return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_EXEC));
}
-static inline pte_t pte_mkpte(pte_t pte)
-{
- return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));
-}
-
static inline pte_t pte_mkwrite(pte_t pte)
{
/*
@@ -819,6 +814,14 @@ static inline int pte_none(pte_t pte)
static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, int percpu)
{
+
+ VM_WARN_ON(!(pte_raw(pte) & cpu_to_be64(_PAGE_PTE)));
+ /*
+ * Keep the _PAGE_PTE added till we are sure we handle _PAGE_PTE
+ * in all the callers.
+ */
+ pte = __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PTE));
+
if (radix_enabled())
return radix__set_pte_at(mm, addr, ptep, pte, percpu);
return hash__set_pte_at(mm, addr, ptep, pte, percpu);
@@ -866,6 +869,13 @@ static inline bool pte_ci(pte_t pte)
static inline void pmd_clear(pmd_t *pmdp)
{
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+ /*
+ * Don't use this if we can possibly have a hash page table
+ * entry mapping this.
+ */
+ WARN_ON((pmd_val(*pmdp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE));
+ }
*pmdp = __pmd(0);
}
@@ -914,6 +924,13 @@ static inline int pmd_bad(pmd_t pmd)
static inline void pud_clear(pud_t *pudp)
{
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && !radix_enabled()) {
+ /*
+ * Don't use this if we can possibly have a hash page table
+ * entry mapping this.
+ */
+ WARN_ON((pud_val(*pudp) & (H_PAGE_HASHPTE | _PAGE_PTE)) == (H_PAGE_HASHPTE | _PAGE_PTE));
+ }
*pudp = __pud(0);
}
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 4b7c347..6277e75 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -140,11 +140,6 @@ static inline pte_t pte_mkold(pte_t pte)
return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
}
-static inline pte_t pte_mkpte(pte_t pte)
-{
- return pte;
-}
-
static inline pte_t pte_mkspecial(pte_t pte)
{
return __pte(pte_val(pte) | _PAGE_SPECIAL);
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9c0547d..ab57b07 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -184,9 +184,6 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
*/
VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
- /* Add the pte bit when trying to set a pte */
- pte = pte_mkpte(pte);
-
/* Note: mm->context.id might not yet have been assigned as
* this context might not have been activated yet when this
* is called.
@@ -275,8 +272,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_
*/
VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
- pte = pte_mkpte(pte);
-
pte = set_pte_filter(pte);
val = pte_val(pte);
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 13b369d..6828108 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -224,7 +224,7 @@ static int memtrace_online(void)
ent->mem = 0;
}
- if (add_memory(ent->nid, ent->start, ent->size)) {
+ if (add_memory(ent->nid, ent->start, ent->size, MHP_NONE)) {
pr_err("Failed to add trace memory to node %d\n",
ent->nid);
ret += 1;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 5d545b7..d8bbf0c 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -606,7 +606,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
block_sz = memory_block_size_bytes();
/* Add the memory */
- rc = __add_memory(lmb->nid, lmb->base_addr, block_sz);
+ rc = __add_memory(lmb->nid, lmb->base_addr, block_sz, MHP_NONE);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index ad6e90fb..b02fd51 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -194,7 +194,8 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
if (node < 0)
node = memory_add_physaddr_to_nid(info->start_addr);
- result = __add_memory(node, info->start_addr, info->length);
+ result = __add_memory(node, info->start_addr, info->length,
+ MHP_NONE);
/*
* If the memory block has been used by the kernel, add_memory()
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index adf828d..eef4ffb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -432,7 +432,8 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
nid = memory_add_physaddr_to_nid(phys_addr);
ret = __add_memory(nid, phys_addr,
- MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+ MIN_MEMORY_BLOCK_SIZE * sections_per_block,
+ MHP_NONE);
if (ret)
goto out;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 43d21f9..6ffa470 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -772,8 +772,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
return pfn_to_nid(pfn);
}
-static int do_register_memory_block_under_node(int nid,
- struct memory_block *mem_blk)
+static void do_register_memory_block_under_node(int nid,
+ struct memory_block *mem_blk)
{
int ret;
@@ -786,12 +786,19 @@ static int do_register_memory_block_under_node(int nid,
ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
&mem_blk->dev.kobj,
kobject_name(&mem_blk->dev.kobj));
- if (ret)
- return ret;
+ if (ret && ret != -EEXIST)
+ dev_err_ratelimited(&node_devices[nid]->dev,
+ "can't create link to %s in sysfs (%d)\n",
+ kobject_name(&mem_blk->dev.kobj), ret);
- return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
+ ret = sysfs_create_link_nowarn(&mem_blk->dev.kobj,
&node_devices[nid]->dev.kobj,
kobject_name(&node_devices[nid]->dev.kobj));
+ if (ret && ret != -EEXIST)
+ dev_err_ratelimited(&mem_blk->dev,
+ "can't create link to %s in sysfs (%d)\n",
+ kobject_name(&node_devices[nid]->dev.kobj),
+ ret);
}
/* register memory section under specified node if it spans that node */
@@ -827,7 +834,8 @@ static int register_mem_block_under_node_early(struct memory_block *mem_blk,
if (page_nid != nid)
continue;
- return do_register_memory_block_under_node(nid, mem_blk);
+ do_register_memory_block_under_node(nid, mem_blk);
+ return 0;
}
/* mem section does not span the specified node */
return 0;
@@ -842,7 +850,8 @@ static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
{
int nid = *(int *)arg;
- return do_register_memory_block_under_node(nid, mem_blk);
+ do_register_memory_block_under_node(nid, mem_blk);
+ return 0;
}
/*
@@ -860,8 +869,8 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
}
-int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
- enum meminit_context context)
+void link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
+ enum meminit_context context)
{
walk_memory_blocks_func_t func;
@@ -870,9 +879,9 @@ int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
else
func = register_mem_block_under_node_early;
- return walk_memory_blocks(PFN_PHYS(start_pfn),
- PFN_PHYS(end_pfn - start_pfn), (void *)&nid,
- func);
+ walk_memory_blocks(PFN_PHYS(start_pfn), PFN_PHYS(end_pfn - start_pfn),
+ (void *)&nid, func);
+ return;
}
#ifdef CONFIG_HUGETLBFS
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index bff3d40..029403c 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1270,7 +1270,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
zram_slot_unlock(zram, index);
/* Should NEVER happen. Return bio error if it does. */
- if (unlikely(ret))
+ if (WARN_ON(ret))
pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
return ret;
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 6c933f2..b4368c5 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -35,11 +35,17 @@ static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
return 0;
}
+struct dax_kmem_data {
+ const char *res_name;
+ struct resource *res[];
+};
+
static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
{
struct device *dev = &dev_dax->dev;
+ struct dax_kmem_data *data;
+ int rc = -ENOMEM;
int i, mapped = 0;
- char *res_name;
int numa_node;
/*
@@ -55,14 +61,17 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
return -EINVAL;
}
- res_name = kstrdup(dev_name(dev), GFP_KERNEL);
- if (!res_name)
+ data = kzalloc(sizeof(*data) + sizeof(struct resource *) * dev_dax->nr_range, GFP_KERNEL);
+ if (!data)
return -ENOMEM;
+ data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
+ if (!data->res_name)
+ goto err_res_name;
+
for (i = 0; i < dev_dax->nr_range; i++) {
struct resource *res;
struct range range;
- int rc;
rc = dax_kmem_range(dev_dax, i, &range);
if (rc) {
@@ -72,7 +81,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
}
/* Region is permanently reserved if hotremove fails. */
- res = request_mem_region(range.start, range_len(&range), res_name);
+ res = request_mem_region(range.start, range_len(&range), data->res_name);
if (!res) {
dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n",
i, range.start, range.end);
@@ -82,9 +91,10 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
*/
if (mapped)
continue;
- kfree(res_name);
- return -EBUSY;
+ rc = -EBUSY;
+ goto err_request_mem;
}
+ data->res[i] = res;
/*
* Set flags appropriate for System RAM. Leave ..._BUSY clear
@@ -99,23 +109,30 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
* this as RAM automatically.
*/
rc = add_memory_driver_managed(numa_node, range.start,
- range_len(&range), kmem_name);
+ range_len(&range), kmem_name, MHP_NONE);
if (rc) {
dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
i, range.start, range.end);
- release_mem_region(range.start, range_len(&range));
+ release_resource(res);
+ kfree(res);
+ data->res[i] = NULL;
if (mapped)
continue;
- kfree(res_name);
- return rc;
+ goto err_request_mem;
}
mapped++;
}
- dev_set_drvdata(dev, res_name);
+ dev_set_drvdata(dev, data);
return 0;
+
+err_request_mem:
+ kfree(data->res_name);
+err_res_name:
+ kfree(data);
+ return rc;
}
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -123,7 +140,7 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax)
{
int i, success = 0;
struct device *dev = &dev_dax->dev;
- const char *res_name = dev_get_drvdata(dev);
+ struct dax_kmem_data *data = dev_get_drvdata(dev);
/*
* We have one shot for removing memory, if some memory blocks were not
@@ -142,7 +159,9 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax)
rc = remove_memory(dev_dax->target_node, range.start,
range_len(&range));
if (rc == 0) {
- release_mem_region(range.start, range_len(&range));
+ release_resource(data->res[i]);
+ kfree(data->res[i]);
+ data->res[i] = NULL;
success++;
continue;
}
@@ -153,7 +172,8 @@ static int dev_dax_kmem_remove(struct dev_dax *dev_dax)
}
if (success >= dev_dax->nr_range) {
- kfree(res_name);
+ kfree(data->res_name);
+ kfree(data);
dev_set_drvdata(dev, NULL);
}
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 32e3bc0..b64d2ef 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size,
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
- (HA_CHUNK << PAGE_SHIFT));
+ (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
if (ret) {
pr_err("hot_add memory failed error is %d\n", ret);
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 37794d8..a4ba0b8 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -845,8 +845,6 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
* will only be one mm, so no big deal.
*/
mmap_read_lock(mm);
- if (!mmget_still_valid(mm))
- goto skip_mm;
mutex_lock(&ufile->umap_lock);
list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
list) {
@@ -865,7 +863,6 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
}
}
mutex_unlock(&ufile->umap_lock);
- skip_mm:
mmap_read_unlock(mm);
mmput(mm);
}
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
index a303429..94331d9 100644
--- a/drivers/rapidio/devices/rio_mport_cdev.c
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -871,15 +871,16 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode,
rmcd_error("pin_user_pages_fast err=%ld",
pinned);
nr_pages = 0;
- } else
+ } else {
rmcd_error("pinned %ld out of %ld pages",
pinned, nr_pages);
+ /*
+ * Set nr_pages up to mean "how many pages to unpin, in
+ * the error handler:
+ */
+ nr_pages = pinned;
+ }
ret = -EFAULT;
- /*
- * Set nr_pages up to mean "how many pages to unpin, in
- * the error handler:
- */
- nr_pages = pinned;
goto err_pg;
}
@@ -1679,6 +1680,7 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv,
struct rio_dev *rdev;
struct rio_switch *rswitch = NULL;
struct rio_mport *mport;
+ struct device *dev;
size_t size;
u32 rval;
u32 swpinfo = 0;
@@ -1693,8 +1695,10 @@ static int rio_mport_add_riodev(struct mport_cdev_priv *priv,
rmcd_debug(RDEV, "name:%s ct:0x%x did:0x%x hc:0x%x", dev_info.name,
dev_info.comptag, dev_info.destid, dev_info.hopcount);
- if (bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name)) {
+ dev = bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name);
+ if (dev) {
rmcd_debug(RDEV, "device %s already exists", dev_info.name);
+ put_device(dev);
return -EEXIST;
}
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index a864b21..f6e97f0 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -406,7 +406,7 @@ static void __init add_memory_merged(u16 rn)
if (!size)
goto skip_add;
for (addr = start; addr < start + size; addr += block_size)
- add_memory(0, addr, block_size);
+ add_memory(0, addr, block_size, MHP_NONE);
skip_add:
first_rn = rn;
num = 1;
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 1ab1f5c..b0f4b92 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1480,31 +1480,29 @@ static int vfio_pci_zap_and_vma_lock(struct vfio_pci_device *vdev, bool try)
} else {
mmap_read_lock(mm);
}
- if (mmget_still_valid(mm)) {
- if (try) {
- if (!mutex_trylock(&vdev->vma_lock)) {
- mmap_read_unlock(mm);
- mmput(mm);
- return 0;
- }
- } else {
- mutex_lock(&vdev->vma_lock);
+ if (try) {
+ if (!mutex_trylock(&vdev->vma_lock)) {
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
}
- list_for_each_entry_safe(mmap_vma, tmp,
- &vdev->vma_list, vma_next) {
- struct vm_area_struct *vma = mmap_vma->vma;
-
- if (vma->vm_mm != mm)
- continue;
-
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
-
- zap_vma_ptes(vma, vma->vm_start,
- vma->vm_end - vma->vm_start);
- }
- mutex_unlock(&vdev->vma_lock);
+ } else {
+ mutex_lock(&vdev->vma_lock);
}
+ list_for_each_entry_safe(mmap_vma, tmp,
+ &vdev->vma_list, vma_next) {
+ struct vm_area_struct *vma = mmap_vma->vma;
+
+ if (vma->vm_mm != mm)
+ continue;
+
+ list_del(&mmap_vma->vma_next);
+ kfree(mmap_vma);
+
+ zap_vma_ptes(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ }
+ mutex_unlock(&vdev->vma_lock);
mmap_read_unlock(mm);
mmput(mm);
}
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 834b7c1..ba4de59 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -424,7 +424,8 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, unsigned long mb_id)
dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id);
return add_memory_driver_managed(nid, addr, memory_block_size_bytes(),
- vm->resource_name);
+ vm->resource_name,
+ MEMHP_MERGE_RESOURCE);
}
/*
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 51427c7..b57b206 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -331,7 +331,7 @@ static enum bp_state reserve_additional_memory(void)
mutex_unlock(&balloon_mutex);
/* add_memory_resource() requires the device_hotplug lock */
lock_device_hotplug();
- rc = add_memory_resource(nid, resource);
+ rc = add_memory_resource(nid, resource, MEMHP_MERGE_RESOURCE);
unlock_device_hotplug();
mutex_lock(&balloon_mutex);
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 75105f45..322b7df 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -8,6 +8,7 @@
#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/magic.h>
+#include <linux/nospec.h>
#include "autofs_i.h"
@@ -563,7 +564,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
{
- static ioctl_fn _ioctls[] = {
+ static const ioctl_fn _ioctls[] = {
autofs_dev_ioctl_version,
autofs_dev_ioctl_protover,
autofs_dev_ioctl_protosubver,
@@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
};
unsigned int idx = cmd_idx(cmd);
- return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx];
+ if (idx >= ARRAY_SIZE(_ioctls))
+ return NULL;
+ idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls));
+ return _ioctls[idx];
}
/* ioctl dispatcher */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 13d0539..e7e9d0c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
+#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/errno.h>
@@ -421,6 +422,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
return 0;
}
+static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr)
+{
+ unsigned long alignment = 0;
+ int i;
+
+ for (i = 0; i < nr; i++) {
+ if (cmds[i].p_type == PT_LOAD) {
+ unsigned long p_align = cmds[i].p_align;
+
+ /* skip non-power of two alignments as invalid */
+ if (!is_power_of_2(p_align))
+ continue;
+ alignment = max(alignment, p_align);
+ }
+ }
+
+ /* ensure we align to at least one page */
+ return ELF_PAGEALIGN(alignment);
+}
+
/**
* load_elf_phdrs() - load ELF program headers
* @elf_ex: ELF header of the binary whose program headers should be loaded
@@ -1008,6 +1029,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
int elf_prot, elf_flags;
unsigned long k, vaddr;
unsigned long total_size = 0;
+ unsigned long alignment;
if (elf_ppnt->p_type != PT_LOAD)
continue;
@@ -1086,6 +1108,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
+ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
+ if (alignment)
+ load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED;
} else
load_bias = 0;
@@ -1389,126 +1414,6 @@ static int load_elf_library(struct file *file)
* Jeremy Fitzhardinge <jeremy@sw.oz.au>
*/
-/*
- * The purpose of always_dump_vma() is to make sure that special kernel mappings
- * that are useful for post-mortem analysis are included in every core dump.
- * In that way we ensure that the core dump is fully interpretable later
- * without matching up the same kernel and hardware config to see what PC values
- * meant. These special mappings include - vDSO, vsyscall, and other
- * architecture specific mappings
- */
-static bool always_dump_vma(struct vm_area_struct *vma)
-{
- /* Any vsyscall mappings? */
- if (vma == get_gate_vma(vma->vm_mm))
- return true;
-
- /*
- * Assume that all vmas with a .name op should always be dumped.
- * If this changes, a new vm_ops field can easily be added.
- */
- if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
- return true;
-
- /*
- * arch_vma_name() returns non-NULL for special architecture mappings,
- * such as vDSO sections.
- */
- if (arch_vma_name(vma))
- return true;
-
- return false;
-}
-
-/*
- * Decide what to dump of a segment, part, all or none.
- */
-static unsigned long vma_dump_size(struct vm_area_struct *vma,
- unsigned long mm_flags)
-{
-#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
-
- /* always dump the vdso and vsyscall sections */
- if (always_dump_vma(vma))
- goto whole;
-
- if (vma->vm_flags & VM_DONTDUMP)
- return 0;
-
- /* support for DAX */
- if (vma_is_dax(vma)) {
- if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
- goto whole;
- if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
- goto whole;
- return 0;
- }
-
- /* Hugetlb memory check */
- if (is_vm_hugetlb_page(vma)) {
- if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
- goto whole;
- if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
- goto whole;
- return 0;
- }
-
- /* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & VM_IO)
- return 0;
-
- /* By default, dump shared memory if mapped from an anonymous file. */
- if (vma->vm_flags & VM_SHARED) {
- if (file_inode(vma->vm_file)->i_nlink == 0 ?
- FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
- goto whole;
- return 0;
- }
-
- /* Dump segments that have been written to. */
- if (vma->anon_vma && FILTER(ANON_PRIVATE))
- goto whole;
- if (vma->vm_file == NULL)
- return 0;
-
- if (FILTER(MAPPED_PRIVATE))
- goto whole;
-
- /*
- * If this looks like the beginning of a DSO or executable mapping,
- * check for an ELF header. If we find one, dump the first page to
- * aid in determining what was mapped here.
- */
- if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
- u32 __user *header = (u32 __user *) vma->vm_start;
- u32 word;
- /*
- * Doing it this way gets the constant folded by GCC.
- */
- union {
- u32 cmp;
- char elfmag[SELFMAG];
- } magic;
- BUILD_BUG_ON(SELFMAG != sizeof word);
- magic.elfmag[EI_MAG0] = ELFMAG0;
- magic.elfmag[EI_MAG1] = ELFMAG1;
- magic.elfmag[EI_MAG2] = ELFMAG2;
- magic.elfmag[EI_MAG3] = ELFMAG3;
- if (unlikely(get_user(word, header)))
- word = 0;
- if (word == magic.cmp)
- return PAGE_SIZE;
- }
-
-#undef FILTER
-
- return 0;
-
-whole:
- return vma->vm_end - vma->vm_start;
-}
-
/* An ELF note in memory */
struct memelfnote
{
@@ -2220,32 +2125,6 @@ static void free_note_info(struct elf_note_info *info)
#endif
-static struct vm_area_struct *first_vma(struct task_struct *tsk,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret = tsk->mm->mmap;
-
- if (ret)
- return ret;
- return gate_vma;
-}
-/*
- * Helper function for iterating across a vma list. It ensures that the caller
- * will visit `gate_vma' prior to terminating the search.
- */
-static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
- struct vm_area_struct *gate_vma)
-{
- struct vm_area_struct *ret;
-
- ret = this_vma->vm_next;
- if (ret)
- return ret;
- if (this_vma == gate_vma)
- return NULL;
- return gate_vma;
-}
-
static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
elf_addr_t e_shoff, int segs)
{
@@ -2272,9 +2151,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int segs, i;
- size_t vma_data_size = 0;
- struct vm_area_struct *vma, *gate_vma;
+ int vma_count, segs, i;
+ size_t vma_data_size;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2282,30 +2160,16 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- elf_addr_t *vma_filesz = NULL;
+ struct core_vma_metadata *vma_meta;
- /*
- * We no longer stop all VM operations.
- *
- * This is because those proceses that could possibly change map_count
- * or the mmap / vma pages are now blocked in do_exit on current
- * finishing this core dump.
- *
- * Only ptrace can touch these memory addresses, but it doesn't change
- * the map_count or the pages allocated. So no possibility of crashing
- * exists while dumping the mm->vm_next areas to the core file.
- */
-
+ if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
+ return 0;
+
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = current->mm->map_count;
- segs += elf_core_extra_phdrs();
-
- gate_vma = get_gate_vma(current->mm);
- if (gate_vma != NULL)
- segs++;
+ segs = vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2343,24 +2207,6 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- /*
- * Zero vma process will get ZERO_SIZE_PTR here.
- * Let coredump continue for register state at least.
- */
- vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)),
- GFP_KERNEL);
- if (!vma_filesz)
- goto end_coredump;
-
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
- unsigned long dump_size;
-
- dump_size = vma_dump_size(vma, cprm->mm_flags);
- vma_filesz[i++] = dump_size;
- vma_data_size += dump_size;
- }
-
offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2381,21 +2227,23 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
phdr.p_offset = offset;
- phdr.p_vaddr = vma->vm_start;
+ phdr.p_vaddr = meta->start;
phdr.p_paddr = 0;
- phdr.p_filesz = vma_filesz[i++];
- phdr.p_memsz = vma->vm_end - vma->vm_start;
+ phdr.p_filesz = meta->dump_size;
+ phdr.p_memsz = meta->end - meta->start;
offset += phdr.p_filesz;
- phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
- if (vma->vm_flags & VM_WRITE)
+ phdr.p_flags = 0;
+ if (meta->flags & VM_READ)
+ phdr.p_flags |= PF_R;
+ if (meta->flags & VM_WRITE)
phdr.p_flags |= PF_W;
- if (vma->vm_flags & VM_EXEC)
+ if (meta->flags & VM_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
@@ -2417,28 +2265,11 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
- vma = next_vma(vma, gate_vma)) {
- unsigned long addr;
- unsigned long end;
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
- end = vma->vm_start + vma_filesz[i++];
-
- for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) {
- struct page *page;
- int stop;
-
- page = get_dump_page(addr);
- if (page) {
- void *kaddr = kmap(page);
- stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap(page);
- put_page(page);
- } else
- stop = !dump_skip(cprm, PAGE_SIZE);
- if (stop)
- goto end_coredump;
- }
+ if (!dump_user_range(cprm, meta->start, meta->dump_size))
+ goto end_coredump;
}
dump_truncate(cprm);
@@ -2453,7 +2284,7 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_filesz);
+ kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 50f8457..be4062b 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic
int pr_fpvalid; /* True if math co-processor being used. */
};
-/*
- * Decide whether a segment is worth dumping; default is yes to be
- * sure (missing info is worse than too much; etc).
- * Personally I'd include everything, and use the coredump limit...
- *
- * I think we should skip something. But I am not sure how. H.J.
- */
-static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
-{
- int dump_ok;
-
- /* Do not dump I/O mapped devices or special mappings */
- if (vma->vm_flags & VM_IO) {
- kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags);
- return 0;
- }
-
- /* If we may not read the contents, don't allow us to dump
- * them either. "dump_write()" can't handle it anyway.
- */
- if (!(vma->vm_flags & VM_READ)) {
- kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags);
- return 0;
- }
-
- /* support for DAX */
- if (vma_is_dax(vma)) {
- if (vma->vm_flags & VM_SHARED) {
- dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- } else {
- dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- }
- return dump_ok;
- }
-
- /* By default, dump shared memory if mapped from an anonymous file. */
- if (vma->vm_flags & VM_SHARED) {
- if (file_inode(vma->vm_file)->i_nlink == 0) {
- dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-
- dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags);
- kdcore("%08lx: %08lx: %s (share)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-
-#ifdef CONFIG_MMU
- /* By default, if it hasn't been written to, don't write it out */
- if (!vma->anon_vma) {
- dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start,
- vma->vm_flags, dump_ok ? "yes" : "no");
- return dump_ok;
- }
-#endif
-
- dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags);
- kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags,
- dump_ok ? "yes" : "no");
- return dump_ok;
-}
-
/* An ELF note in memory */
struct memelfnote
{
@@ -1524,54 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
/*
* dump the segments for an MMU process
*/
-static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
+ struct core_vma_metadata *vma_meta,
+ int vma_count)
{
- struct vm_area_struct *vma;
+ int i;
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
-#ifdef CONFIG_MMU
- unsigned long addr;
-#endif
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
- if (!maydump(vma, cprm->mm_flags))
- continue;
-
-#ifdef CONFIG_MMU
- for (addr = vma->vm_start; addr < vma->vm_end;
- addr += PAGE_SIZE) {
- bool res;
- struct page *page = get_dump_page(addr);
- if (page) {
- void *kaddr = kmap(page);
- res = dump_emit(cprm, kaddr, PAGE_SIZE);
- kunmap(page);
- put_page(page);
- } else {
- res = dump_skip(cprm, PAGE_SIZE);
- }
- if (!res)
- return false;
- }
-#else
- if (!dump_emit(cprm, (void *) vma->vm_start,
- vma->vm_end - vma->vm_start))
+ if (!dump_user_range(cprm, meta->start, meta->dump_size))
return false;
-#endif
}
return true;
}
-static size_t elf_core_vma_data_size(unsigned long mm_flags)
-{
- struct vm_area_struct *vma;
- size_t size = 0;
-
- for (vma = current->mm->mmap; vma; vma = vma->vm_next)
- if (maydump(vma, mm_flags))
- size += vma->vm_end - vma->vm_start;
- return size;
-}
-
/*
* Actual dumper
*
@@ -1582,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int segs;
+ int vma_count, segs;
int i;
- struct vm_area_struct *vma;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
struct memelfnote psinfo_note, auxv_note;
@@ -1598,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
-
- /*
- * We no longer stop all VM operations.
- *
- * This is because those proceses that could possibly change map_count
- * or the mmap / vma pages are now blocked in do_exit on current
- * finishing this core dump.
- *
- * Only ptrace can touch these memory addresses, but it doesn't change
- * the map_count or the pages allocated. So no possibility of crashing
- * exists while dumping the mm->vm_next areas to the core file.
- */
+ struct core_vma_metadata *vma_meta = NULL;
+ size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1619,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
+ if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
+ goto end_coredump;
+
for (ct = current->mm->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1638,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = current->mm->map_count;
- segs += elf_core_extra_phdrs();
+ segs = vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1684,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += elf_core_vma_data_size(cprm->mm_flags);
+ offset += vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1704,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
+ for (i = 0; i < vma_count; i++) {
+ struct core_vma_metadata *meta = vma_meta + i;
struct elf_phdr phdr;
size_t sz;
- sz = vma->vm_end - vma->vm_start;
+ sz = meta->end - meta->start;
phdr.p_type = PT_LOAD;
phdr.p_offset = offset;
- phdr.p_vaddr = vma->vm_start;
+ phdr.p_vaddr = meta->start;
phdr.p_paddr = 0;
- phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0;
+ phdr.p_filesz = meta->dump_size;
phdr.p_memsz = sz;
offset += phdr.p_filesz;
- phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0;
- if (vma->vm_flags & VM_WRITE)
+ phdr.p_flags = 0;
+ if (meta->flags & VM_READ)
+ phdr.p_flags |= PF_R;
+ if (meta->flags & VM_WRITE)
phdr.p_flags |= PF_W;
- if (vma->vm_flags & VM_EXEC)
+ if (meta->flags & VM_EXEC)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
@@ -1752,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- if (!elf_fdpic_dump_segments(cprm))
+ if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1776,6 +1667,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
thread_list = thread_list->next;
kfree(tmp);
}
+ kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index ca22737..b0983e2 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1168,7 +1168,7 @@ EXPORT_SYMBOL(configfs_depend_item);
/*
* Release the dependent linkage. This is much simpler than
- * configfs_depend_item() because we know that that the client driver is
+ * configfs_depend_item() because we know that the client driver is
* pinned, thus the subsystem is pinned, and therefore configfs is pinned.
*/
void configfs_undepend_item(struct config_item *target)
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index fb65b70..1f02702 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -267,7 +267,7 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou
* There is no easy way for us to know if userspace is only doing a partial
* write, so we don't support them. We expect the entire buffer to come
* on the first write.
- * Hint: if you're writing a value, first read the file, modify only the
+ * Hint: if you're writing a value, first read the file, modify only
* the value you're changing, then write entire buffer back.
*/
diff --git a/fs/coredump.c b/fs/coredump.c
index 76e7c10..0cd9056 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -840,17 +840,17 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
ssize_t n;
if (cprm->written + nr > cprm->limit)
return 0;
- while (nr) {
- if (dump_interrupted())
- return 0;
- n = __kernel_write(file, addr, nr, &pos);
- if (n <= 0)
- return 0;
- file->f_pos = pos;
- cprm->written += n;
- cprm->pos += n;
- nr -= n;
- }
+
+
+ if (dump_interrupted())
+ return 0;
+ n = __kernel_write(file, addr, nr, &pos);
+ if (n != nr)
+ return 0;
+ file->f_pos = pos;
+ cprm->written += n;
+ cprm->pos += n;
+
return 1;
}
EXPORT_SYMBOL(dump_emit);
@@ -876,6 +876,40 @@ int dump_skip(struct coredump_params *cprm, size_t nr)
}
EXPORT_SYMBOL(dump_skip);
+#ifdef CONFIG_ELF_CORE
+int dump_user_range(struct coredump_params *cprm, unsigned long start,
+ unsigned long len)
+{
+ unsigned long addr;
+
+ for (addr = start; addr < start + len; addr += PAGE_SIZE) {
+ struct page *page;
+ int stop;
+
+ /*
+ * To avoid having to allocate page tables for virtual address
+ * ranges that have never been used yet, and also to make it
+ * easy to generate sparse core files, use a helper that returns
+ * NULL when encountering an empty page table entry that would
+ * otherwise have been filled with the zero page.
+ */
+ page = get_dump_page(addr);
+ if (page) {
+ void *kaddr = kmap(page);
+
+ stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
+ kunmap(page);
+ put_page(page);
+ } else {
+ stop = !dump_skip(cprm, PAGE_SIZE);
+ }
+ if (stop)
+ return 0;
+ }
+ return 1;
+}
+#endif
+
int dump_align(struct coredump_params *cprm, int align)
{
unsigned mod = cprm->pos & (align - 1);
@@ -902,3 +936,183 @@ void dump_truncate(struct coredump_params *cprm)
}
}
EXPORT_SYMBOL(dump_truncate);
+
+/*
+ * The purpose of always_dump_vma() is to make sure that special kernel mappings
+ * that are useful for post-mortem analysis are included in every core dump.
+ * In that way we ensure that the core dump is fully interpretable later
+ * without matching up the same kernel and hardware config to see what PC values
+ * meant. These special mappings include - vDSO, vsyscall, and other
+ * architecture specific mappings
+ */
+static bool always_dump_vma(struct vm_area_struct *vma)
+{
+ /* Any vsyscall mappings? */
+ if (vma == get_gate_vma(vma->vm_mm))
+ return true;
+
+ /*
+ * Assume that all vmas with a .name op should always be dumped.
+ * If this changes, a new vm_ops field can easily be added.
+ */
+ if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
+ return true;
+
+ /*
+ * arch_vma_name() returns non-NULL for special architecture mappings,
+ * such as vDSO sections.
+ */
+ if (arch_vma_name(vma))
+ return true;
+
+ return false;
+}
+
+/*
+ * Decide how much of @vma's contents should be included in a core dump.
+ */
+static unsigned long vma_dump_size(struct vm_area_struct *vma,
+ unsigned long mm_flags)
+{
+#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type))
+
+ /* always dump the vdso and vsyscall sections */
+ if (always_dump_vma(vma))
+ goto whole;
+
+ if (vma->vm_flags & VM_DONTDUMP)
+ return 0;
+
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
+ /* Hugetlb memory check */
+ if (is_vm_hugetlb_page(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
+ /* Do not dump I/O mapped devices or special mappings */
+ if (vma->vm_flags & VM_IO)
+ return 0;
+
+ /* By default, dump shared memory if mapped from an anonymous file. */
+ if (vma->vm_flags & VM_SHARED) {
+ if (file_inode(vma->vm_file)->i_nlink == 0 ?
+ FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED))
+ goto whole;
+ return 0;
+ }
+
+ /* Dump segments that have been written to. */
+ if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE))
+ goto whole;
+ if (vma->vm_file == NULL)
+ return 0;
+
+ if (FILTER(MAPPED_PRIVATE))
+ goto whole;
+
+ /*
+ * If this is the beginning of an executable file mapping,
+ * dump the first page to aid in determining what was mapped here.
+ */
+ if (FILTER(ELF_HEADERS) &&
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
+ (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+#undef FILTER
+
+ return 0;
+
+whole:
+ return vma->vm_end - vma->vm_start;
+}
+
+static struct vm_area_struct *first_vma(struct task_struct *tsk,
+ struct vm_area_struct *gate_vma)
+{
+ struct vm_area_struct *ret = tsk->mm->mmap;
+
+ if (ret)
+ return ret;
+ return gate_vma;
+}
+
+/*
+ * Helper function for iterating across a vma list. It ensures that the caller
+ * will visit `gate_vma' prior to terminating the search.
+ */
+static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+ struct vm_area_struct *gate_vma)
+{
+ struct vm_area_struct *ret;
+
+ ret = this_vma->vm_next;
+ if (ret)
+ return ret;
+ if (this_vma == gate_vma)
+ return NULL;
+ return gate_vma;
+}
+
+/*
+ * Under the mmap_lock, take a snapshot of relevant information about the task's
+ * VMAs.
+ */
+int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
+ struct core_vma_metadata **vma_meta,
+ size_t *vma_data_size_ptr)
+{
+ struct vm_area_struct *vma, *gate_vma;
+ struct mm_struct *mm = current->mm;
+ int i;
+ size_t vma_data_size = 0;
+
+ /*
+ * Once the stack expansion code is fixed to not change VMA bounds
+ * under mmap_lock in read mode, this can be changed to take the
+ * mmap_lock in read mode.
+ */
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+
+ gate_vma = get_gate_vma(mm);
+ *vma_count = mm->map_count + (gate_vma ? 1 : 0);
+
+ *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
+ if (!*vma_meta) {
+ mmap_write_unlock(mm);
+ return -ENOMEM;
+ }
+
+ for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+ vma = next_vma(vma, gate_vma), i++) {
+ struct core_vma_metadata *m = (*vma_meta) + i;
+
+ m->start = vma->vm_start;
+ m->end = vma->vm_end;
+ m->flags = vma->vm_flags;
+ m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+
+ vma_data_size += m->dump_size;
+ }
+
+ mmap_write_unlock(mm);
+
+ if (WARN_ON(i != *vma_count))
+ return -EFAULT;
+
+ *vma_data_size_ptr = vma_data_size;
+ return 0;
+}
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index bbd5e7e..5b7ba8f 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -349,6 +349,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
@@ -358,8 +359,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode,
if (page)
put_page(page);
else if (num_ra_pages > 1)
- page_cache_readahead_unbounded(inode->i_mapping, NULL,
- index, num_ra_pages, 0);
+ page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
return page;
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 9eb0dba..054ec85 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -228,6 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
pgoff_t index,
unsigned long num_ra_pages)
{
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
@@ -237,8 +238,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
if (page)
put_page(page);
else if (num_ra_pages > 1)
- page_cache_readahead_unbounded(inode->i_mapping, NULL,
- index, num_ra_pages, 0);
+ page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
page = read_mapping_page(inode->i_mapping, index, NULL);
}
return page;
diff --git a/fs/inode.c b/fs/inode.c
index 72c4c34..9d78c37 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -181,6 +181,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping->a_ops = &empty_aops;
mapping->host = inode;
mapping->flags = 0;
+ if (sb->s_type->fs_flags & FS_THP_SUPPORT)
+ __set_bit(AS_THP_SUPPORT, &mapping->flags);
mapping->wb_err = 0;
atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index e516ae3..5900879 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -355,7 +355,7 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
/**
* nilfs_bmap_assign - assign a new block number to a block
* @bmap: bmap
- * @bhp: pointer to buffer head
+ * @bh: pointer to buffer head
* @blocknr: block number
* @binfo: block information
*
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 86d4d85..025fb08 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -889,7 +889,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
* nilfs_cpfile_change_cpmode - change checkpoint mode
* @cpfile: inode of checkpoint file
* @cno: checkpoint number
- * @status: mode of checkpoint
+ * @mode: mode of checkpoint
*
* Description: nilfs_change_cpmode() changes the mode of the checkpoint
* specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
@@ -930,12 +930,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
/**
* nilfs_cpfile_get_stat - get checkpoint statistics
* @cpfile: inode of checkpoint file
- * @stat: pointer to a structure of checkpoint statistics
+ * @cpstat: pointer to a structure of checkpoint statistics
*
* Description: nilfs_cpfile_get_stat() returns information about checkpoints.
*
* Return Value: On success, 0 is returned, and checkpoints information is
- * stored in the place pointed by @stat. On error, one of the following
+ * stored in the place pointed by @cpstat. On error, one of the following
* negative error codes is returned.
*
* %-EIO - I/O error.
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index b175f13..171fb5c 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -69,7 +69,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
/**
* nilfs_forget_buffer - discard dirty state
- * @inode: owner inode of the buffer
* @bh: buffer head of the buffer to be discarded
*/
void nilfs_forget_buffer(struct buffer_head *bh)
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 42ff67c..6372247 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -546,13 +546,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
/**
* nilfs_sufile_get_stat - get segment usage statistics
* @sufile: inode of segment usage file
- * @stat: pointer to a structure of segment usage statistics
+ * @sustat: pointer to a structure of segment usage statistics
*
* Description: nilfs_sufile_get_stat() returns information about segment
* usage.
*
* Return Value: On success, 0 is returned, and segment usage information is
- * stored in the place pointed by @stat. On error, one of the following
+ * stored in the place pointed by @sustat. On error, one of the following
* negative error codes is returned.
*
* %-EIO - I/O error.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 846d43d..217aa27 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1244,24 +1244,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = -EINTR;
goto out_mm;
}
- /*
- * Avoid to modify vma->vm_flags
- * without locked ops while the
- * coredump reads the vm_flags.
- */
- if (!mmget_still_valid(mm)) {
- /*
- * Silently return "count"
- * like if get_task_mm()
- * failed. FIXME: should this
- * function have returned
- * -ESRCH if get_task_mm()
- * failed like if
- * get_proc_task() fails?
- */
- mmap_write_unlock(mm);
- goto out_mm;
- }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 4146954..355523f 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -224,7 +224,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
if (!pages)
goto out_free;
- nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
+ nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages);
if (nr != lpages)
goto out_free_pages; /* leave if some pages were missing */
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e582d00..b1b7d3f 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -356,6 +356,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
}
i->i_mode = mode;
+ i->i_blocks = (i->i_size + 511) >> 9;
unlock_new_inode(i);
return i;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0e4a383..000b457 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -601,8 +601,6 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */
mmap_write_lock(mm);
- /* no task can run (and in turn coredump) yet */
- VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -842,7 +840,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
/* len == 0 means wake all */
struct userfaultfd_wake_range range = { .len = 0, };
unsigned long new_flags;
- bool still_valid;
WRITE_ONCE(ctx->released, true);
@@ -858,7 +855,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_lock for writing.
*/
mmap_write_lock(mm);
- still_valid = mmget_still_valid(mm);
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
@@ -869,17 +865,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
continue;
}
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
- if (still_valid) {
- prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
- new_flags, vma->anon_vma,
- vma->vm_file, vma->vm_pgoff,
- vma_policy(vma),
- NULL_VM_UFFD_CTX);
- if (prev)
- vma = prev;
- else
- prev = vma;
- }
+ prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+ new_flags, vma->anon_vma,
+ vma->vm_file, vma->vm_pgoff,
+ vma_policy(vma),
+ NULL_VM_UFFD_CTX);
+ if (prev)
+ vma = prev;
+ else
+ prev = vma;
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -1309,8 +1303,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- if (!mmget_still_valid(mm))
- goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
@@ -1511,8 +1503,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
mmap_write_lock(mm);
- if (!mmget_still_valid(mm))
- goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 99f2ac3..5b74bdf 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -188,12 +188,10 @@ static inline unsigned fls_long(unsigned long l)
static inline int get_count_order(unsigned int count)
{
- int order;
+ if (count == 0)
+ return -1;
- order = fls(count) - 1;
- if (count & (count - 1))
- order++;
- return order;
+ return fls(--count);
}
/**
@@ -206,10 +204,7 @@ static inline int get_count_order_long(unsigned long l)
{
if (l == 0UL)
return -1;
- else if (l & (l - 1UL))
- return (int)fls_long(l);
- else
- return (int)fls_long(l) - 1;
+ return (int)fls_long(--l);
}
/**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c09375e..639cae2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -8,6 +8,7 @@
#include <linux/genhd.h>
#include <linux/list.h>
#include <linux/llist.h>
+#include <linux/minmax.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/pagemap.h>
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index dd74503..2efec10 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -7,10 +7,14 @@
#ifndef __LINUX_BVEC_ITER_H
#define __LINUX_BVEC_ITER_H
-#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/errno.h>
+#include <linux/limits.h>
+#include <linux/minmax.h>
#include <linux/mm.h>
+#include <linux/types.h>
+
+struct page;
/**
* struct bio_vec - a contiguous range of physical memory addresses
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index 7a899e8..e58e8c2 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -7,6 +7,12 @@
#include <linux/fs.h>
#include <asm/siginfo.h>
+struct core_vma_metadata {
+ unsigned long start, end;
+ unsigned long flags;
+ unsigned long dump_size;
+};
+
/*
* These are the only things you should do on a core-file: use only these
* functions to write out all the necessary info.
@@ -16,6 +22,11 @@ extern int dump_skip(struct coredump_params *cprm, size_t nr);
extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
extern int dump_align(struct coredump_params *cprm, int align);
extern void dump_truncate(struct coredump_params *cprm);
+int dump_user_range(struct coredump_params *cprm, unsigned long start,
+ unsigned long len);
+int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
+ struct core_vma_metadata **vma_meta,
+ size_t *vma_data_size_ptr);
#ifdef CONFIG_COREDUMP
extern void do_coredump(const kernel_siginfo_t *siginfo);
#else
diff --git a/include/linux/fault-inject-usercopy.h b/include/linux/fault-inject-usercopy.h
new file mode 100644
index 0000000..56c3a69
--- /dev/null
+++ b/include/linux/fault-inject-usercopy.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_FAULT_INJECT_USERCOPY_H__
+#define __LINUX_FAULT_INJECT_USERCOPY_H__
+
+/*
+ * This header provides a wrapper for injecting failures to user space memory
+ * access functions.
+ */
+
+#include <linux/types.h>
+
+#ifdef CONFIG_FAULT_INJECTION_USERCOPY
+
+bool should_fail_usercopy(void);
+
+#else
+
+static inline bool should_fail_usercopy(void) { return false; }
+
+#endif /* CONFIG_FAULT_INJECTION_USERCOPY */
+
+#endif /* __LINUX_FAULT_INJECT_USERCOPY_H__ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae97d87..d1d166b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2209,6 +2209,7 @@ struct file_system_type {
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
+#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
@@ -2696,33 +2697,6 @@ static inline errseq_t file_sample_sb_err(struct file *file)
return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}
-static inline int filemap_nr_thps(struct address_space *mapping)
-{
-#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- return atomic_read(&mapping->nr_thps);
-#else
- return 0;
-#endif
-}
-
-static inline void filemap_nr_thps_inc(struct address_space *mapping)
-{
-#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- atomic_inc(&mapping->nr_thps);
-#else
- WARN_ON_ONCE(1);
-#endif
-}
-
-static inline void filemap_nr_thps_dec(struct address_space *mapping)
-{
-#ifdef CONFIG_READ_ONLY_THP_FOR_FS
- atomic_dec(&mapping->nr_thps);
-#else
- WARN_ON_ONCE(1);
-#endif
-}
-
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern int vfs_fsync(struct file *file, int datasync);
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 3ade03e..a0dce14 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -263,7 +263,8 @@ void ida_destroy(struct ida *ida);
*
* Allocate an ID between 0 and %INT_MAX, inclusive.
*
- * Context: Any context.
+ * Context: Any context. It is safe to call this function without
+ * locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
@@ -280,7 +281,8 @@ static inline int ida_alloc(struct ida *ida, gfp_t gfp)
*
* Allocate an ID between @min and %INT_MAX, inclusive.
*
- * Context: Any context.
+ * Context: Any context. It is safe to call this function without
+ * locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
@@ -297,7 +299,8 @@ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp)
*
* Allocate an ID between 0 and @max, inclusive.
*
- * Context: Any context.
+ * Context: Any context. It is safe to call this function without
+ * locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
@@ -311,6 +314,10 @@ static inline void ida_init(struct ida *ida)
xa_init_flags(&ida->xa, IDA_INIT_FLAGS);
}
+/*
+ * ida_simple_get() and ida_simple_remove() are deprecated. Use
+ * ida_alloc() and ida_free() instead respectively.
+ */
#define ida_simple_get(ida, start, end, gfp) \
ida_alloc_range(ida, start, (end) - 1, gfp)
#define ida_simple_remove(ida, id) ida_free(ida, id)
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6c2b06f..5135d4b 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -58,6 +58,10 @@ struct resource {
#define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */
#define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */
+/* IORESOURCE_SYSRAM specific bits. */
+#define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */
+#define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */
+
#define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */
#define IORESOURCE_DISABLED 0x10000000
@@ -103,7 +107,6 @@ struct resource {
#define IORESOURCE_MEM_32BIT (3<<3)
#define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */
#define IORESOURCE_MEM_EXPANSIONROM (1<<6)
-#define IORESOURCE_MEM_DRIVER_MANAGED (1<<7)
/* PnP I/O specific bits (IORESOURCE_BITS) */
#define IORESOURCE_IO_16BIT_ADDR (1<<0)
@@ -248,8 +251,10 @@ extern struct resource * __request_region(struct resource *,
extern void __release_region(struct resource *, resource_size_t,
resource_size_t);
#ifdef CONFIG_MEMORY_HOTREMOVE
-extern int release_mem_region_adjustable(struct resource *, resource_size_t,
- resource_size_t);
+extern void release_mem_region_adjustable(resource_size_t, resource_size_t);
+#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+extern void merge_system_ram_resource(struct resource *res);
#endif
/* Wrappers for managed devices */
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index fed6ba9..5e13f80 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -3,8 +3,9 @@
#define _LINUX_JIFFIES_H
#include <linux/cache.h>
+#include <linux/limits.h>
#include <linux/math64.h>
-#include <linux/kernel.h>
+#include <linux/minmax.h>
#include <linux/types.h>
#include <linux/time.h>
#include <linux/timex.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e4aa29b..c629215 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -11,6 +11,7 @@
#include <linux/compiler.h>
#include <linux/bitops.h>
#include <linux/log2.h>
+#include <linux/minmax.h>
#include <linux/typecheck.h>
#include <linux/printk.h>
#include <linux/build_bug.h>
@@ -833,155 +834,6 @@ ftrace_vprintk(const char *fmt, va_list ap)
static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
#endif /* CONFIG_TRACING */
-/*
- * min()/max()/clamp() macros must accomplish three things:
- *
- * - avoid multiple evaluations of the arguments (so side-effects like
- * "x++" happen only once) when non-constant.
- * - perform strict type-checking (to generate warnings instead of
- * nasty runtime surprises). See the "unnecessary" pointer comparison
- * in __typecheck().
- * - retain result as a constant expressions when called with only
- * constant expressions (to avoid tripping VLA warnings in stack
- * allocation usage).
- */
-#define __typecheck(x, y) \
- (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
-
-/*
- * This returns a constant expression while determining if an argument is
- * a constant expression, most importantly without evaluating the argument.
- * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
- */
-#define __is_constexpr(x) \
- (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
-
-#define __no_side_effects(x, y) \
- (__is_constexpr(x) && __is_constexpr(y))
-
-#define __safe_cmp(x, y) \
- (__typecheck(x, y) && __no_side_effects(x, y))
-
-#define __cmp(x, y, op) ((x) op (y) ? (x) : (y))
-
-#define __cmp_once(x, y, unique_x, unique_y, op) ({ \
- typeof(x) unique_x = (x); \
- typeof(y) unique_y = (y); \
- __cmp(unique_x, unique_y, op); })
-
-#define __careful_cmp(x, y, op) \
- __builtin_choose_expr(__safe_cmp(x, y), \
- __cmp(x, y, op), \
- __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op))
-
-/**
- * min - return minimum of two values of the same or compatible types
- * @x: first value
- * @y: second value
- */
-#define min(x, y) __careful_cmp(x, y, <)
-
-/**
- * max - return maximum of two values of the same or compatible types
- * @x: first value
- * @y: second value
- */
-#define max(x, y) __careful_cmp(x, y, >)
-
-/**
- * min3 - return minimum of three values
- * @x: first value
- * @y: second value
- * @z: third value
- */
-#define min3(x, y, z) min((typeof(x))min(x, y), z)
-
-/**
- * max3 - return maximum of three values
- * @x: first value
- * @y: second value
- * @z: third value
- */
-#define max3(x, y, z) max((typeof(x))max(x, y), z)
-
-/**
- * min_not_zero - return the minimum that is _not_ zero, unless both are zero
- * @x: value1
- * @y: value2
- */
-#define min_not_zero(x, y) ({ \
- typeof(x) __x = (x); \
- typeof(y) __y = (y); \
- __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
-
-/**
- * clamp - return a value clamped to a given range with strict typechecking
- * @val: current value
- * @lo: lowest allowable value
- * @hi: highest allowable value
- *
- * This macro does strict typechecking of @lo/@hi to make sure they are of the
- * same type as @val. See the unnecessary pointer comparisons.
- */
-#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
-
-/*
- * ..and if you can't take the strict
- * types, you can specify one yourself.
- *
- * Or not use min/max/clamp at all, of course.
- */
-
-/**
- * min_t - return minimum of two values, using the specified type
- * @type: data type to use
- * @x: first value
- * @y: second value
- */
-#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <)
-
-/**
- * max_t - return maximum of two values, using the specified type
- * @type: data type to use
- * @x: first value
- * @y: second value
- */
-#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >)
-
-/**
- * clamp_t - return a value clamped to a given range using a given type
- * @type: the type of variable to use
- * @val: current value
- * @lo: minimum allowable value
- * @hi: maximum allowable value
- *
- * This macro does no typechecking and uses temporary variables of type
- * @type to make all the comparisons.
- */
-#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
-
-/**
- * clamp_val - return a value clamped to a given range using val's type
- * @val: current value
- * @lo: minimum allowable value
- * @hi: maximum allowable value
- *
- * This macro does no typechecking and uses temporary variables of whatever
- * type the input argument @val is. This is useful when @val is an unsigned
- * type and @lo and @hi are literals that will otherwise be assigned a signed
- * integer type.
- */
-#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
-
-
-/**
- * swap - swap values of @a and @b
- * @a: first value
- * @b: second value
- */
-#define swap(a, b) \
- do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
-
/* This counts to 12. Any more, it will return 13th argument. */
#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
diff --git a/include/linux/list.h b/include/linux/list.h
index 0d0d17a..a18c87b 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -610,6 +610,15 @@ static inline void list_splice_tail_init(struct list_head *list,
pos = n, n = pos->prev)
/**
+ * list_entry_is_head - test if the entry points to the head of the list
+ * @pos: the type * to cursor
+ * @head: the head for your list.
+ * @member: the name of the list_head within the struct.
+ */
+#define list_entry_is_head(pos, head, member) \
+ (&pos->member == (head))
+
+/**
* list_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop cursor.
* @head: the head for your list.
@@ -617,7 +626,7 @@ static inline void list_splice_tail_init(struct list_head *list,
*/
#define list_for_each_entry(pos, head, member) \
for (pos = list_first_entry(head, typeof(*pos), member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = list_next_entry(pos, member))
/**
@@ -628,7 +637,7 @@ static inline void list_splice_tail_init(struct list_head *list,
*/
#define list_for_each_entry_reverse(pos, head, member) \
for (pos = list_last_entry(head, typeof(*pos), member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = list_prev_entry(pos, member))
/**
@@ -653,7 +662,7 @@ static inline void list_splice_tail_init(struct list_head *list,
*/
#define list_for_each_entry_continue(pos, head, member) \
for (pos = list_next_entry(pos, member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = list_next_entry(pos, member))
/**
@@ -667,7 +676,7 @@ static inline void list_splice_tail_init(struct list_head *list,
*/
#define list_for_each_entry_continue_reverse(pos, head, member) \
for (pos = list_prev_entry(pos, member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = list_prev_entry(pos, member))
/**
@@ -679,7 +688,7 @@ static inline void list_splice_tail_init(struct list_head *list,
* Iterate over list of given type, continuing from current position.
*/
#define list_for_each_entry_from(pos, head, member) \
- for (; &pos->member != (head); \
+ for (; !list_entry_is_head(pos, head, member); \
pos = list_next_entry(pos, member))
/**
@@ -692,7 +701,7 @@ static inline void list_splice_tail_init(struct list_head *list,
* Iterate backwards over list of given type, continuing from current position.
*/
#define list_for_each_entry_from_reverse(pos, head, member) \
- for (; &pos->member != (head); \
+ for (; !list_entry_is_head(pos, head, member); \
pos = list_prev_entry(pos, member))
/**
@@ -705,7 +714,7 @@ static inline void list_splice_tail_init(struct list_head *list,
#define list_for_each_entry_safe(pos, n, head, member) \
for (pos = list_first_entry(head, typeof(*pos), member), \
n = list_next_entry(pos, member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = n, n = list_next_entry(n, member))
/**
@@ -721,7 +730,7 @@ static inline void list_splice_tail_init(struct list_head *list,
#define list_for_each_entry_safe_continue(pos, n, head, member) \
for (pos = list_next_entry(pos, member), \
n = list_next_entry(pos, member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = n, n = list_next_entry(n, member))
/**
@@ -736,7 +745,7 @@ static inline void list_splice_tail_init(struct list_head *list,
*/
#define list_for_each_entry_safe_from(pos, n, head, member) \
for (n = list_next_entry(pos, member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = n, n = list_next_entry(n, member))
/**
@@ -752,7 +761,7 @@ static inline void list_splice_tail_init(struct list_head *list,
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
for (pos = list_last_entry(head, typeof(*pos), member), \
n = list_prev_entry(pos, member); \
- &pos->member != (head); \
+ !list_entry_is_head(pos, head, member); \
pos = n, n = list_prev_entry(n, member))
/**
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c0faa7a..d65c6fd 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -57,6 +57,19 @@ enum {
MMOP_ONLINE_MOVABLE,
};
+/* Flags for add_memory() and friends to specify memory hotplug details. */
+typedef int __bitwise mhp_t;
+
+/* No special request */
+#define MHP_NONE ((__force mhp_t)0)
+/*
+ * Allow merging of the added System RAM resource with adjacent,
+ * mergeable resources. After a successful call to add_memory_resource()
+ * with this flag set, the resource pointer must no longer be used as it
+ * might be stale, or the resource might have changed.
+ */
+#define MEMHP_MERGE_RESOURCE ((__force mhp_t)BIT(0))
+
/*
* Extended parameters for memory hotplug:
* altmap: alternative allocator for memmap array (optional)
@@ -103,8 +116,8 @@ extern int online_pages(unsigned long pfn, unsigned long nr_pages,
int online_type, int nid);
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
unsigned long end_pfn);
-extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
- unsigned long end_pfn);
+extern void __offline_isolated_pages(unsigned long start_pfn,
+ unsigned long end_pfn);
typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
@@ -247,13 +260,6 @@ static inline void zone_span_writelock(struct zone *zone) {}
static inline void zone_span_writeunlock(struct zone *zone) {}
static inline void zone_seqlock_init(struct zone *zone) {}
-static inline int mhp_notimplemented(const char *func)
-{
- printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
- dump_stack();
- return -ENOSYS;
-}
-
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
}
@@ -344,14 +350,18 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {}
extern void set_zone_contiguous(struct zone *zone);
extern void clear_zone_contiguous(struct zone *zone);
+#ifdef CONFIG_MEMORY_HOTPLUG
extern void __ref free_area_init_core_hotplug(int nid);
-extern int __add_memory(int nid, u64 start, u64 size);
-extern int add_memory(int nid, u64 start, u64 size);
-extern int add_memory_resource(int nid, struct resource *resource);
+extern int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
+extern int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags);
+extern int add_memory_resource(int nid, struct resource *resource,
+ mhp_t mhp_flags);
extern int add_memory_driver_managed(int nid, u64 start, u64 size,
- const char *resource_name);
+ const char *resource_name,
+ mhp_t mhp_flags);
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap);
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype);
extern void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages);
@@ -363,8 +373,8 @@ extern void sparse_remove_section(struct mem_section *ms,
unsigned long map_offset, struct vmem_altmap *altmap);
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
unsigned long pnum);
-extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
- int online_type);
extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
unsigned long nr_pages);
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/minmax.h b/include/linux/minmax.h
new file mode 100644
index 0000000..c0f57b0
--- /dev/null
+++ b/include/linux/minmax.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MINMAX_H
+#define _LINUX_MINMAX_H
+
+/*
+ * min()/max()/clamp() macros must accomplish three things:
+ *
+ * - avoid multiple evaluations of the arguments (so side-effects like
+ * "x++" happen only once) when non-constant.
+ * - perform strict type-checking (to generate warnings instead of
+ * nasty runtime surprises). See the "unnecessary" pointer comparison
+ * in __typecheck().
+ * - retain result as a constant expressions when called with only
+ * constant expressions (to avoid tripping VLA warnings in stack
+ * allocation usage).
+ */
+#define __typecheck(x, y) \
+ (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
+
+/*
+ * This returns a constant expression while determining if an argument is
+ * a constant expression, most importantly without evaluating the argument.
+ * Glory to Martin Uecker <Martin.Uecker@med.uni-goettingen.de>
+ */
+#define __is_constexpr(x) \
+ (sizeof(int) == sizeof(*(8 ? ((void *)((long)(x) * 0l)) : (int *)8)))
+
+#define __no_side_effects(x, y) \
+ (__is_constexpr(x) && __is_constexpr(y))
+
+#define __safe_cmp(x, y) \
+ (__typecheck(x, y) && __no_side_effects(x, y))
+
+#define __cmp(x, y, op) ((x) op (y) ? (x) : (y))
+
+#define __cmp_once(x, y, unique_x, unique_y, op) ({ \
+ typeof(x) unique_x = (x); \
+ typeof(y) unique_y = (y); \
+ __cmp(unique_x, unique_y, op); })
+
+#define __careful_cmp(x, y, op) \
+ __builtin_choose_expr(__safe_cmp(x, y), \
+ __cmp(x, y, op), \
+ __cmp_once(x, y, __UNIQUE_ID(__x), __UNIQUE_ID(__y), op))
+
+/**
+ * min - return minimum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define min(x, y) __careful_cmp(x, y, <)
+
+/**
+ * max - return maximum of two values of the same or compatible types
+ * @x: first value
+ * @y: second value
+ */
+#define max(x, y) __careful_cmp(x, y, >)
+
+/**
+ * min3 - return minimum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
+#define min3(x, y, z) min((typeof(x))min(x, y), z)
+
+/**
+ * max3 - return maximum of three values
+ * @x: first value
+ * @y: second value
+ * @z: third value
+ */
+#define max3(x, y, z) max((typeof(x))max(x, y), z)
+
+/**
+ * min_not_zero - return the minimum that is _not_ zero, unless both are zero
+ * @x: value1
+ * @y: value2
+ */
+#define min_not_zero(x, y) ({ \
+ typeof(x) __x = (x); \
+ typeof(y) __y = (y); \
+ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
+/**
+ * clamp - return a value clamped to a given range with strict typechecking
+ * @val: current value
+ * @lo: lowest allowable value
+ * @hi: highest allowable value
+ *
+ * This macro does strict typechecking of @lo/@hi to make sure they are of the
+ * same type as @val. See the unnecessary pointer comparisons.
+ */
+#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
+
+/*
+ * ..and if you can't take the strict
+ * types, you can specify one yourself.
+ *
+ * Or not use min/max/clamp at all, of course.
+ */
+
+/**
+ * min_t - return minimum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
+#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <)
+
+/**
+ * max_t - return maximum of two values, using the specified type
+ * @type: data type to use
+ * @x: first value
+ * @y: second value
+ */
+#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >)
+
+/**
+ * clamp_t - return a value clamped to a given range using a given type
+ * @type: the type of variable to use
+ * @val: current value
+ * @lo: minimum allowable value
+ * @hi: maximum allowable value
+ *
+ * This macro does no typechecking and uses temporary variables of type
+ * @type to make all the comparisons.
+ */
+#define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi)
+
+/**
+ * clamp_val - return a value clamped to a given range using val's type
+ * @val: current value
+ * @lo: minimum allowable value
+ * @hi: maximum allowable value
+ *
+ * This macro does no typechecking and uses temporary variables of whatever
+ * type the input argument @val is. This is useful when @val is an unsigned
+ * type and @lo and @hi are literals that will otherwise be assigned a signed
+ * integer type.
+ */
+#define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
+
+/**
+ * swap - swap values of @a and @b
+ * @a: first value
+ * @b: second value
+ */
+#define swap(a, b) \
+ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
+
+#endif /* _LINUX_MINMAX_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 620961e..61a2633 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2440,7 +2440,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn,
extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
- enum meminit_context, struct vmem_altmap *);
+ enum meminit_context, struct vmem_altmap *, int migratetype);
extern void setup_per_zone_wmarks(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
@@ -3025,8 +3025,6 @@ extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
-extern int get_hwpoison_page(struct page *page);
-#define put_hwpoison_page(page) put_page(page)
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
@@ -3066,6 +3064,7 @@ enum mf_action_page_type {
MF_MSG_BUDDY,
MF_MSG_BUDDY_2ND,
MF_MSG_DAX,
+ MF_MSG_UNSPLIT_THP,
MF_MSG_UNKNOWN,
};
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c27fb1f..fb3bf69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -266,6 +266,8 @@ static inline bool is_active_lru(enum lru_list lru)
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
+#define ANON_AND_FILE 2
+
enum lruvec_flags {
LRUVEC_CONGESTED, /* lruvec has many dirty pages
* backed by a congested BDI
@@ -283,8 +285,8 @@ struct lruvec {
unsigned long file_cost;
/* Non-resident age, driven by LRU movement */
atomic_long_t nonresident_age;
- /* Refaults at the time of last reclaim cycle, anon=0, file=1 */
- unsigned long refaults[2];
+ /* Refaults at the time of last reclaim cycle */
+ unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
#ifdef CONFIG_MEMCG
@@ -441,6 +443,8 @@ enum zone_type {
#ifndef __GENERATING_BOUNDS_H
+#define ASYNC_AND_SYNC 2
+
struct zone {
/* Read-mostly fields */
@@ -560,8 +564,8 @@ struct zone {
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/* pfn where compaction free scanner should start */
unsigned long compact_cached_free_pfn;
- /* pfn where async and sync compaction migration scanner should start */
- unsigned long compact_cached_migrate_pfn[2];
+ /* pfn where compaction migration scanner should start */
+ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC];
unsigned long compact_init_migrate_pfn;
unsigned long compact_init_free_pfn;
#endif
@@ -1416,7 +1420,6 @@ static inline unsigned long next_present_section_nr(unsigned long section_nr)
#define pfn_to_nid(pfn) (0)
#endif
-#define early_pfn_valid(pfn) pfn_valid(pfn)
void sparse_init(void);
#else
#define sparse_init() do {} while (0)
@@ -1436,10 +1439,6 @@ struct mminit_pfnnid_cache {
int last_nid;
};
-#ifndef early_pfn_valid
-#define early_pfn_valid(pfn) (1)
-#endif
-
/*
* If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
* need to check pfn validity within that MAX_ORDER_NR_PAGES block.
diff --git a/include/linux/node.h b/include/linux/node.h
index 014ba3a..8e5a298 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -99,15 +99,14 @@ extern struct node *node_devices[];
typedef void (*node_registration_func_t)(struct node *);
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
-int link_mem_sections(int nid, unsigned long start_pfn,
- unsigned long end_pfn,
- enum meminit_context context);
+void link_mem_sections(int nid, unsigned long start_pfn,
+ unsigned long end_pfn,
+ enum meminit_context context);
#else
-static inline int link_mem_sections(int nid, unsigned long start_pfn,
- unsigned long end_pfn,
- enum meminit_context context)
+static inline void link_mem_sections(int nid, unsigned long start_pfn,
+ unsigned long end_pfn,
+ enum meminit_context context)
{
- return 0;
}
#endif
@@ -130,8 +129,7 @@ static inline int register_one_node(int nid)
if (error)
return error;
/* link memory sections under this node */
- error = link_mem_sections(nid, start_pfn, end_pfn,
- MEMINIT_EARLY);
+ link_mem_sections(nid, start_pfn, end_pfn, MEMINIT_EARLY);
}
return error;
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 3334ce0..ac398e1 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -90,9 +90,9 @@
* for such situations. See below and CPUMASK_ALLOC also.
*/
-#include <linux/kernel.h>
#include <linux/threads.h>
#include <linux/bitmap.h>
+#include <linux/minmax.h>
#include <linux/numa.h>
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 38ded40..4f6ba93 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -431,13 +431,9 @@ PAGEFLAG_FALSE(Uncached)
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
-extern bool set_hwpoison_free_buddy_page(struct page *page);
+extern bool take_page_off_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison)
-static inline bool set_hwpoison_free_buddy_page(struct page *page)
-{
- return 0;
-}
#define __PG_HWPOISON 0
#endif
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 8679ccd..3468794 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -11,7 +11,7 @@ extern struct page_ext_operations page_owner_ops;
extern void __reset_page_owner(struct page *page, unsigned int order);
extern void __set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask);
-extern void __split_page_owner(struct page *page, unsigned int order);
+extern void __split_page_owner(struct page *page, unsigned int nr);
extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
extern void __set_page_owner_migrate_reason(struct page *page, int reason);
extern void __dump_page_owner(struct page *page);
@@ -31,10 +31,10 @@ static inline void set_page_owner(struct page *page,
__set_page_owner(page, order, gfp_mask);
}
-static inline void split_page_owner(struct page *page, unsigned int order)
+static inline void split_page_owner(struct page *page, unsigned int nr)
{
if (static_branch_unlikely(&page_owner_inited))
- __split_page_owner(page, order);
+ __split_page_owner(page, nr);
}
static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c3afd32..c77b7c31 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -29,6 +29,7 @@ enum mapping_flags {
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
+ AS_THP_SUPPORT = 6, /* THPs supported */
};
/**
@@ -120,6 +121,40 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
m->gfp_mask = mask;
}
+static inline bool mapping_thp_support(struct address_space *mapping)
+{
+ return test_bit(AS_THP_SUPPORT, &mapping->flags);
+}
+
+static inline int filemap_nr_thps(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ return atomic_read(&mapping->nr_thps);
+#else
+ return 0;
+#endif
+}
+
+static inline void filemap_nr_thps_inc(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ if (!mapping_thp_support(mapping))
+ atomic_inc(&mapping->nr_thps);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
+static inline void filemap_nr_thps_dec(struct address_space *mapping)
+{
+#ifdef CONFIG_READ_ONLY_THP_FOR_FS
+ if (!mapping_thp_support(mapping))
+ atomic_dec(&mapping->nr_thps);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
void release_pages(struct page **pages, int nr);
/*
@@ -726,17 +761,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec);
-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
-
-void page_cache_sync_readahead(struct address_space *, struct file_ra_state *,
- struct file *, pgoff_t index, unsigned long req_count);
-void page_cache_async_readahead(struct address_space *, struct file_ra_state *,
- struct file *, struct page *, pgoff_t index,
- unsigned long req_count);
-void page_cache_readahead_unbounded(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_count);
-
/*
* Like add_to_page_cache_locked, but used to add newly allocated pages:
* the page is new, so we can just run __SetPageLocked() against it.
@@ -777,6 +801,67 @@ struct readahead_control {
unsigned int _batch_count;
};
+#define DEFINE_READAHEAD(rac, f, m, i) \
+ struct readahead_control rac = { \
+ .file = f, \
+ .mapping = m, \
+ ._index = i, \
+ }
+
+#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
+
+void page_cache_ra_unbounded(struct readahead_control *,
+ unsigned long nr_to_read, unsigned long lookahead_count);
+void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *,
+ unsigned long req_count);
+void page_cache_async_ra(struct readahead_control *, struct file_ra_state *,
+ struct page *, unsigned long req_count);
+
+/**
+ * page_cache_sync_readahead - generic file readahead
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @file: Used by the filesystem for authentication.
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
+ *
+ * page_cache_sync_readahead() should be called when a cache miss happened:
+ * it will submit the read. The readahead logic may decide to piggyback more
+ * pages onto the read request if access patterns suggest it will improve
+ * performance.
+ */
+static inline
+void page_cache_sync_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file, pgoff_t index,
+ unsigned long req_count)
+{
+ DEFINE_READAHEAD(ractl, file, mapping, index);
+ page_cache_sync_ra(&ractl, ra, req_count);
+}
+
+/**
+ * page_cache_async_readahead - file readahead for marked pages
+ * @mapping: address_space which holds the pagecache and I/O vectors
+ * @ra: file_ra_state which holds the readahead state
+ * @file: Used by the filesystem for authentication.
+ * @page: The page at @index which triggered the readahead call.
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
+ *
+ * page_cache_async_readahead() should be called when a page is used which
+ * is marked as PageReadahead; this is a marker to suggest that the application
+ * has used up enough of the readahead window that we should start pulling in
+ * more pages.
+ */
+static inline
+void page_cache_async_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *file,
+ struct page *page, pgoff_t index, unsigned long req_count)
+{
+ DEFINE_READAHEAD(ractl, file, mapping, index);
+ page_cache_async_ra(&ractl, ra, page, req_count);
+}
+
/**
* readahead_page - Get the next page to read.
* @rac: The current readahead request.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9030f3a..063cd12 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1013,7 +1013,7 @@ struct task_struct {
struct held_lock held_locks[MAX_LOCK_DEPTH];
#endif
-#ifdef CONFIG_UBSAN
+#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
unsigned int in_ubsan;
#endif
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 15bfb06..981e34c 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,31 +49,6 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
-/*
- * This has to be called after a get_task_mm()/mmget_not_zero()
- * followed by taking the mmap_lock for writing before modifying the
- * vmas or anything the coredump pretends not to change from under it.
- *
- * It also has to be called when mmgrab() is used in the context of
- * the process, but then the mm_count refcount is transferred outside
- * the context of the process to run down_write() on that pinned mm.
- *
- * NOTE: find_extend_vma() called from GUP context is the only place
- * that can modify the "mm" (notably the vm_start/end) under mmap_lock
- * for reading and outside the context of the process, so it is also
- * the only case that holds the mmap_lock for reading that must call
- * this function. Generally if the mmap_lock is hold for reading
- * there's no need of this check after get_task_mm()/mmget_not_zero().
- *
- * This function can be obsoleted and the check can be removed, after
- * the coredump code will hold the mmap_lock for writing before
- * invoking the ->core_dump methods.
- */
-static inline bool mmget_still_valid(struct mm_struct *mm)
-{
- return likely(!mm->core_state);
-}
-
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 1ae36bc..1b8c9d6 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,7 +2,9 @@
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__
+#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
+#include <linux/minmax.h>
#include <linux/sched.h>
#include <linux/thread_info.h>
@@ -83,6 +85,8 @@ static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
might_fault();
+ if (should_fail_usercopy())
+ return n;
instrument_copy_from_user(to, from, n);
check_object_size(to, n, false);
return raw_copy_from_user(to, from, n);
@@ -104,6 +108,8 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
+ if (should_fail_usercopy())
+ return n;
instrument_copy_to_user(to, from, n);
check_object_size(from, n, true);
return raw_copy_to_user(to, from, n);
@@ -113,6 +119,8 @@ static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
might_fault();
+ if (should_fail_usercopy())
+ return n;
instrument_copy_to_user(to, from, n);
check_object_size(from, n, true);
return raw_copy_to_user(to, from, n);
@@ -124,7 +132,7 @@ _copy_from_user(void *to, const void __user *from, unsigned long n)
{
unsigned long res = n;
might_fault();
- if (likely(access_ok(from, n))) {
+ if (!should_fail_usercopy() && likely(access_ok(from, n))) {
instrument_copy_from_user(to, from, n);
res = raw_copy_from_user(to, from, n);
}
@@ -142,6 +150,8 @@ static inline __must_check unsigned long
_copy_to_user(void __user *to, const void *from, unsigned long n)
{
might_fault();
+ if (should_fail_usercopy())
+ return n;
if (access_ok(to, n)) {
instrument_copy_to_user(to, from, n);
n = raw_copy_to_user(to, from, n);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 7557c10..322dcbf 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -28,7 +28,7 @@ struct reclaim_stat {
unsigned nr_writeback;
unsigned nr_immediate;
unsigned nr_pageout;
- unsigned nr_activate[2];
+ unsigned nr_activate[ANON_AND_FILE];
unsigned nr_ref_keep;
unsigned nr_unmap_fail;
unsigned nr_lazyfree_fail;
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index b4d70e7..5cdf441 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1505,6 +1505,28 @@ void xas_pause(struct xa_state *);
void xas_create_range(struct xa_state *);
+#ifdef CONFIG_XARRAY_MULTI
+int xa_get_order(struct xarray *, unsigned long index);
+void xas_split(struct xa_state *, void *entry, unsigned int order);
+void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
+#else
+static inline int xa_get_order(struct xarray *xa, unsigned long index)
+{
+ return 0;
+}
+
+static inline void xas_split(struct xa_state *xas, void *entry,
+ unsigned int order)
+{
+ xas_store(xas, entry);
+}
+
+static inline void xas_split_alloc(struct xa_state *xas, void *entry,
+ unsigned int order, gfp_t gfp)
+{
+}
+#endif
+
/**
* xas_reload() - Refetch an entry from the xarray.
* @xas: XArray operation state.
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 36c5c5e..0bdbc0d 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -361,6 +361,7 @@ TRACE_EVENT(aer_event,
EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \
EM ( MF_MSG_HUGE, "huge page" ) \
EM ( MF_MSG_FREE_HUGE, "free huge page" ) \
+ EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \
EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \
EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \
EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \
@@ -373,6 +374,8 @@ TRACE_EVENT(aer_event,
EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \
EM ( MF_MSG_BUDDY, "free buddy page" ) \
EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \
+ EM ( MF_MSG_DAX, "dax page" ) \
+ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \
EMe ( MF_MSG_UNKNOWN, "unknown page" )
/*
diff --git a/kernel/acct.c b/kernel/acct.c
index b0c5b3a..f175df8 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -25,7 +25,7 @@
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
- * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * Fixed a nasty interaction with sys_umount(). If the accounting
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
@@ -263,12 +263,12 @@ static DEFINE_MUTEX(acct_on_mutex);
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
- * Returns 0 for success or negative errno values for failure.
- *
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
+ *
+ * Returns: 0 for success or negative errno values for failure.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
@@ -586,9 +586,7 @@ static void slow_acct_process(struct pid_namespace *ns)
}
/**
- * acct_process
- *
- * handles process accounting for an exiting task
+ * acct_process - handles process accounting for an exiting task
*/
void acct_process(void)
{
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 642415b..57b5b5d 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -390,7 +390,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* The top cpuset doesn't have any online cpu as a
* consequence of a race between cpuset_hotplug_work
* and cpu hotplug notifier. But we know the top
- * cpuset's effective_cpus is on its way to to be
+ * cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
cpumask_copy(pmask, cpu_online_mask);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b92d08e..06c1115 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -16,7 +16,7 @@
#include "direct.h"
/*
- * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use it
+ * Most architectures use ZONE_DMA for the first 16 Megabytes, but some use
* it for entirely different regions. In that case the arch code needs to
* override the variable below for dma-direct to work properly.
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 3ca8f1f..32083db 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -556,7 +556,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
- atomic_dec(&inode->i_writecount);
+ put_write_access(inode);
i_mmap_lock_write(mapping);
if (tmp->vm_flags & VM_SHARED)
mapping_allow_writable(mapping);
@@ -2189,7 +2189,7 @@ static __latent_entropy struct task_struct *copy_process(
/*
* Ensure that the cgroup subsystem policies allow the new process to be
- * forked. It should be noted the the new process's css_set can be changed
+ * forked. It should be noted that the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
diff --git a/kernel/futex.c b/kernel/futex.c
index a587669..680854d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -916,7 +916,7 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
* [10] Found | Found | task | !=taskTID | 0/1 | Invalid
*
* [1] Indicates that the kernel can acquire the futex atomically. We
- * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
+ * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
*
* [2] Valid, if TID does not belong to a kernel thread. If no matching
* thread is found then it indicates that the owner TID has died.
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index e960d7c..773b610 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -604,7 +604,7 @@ int irq_timings_alloc(int irq)
/*
* Some platforms can have the same private interrupt per cpu,
- * so this function may be be called several times with the
+ * so this function may be called several times with the
* same interrupt number. Just bail out in case the per cpu
* stat structure is already allocated.
*/
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index e661c61..015ef90 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -19,7 +19,7 @@
#include <linux/cpu.h>
#include <asm/sections.h>
-/* mutex to protect coming/going of the the jump_label table */
+/* mutex to protect coming/going of the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);
void jump_label_lock(void)
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
index f03562a..1a6db2f 100644
--- a/kernel/kcsan/encoding.h
+++ b/kernel/kcsan/encoding.h
@@ -32,7 +32,7 @@
* 1. different addresses but with the same encoded address race;
* 2. and both map onto the same watchpoint slots;
*
- * Both these are assumed to be very unlikely. However, in case it still happens
+ * Both these are assumed to be very unlikely. However, in case it still
* happens, the report logic will filter out the false positive (see report.c).
*/
#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c5e5e5a..8798a81 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -109,7 +109,7 @@ EXPORT_SYMBOL_GPL(kexec_crash_loaded);
* defined more restrictively in <asm/kexec.h>.
*
* The code for the transition from the current kernel to the
- * the new kernel is placed in the control_code_buffer, whose size
+ * new kernel is placed in the control_code_buffer, whose size
* is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
* page of memory is necessary, but some architectures require more.
* Because this memory must be identity mapped in the transition from
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 84f7316..e21f6b9 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -521,7 +521,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
/* Returning 0 will take to next memory range */
/* Don't use memory that will be detected and handled by a driver. */
- if (res->flags & IORESOURCE_MEM_DRIVER_MANAGED)
+ if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
return 0;
if (sz < kbuf->memsz)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3edaa38..e29773c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -775,7 +775,7 @@ EXPORT_SYMBOL(kthread_create_worker);
/**
* kthread_create_worker_on_cpu - create a kthread worker and bind it
- * it to a given CPU and the associated NUMA node.
+ * to a given CPU and the associated NUMA node.
* @cpu: CPU number
* @flags: flags modifying the default behavior of the worker
* @namefmt: printf-style name for the kthread worker (task).
diff --git a/kernel/livepatch/state.c b/kernel/livepatch/state.c
index 7ee1947..2565d03 100644
--- a/kernel/livepatch/state.c
+++ b/kernel/livepatch/state.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL_GPL(klp_get_state);
*
* The function can be called only during transition when a new
* livepatch is being enabled or when such a transition is reverted.
- * It is typically called only from from pre/post (un)patch
+ * It is typically called only from pre/post (un)patch
* callbacks.
*
* Return: pointer to the latest struct klp_state from already
diff --git a/kernel/panic.c b/kernel/panic.c
index aef8872ba..396142e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -589,6 +589,11 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (args)
vprintk(args->fmt, args->args);
+ print_modules();
+
+ if (regs)
+ show_regs(regs);
+
if (panic_on_warn) {
/*
* This thread may hit another WARN() in the panic path.
@@ -600,12 +605,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
panic("panic_on_warn set ...\n");
}
- print_modules();
-
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+ dump_stack();
print_irqtrace_events(current);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ac135bd..9de2180 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -233,7 +233,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* to pid_ns->child_reaper. Thus pidns->child_reaper needs to
* stay valid until they all go away.
*
- * The code relies on the the pid_ns->child_reaper ignoring
+ * The code relies on the pid_ns->child_reaper ignoring
* SIGCHILD to cause those EXIT_ZOMBIE processes to be
* autoreaped if reparented.
*
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d25749b..46b1804 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -735,7 +735,7 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
*/
/*
- * If the zone we wish to scan is the the current zone and the
+ * If the zone we wish to scan is the current zone and the
* pfn falls into the current node then we do not need to walk
* the tree.
*/
diff --git a/kernel/range.c b/kernel/range.c
index d84de67..56435f9 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -2,8 +2,9 @@
/*
* Range add and subtract
*/
-#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/minmax.h>
+#include <linux/printk.h>
#include <linux/sort.h>
#include <linux/string.h>
#include <linux/range.h>
diff --git a/kernel/relay.c b/kernel/relay.c
index fb4e0c5..b08d936 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1002,7 +1002,7 @@ static int relay_file_read_avail(struct rchan_buf *buf)
size_t subbuf_size = buf->chan->subbuf_size;
size_t n_subbufs = buf->chan->n_subbufs;
size_t produced = buf->subbufs_produced;
- size_t consumed = buf->subbufs_consumed;
+ size_t consumed;
relay_file_read_consume(buf, 0, 0);
diff --git a/kernel/resource.c b/kernel/resource.c
index f1175ce..3ae2f56 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1240,7 +1240,6 @@ EXPORT_SYMBOL(__release_region);
#ifdef CONFIG_MEMORY_HOTREMOVE
/**
* release_mem_region_adjustable - release a previously reserved memory region
- * @parent: parent resource descriptor
* @start: resource start address
* @size: resource region size
*
@@ -1258,21 +1257,28 @@ EXPORT_SYMBOL(__release_region);
* assumes that all children remain in the lower address entry for
* simplicity. Enhance this logic when necessary.
*/
-int release_mem_region_adjustable(struct resource *parent,
- resource_size_t start, resource_size_t size)
+void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
{
+ struct resource *parent = &iomem_resource;
+ struct resource *new_res = NULL;
+ bool alloc_nofail = false;
struct resource **p;
struct resource *res;
- struct resource *new_res;
resource_size_t end;
- int ret = -EINVAL;
end = start + size - 1;
- if ((start < parent->start) || (end > parent->end))
- return ret;
+ if (WARN_ON_ONCE((start < parent->start) || (end > parent->end)))
+ return;
- /* The alloc_resource() result gets checked later */
- new_res = alloc_resource(GFP_KERNEL);
+ /*
+ * We free up quite a lot of memory on memory hotunplug (esp., memap),
+ * just before releasing the region. This is highly unlikely to
+ * fail - let's play save and make it never fail as the caller cannot
+ * perform any error handling (e.g., trying to re-add memory will fail
+ * similarly).
+ */
+retry:
+ new_res = alloc_resource(GFP_KERNEL | (alloc_nofail ? __GFP_NOFAIL : 0));
p = &parent->child;
write_lock(&resource_lock);
@@ -1298,7 +1304,6 @@ int release_mem_region_adjustable(struct resource *parent,
* so if we are dealing with them, let us just back off here.
*/
if (!(res->flags & IORESOURCE_SYSRAM)) {
- ret = 0;
break;
}
@@ -1315,20 +1320,23 @@ int release_mem_region_adjustable(struct resource *parent,
/* free the whole entry */
*p = res->sibling;
free_resource(res);
- ret = 0;
} else if (res->start == start && res->end != end) {
/* adjust the start */
- ret = __adjust_resource(res, end + 1,
- res->end - end);
+ WARN_ON_ONCE(__adjust_resource(res, end + 1,
+ res->end - end));
} else if (res->start != start && res->end == end) {
/* adjust the end */
- ret = __adjust_resource(res, res->start,
- start - res->start);
+ WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start));
} else {
- /* split into two entries */
+ /* split into two entries - we need a new resource */
if (!new_res) {
- ret = -ENOMEM;
- break;
+ new_res = alloc_resource(GFP_ATOMIC);
+ if (!new_res) {
+ alloc_nofail = true;
+ write_unlock(&resource_lock);
+ goto retry;
+ }
}
new_res->name = res->name;
new_res->start = end + 1;
@@ -1339,9 +1347,8 @@ int release_mem_region_adjustable(struct resource *parent,
new_res->sibling = res->sibling;
new_res->child = NULL;
- ret = __adjust_resource(res, res->start,
- start - res->start);
- if (ret)
+ if (WARN_ON_ONCE(__adjust_resource(res, res->start,
+ start - res->start)))
break;
res->sibling = new_res;
new_res = NULL;
@@ -1352,10 +1359,69 @@ int release_mem_region_adjustable(struct resource *parent,
write_unlock(&resource_lock);
free_resource(new_res);
- return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool system_ram_resources_mergeable(struct resource *r1,
+ struct resource *r2)
+{
+ /* We assume either r1 or r2 is IORESOURCE_SYSRAM_MERGEABLE. */
+ return r1->flags == r2->flags && r1->end + 1 == r2->start &&
+ r1->name == r2->name && r1->desc == r2->desc &&
+ !r1->child && !r2->child;
+}
+
+/*
+ * merge_system_ram_resource - mark the System RAM resource mergeable and try to
+ * merge it with adjacent, mergeable resources
+ * @res: resource descriptor
+ *
+ * This interface is intended for memory hotplug, whereby lots of contiguous
+ * system ram resources are added (e.g., via add_memory*()) by a driver, and
+ * the actual resource boundaries are not of interest (e.g., it might be
+ * relevant for DIMMs). Only resources that are marked mergeable, that have the
+ * same parent, and that don't have any children are considered. All mergeable
+ * resources must be immutable during the request.
+ *
+ * Note:
+ * - The caller has to make sure that no pointers to resources that are
+ * marked mergeable are used anymore after this call - the resource might
+ * be freed and the pointer might be stale!
+ * - release_mem_region_adjustable() will split on demand on memory hotunplug
+ */
+void merge_system_ram_resource(struct resource *res)
+{
+ const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ struct resource *cur;
+
+ if (WARN_ON_ONCE((res->flags & flags) != flags))
+ return;
+
+ write_lock(&resource_lock);
+ res->flags |= IORESOURCE_SYSRAM_MERGEABLE;
+
+ /* Try to merge with next item in the list. */
+ cur = res->sibling;
+ if (cur && system_ram_resources_mergeable(res, cur)) {
+ res->end = cur->end;
+ res->sibling = cur->sibling;
+ free_resource(cur);
+ }
+
+ /* Try to merge with previous item in the list. */
+ cur = res->parent->child;
+ while (cur && cur->sibling != res)
+ cur = cur->sibling;
+ if (cur && system_ram_resources_mergeable(cur, res)) {
+ cur->end = res->end;
+ cur->sibling = res->sibling;
+ free_resource(res);
+ }
+ write_unlock(&resource_lock);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
/*
* Managed region resource
*/
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae8eb..d9832a1 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -741,7 +741,7 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* for all the required CPUs to finish. This may include the local
* processor.
* @cond_func: A callback function that is passed a cpu id and
- * the the info parameter. The function is called
+ * the info parameter. The function is called
* with preemption disabled. The function should
* return a blooean value indicating whether to IPI
* the specified CPU.
diff --git a/kernel/sys.c b/kernel/sys.c
index ab6c409..6401880 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2034,7 +2034,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
* VMAs already unmapped and kernel uses these members for statistics
* output in procfs mostly, except
*
- * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * - @start_brk/@brk which are used in do_brk_flags but kernel lookups
* for VMAs when updating these memvers so anything wrong written
* here cause kernel to swear at userspace program but won't lead
* to any problem in kernel itself
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 87804e0..e703d5d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -515,7 +515,7 @@ EXPORT_SYMBOL(from_kgid_munged);
*
* When there is no mapping defined for the user-namespace projid
* pair INVALID_PROJID is returned. Callers are expected to test
- * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
+ * for and handle INVALID_PROJID being returned. INVALID_PROJID
* may be tested for using projid_valid().
*/
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 491789a..ebe5ab1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1768,6 +1768,13 @@
help
Provide fault-injection capability for alloc_pages().
+config FAULT_INJECTION_USERCOPY
+ bool "Fault injection capability for usercopy functions"
+ depends on FAULT_INJECTION
+ help
+ Provides fault-injection capability to inject failures
+ in usercopy functions (copy_from_user(), get_user(), ...).
+
config FAIL_MAKE_REQUEST
bool "Fault-injection capability for disk IO"
depends on FAULT_INJECTION && BLOCK
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index 774315d..58f8d03 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -47,6 +47,20 @@
to the {str,mem}*cpy() family of functions (that is addressed
by CONFIG_FORTIFY_SOURCE).
+config UBSAN_LOCAL_BOUNDS
+ bool "Perform array local bounds checking"
+ depends on UBSAN_TRAP
+ depends on CC_IS_CLANG
+ depends on !UBSAN_KCOV_BROKEN
+ help
+ This option enables -fsanitize=local-bounds which traps when an
+ exception/error is detected. Therefore, it should be enabled only
+ if trapping is expected.
+ Enabling this option detects errors due to accesses through a
+ pointer that is derived from an object of a statically-known size,
+ where an added offset (which may not be known statically) is
+ out-of-bounds.
+
config UBSAN_MISC
bool "Enable all other Undefined Behavior sanity checks"
default UBSAN
diff --git a/lib/Makefile b/lib/Makefile
index 49a2a9e..1c7577b2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -210,6 +210,7 @@
obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
+obj-$(CONFIG_FAULT_INJECTION_USERCOPY) += fault-inject-usercopy.o
obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 6139489..75006c4 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -23,7 +23,7 @@
/**
* DOC: bitmap introduction
*
- * bitmaps provide an array of bits, implemented using an an
+ * bitmaps provide an array of bits, implemented using an
* array of unsigned longs. The number of valid bits in a
* given bitmap does _not_ need to be an exact multiple of
* BITS_PER_LONG.
diff --git a/lib/crc32.c b/lib/crc32.c
index 35a03d0..2a68dfd 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -331,7 +331,7 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
return crc;
}
-#if CRC_LE_BITS == 1
+#if CRC_BE_BITS == 1
u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len)
{
return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c
index f9628f39..c72c865 100644
--- a/lib/decompress_bunzip2.c
+++ b/lib/decompress_bunzip2.c
@@ -390,7 +390,7 @@ static int INIT get_next_block(struct bunzip_data *bd)
j = (bd->inbufBits >> bd->inbufBitCount)&
((1 << hufGroup->maxLen)-1);
got_huff_bits:
- /* Figure how how many bits are in next symbol and
+ /* Figure how many bits are in next symbol and
* unget extras */
i = hufGroup->minLen;
while (j > limit[i])
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index e659a02..fde0aa2 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -60,8 +60,8 @@ void dql_completed(struct dql *dql, unsigned int count)
* A decrease is only considered if the queue has been busy in
* the whole interval (the check above).
*
- * If there is slack, the amount of execess data queued above
- * the the amount needed to prevent starvation, the queue limit
+ * If there is slack, the amount of excess data queued above
+ * the amount needed to prevent starvation, the queue limit
* can be decreased. To avoid hysteresis we consider the
* minimum amount of slack found over several iterations of the
* completion routine.
diff --git a/lib/earlycpio.c b/lib/earlycpio.c
index c001e08..e836288 100644
--- a/lib/earlycpio.c
+++ b/lib/earlycpio.c
@@ -42,7 +42,7 @@ enum cpio_fields {
/**
* cpio_data find_cpio_data - Search for files in an uncompressed cpio
* @path: The directory to search for, including a slash at the end
- * @data: Pointer to the the cpio archive or a header inside
+ * @data: Pointer to the cpio archive or a header inside
* @len: Remaining length of the cpio based on data pointer
* @nextoff: When a matching file is found, this is the offset from the
* beginning of the cpio to the beginning of the next file, not the
diff --git a/lib/fault-inject-usercopy.c b/lib/fault-inject-usercopy.c
new file mode 100644
index 0000000..77558b6
--- /dev/null
+++ b/lib/fault-inject-usercopy.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/fault-inject.h>
+#include <linux/fault-inject-usercopy.h>
+
+static struct {
+ struct fault_attr attr;
+} fail_usercopy = {
+ .attr = FAULT_ATTR_INITIALIZER,
+};
+
+static int __init setup_fail_usercopy(char *str)
+{
+ return setup_fault_attr(&fail_usercopy.attr, str);
+}
+__setup("fail_usercopy=", setup_fail_usercopy);
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_usercopy_debugfs(void)
+{
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_usercopy", NULL,
+ &fail_usercopy.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ return 0;
+}
+
+late_initcall(fail_usercopy_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+bool should_fail_usercopy(void)
+{
+ return should_fail(&fail_usercopy.attr, 1);
+}
+EXPORT_SYMBOL_GPL(should_fail_usercopy);
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 49f875f..4a87510 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -16,6 +16,7 @@
#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/kernel.h>
+#include <linux/minmax.h>
#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
!defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 147133f..9301578 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -7,6 +7,7 @@
#include <linux/ctype.h>
#include <linux/errno.h>
#include <linux/kernel.h>
+#include <linux/minmax.h>
#include <linux/export.h>
#include <asm/unaligned.h>
diff --git a/lib/idr.c b/lib/idr.c
index c2cf2c5..3fa8be4 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -372,7 +372,8 @@ EXPORT_SYMBOL(idr_replace);
* Allocate an ID between @min and @max, inclusive. The allocated ID will
* not exceed %INT_MAX, even if @max is larger.
*
- * Context: Any context.
+ * Context: Any context. It is safe to call this function without
+ * locking in your code.
* Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
* or %-ENOSPC if there are no free IDs.
*/
@@ -479,7 +480,8 @@ EXPORT_SYMBOL(ida_alloc_range);
* @ida: IDA handle.
* @id: Previously allocated ID.
*
- * Context: Any context.
+ * Context: Any context. It is safe to call this function without
+ * locking in your code.
*/
void ida_free(struct ida *ida, unsigned int id)
{
@@ -531,7 +533,8 @@ EXPORT_SYMBOL(ida_free);
* or freed. If the IDA is already empty, there is no need to call this
* function.
*
- * Context: Any context.
+ * Context: Any context. It is safe to call this function without
+ * locking in your code.
*/
void ida_destroy(struct ida *ida)
{
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 14cae258..1635111 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -2,6 +2,7 @@
#include <crypto/hash.h>
#include <linux/export.h>
#include <linux/bvec.h>
+#include <linux/fault-inject-usercopy.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
@@ -140,6 +141,8 @@
static int copyout(void __user *to, const void *from, size_t n)
{
+ if (should_fail_usercopy())
+ return n;
if (access_ok(to, n)) {
instrument_copy_to_user(to, from, n);
n = raw_copy_to_user(to, from, n);
@@ -149,6 +152,8 @@ static int copyout(void __user *to, const void *from, size_t n)
static int copyin(void *to, const void __user *from, size_t n)
{
+ if (should_fail_usercopy())
+ return n;
if (access_ok(from, n)) {
instrument_copy_from_user(to, from, n);
n = raw_copy_from_user(to, from, n);
diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
index 77ab839..5ca0d815 100644
--- a/lib/libcrc32c.c
+++ b/lib/libcrc32c.c
@@ -12,7 +12,7 @@
* pages = {},
* month = {June},
*}
- * Used by the iSCSI driver, possibly others, and derived from the
+ * Used by the iSCSI driver, possibly others, and derived from
* the iscsi-crc.c module of the linux-iscsi driver at
* http://linux-iscsi.sourceforge.net.
*
diff --git a/lib/math/rational.c b/lib/math/rational.c
index df75c88..9781d52 100644
--- a/lib/math/rational.c
+++ b/lib/math/rational.c
@@ -11,7 +11,7 @@
#include <linux/rational.h>
#include <linux/compiler.h>
#include <linux/export.h>
-#include <linux/kernel.h>
+#include <linux/minmax.h>
/*
* calculate best rational approximation for a given fraction
diff --git a/lib/math/reciprocal_div.c b/lib/math/reciprocal_div.c
index bf04325..32436dd4 100644
--- a/lib/math/reciprocal_div.c
+++ b/lib/math/reciprocal_div.c
@@ -4,6 +4,7 @@
#include <asm/div64.h>
#include <linux/reciprocal_div.h>
#include <linux/export.h>
+#include <linux/minmax.h>
/*
* For a description of the algorithm please have a look at
diff --git a/lib/mpi/mpi-bit.c b/lib/mpi/mpi-bit.c
index a5119a2..142b680 100644
--- a/lib/mpi/mpi-bit.c
+++ b/lib/mpi/mpi-bit.c
@@ -1,4 +1,4 @@
-/* mpi-bit.c - MPI bit level fucntions
+/* mpi-bit.c - MPI bit level functions
* Copyright (C) 1998, 1999 Free Software Foundation, Inc.
*
* This file is part of GnuPG.
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index f61689a..00f666d 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -85,7 +85,7 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
preempt_disable();
count = __this_cpu_read(*fbc->counters) + amount;
- if (count >= batch || count <= -batch) {
+ if (abs(count) >= batch) {
unsigned long flags;
raw_spin_lock_irqsave(&fbc->lock, flags);
fbc->count += count;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8e4a3a4..005ced9 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -325,7 +325,7 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
int ret = -ENOMEM;
/*
- * Nodes preloaded by one cgroup can be be used by another cgroup, so
+ * Nodes preloaded by one cgroup can be used by another cgroup, so
* they should never be accounted to any particular memory cgroup.
*/
gfp_mask &= ~__GFP_ACCOUNT;
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index 5d63a88..d94628f 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -504,7 +504,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length,
nalloc++;
}
sgl = kmalloc_array(nalloc, sizeof(struct scatterlist),
- (gfp & ~GFP_DMA) | __GFP_ZERO);
+ gfp & ~GFP_DMA);
if (!sgl)
return NULL;
diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 34696a3..e6d5fcc 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
@@ -99,6 +100,8 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
unsigned long max_addr, src_addr;
might_fault();
+ if (should_fail_usercopy())
+ return -EFAULT;
if (unlikely(count <= 0))
return 0;
diff --git a/lib/syscall.c b/lib/syscall.c
index fb328e7..8533d2f 100644
--- a/lib/syscall.c
+++ b/lib/syscall.c
@@ -44,7 +44,7 @@ static int collect_syscall(struct task_struct *target, struct syscall_info *info
* .data.instruction_pointer - filled with user PC
*
* If @target is blocked in a system call, returns zero with @info.data.nr
- * set to the the call's number and @info.data.args filled in with its
+ * set to the call's number and @info.data.args filled in with its
* arguments. Registers not used for system call arguments may not be available
* and it is not kosher to use &struct user_regset calls while the system
* call is still in progress. Note we may get this result if @target
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e151a7f..80a7887 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -461,7 +461,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
- return -ENOMEM;
+ return false;
res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
"hmm_dmirror");
diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c
index 98bc92a..3750323 100644
--- a/lib/test_sysctl.c
+++ b/lib/test_sysctl.c
@@ -16,7 +16,7 @@
*/
/*
- * This module provides an interface to the the proc sysctl interfaces. This
+ * This module provides an interface to the proc sysctl interfaces. This
* driver requires CONFIG_PROC_SYSCTL. It will not normally be loaded by the
* system unless explicitly requested by name. You can also build this driver
* into your kernel.
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index d4f9792..8262c3f 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -1503,6 +1503,49 @@ static noinline void check_store_range(struct xarray *xa)
}
}
+#ifdef CONFIG_XARRAY_MULTI
+static void check_split_1(struct xarray *xa, unsigned long index,
+ unsigned int order)
+{
+ XA_STATE(xas, xa, index);
+ void *entry;
+ unsigned int i = 0;
+
+ xa_store_order(xa, index, order, xa, GFP_KERNEL);
+
+ xas_split_alloc(&xas, xa, order, GFP_KERNEL);
+ xas_lock(&xas);
+ xas_split(&xas, xa, order);
+ xas_unlock(&xas);
+
+ xa_for_each(xa, index, entry) {
+ XA_BUG_ON(xa, entry != xa);
+ i++;
+ }
+ XA_BUG_ON(xa, i != 1 << order);
+
+ xa_set_mark(xa, index, XA_MARK_0);
+ XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0));
+
+ xa_destroy(xa);
+}
+
+static noinline void check_split(struct xarray *xa)
+{
+ unsigned int order;
+
+ XA_BUG_ON(xa, !xa_empty(xa));
+
+ for (order = 1; order < 2 * XA_CHUNK_SHIFT; order++) {
+ check_split_1(xa, 0, order);
+ check_split_1(xa, 1UL << order, order);
+ check_split_1(xa, 3UL << order, order);
+ }
+}
+#else
+static void check_split(struct xarray *xa) { }
+#endif
+
static void check_align_1(struct xarray *xa, char *name)
{
int i;
@@ -1649,6 +1692,26 @@ static noinline void check_account(struct xarray *xa)
#endif
}
+static noinline void check_get_order(struct xarray *xa)
+{
+ unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+ unsigned int order;
+ unsigned long i, j;
+
+ for (i = 0; i < 3; i++)
+ XA_BUG_ON(xa, xa_get_order(xa, i) != 0);
+
+ for (order = 0; order < max_order; order++) {
+ for (i = 0; i < 10; i++) {
+ xa_store_order(xa, i << order, order,
+ xa_mk_index(i << order), GFP_KERNEL);
+ for (j = i << order; j < (i + 1) << order; j++)
+ XA_BUG_ON(xa, xa_get_order(xa, j) != order);
+ xa_erase(xa, i << order);
+ }
+ }
+}
+
static noinline void check_destroy(struct xarray *xa)
{
unsigned long index;
@@ -1697,6 +1760,7 @@ static int xarray_checks(void)
check_reserve(&array);
check_reserve(&xa0);
check_multi_store(&array);
+ check_get_order(&array);
check_xa_alloc();
check_find(&array);
check_find_entry(&array);
@@ -1708,6 +1772,7 @@ static int xarray_checks(void)
check_store_range(&array);
check_store_iter(&array);
check_align(&xa0);
+ check_split(&array);
check_workingset(&array, 0);
check_workingset(&array, 64);
diff --git a/lib/usercopy.c b/lib/usercopy.c
index b26509f..7413dd3 100644
--- a/lib/usercopy.c
+++ b/lib/usercopy.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bitops.h>
+#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/uaccess.h>
@@ -10,7 +11,7 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n
{
unsigned long res = n;
might_fault();
- if (likely(access_ok(from, n))) {
+ if (!should_fail_usercopy() && likely(access_ok(from, n))) {
instrument_copy_from_user(to, from, n);
res = raw_copy_from_user(to, from, n);
}
@@ -25,6 +26,8 @@ EXPORT_SYMBOL(_copy_from_user);
unsigned long _copy_to_user(void __user *to, const void *from, unsigned long n)
{
might_fault();
+ if (should_fail_usercopy())
+ return n;
if (likely(access_ok(to, n))) {
instrument_copy_to_user(to, from, n);
n = raw_copy_to_user(to, from, n);
diff --git a/lib/xarray.c b/lib/xarray.c
index e9e641d..b76eea7 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -266,13 +266,14 @@ static void xa_node_free(struct xa_node *node)
*/
static void xas_destroy(struct xa_state *xas)
{
- struct xa_node *node = xas->xa_alloc;
+ struct xa_node *next, *node = xas->xa_alloc;
- if (!node)
- return;
- XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
- kmem_cache_free(radix_tree_node_cachep, node);
- xas->xa_alloc = NULL;
+ while (node) {
+ XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
+ next = rcu_dereference_raw(node->parent);
+ radix_tree_node_rcu_free(&node->rcu_head);
+ xas->xa_alloc = node = next;
+ }
}
/**
@@ -304,6 +305,7 @@ bool xas_nomem(struct xa_state *xas, gfp_t gfp)
xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp);
if (!xas->xa_alloc)
return false;
+ xas->xa_alloc->parent = NULL;
XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
xas->xa_node = XAS_RESTART;
return true;
@@ -339,6 +341,7 @@ static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
}
if (!xas->xa_alloc)
return false;
+ xas->xa_alloc->parent = NULL;
XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
xas->xa_node = XAS_RESTART;
return true;
@@ -403,7 +406,7 @@ static unsigned long xas_size(const struct xa_state *xas)
/*
* Use this to calculate the maximum index that will need to be created
* in order to add the entry described by @xas. Because we cannot store a
- * multiple-index entry at index 0, the calculation is a little more complex
+ * multi-index entry at index 0, the calculation is a little more complex
* than you might expect.
*/
static unsigned long xas_max(struct xa_state *xas)
@@ -946,6 +949,153 @@ void xas_init_marks(const struct xa_state *xas)
}
EXPORT_SYMBOL_GPL(xas_init_marks);
+#ifdef CONFIG_XARRAY_MULTI
+static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
+{
+ unsigned int marks = 0;
+ xa_mark_t mark = XA_MARK_0;
+
+ for (;;) {
+ if (node_get_mark(node, offset, mark))
+ marks |= 1 << (__force unsigned int)mark;
+ if (mark == XA_MARK_MAX)
+ break;
+ mark_inc(mark);
+ }
+
+ return marks;
+}
+
+static void node_set_marks(struct xa_node *node, unsigned int offset,
+ struct xa_node *child, unsigned int marks)
+{
+ xa_mark_t mark = XA_MARK_0;
+
+ for (;;) {
+ if (marks & (1 << (__force unsigned int)mark)) {
+ node_set_mark(node, offset, mark);
+ if (child)
+ node_mark_all(child, mark);
+ }
+ if (mark == XA_MARK_MAX)
+ break;
+ mark_inc(mark);
+ }
+}
+
+/**
+ * xas_split_alloc() - Allocate memory for splitting an entry.
+ * @xas: XArray operation state.
+ * @entry: New entry which will be stored in the array.
+ * @order: New entry order.
+ * @gfp: Memory allocation flags.
+ *
+ * This function should be called before calling xas_split().
+ * If necessary, it will allocate new nodes (and fill them with @entry)
+ * to prepare for the upcoming split of an entry of @order size into
+ * entries of the order stored in the @xas.
+ *
+ * Context: May sleep if @gfp flags permit.
+ */
+void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
+ gfp_t gfp)
+{
+ unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
+ unsigned int mask = xas->xa_sibs;
+
+ /* XXX: no support for splitting really large entries yet */
+ if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT < order))
+ goto nomem;
+ if (xas->xa_shift + XA_CHUNK_SHIFT > order)
+ return;
+
+ do {
+ unsigned int i;
+ void *sibling;
+ struct xa_node *node;
+
+ node = kmem_cache_alloc(radix_tree_node_cachep, gfp);
+ if (!node)
+ goto nomem;
+ node->array = xas->xa;
+ for (i = 0; i < XA_CHUNK_SIZE; i++) {
+ if ((i & mask) == 0) {
+ RCU_INIT_POINTER(node->slots[i], entry);
+ sibling = xa_mk_sibling(0);
+ } else {
+ RCU_INIT_POINTER(node->slots[i], sibling);
+ }
+ }
+ RCU_INIT_POINTER(node->parent, xas->xa_alloc);
+ xas->xa_alloc = node;
+ } while (sibs-- > 0);
+
+ return;
+nomem:
+ xas_destroy(xas);
+ xas_set_err(xas, -ENOMEM);
+}
+EXPORT_SYMBOL_GPL(xas_split_alloc);
+
+/**
+ * xas_split() - Split a multi-index entry into smaller entries.
+ * @xas: XArray operation state.
+ * @entry: New entry to store in the array.
+ * @order: New entry order.
+ *
+ * The value in the entry is copied to all the replacement entries.
+ *
+ * Context: Any context. The caller should hold the xa_lock.
+ */
+void xas_split(struct xa_state *xas, void *entry, unsigned int order)
+{
+ unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
+ unsigned int offset, marks;
+ struct xa_node *node;
+ void *curr = xas_load(xas);
+ int values = 0;
+
+ node = xas->xa_node;
+ if (xas_top(node))
+ return;
+
+ marks = node_get_marks(node, xas->xa_offset);
+
+ offset = xas->xa_offset + sibs;
+ do {
+ if (xas->xa_shift < node->shift) {
+ struct xa_node *child = xas->xa_alloc;
+
+ xas->xa_alloc = rcu_dereference_raw(child->parent);
+ child->shift = node->shift - XA_CHUNK_SHIFT;
+ child->offset = offset;
+ child->count = XA_CHUNK_SIZE;
+ child->nr_values = xa_is_value(entry) ?
+ XA_CHUNK_SIZE : 0;
+ RCU_INIT_POINTER(child->parent, node);
+ node_set_marks(node, offset, child, marks);
+ rcu_assign_pointer(node->slots[offset],
+ xa_mk_node(child));
+ if (xa_is_value(curr))
+ values--;
+ } else {
+ unsigned int canon = offset - xas->xa_sibs;
+
+ node_set_marks(node, canon, NULL, marks);
+ rcu_assign_pointer(node->slots[canon], entry);
+ while (offset > canon)
+ rcu_assign_pointer(node->slots[offset--],
+ xa_mk_sibling(canon));
+ values += (xa_is_value(entry) - xa_is_value(curr)) *
+ (xas->xa_sibs + 1);
+ }
+ } while (offset-- > xas->xa_offset);
+
+ node->nr_values += values;
+}
+EXPORT_SYMBOL_GPL(xas_split);
+#endif
+
/**
* xas_pause() - Pause a walk to drop a lock.
* @xas: XArray operation state.
@@ -1407,7 +1557,7 @@ EXPORT_SYMBOL(__xa_store);
* @gfp: Memory allocation flags.
*
* After this function returns, loads from this index will return @entry.
- * Storing into an existing multislot entry updates the entry of every index.
+ * Storing into an existing multi-index entry updates the entry of every index.
* The marks associated with @index are unaffected unless @entry is %NULL.
*
* Context: Any context. Takes and releases the xa_lock.
@@ -1549,7 +1699,7 @@ static void xas_set_range(struct xa_state *xas, unsigned long first,
*
* After this function returns, loads from any index between @first and @last,
* inclusive will return @entry.
- * Storing into an existing multislot entry updates the entry of every index.
+ * Storing into an existing multi-index entry updates the entry of every index.
* The marks associated with @index are unaffected unless @entry is %NULL.
*
* Context: Process context. Takes and releases the xa_lock. May sleep
@@ -1592,6 +1742,46 @@ void *xa_store_range(struct xarray *xa, unsigned long first,
return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);
+
+/**
+ * xa_get_order() - Get the order of an entry.
+ * @xa: XArray.
+ * @index: Index of the entry.
+ *
+ * Return: A number between 0 and 63 indicating the order of the entry.
+ */
+int xa_get_order(struct xarray *xa, unsigned long index)
+{
+ XA_STATE(xas, xa, index);
+ void *entry;
+ int order = 0;
+
+ rcu_read_lock();
+ entry = xas_load(&xas);
+
+ if (!entry)
+ goto unlock;
+
+ if (!xas.xa_node)
+ goto unlock;
+
+ for (;;) {
+ unsigned int slot = xas.xa_offset + (1 << order);
+
+ if (slot >= XA_CHUNK_SIZE)
+ break;
+ if (!xa_is_sibling(xas.xa_node->slots[slot]))
+ break;
+ order++;
+ }
+
+ order += xas.xa_node->shift;
+unlock:
+ rcu_read_unlock();
+
+ return order;
+}
+EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */
/**
diff --git a/mm/Kconfig b/mm/Kconfig
index e72e61c..c7f30f8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -152,6 +152,7 @@
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on 64BIT || BROKEN
@@ -178,7 +179,6 @@
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
- select MEMORY_ISOLATION
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
diff --git a/mm/compaction.c b/mm/compaction.c
index 6c63844..6e0ee56 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -625,7 +625,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Found a free page, will break it into order-0 pages */
- order = page_order(page);
+ order = buddy_order(page);
isolated = __isolate_free_page(page, order);
if (!isolated)
break;
@@ -898,7 +898,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* potential isolation targets.
*/
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
@@ -1172,7 +1172,7 @@ static bool suitable_migration_target(struct compact_control *cc,
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (page_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= pageblock_order)
return false;
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309f..c05d9dc 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -28,6 +28,7 @@
#include <linux/swapops.h>
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
+#include <linux/io.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -44,10 +45,17 @@
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
* while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
*/
-#define S390_MASK_BITS 4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define S390_SKIP_MASK GENMASK(3, 0)
+#if __BITS_PER_LONG == 64
+#define PPC64_SKIP_MASK GENMASK(62, 62)
+#else
+#define PPC64_SKIP_MASK 0x0
+#endif
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
@@ -71,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
{
pte_t pte = pfn_pte(pfn, prot);
+ /*
+ * Architectures optimize set_pte_at by avoiding TLB flush.
+ * This requires set_pte_at to be not used to update an
+ * existing pte entry. Clear pte before we do set_pte_at
+ */
+
pr_debug("Validating PTE advanced\n");
pte = pfn_pte(pfn, prot);
set_pte_at(mm, vaddr, ptep, pte);
ptep_set_wrprotect(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(pte_write(pte));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
@@ -93,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
pte = ptep_get(ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear_full(mm, vaddr, ptep, 1);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
+ pte = pfn_pte(pfn, prot);
pte = pte_mkyoung(pte);
set_pte_at(mm, vaddr, ptep, pte);
ptep_test_and_clear_young(vma, vaddr, ptep);
@@ -111,10 +120,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pte_t pte = pfn_pte(pfn, prot);
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
pr_debug("Validating PTE saved write\n");
WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -141,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
pmd_t pmd = pfn_pmd(pfn, prot);
@@ -152,14 +165,13 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
/* Align the address wrt HPAGE_PMD_SIZE */
vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+
pmd = pfn_pmd(pfn, prot);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_write(pmd));
-
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
@@ -173,18 +185,20 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
pmd = pmd_mkyoung(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_test_and_clear_young(vma, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_young(pmd));
+
+ /* Clear the pte entries */
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
}
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -199,11 +213,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pmd_supported())
return;
pr_debug("Validating PMD huge\n");
@@ -217,11 +232,17 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
}
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pmd_t pmd = pfn_pmd(pfn, prot);
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
pr_debug("Validating PMD saved write\n");
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
@@ -272,17 +293,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
pudp_huge_get_and_clear(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
-
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(pfn, prot);
pud = pud_wrprotect(pud);
@@ -294,11 +307,20 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pud = READ_ONCE(*pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+#ifndef __PAGETABLE_PMD_FOLDED
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+ pud = pfn_pud(pfn, prot);
pud = pud_mkyoung(pud);
set_pud_at(mm, vaddr, pudp, pud);
pudp_test_and_clear_young(vma, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_young(pud));
+
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
}
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -313,11 +335,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pud_leaf(pud));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
pud_t pud;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pud_supported())
return;
pr_debug("Validating PUD huge\n");
@@ -331,6 +354,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
}
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -350,7 +377,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
}
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -417,8 +444,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
*/
- pmd_clear(pmdp);
- pud_clear(pudp);
pud_populate(mm, pudp, pmdp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_bad(pud));
@@ -515,12 +540,15 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
#endif /* PAGETABLE_P4D_FOLDED */
static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
- unsigned long vaddr)
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte = pfn_pte(pfn, prot);
pr_debug("Validating PTE clear\n");
+#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
+#endif
set_pte_at(mm, vaddr, ptep, pte);
barrier();
pte_clear(mm, vaddr, ptep);
@@ -550,7 +578,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
*/
- pmd_clear(pmdp);
pmd_populate(mm, pmdp, pgtable);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_bad(pmd));
@@ -784,57 +811,8 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pte_huge(pte_mkhuge(pte)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
-
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
- struct page *page = pfn_to_page(pfn);
- pte_t pte = ptep_get(ptep);
- unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
-
- pr_debug("Validating HugeTLB advanced\n");
- pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
- huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_set_wrprotect(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(huge_pte_write(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_get_and_clear(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- pte = huge_pte_wrprotect(pte);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- pte = huge_pte_mkwrite(pte);
- pte = huge_pte_mkdirty(pte);
- huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = huge_ptep_get(ptep);
- WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
-}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
-}
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -965,7 +943,13 @@ static int __init debug_vm_pgtable(void)
p4dp = p4d_alloc(mm, pgdp, vaddr);
pudp = pud_alloc(mm, p4dp, vaddr);
pmdp = pmd_alloc(mm, pudp, vaddr);
- ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+ /*
+ * Allocate pgtable_t
+ */
+ if (pte_alloc(mm, pmdp)) {
+ pr_err("pgtable allocation failed\n");
+ return 1;
+ }
/*
* Save all the page table page addresses as the page table
@@ -985,32 +969,11 @@ static int __init debug_vm_pgtable(void)
p4d_basic_tests(p4d_aligned, prot);
pgd_basic_tests(pgd_aligned, prot);
- pte_clear_tests(mm, ptep, vaddr);
- pmd_clear_tests(mm, pmdp);
- pud_clear_tests(mm, pudp);
- p4d_clear_tests(mm, p4dp);
- pgd_clear_tests(mm, pgdp);
-
- pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
- pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
- pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
- hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
pmd_leaf_tests(pmd_aligned, prot);
pud_leaf_tests(pud_aligned, prot);
- pmd_huge_tests(pmdp, pmd_aligned, prot);
- pud_huge_tests(pudp, pud_aligned, prot);
-
- pte_savedwrite_tests(pte_aligned, prot);
- pmd_savedwrite_tests(pmd_aligned, prot);
-
- pte_unmap_unlock(ptep, ptl);
-
- pmd_populate_tests(mm, pmdp, saved_ptep);
- pud_populate_tests(mm, pudp, saved_pmdp);
- p4d_populate_tests(mm, p4dp, saved_pudp);
- pgd_populate_tests(mm, pgdp, saved_p4dp);
+ pte_savedwrite_tests(pte_aligned, protnone);
+ pmd_savedwrite_tests(pmd_aligned, protnone);
pte_special_tests(pte_aligned, prot);
pte_protnone_tests(pte_aligned, protnone);
@@ -1029,11 +992,43 @@ static int __init debug_vm_pgtable(void)
pmd_swap_tests(pmd_aligned, prot);
swap_migration_tests();
- hugetlb_basic_tests(pte_aligned, prot);
pmd_thp_tests(pmd_aligned, prot);
pud_thp_tests(pud_aligned, prot);
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ /*
+ * Page table modifying tests. They need to hold
+ * proper page table lock.
+ */
+
+ ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
+ pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pte_unmap_unlock(ptep, ptl);
+
+ ptl = pmd_lock(mm, pmdp);
+ pmd_clear_tests(mm, pmdp);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ spin_unlock(ptl);
+
+ ptl = pud_lock(mm, pudp);
+ pud_clear_tests(mm, pudp);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ spin_unlock(ptl);
+
+ spin_lock(&mm->page_table_lock);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+ spin_unlock(&mm->page_table_lock);
+
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
pmd_free(mm, saved_pmdp);
diff --git a/mm/filemap.c b/mm/filemap.c
index e3b8987..1a6beaf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping,
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
+ page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
@@ -829,13 +829,12 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
noinline int __add_to_page_cache_locked(struct page *page,
struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
+ pgoff_t offset, gfp_t gfp,
void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -846,25 +845,46 @@ noinline int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
}
+ gfp &= GFP_RECLAIM_MASK;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -872,7 +892,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
@@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
* unlock_page - unlock a locked page
* @page: the page
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@ -2568,8 +2588,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
@@ -2580,8 +2600,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
return fpin;
}
@@ -2601,10 +2620,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
@@ -2984,7 +3004,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
goto out;
/*
- * Page is not up to date and may be locked due one of the following
+ * Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)
diff --git a/mm/gup.c b/mm/gup.c
index ad617e7f..102877e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1490,35 +1490,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_lock, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
@@ -1564,6 +1535,38 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
}
#endif /* !CONFIG_MMU */
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct page *page;
+ int locked = 1;
+ int ret;
+
+ if (mmap_read_lock_killable(mm))
+ return NULL;
+ ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ if (locked)
+ mmap_read_unlock(mm);
+ return (ret == 1) ? page : NULL;
+}
+#endif /* CONFIG_ELF_CORE */
+
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
diff --git a/mm/highmem.c b/mm/highmem.c
index 64d8dea..1352a27 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -369,7 +369,7 @@ void kunmap_high(struct page *page)
}
EXPORT_SYMBOL(kunmap_high);
-#endif
+#endif /* CONFIG_HIGHMEM */
#if defined(HASHED_PAGE_VIRTUAL)
@@ -481,4 +481,4 @@ void __init page_address_init(void)
}
}
-#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+#endif /* defined(HASHED_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 65c289c..9474dbc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2335,13 +2335,13 @@ static void unmap_page(struct page *page)
VM_BUG_ON_PAGE(!unmap_success, page);
}
-static void remap_page(struct page *page)
+static void remap_page(struct page *page, unsigned int nr)
{
int i;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
remove_migration_ptes(page + i, page + i, true);
}
}
@@ -2419,6 +2419,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2434,7 +2435,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
@@ -2454,7 +2455,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
- split_page_owner(head, HPAGE_PMD_ORDER);
+ split_page_owner(head, nr);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
@@ -2473,9 +2474,15 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- remap_page(head);
+ remap_page(head, nr);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ split_swap_cluster(entry);
+ }
+
+ for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
if (subpage == page)
continue;
@@ -2494,7 +2501,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int total_mapcount(struct page *page)
{
- int i, compound, ret;
+ int i, compound, nr, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -2502,16 +2509,17 @@ int total_mapcount(struct page *page)
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
+ nr = compound_nr(page);
if (PageHuge(page))
return compound;
ret = compound;
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
- return ret - compound * HPAGE_PMD_NR;
+ return ret - compound * nr;
if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
+ ret -= nr;
return ret;
}
@@ -2556,14 +2564,14 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
page = compound_head(page);
_total_mapcount = ret = 0;
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
ret -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
+ _total_mapcount -= thp_nr_pages(page);
}
mapcount = compound_mapcount(page);
ret += mapcount;
@@ -2580,9 +2588,9 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
/* Additional pins from page cache */
if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
else
- extra_pins = HPAGE_PMD_NR;
+ extra_pins = thp_nr_pages(page);
if (pextra_pins)
*pextra_pins = extra_pins;
return total_mapcount(page) == page_count(page) - extra_pins - 1;
@@ -2709,12 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
__split_huge_page(page, list, end, flags);
- if (PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
-
- ret = split_swap_cluster(entry);
- } else
- ret = 0;
+ ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
@@ -2728,7 +2731,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
- remap_page(head);
+ remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e488876..1ae1ebc 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -26,11 +26,6 @@ static int hwpoison_inject(void *data, u64 val)
p = pfn_to_page(pfn);
hpage = compound_head(p);
- /*
- * This implies unable to support free buddy pages.
- */
- if (!get_hwpoison_page(p))
- return 0;
if (!hwpoison_filter_enable)
goto inject;
@@ -40,23 +35,20 @@ static int hwpoison_inject(void *data, u64 val)
* This implies unable to support non-LRU pages.
*/
if (!PageLRU(hpage) && !PageHuge(p))
- goto put_out;
+ return 0;
/*
- * do a racy check with elevated page count, to make sure PG_hwpoison
- * will only be set for the targeted owner (or on a free page).
+ * do a racy check to make sure PG_hwpoison will only be set for
+ * the targeted owner (or on a free page).
* memory_failure() will redo the check reliably inside page lock.
*/
err = hwpoison_filter(hpage);
if (err)
- goto put_out;
+ return 0;
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- return memory_failure(pfn, MF_COUNT_INCREASED);
-put_out:
- put_hwpoison_page(p);
- return 0;
+ return memory_failure(pfn, 0);
}
static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/internal.h b/mm/internal.h
index a801a4d..c43ccdd 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,20 +49,15 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-void force_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read);
-void __do_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read,
+void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
unsigned long lookahead_size);
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-static inline void ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
+void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
+ unsigned long nr);
+static inline void force_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read)
{
- __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ DEFINE_READAHEAD(ractl, file, mapping, index);
+ force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
}
struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
@@ -275,16 +270,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* page from being allocated in parallel and returning garbage as the order.
* If a caller does not hold page_zone(page)->lock, it must guarantee that the
* page cannot be allocated or merged in parallel. Alternatively, it must
- * handle invalid values gracefully, and use page_order_unsafe() below.
+ * handle invalid values gracefully, and use buddy_order_unsafe() below.
*/
-static inline unsigned int page_order(struct page *page)
+static inline unsigned int buddy_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
}
/*
- * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
@@ -294,7 +289,7 @@ static inline unsigned int page_order(struct page *page)
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
-#define page_order_unsafe(page) READ_ONCE(page_private(page))
+#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 58b0d9c..4e3dff1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -434,7 +434,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
+ return atomic_read(&mm->mm_users) == 0;
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
diff --git a/mm/madvise.c b/mm/madvise.c
index 9b065d4..fd1f448b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -872,7 +872,6 @@ static long madvise_remove(struct vm_area_struct *vma,
static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
- struct page *page;
struct zone *zone;
unsigned long size;
@@ -882,6 +881,7 @@ static int madvise_inject_error(int behavior,
for (; start < end; start += size) {
unsigned long pfn;
+ struct page *page;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
@@ -896,32 +896,23 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (PageHWPoison(page)) {
- put_page(page);
- continue;
- }
-
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
+ pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
- if (ret)
- return ret;
- continue;
+ } else {
+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+ pfn, start);
+ /*
+ * Drop the page reference taken by get_user_pages_fast(). In
+ * the absence of MF_COUNT_INCREASED the memory_failure()
+ * routine is responsible for pinning the page to prevent it
+ * from being released back to the page allocator.
+ */
+ put_page(page);
+ ret = memory_failure(pfn, 0);
}
- pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
- /*
- * Drop the page reference taken by get_user_pages_fast(). In
- * the absence of MF_COUNT_INCREASED the memory_failure()
- * routine is responsible for pinning the page to prevent it
- * from being released back to the page allocator.
- */
- put_page(page);
- ret = memory_failure(pfn, 0);
if (ret)
return ret;
}
@@ -1094,23 +1085,6 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (write) {
if (mmap_write_lock_killable(current->mm))
return -EINTR;
-
- /*
- * We may have stolen the mm from another process
- * that is undergoing core dumping.
- *
- * Right now that's io_ring, in the future it may
- * be remote process management and not "current"
- * at all.
- *
- * We need to fix core dumping to not do this,
- * but for now we have the mmget_still_valid()
- * model.
- */
- if (!mmget_still_valid(current->mm)) {
- mmap_write_unlock(current->mm);
- return -EINTR;
- }
} else {
mmap_read_lock(current->mm);
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 990e3b2e..a2184b7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -65,6 +65,33 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
+{
+ if (hugepage_or_freepage) {
+ /*
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
+ * returns 0 for non-hugetlb pages as well.
+ */
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ /*
+ * We could fail to take off the target page from buddy
+ * for example due to racy page allocaiton, but that's
+ * acceptable because soft-offlined page is not broken
+ * and if someone really want to use it, they should
+ * take it.
+ */
+ return false;
+ }
+
+ SetPageHWPoison(page);
+ if (release)
+ put_page(page);
+ page_ref_inc(page);
+ num_poisoned_pages_inc();
+
+ return true;
+}
+
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
u32 hwpoison_filter_enable = 0;
@@ -555,6 +582,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -924,7 +952,7 @@ static int page_action(struct page_state *ps, struct page *p,
* Return: return 0 if failed to grab the refcount, otherwise true (some
* non-zero value.)
*/
-int get_hwpoison_page(struct page *page)
+static int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
@@ -953,7 +981,6 @@ int get_hwpoison_page(struct page *page)
return 0;
}
-EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
@@ -1103,6 +1130,25 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
+static int try_to_split_thp_page(struct page *page, const char *msg)
+{
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unsigned long pfn = page_to_pfn(page);
+
+ unlock_page(page);
+ if (!PageAnon(page))
+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
+ else
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ put_page(page);
+ return -EBUSY;
+ }
+ unlock_page(page);
+
+ return 0;
+}
+
static int memory_failure_hugetlb(unsigned long pfn, int flags)
{
struct page *p = pfn_to_page(pfn);
@@ -1144,7 +1190,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(head);
- put_hwpoison_page(head);
+ put_page(head);
return 0;
}
@@ -1325,23 +1371,11 @@ int memory_failure(unsigned long pfn, int flags)
}
if (PageTransHuge(hpage)) {
- lock_page(p);
- if (!PageAnon(p) || unlikely(split_huge_page(p))) {
- unlock_page(p);
- if (!PageAnon(p))
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- pfn);
- else
- pr_err("Memory failure: %#lx: thp split failed\n",
- pfn);
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- put_hwpoison_page(p);
+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
return -EBUSY;
}
- unlock_page(p);
VM_BUG_ON_PAGE(!page_count(p), p);
- hpage = compound_head(p);
}
/*
@@ -1381,10 +1415,7 @@ int memory_failure(unsigned long pfn, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- if (PageHuge(p))
- page_flags = hpage->flags;
- else
- page_flags = p->flags;
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1393,14 +1424,14 @@ int memory_failure(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
@@ -1416,11 +1447,8 @@ int memory_failure(unsigned long pfn, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
- *
- * When the raw error page is thp tail page, hpage points to the raw
- * page after thp split.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1637,9 +1665,9 @@ int unpoison_memory(unsigned long pfn)
}
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_hwpoison_page(page);
+ put_page(page);
return 0;
}
@@ -1679,6 +1707,9 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
} else if (is_free_buddy_page(p)) {
pr_info("%s: %#lx free buddy page\n", __func__, pfn);
ret = 0;
+ } else if (page_count(p)) {
+ /* raced with allocation */
+ ret = -EBUSY;
} else {
pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
__func__, pfn, p->flags);
@@ -1695,12 +1726,15 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
{
int ret = __get_any_page(page, pfn, flags);
+ if (ret == -EBUSY)
+ ret = __get_any_page(page, pfn, flags);
+
if (ret == 1 && !PageHuge(page) &&
!PageLRU(page) && !__PageMovable(page)) {
/*
* Try to free it.
*/
- put_hwpoison_page(page);
+ put_page(page);
shake_page(page, 1);
/*
@@ -1709,7 +1743,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
ret = __get_any_page(page, pfn, 0);
if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
pfn, page->flags, &page->flags);
return -EIO;
@@ -1718,69 +1752,51 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
return ret;
}
-static int soft_offline_huge_page(struct page *page, int flags)
+static bool isolate_page(struct page *page, struct list_head *pagelist)
{
- int ret;
- unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_head(page);
- LIST_HEAD(pagelist);
+ bool isolated = false;
+ bool lru = PageLRU(page);
- /*
- * This double-check of PageHWPoison is to avoid the race with
- * memory_failure(). See also comment in __soft_offline_page().
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
- return -EBUSY;
- }
- unlock_page(hpage);
-
- ret = isolate_huge_page(hpage, &pagelist);
- /*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
- */
- put_hwpoison_page(hpage);
- if (!ret) {
- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
- return -EBUSY;
- }
-
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
- if (!list_empty(&pagelist))
- putback_movable_pages(&pagelist);
- if (ret > 0)
- ret = -EIO;
+ if (PageHuge(page)) {
+ isolated = isolate_huge_page(page, pagelist);
} else {
- /*
- * We set PG_hwpoison only when the migration source hugepage
- * was successfully dissolved, because otherwise hwpoisoned
- * hugepage remains on free hugepage list, then userspace will
- * find it as SIGBUS by allocation failure. That's not expected
- * in soft-offlining.
- */
- ret = dissolve_free_huge_page(page);
- if (!ret) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- ret = -EBUSY;
- }
+ if (lru)
+ isolated = !isolate_lru_page(page);
+ else
+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+
+ if (isolated)
+ list_add(&page->lru, pagelist);
}
- return ret;
+
+ if (isolated && lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+
+ /*
+ * If we succeed to isolate the page, we grabbed another refcount on
+ * the page, so we can safely drop the one we got from get_any_pages().
+ * If we failed to isolate the page, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_pages() as well.
+ */
+ put_page(page);
+ return isolated;
}
-static int __soft_offline_page(struct page *page, int flags)
+/*
+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
+ * If the page is mapped, it migrates the contents over.
+ */
+static int __soft_offline_page(struct page *page)
{
- int ret;
+ int ret = 0;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ char const *msg_page[] = {"page", "hugepage"};
+ bool huge = PageHuge(page);
+ LIST_HEAD(pagelist);
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1789,121 +1805,75 @@ static int __soft_offline_page(struct page *page, int flags)
* so there's no race between soft_offline_page() and memory_failure().
*/
lock_page(page);
- wait_on_page_writeback(page);
+ if (!PageHuge(page))
+ wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
- return -EBUSY;
+ return 0;
}
- /*
- * Try to invalidate first. This should work for
- * non dirty unmapped page cache pages.
- */
- ret = invalidate_inode_page(page);
+
+ if (!PageHuge(page))
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
unlock_page(page);
+
/*
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- if (ret == 1) {
- put_hwpoison_page(page);
+ if (ret) {
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
+ page_handle_poison(page, false, true);
return 0;
}
- /*
- * Simple invalidation didn't work.
- * Try to migrate to a new page instead. migrate.c
- * handles a large number of cases for us.
- */
- if (PageLRU(page))
- ret = isolate_lru_page(page);
- else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- /*
- * Drop page reference which is came from get_any_page()
- * successful isolate_lru_page() already took another one.
- */
- put_hwpoison_page(page);
- if (!ret) {
- LIST_HEAD(pagelist);
- /*
- * After isolated lru page, the PageLRU will be cleared,
- * so use !__PageMovable instead for LRU page's mapping
- * cannot have PAGE_MAPPING_MOVABLE.
- */
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
- list_add(&page->lru, &pagelist);
+ if (isolate_page(hpage, &pagelist)) {
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
+ if (!ret) {
+ bool release = !huge;
+
+ if (!page_handle_poison(page, huge, release))
+ ret = -EBUSY;
+ } else {
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page->flags, &page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
- pfn, ret, page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags);
+ ret = -EBUSY;
}
return ret;
}
-static int soft_offline_in_use_page(struct page *page, int flags)
+static int soft_offline_in_use_page(struct page *page)
{
- int ret;
- int mt;
struct page *hpage = compound_head(page);
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
- unlock_page(page);
- if (!PageAnon(page))
- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
- else
- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(page);
+ if (!PageHuge(page) && PageTransHuge(hpage))
+ if (try_to_split_thp_page(page, "soft offline") < 0)
return -EBUSY;
- }
- unlock_page(page);
- }
-
- /*
- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
- * to free list immediately (not via pcplist) when released after
- * successful page migration. Otherwise we can't guarantee that the
- * page is really free after put_page() returns, so
- * set_hwpoison_free_buddy_page() highly likely fails.
- */
- mt = get_pageblock_migratetype(page);
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- set_pageblock_migratetype(page, mt);
- return ret;
+ return __soft_offline_page(page);
}
static int soft_offline_free_page(struct page *page)
{
- int rc = dissolve_free_huge_page(page);
+ int rc = 0;
- if (!rc) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- rc = -EBUSY;
- }
+ if (!page_handle_poison(page, true, false))
+ rc = -EBUSY;
+
return rc;
}
@@ -1933,6 +1903,7 @@ int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
struct page *page;
+ bool try_again = true;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1944,18 +1915,22 @@ int soft_offline_page(unsigned long pfn, int flags)
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
+ put_page(page);
+ return 0;
}
+retry:
get_online_mems();
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0)
- ret = soft_offline_in_use_page(page, flags);
+ ret = soft_offline_in_use_page(page);
else if (ret == 0)
- ret = soft_offline_free_page(page);
+ if (soft_offline_free_page(page) && try_again) {
+ try_again = false;
+ goto retry;
+ }
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 2afb01e..589afe4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3709,13 +3709,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i;
- vm_fault_t ret;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
- return VM_FAULT_FALLBACK;
+ return ret;
- ret = VM_FAULT_FALLBACK;
page = compound_head(page);
+ if (compound_order(page) != HPAGE_PMD_ORDER)
+ return ret;
/*
* Archs like ppc64 need additonal space to store information
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e9e2d4..6f20357 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -105,7 +105,7 @@ static struct resource *register_memory_resource(u64 start, u64 size,
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (strcmp(resource_name, "System RAM"))
- flags |= IORESOURCE_MEM_DRIVER_MANAGED;
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -625,31 +625,22 @@ void generic_online_page(struct page *page, unsigned int order)
}
EXPORT_SYMBOL_GPL(generic_online_page);
-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
- int order;
/*
- * Online the pages. The callback might decide to keep some pages
- * PG_reserved (to add them to the buddy later), but we still account
- * them as being online/belonging to this zone ("present").
+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+ * decide to not expose all pages to the buddy (e.g., expose them
+ * later). We account all pages as being online and belonging to this
+ * zone ("present").
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
- order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
- /* __free_pages_core() wants pfns to be aligned to the order */
- if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
- order = 0;
- (*online_page_callback)(pfn_to_page(pfn), order);
- }
+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
-
- *(unsigned long *)arg += nr_pages;
- return 0;
}
/* check which state of node_states will be changed when online memory */
@@ -710,9 +701,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
* call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -737,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMINIT_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
}
@@ -803,17 +799,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
int online_type, int nid)
{
unsigned long flags;
- unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
int ret;
struct memory_notify arg;
+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -825,6 +825,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
goto failed_addition;
/*
+ * Fixup the number of isolated pageblocks before marking the sections
+ * onlining, such that undo_isolate_page_range() works correctly.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
@@ -834,36 +842,29 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
setup_zone_pageset(zone);
}
- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
- online_pages_range);
- if (ret) {
- /* not a single memory resource was applicable */
- if (need_zonelists_rebuild)
- zone_pcp_reset(zone);
- goto failed_addition;
- }
-
- zone->present_pages += onlined_pages;
+ online_pages_range(pfn, nr_pages);
+ zone->present_pages += nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
- /*
- * When exposing larger, physically contiguous memory areas to the
- * buddy, shuffling in the buddy (when freeing onlined pages, putting
- * them either to the head or the tail of the freelist) is only helpful
- * for maintaining the shuffle, but not for creating the initial
- * shuffle. Shuffle the whole zone to make sure the just onlined pages
- * are properly distributed across the whole freelist.
- */
- shuffle_zone(zone);
-
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
zone_pcp_update(zone);
+ /* Basic onlining is complete, allow allocation of onlined pages. */
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+
+ /*
+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+ * the tail of the freelist when undoing isolation). Shuffle the whole
+ * zone to make sure the just onlined pages are properly distributed
+ * across the whole freelist - to create an initial shuffle.
+ */
+ shuffle_zone(zone);
+
init_per_zone_wmark_min();
kswapd_run(nid);
@@ -1035,7 +1036,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = PAGE_KERNEL };
u64 start, size;
@@ -1088,9 +1089,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
- BUG_ON(ret);
+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1099,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
+ /*
+ * In case we're allowed to merge the resource, flag it and trigger
+ * merging now that adding succeeded.
+ */
+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
+ merge_system_ram_resource(res);
+
/* online pages if requested */
if (memhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
@@ -1115,7 +1122,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size)
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1124,18 +1131,18 @@ int __ref __add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, mhp_flags);
if (ret < 0)
release_memory_resource(res);
return ret;
}
-int add_memory(int nid, u64 start, u64 size)
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
int rc;
lock_device_hotplug();
- rc = __add_memory(nid, start, size);
+ rc = __add_memory(nid, start, size, mhp_flags);
unlock_device_hotplug();
return rc;
@@ -1157,14 +1164,14 @@ EXPORT_SYMBOL_GPL(add_memory);
*
* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
* memory map") are created. Also, the created memory resource is flagged
- * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
* this memory as well (esp., not place kexec images onto it).
*
* The resource_name (visible via /proc/iomem) has to have the format
* "System RAM ($DRIVER)".
*/
int add_memory_driver_managed(int nid, u64 start, u64 size,
- const char *resource_name)
+ const char *resource_name, mhp_t mhp_flags)
{
struct resource *res;
int rc;
@@ -1182,7 +1189,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size,
goto out_unlock;
}
- rc = add_memory_resource(nid, res);
+ rc = add_memory_resource(nid, res, mhp_flags);
if (rc < 0)
release_memory_resource(res);
@@ -1379,28 +1386,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
-/* Mark all sections offline and remove all free pages from the buddy. */
-static int
-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
- void *data)
-{
- unsigned long *offlined_pages = (unsigned long *)data;
-
- *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
-
-/*
- * Check all pages in range, recorded as memory resource, are isolated.
- */
-static int
-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
- void *data)
-{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages,
- MEMORY_OFFLINE);
-}
-
static int __init cmdline_parse_movable_node(char *p)
{
movable_node_enabled = true;
@@ -1484,17 +1469,21 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}
-static int __ref __offline_pages(unsigned long start_pfn,
- unsigned long end_pfn)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
- unsigned long pfn, nr_pages = 0;
- unsigned long offlined_pages = 0;
- int ret, node, nr_isolate_pageblock;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn, system_ram_pages = 0;
unsigned long flags;
struct zone *zone;
struct memory_notify arg;
+ int ret, node;
char *reason;
+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/*
@@ -1505,9 +1494,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
* memory holes PG_reserved, don't need pfn_valid() checks, and can
* avoid using walk_system_ram_range() later.
*/
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb);
- if (nr_pages != end_pfn - start_pfn) {
+ if (system_ram_pages != nr_pages) {
ret = -EINVAL;
reason = "memory holes";
goto failed_removal;
@@ -1527,11 +1516,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
- if (ret < 0) {
+ if (ret) {
reason = "failure to isolate range";
goto failed_removal;
}
- nr_isolate_pageblock = ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1581,9 +1569,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}
- /* check again */
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- NULL, check_pages_isolated_cb);
+
/*
* per-cpu pages are drained in start_isolate_page_range, but if
* there are still pages that are not free, make sure that we
@@ -1596,30 +1582,30 @@ static int __ref __offline_pages(unsigned long start_pfn,
* because has_unmovable_pages explicitly checks for
* PageBuddy on freed pages on other zones.
*/
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
if (ret)
drain_all_pages(zone);
} while (ret);
- /* Ok, all of our target is isolated.
- We cannot do rollback at this point. */
- walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- &offlined_pages, offline_isolated_pages_cb);
- pr_info("Offlined Pages %ld\n", offlined_pages);
+ /* Mark all sections offline and remove free pages from the buddy. */
+ __offline_isolated_pages(start_pfn, end_pfn);
+ pr_info("Offlined Pages %ld\n", nr_pages);
+
/*
- * Onlining will reset pagetype flags and makes migrate type
- * MOVABLE, so just need to decrease the number of isolated
- * pageblocks zone counter here.
+ * The memory sections are marked offline, and the pageblock flags
+ * effectively stale; nobody should be touching them. Fixup the number
+ * of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
- zone->present_pages -= offlined_pages;
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= offlined_pages;
+ zone->zone_pgdat->node_present_pages -= nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
init_per_zone_wmark_min();
@@ -1656,11 +1642,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
return ret;
}
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
- return __offline_pages(start_pfn, start_pfn + nr_pages);
-}
-
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1749,26 +1730,6 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-static void __release_memory_resource(resource_size_t start,
- resource_size_t size)
-{
- int ret;
-
- /*
- * When removing memory in the same granularity as it was added,
- * this function never fails. It might only fail if resources
- * have to be adjusted or split. We'll ignore the error, as
- * removing of memory cannot fail.
- */
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-}
-
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
@@ -1802,7 +1763,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
memblock_remove(start, size);
}
- __release_memory_resource(start, size);
+ release_mem_region_adjustable(start, size);
try_offline_node(nid);
diff --git a/mm/memremap.c b/mm/memremap.c
index 19808345..73a206d 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -266,7 +266,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
- PHYS_PFN(range_len(range)), params->altmap);
+ PHYS_PFN(range_len(range)), params->altmap,
+ MIGRATE_MOVABLE);
}
mem_hotplug_done();
diff --git a/mm/migrate.c b/mm/migrate.c
index f94d7c7..4cf1af8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1223,16 +1223,11 @@ static int unmap_and_move(new_page_t get_new_page,
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- put_page(page);
- if (reason == MR_MEMORY_FAILURE) {
+ if (reason != MR_MEMORY_FAILURE)
/*
- * Set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird,
- * it's how HWPoison flag works at the moment.
+ * We release the page in page_handle_poison.
*/
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- }
+ put_page(page);
} else {
if (rc != -EAGAIN) {
if (likely(!__PageMovable(page))) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 67d11ad..ebb92f5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -619,7 +619,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file_inode(file)->i_writecount);
+ put_write_access(file_inode(file));
if (vma->vm_flags & VM_SHARED)
mapping_allow_writable(mapping);
@@ -2562,7 +2562,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (vma && (vma->vm_start <= addr))
return vma;
/* don't alter vm_end if the coredump is running */
- if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2588,9 +2588,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
- /* don't alter vm_start if the coredump is running */
- if (!mmget_still_valid(mm))
- return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 4fc9181..5654dd1 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -913,7 +913,7 @@ static int __mmu_interval_notifier_insert(
return -EOVERFLOW;
/* Must call with a mmget() held */
- if (WARN_ON(atomic_read(&mm->mm_count) <= 0))
+ if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
return -EINVAL;
/* pairs with mmdrop in mmu_interval_notifier_remove() */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 358d6f2..7709f0e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2849,6 +2849,7 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback);
*/
void wait_for_stable_page(struct page *page)
{
+ page = thp_head(page);
if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
wait_on_page_writeback(page);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0ff3a8..23f5066 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -78,6 +78,34 @@
#include "shuffle.h"
#include "page_reporting.h"
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fpi_t;
+
+/* No special request */
+#define FPI_NONE ((__force fpi_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page.
+ * This does not hinder free page reporting from grabbing the page,
+ * reporting it and marking it "reported" - it only skips notifying
+ * the free page reporting infrastructure about a newly freed page. For
+ * example, used when temporarily pulling a page from a freelist and
+ * putting it back unmodified.
+ */
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
+
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely on this flag for correctness - it's purely
+ * to allow for optimizations when handing back either fresh pages
+ * (memory onlining) or untouched pages (page isolation, free page
+ * reporting).
+ */
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -247,7 +275,8 @@ bool pm_suspended_storage(void)
unsigned int pageblock_order __read_mostly;
#endif
-static void __free_pages_ok(struct page *page, unsigned int order);
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -659,7 +688,7 @@ static void bad_page(struct page *page, const char *reason)
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page));
+ __free_pages_ok(page, compound_order(page), FPI_NONE);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -763,7 +792,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
-static inline void set_page_order(struct page *page, unsigned int order)
+static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -788,7 +817,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy,
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
- if (page_order(buddy) != order)
+ if (buddy_order(buddy) != order)
return false;
/*
@@ -873,13 +902,17 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
area->nr_free++;
}
-/* Used for pages which are on another list */
+/*
+ * Used for pages which are on another list. Move the pages to the tail
+ * of the list - so the moved pages won't immediately be considered for
+ * allocation again (e.g., optimization for memory onlining).
+ */
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
- list_move(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->lru, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -952,7 +985,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
- int migratetype, bool report)
+ int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
@@ -1026,9 +1059,11 @@ static inline void __free_one_page(struct page *page,
}
done_merging:
- set_page_order(page, order);
+ set_buddy_order(page, order);
- if (is_shuffle_order(order))
+ if (fpi_flags & FPI_TO_TAIL)
+ to_tail = true;
+ else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -1039,7 +1074,7 @@ static inline void __free_one_page(struct page *page,
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
- if (report)
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
@@ -1174,6 +1209,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
trace_mm_page_free(page, order);
+ if (unlikely(PageHWPoison(page)) && !order) {
+ /*
+ * Do not let hwpoison pages hit pcplists/buddy
+ * Untie memcg state and reset page's owner
+ */
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ __memcg_kmem_uncharge_page(page, order);
+ reset_page_owner(page, order);
+ return false;
+ }
+
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
@@ -1369,7 +1415,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1378,14 +1424,14 @@ static void free_pcppages_bulk(struct zone *zone, int count,
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
- int migratetype)
+ int migratetype, fpi_t fpi_flags)
{
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
- __free_one_page(page, pfn, zone, order, migratetype, true);
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(&zone->lock);
}
@@ -1463,7 +1509,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
}
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
@@ -1475,7 +1522,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype,
+ fpi_flags);
local_irq_restore(flags);
}
@@ -1485,6 +1533,11 @@ void __free_pages_core(struct page *page, unsigned int order)
struct page *p = page;
unsigned int loop;
+ /*
+ * When initializing the memmap, __init_single_page() sets the refcount
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
+ * refcount of all involved pages to 0.
+ */
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
@@ -1495,8 +1548,12 @@ void __free_pages_core(struct page *page, unsigned int order)
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
- set_page_refcounted(page);
- __free_pages(page, order);
+
+ /*
+ * Bypass PCP and place fresh pages right to the tail, primarily
+ * relevant for memory onlining.
+ */
+ __free_pages_ok(page, order, FPI_TO_TAIL);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -2121,7 +2178,7 @@ static inline void expand(struct zone *zone, struct page *page,
continue;
add_to_free_list(&page[size], zone, high, migratetype);
- set_page_order(&page[size], high);
+ set_buddy_order(&page[size], high);
}
}
@@ -2299,7 +2356,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Move the free pages in a range to the free lists of the requested type.
+ * Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
@@ -2335,7 +2392,7 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
- order = page_order(page);
+ order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
@@ -2459,7 +2516,7 @@ static inline void boost_watermark(struct zone *zone)
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
- unsigned int current_order = page_order(page);
+ unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -3123,7 +3180,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype);
+ free_one_page(zone, page, pfn, 0, migratetype,
+ FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
@@ -3209,7 +3267,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, order);
+ split_page_owner(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -3278,7 +3336,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
lockdep_assert_held(&zone->lock);
/* Return isolated page to tail of freelist. */
- __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
}
/*
@@ -4945,7 +5004,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
- __free_pages_ok(page, order);
+ __free_pages_ok(page, order, FPI_NONE);
}
void __free_pages(struct page *page, unsigned int order)
@@ -5979,10 +6038,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum meminit_context context,
- struct vmem_altmap *altmap)
+ unsigned long start_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -6026,19 +6090,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
__SetPageReserved(page);
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, migratetype);
cond_resched();
}
pfn++;
@@ -6100,15 +6157,10 @@ void __ref memmap_init_zone_device(struct zone *zone,
* the address space during boot when many long-lived
* kernel allocations are made.
*
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
@@ -6143,7 +6195,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
if (end_pfn > start_pfn) {
size = end_pfn - start_pfn;
memmap_init_zone(size, nid, zone, start_pfn,
- MEMINIT_EARLY, NULL);
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
}
}
}
@@ -8292,7 +8344,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
- iter += (1 << page_order(page)) - 1;
+ iter += (1 << buddy_order(page)) - 1;
continue;
}
@@ -8457,7 +8509,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0);
- if (ret < 0)
+ if (ret)
return ret;
/*
@@ -8505,7 +8557,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
if (outer_start != start) {
- order = page_order(pfn_to_page(outer_start));
+ order = buddy_order(pfn_to_page(outer_start));
/*
* outer_start page could be small order buddy page and
@@ -8693,35 +8745,21 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be in a single zone and isolated
- * before calling this.
+ * All pages in the range must be in a single zone, must not contain holes,
+ * must span full sections, and must be isolated before calling this function.
*/
-unsigned long
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
+ unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
- unsigned long pfn;
unsigned long flags;
- unsigned long offlined_pages = 0;
-
- /* find the first valid pfn */
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- if (pfn_valid(pfn))
- break;
- if (pfn == end_pfn)
- return offlined_pages;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
- pfn = start_pfn;
while (pfn < end_pfn) {
- if (!pfn_valid(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
@@ -8729,7 +8767,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- offlined_pages++;
continue;
}
/*
@@ -8740,20 +8777,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
pfn++;
- offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
- order = page_order(page);
- offlined_pages += 1 << order;
+ order = buddy_order(page);
del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return offlined_pages;
}
#endif
@@ -8768,7 +8801,7 @@ bool is_free_buddy_page(struct page *page)
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && page_order(page_head) >= order)
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -8778,30 +8811,70 @@ bool is_free_buddy_page(struct page *page)
#ifdef CONFIG_MEMORY_FAILURE
/*
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
- * test is performed under the zone lock to prevent a race against page
- * allocation.
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
*/
-bool set_hwpoison_free_buddy_page(struct page *page)
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ struct page *target, int low, int high,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+ struct page *current_buddy, *next_page;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+
+ if (target >= &page[size]) {
+ next_page = page + size;
+ current_buddy = page;
+ } else {
+ next_page = page;
+ current_buddy = page + size;
+ }
+
+ if (set_page_guard(zone, current_buddy, high, migratetype))
+ continue;
+
+ if (current_buddy != target) {
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
+ page = next_page;
+ }
+ }
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
+ */
+bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
- bool hwpoisoned = false;
+ bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
+ int page_order = buddy_order(page_head);
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
- if (!TestSetPageHWPoison(page))
- hwpoisoned = true;
+ if (PageBuddy(page_head) && page_order >= order) {
+ unsigned long pfn_head = page_to_pfn(page_head);
+ int migratetype = get_pfnblock_migratetype(page_head,
+ pfn_head);
+
+ del_page_from_free_list(page_head, zone, page_order);
+ break_down_buddy_pages(zone, page_head, page, 0,
+ page_order, migratetype);
+ ret = true;
break;
}
+ if (page_count(page_head) > 0)
+ break;
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return hwpoisoned;
+ return ret;
}
#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index aa94afb..abbf422 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* these pages to be merged.
*/
if (PageBuddy(page)) {
- order = page_order(page);
+ order = buddy_order(page);
if (order >= pageblock_order) {
pfn = page_to_pfn(page);
buddy_pfn = __find_buddy_pfn(pfn, order);
@@ -106,6 +106,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* If we isolate freepage with more than pageblock_order, there
* should be no freepage in the range, so we could avoid costly
* pageblock scanning for freepage moving.
+ *
+ * We didn't actually touch any of the isolated pages, so place them
+ * to the tail of the freelist. This is an optimization for memory
+ * onlining - just onlined memory won't immediately be considered for
+ * allocation.
*/
if (!isolated_page) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
@@ -173,8 +178,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* (e.g. __offline_pages will need to call it after check for isolated range for
* a next retry).
*
- * Return: the number of isolated pageblocks on success and -EBUSY if any part
- * of range cannot be isolated.
+ * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, int flags)
@@ -182,7 +186,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn;
unsigned long undo_pfn;
struct page *page;
- int nr_isolate_pageblock = 0;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
@@ -196,10 +199,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo_pfn = pfn;
goto undo;
}
- nr_isolate_pageblock++;
}
}
- return nr_isolate_pageblock;
+ return 0;
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
@@ -259,7 +261,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* the correct MIGRATE_ISOLATE freelist. There is no
* simple way to verify that as VM_BUG_ON(), though.
*/
- pfn += 1 << page_order(page);
+ pfn += 1 << buddy_order(page);
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 36046150..b735a8e 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -204,7 +204,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner->last_migrate_reason = reason;
}
-void __split_page_owner(struct page *page, unsigned int order)
+void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
struct page_ext *page_ext = lookup_page_ext(page);
@@ -213,7 +213,7 @@ void __split_page_owner(struct page *page, unsigned int order)
if (unlikely(!page_ext))
return;
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < nr; i++) {
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
@@ -295,7 +295,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageBuddy(page)) {
unsigned long freepage_order;
- freepage_order = page_order_unsafe(page);
+ freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
@@ -490,7 +490,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
@@ -584,7 +584,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
* heavy lock contention.
*/
if (PageBuddy(page)) {
- unsigned long order = page_order_unsafe(page);
+ unsigned long order = buddy_order_unsafe(page);
if (order > 0 && order < MAX_ORDER)
pfn += (1UL << order) - 1;
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 34b9181..ae0482c 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -8,13 +8,23 @@
#include <linux/ratelimit.h>
#include <linux/kasan.h>
-static bool want_page_poisoning __read_mostly;
+static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning);
static int __init early_page_poison_param(char *buf)
{
- if (!buf)
- return -EINVAL;
- return strtobool(buf, &want_page_poisoning);
+ int ret;
+ bool tmp;
+
+ ret = strtobool(buf, &tmp);
+ if (ret)
+ return ret;
+
+ if (tmp)
+ static_branch_enable(&want_page_poisoning);
+ else
+ static_branch_disable(&want_page_poisoning);
+
+ return 0;
}
early_param("page_poison", early_page_poison_param);
@@ -31,7 +41,7 @@ bool page_poisoning_enabled(void)
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
- return (want_page_poisoning ||
+ return (static_branch_unlikely(&want_page_poisoning) ||
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled()));
}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 3bbd471..cd8e13d 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -92,7 +92,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
* report on the new larger page when we make our way
* up to that higher order.
*/
- if (PageBuddy(page) && page_order(page) == order)
+ if (PageBuddy(page) && buddy_order(page) == order)
__SetPageReported(page);
} while ((sg = sg_next(sg)));
@@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
* the new head of the free list before we release the
* zone lock.
*/
- if (&page->lru != list && !list_is_first(&page->lru, list))
+ if (!list_is_first(&page->lru, list))
list_rotate_to_front(&page->lru, list);
/* release lock before waiting on report processing */
diff --git a/mm/readahead.c b/mm/readahead.c
index 3c9a8dd..c6ffb76 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,10 +158,8 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
}
/**
- * page_cache_readahead_unbounded - Start unchecked readahead.
- * @mapping: File address space.
- * @file: This instance of the open file; used for authentication.
- * @index: First page index to read.
+ * page_cache_ra_unbounded - Start unchecked readahead.
+ * @ractl: Readahead control.
* @nr_to_read: The number of pages to read.
* @lookahead_size: Where to start the next readahead.
*
@@ -173,17 +171,13 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
* Context: File is referenced by caller. Mutexes may be held by caller.
* May sleep, but will not reenter filesystem to reclaim memory.
*/
-void page_cache_readahead_unbounded(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void page_cache_ra_unbounded(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
+ struct address_space *mapping = ractl->mapping;
+ unsigned long index = readahead_index(ractl);
LIST_HEAD(page_pool);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- struct readahead_control rac = {
- .mapping = mapping,
- .file = file,
- ._index = index,
- };
unsigned long i;
/*
@@ -204,7 +198,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
- BUG_ON(index + i != rac._index + rac._nr_pages);
+ BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
if (page && !xa_is_value(page)) {
/*
@@ -215,7 +209,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
@@ -228,12 +222,12 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
} else if (add_to_page_cache_lru(page, mapping, index + i,
gfp_mask) < 0) {
put_page(page);
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
- rac._nr_pages++;
+ ractl->_nr_pages++;
}
/*
@@ -241,22 +235,22 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- read_pages(&rac, &page_pool, false);
+ read_pages(ractl, &page_pool, false);
memalloc_nofs_restore(nofs);
}
-EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * do_page_cache_ra() actually reads a chunk of disk. It allocates
* the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-void __do_page_cache_readahead(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void do_page_cache_ra(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = ractl->mapping->host;
+ unsigned long index = readahead_index(ractl);
loff_t isize = i_size_read(inode);
pgoff_t end_index; /* The last page we want to read */
@@ -270,20 +264,19 @@ void __do_page_cache_readahead(struct address_space *mapping,
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
- page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
- lookahead_size);
+ page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-void force_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t index, unsigned long nr_to_read)
+void force_page_cache_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long nr_to_read)
{
+ struct address_space *mapping = ractl->mapping;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
- struct file_ra_state *ra = &filp->f_ra;
- unsigned long max_pages;
+ unsigned long max_pages, index;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
!mapping->a_ops->readahead))
@@ -293,14 +286,16 @@ void force_page_cache_readahead(struct address_space *mapping,
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
+ index = readahead_index(ractl);
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
- nr_to_read = min(nr_to_read, max_pages);
+ nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
while (nr_to_read) {
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
+ ractl->_index = index;
+ do_page_cache_ra(ractl, this_chunk, 0);
index += this_chunk;
nr_to_read -= this_chunk;
@@ -437,14 +432,14 @@ static int try_context_readahead(struct address_space *mapping,
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static void ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t index,
+static void ondemand_readahead(struct readahead_control *ractl,
+ struct file_ra_state *ra, bool hit_readahead_marker,
unsigned long req_size)
{
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
+ unsigned long index = readahead_index(ractl);
pgoff_t prev_index;
/*
@@ -482,7 +477,8 @@ static void ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, index + 1, max_pages);
+ start = page_cache_next_miss(ractl->mapping, index + 1,
+ max_pages);
rcu_read_unlock();
if (!start || start - index > max_pages)
@@ -515,14 +511,15 @@ static void ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, index, req_size, max_pages))
+ if (try_context_readahead(ractl->mapping, ra, index, req_size,
+ max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ do_page_cache_ra(ractl, req_size, 0);
return;
initial_readahead:
@@ -548,25 +545,12 @@ static void ondemand_readahead(struct address_space *mapping,
}
}
- ra_submit(ra, mapping, filp);
+ ractl->_index = ra->start;
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
}
-/**
- * page_cache_sync_readahead - generic file readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_sync_readahead() should be called when a cache miss happened:
- * it will submit the read. The readahead logic may decide to piggyback more
- * pages onto the read request if access patterns suggest it will improve
- * performance.
- */
-void page_cache_sync_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- pgoff_t index, unsigned long req_count)
+void page_cache_sync_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -576,35 +560,19 @@ void page_cache_sync_readahead(struct address_space *mapping,
return;
/* be dumb */
- if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, index, req_count);
+ if (ractl->file && (ractl->file->f_mode & FMODE_RANDOM)) {
+ force_page_cache_ra(ractl, ra, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, index, req_count);
+ ondemand_readahead(ractl, ra, false, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+EXPORT_SYMBOL_GPL(page_cache_sync_ra);
-/**
- * page_cache_async_readahead - file readahead for marked pages
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @page: The page at @index which triggered the readahead call.
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_async_readahead() should be called when a page is used which
- * is marked as PageReadahead; this is a marker to suggest that the application
- * has used up enough of the readahead window that we should start pulling in
- * more pages.
- */
-void
-page_cache_async_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t index,
- unsigned long req_count)
+void page_cache_async_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, struct page *page,
+ unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -621,16 +589,16 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (inode_read_congested(mapping->host))
+ if (inode_read_congested(ractl->mapping->host))
return;
if (blk_cgroup_congested())
return;
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, index, req_count);
+ ondemand_readahead(ractl, ra, true, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 9425260..1b84945 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1205,7 +1205,7 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
@@ -1246,7 +1246,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1293,7 +1293,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
* Subpages can be mapped with PTEs too. Check how many of
* them are still mapped.
*/
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1303,10 +1303,10 @@ static void page_remove_anon_compound_rmap(struct page *page)
* page of the compound page is unmapped, but at least one
* small page is still mapped.
*/
- if (nr && nr < HPAGE_PMD_NR)
+ if (nr && nr < thp_nr_pages(page))
deferred_split_huge_page(page);
} else {
- nr = HPAGE_PMD_NR;
+ nr = thp_nr_pages(page);
}
if (unlikely(PageMlocked(page)))
diff --git a/mm/shmem.c b/mm/shmem.c
index 6d4ddef..537c137 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3984,7 +3984,7 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
};
int __init shmem_init(void)
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 9b5cd4b..9c2e145 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -60,7 +60,7 @@ static struct page * __meminit shuffle_valid_page(struct zone *zone,
* ...is the page on the same list as the page we will
* shuffle it with?
*/
- if (page_order(page) != order)
+ if (buddy_order(page) != order)
return NULL;
return page;
diff --git a/mm/slab.c b/mm/slab.c
index 399a9d1..b111356 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1062,7 +1062,7 @@ int slab_prepare_cpu(unsigned int cpu)
* Even if all the cpus of a node are down, we don't free the
* kmem_cache_node of any cache. This to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
- * the cpu going down. The list3 structure is usually allocated from
+ * the cpu going down. The kmem_cache_node structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
*/
int slab_dead_cpu(unsigned int cpu)
diff --git a/mm/slab.h b/mm/slab.h
index 6dd4b70..06c6587 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -46,7 +46,6 @@ struct kmem_cache {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
-#include <linux/kmemleak.h>
/*
* State of the slab allocator.
diff --git a/mm/slub.c b/mm/slub.c
index 61d0d29..b30be23 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1956,7 +1956,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab and there is none available then get_partials()
+ * partial slab and there is none available then get_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
diff --git a/mm/sparse.c b/mm/sparse.c
index b25ad8e..7bd23f9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -312,6 +312,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
return coded_mem_map;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Decode mem_map from the coded memmap
*/
@@ -321,6 +322,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
coded_mem_map &= SECTION_MAP_MASK;
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
+#endif /* CONFIG_MEMORY_HOTPLUG */
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index aa40e70..ee46582 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -246,7 +246,7 @@ int add_to_swap(struct page *page)
goto fail;
/*
* Normally the page will be dirtied in unmap because its pte should be
- * dirty. A special case is MADV_FREE page. The page'e pte could have
+ * dirty. A special case is MADV_FREE page. The page's pte could have
* dirty bit cleared but the page's SwapBacked bit is still set because
* clearing the dirty bit and SwapBacked bit has no lock protected. For
* such page, unmap will not set dirty bit for it, so page reclaim will
diff --git a/mm/truncate.c b/mm/truncate.c
index 6bbe0f0..18cec39 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -168,7 +168,7 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
*
- * We need to bale out if page->mapping is no longer equal to the original
+ * We need to bail out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
@@ -177,12 +177,12 @@ static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
- pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+ unsigned int nr = thp_nr_pages(page);
unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_SIZE);
+ do_invalidatepage(page, 0, thp_size(page));
/*
* Some filesystems seem to re-dirty the page even after
diff --git a/mm/util.c b/mm/util.c
index 4e21fe7..4ddb6e1 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -69,7 +69,8 @@ EXPORT_SYMBOL(kstrdup);
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
+ * must not be passed to krealloc().
*
* Return: source string if it is in .rodata section otherwise
* fallback to kstrdup.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 879fb57..1b8f0e0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -725,8 +725,7 @@ static inline int is_page_cache_freeable(struct page *page)
* that isolated the page, the page cache and optional buffer
* heads at page->private.
*/
- int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
- HPAGE_PMD_NR : 1;
+ int page_cache_pins = thp_nr_pages(page);
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
@@ -2240,7 +2239,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- u64 fraction[2];
+ u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
unsigned long ap, fp;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f7b4ee..698bc0b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -325,7 +325,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
@@ -350,7 +350,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
node_page_state_add(x, pgdat, item);
x = 0;
}
@@ -511,7 +511,7 @@ static inline void mod_zone_state(struct zone *zone,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to zone counters */
@@ -573,7 +573,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to node counters */
diff --git a/mm/workingset.c b/mm/workingset.c
index 92e6611..8ed8e62 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -216,7 +216,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
- * @memcg: the lruvec that was aged
+ * @lruvec: the lruvec that was aged
* @nr_pages: the number of pages to count
*
* As in-memory pages are aged, non-resident pages need to be aged as
diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan
index 2734802..4e3fff0 100644
--- a/scripts/Makefile.ubsan
+++ b/scripts/Makefile.ubsan
@@ -4,7 +4,15 @@
endif
ifdef CONFIG_UBSAN_BOUNDS
- CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds)
+ ifdef CONFIG_CC_IS_CLANG
+ CFLAGS_UBSAN += -fsanitize=array-bounds
+ else
+ CFLAGS_UBSAN += $(call cc-option, -fsanitize=bounds)
+ endif
+endif
+
+ifdef CONFIG_UBSAN_LOCAL_BOUNDS
+ CFLAGS_UBSAN += -fsanitize=local-bounds
endif
ifdef CONFIG_UBSAN_MISC
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 504d2e4..4223a9ac 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -65,6 +65,7 @@
# git output parsing needs US English output, so first set backtick child process LANGUAGE
my $git_command ='export LANGUAGE=en_US.UTF-8; git';
my $tabsize = 8;
+my ${CONFIG_} = "CONFIG_";
sub help {
my ($exitcode) = @_;
@@ -127,6 +128,8 @@
--typedefsfile Read additional types from this file
--color[=WHEN] Use colors 'always', 'never', or only when output
is a terminal ('auto'). Default is 'auto'.
+ --kconfig-prefix=WORD use WORD as a prefix for Kconfig symbols (default
+ ${CONFIG_})
-h, --help, --version display this help and exit
When FILE is - read standard input.
@@ -235,6 +238,7 @@
'color=s' => \$color,
'no-color' => \$color, #keep old behaviors of -nocolor
'nocolor' => \$color, #keep old behaviors of -nocolor
+ 'kconfig-prefix=s' => \${CONFIG_},
'h|help' => \$help,
'version' => \$help
) or help(1);
@@ -970,6 +974,16 @@
}
}
+sub git_is_single_file {
+ my ($filename) = @_;
+
+ return 0 if ((which("git") eq "") || !(-e "$gitroot"));
+
+ my $output = `${git_command} ls-files -- $filename 2>/dev/null`;
+ my $count = $output =~ tr/\n//;
+ return $count eq 1 && $output =~ m{^${filename}$};
+}
+
sub git_commit_info {
my ($commit, $id, $desc) = @_;
@@ -1043,6 +1057,9 @@
$allow_c99_comments = !defined $ignore_type{"C99_COMMENT_TOLERANCE"};
for my $filename (@ARGV) {
my $FILE;
+ my $is_git_file = git_is_single_file($filename);
+ my $oldfile = $file;
+ $file = 1 if ($is_git_file);
if ($git) {
open($FILE, '-|', "git format-patch -M --stdout -1 $filename") ||
die "$P: $filename: git format-patch failed - $!\n";
@@ -1087,6 +1104,7 @@
@modifierListFile = ();
@typeListFile = ();
build_types();
+ $file = $oldfile if ($is_git_file);
}
if (!$quiet) {
@@ -1163,10 +1181,10 @@
}
}
+ $comment = trim($comment);
$name = trim($name);
$name =~ s/^\"|\"$//g;
- $name =~ s/(\s*\([^\)]+\))\s*//;
- if (defined($1)) {
+ if ($name =~ s/(\s*\([^\)]+\))\s*//) {
$name_comment = trim($1);
}
$address = trim($address);
@@ -1181,10 +1199,12 @@
}
sub format_email {
- my ($name, $address) = @_;
+ my ($name, $name_comment, $address, $comment) = @_;
my $formatted_email;
+ $name_comment = trim($name_comment);
+ $comment = trim($comment);
$name = trim($name);
$name =~ s/^\"|\"$//g;
$address = trim($address);
@@ -1197,9 +1217,9 @@
if ("$name" eq "") {
$formatted_email = "$address";
} else {
- $formatted_email = "$name <$address>";
+ $formatted_email = "$name$name_comment <$address>";
}
-
+ $formatted_email .= "$comment";
return $formatted_email;
}
@@ -1207,17 +1227,23 @@
my ($email) = @_;
my ($email_name, $name_comment, $email_address, $comment) = parse_email($email);
- return format_email($email_name, $email_address);
+ return format_email($email_name, $name_comment, $email_address, $comment);
}
sub same_email_addresses {
- my ($email1, $email2) = @_;
+ my ($email1, $email2, $match_comment) = @_;
my ($email1_name, $name1_comment, $email1_address, $comment1) = parse_email($email1);
my ($email2_name, $name2_comment, $email2_address, $comment2) = parse_email($email2);
+ if ($match_comment != 1) {
+ return $email1_name eq $email2_name &&
+ $email1_address eq $email2_address;
+ }
return $email1_name eq $email2_name &&
- $email1_address eq $email2_address;
+ $email1_address eq $email2_address &&
+ $name1_comment eq $name2_comment &&
+ $comment1 eq $comment2;
}
sub which {
@@ -2347,6 +2373,7 @@
my $signoff = 0;
my $author = '';
my $authorsignoff = 0;
+ my $author_sob = '';
my $is_patch = 0;
my $is_binding_patch = -1;
my $in_header_lines = $file ? 0 : 1;
@@ -2661,6 +2688,10 @@
# Check the patch for a From:
if (decode("MIME-Header", $line) =~ /^From:\s*(.*)/) {
$author = $1;
+ my $curline = $linenr;
+ while(defined($rawlines[$curline]) && ($rawlines[$curline++] =~ /^[ \t]\s*(.*)/)) {
+ $author .= $1;
+ }
$author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i);
$author =~ s/"//g;
$author = reformat_email($author);
@@ -2670,9 +2701,37 @@
if ($line =~ /^\s*signed-off-by:\s*(.*)/i) {
$signoff++;
$in_commit_log = 0;
- if ($author ne '') {
- if (same_email_addresses($1, $author)) {
+ if ($author ne '' && $authorsignoff != 1) {
+ if (same_email_addresses($1, $author, 1)) {
$authorsignoff = 1;
+ } else {
+ my $ctx = $1;
+ my ($email_name, $email_comment, $email_address, $comment1) = parse_email($ctx);
+ my ($author_name, $author_comment, $author_address, $comment2) = parse_email($author);
+
+ if ($email_address eq $author_address && $email_name eq $author_name) {
+ $author_sob = $ctx;
+ $authorsignoff = 2;
+ } elsif ($email_address eq $author_address) {
+ $author_sob = $ctx;
+ $authorsignoff = 3;
+ } elsif ($email_name eq $author_name) {
+ $author_sob = $ctx;
+ $authorsignoff = 4;
+
+ my $address1 = $email_address;
+ my $address2 = $author_address;
+
+ if ($address1 =~ /(\S+)\+\S+(\@.*)/) {
+ $address1 = "$1$2";
+ }
+ if ($address2 =~ /(\S+)\+\S+(\@.*)/) {
+ $address2 = "$1$2";
+ }
+ if ($address1 eq $address2) {
+ $authorsignoff = 5;
+ }
+ }
}
}
}
@@ -2729,7 +2788,7 @@
}
my ($email_name, $name_comment, $email_address, $comment) = parse_email($email);
- my $suggested_email = format_email(($email_name, $email_address));
+ my $suggested_email = format_email(($email_name, $name_comment, $email_address, $comment));
if ($suggested_email eq "") {
ERROR("BAD_SIGN_OFF",
"Unrecognized email address: '$email'\n" . $herecurr);
@@ -2739,9 +2798,9 @@
$dequoted =~ s/" </ </;
# Don't force email to have quotes
# Allow just an angle bracketed address
- if (!same_email_addresses($email, $suggested_email)) {
+ if (!same_email_addresses($email, $suggested_email, 0)) {
WARN("BAD_SIGN_OFF",
- "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr);
+ "email address '$email' might be better as '$suggested_email'\n" . $herecurr);
}
}
@@ -2987,6 +3046,42 @@
}
}
+# check for repeated words separated by a single space
+ if ($rawline =~ /^\+/ || $in_commit_log) {
+ while ($rawline =~ /\b($word_pattern) (?=($word_pattern))/g) {
+
+ my $first = $1;
+ my $second = $2;
+
+ if ($first =~ /(?:struct|union|enum)/) {
+ pos($rawline) += length($first) + length($second) + 1;
+ next;
+ }
+
+ next if ($first ne $second);
+ next if ($first eq 'long');
+
+ if (WARN("REPEATED_WORD",
+ "Possible repeated word: '$first'\n" . $herecurr) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/\b$first $second\b/$first/;
+ }
+ }
+
+ # if it's a repeated word on consecutive lines in a comment block
+ if ($prevline =~ /$;+\s*$/ &&
+ $prevrawline =~ /($word_pattern)\s*$/) {
+ my $last_word = $1;
+ if ($rawline =~ /^\+\s*\*\s*$last_word /) {
+ if (WARN("REPEATED_WORD",
+ "Possible repeated word: '$last_word'\n" . $hereprev) &&
+ $fix) {
+ $fixed[$fixlinenr] =~ s/(\+\s*\*\s*)$last_word /$1/;
+ }
+ }
+ }
+ }
+
# ignore non-hunk lines and lines being removed
next if (!$hunk_line || $line =~ /^-/);
@@ -3213,6 +3308,12 @@
}
}
+# check for embedded filenames
+ if ($rawline =~ /^\+.*\Q$realfile\E/) {
+ WARN("EMBEDDED_FILENAME",
+ "It's generally not useful to have the filename in the file\n" . $herecurr);
+ }
+
# check we are in a valid source file if not then ignore this hunk
next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/);
@@ -3310,42 +3411,6 @@
}
}
-# check for repeated words separated by a single space
- if ($rawline =~ /^\+/) {
- while ($rawline =~ /\b($word_pattern) (?=($word_pattern))/g) {
-
- my $first = $1;
- my $second = $2;
-
- if ($first =~ /(?:struct|union|enum)/) {
- pos($rawline) += length($first) + length($second) + 1;
- next;
- }
-
- next if ($first ne $second);
- next if ($first eq 'long');
-
- if (WARN("REPEATED_WORD",
- "Possible repeated word: '$first'\n" . $herecurr) &&
- $fix) {
- $fixed[$fixlinenr] =~ s/\b$first $second\b/$first/;
- }
- }
-
- # if it's a repeated word on consecutive lines in a comment block
- if ($prevline =~ /$;+\s*$/ &&
- $prevrawline =~ /($word_pattern)\s*$/) {
- my $last_word = $1;
- if ($rawline =~ /^\+\s*\*\s*$last_word /) {
- if (WARN("REPEATED_WORD",
- "Possible repeated word: '$last_word'\n" . $hereprev) &&
- $fix) {
- $fixed[$fixlinenr] =~ s/(\+\s*\*\s*)$last_word /$1/;
- }
- }
- }
- }
-
# check for space before tabs.
if ($rawline =~ /^\+/ && $rawline =~ / \t/) {
my $herevet = "$here\n" . cat_vet($rawline) . "\n";
@@ -3436,7 +3501,7 @@
if ($realfile =~ m@^(drivers/net/|net/)@ &&
$prevrawline =~ /^\+[ \t]*\/\*[ \t]*$/ &&
$rawline =~ /^\+[ \t]*\*/ &&
- $realline > 2) {
+ $realline > 3) { # Do not warn about the initial copyright comment block after SPDX-License-Identifier
WARN("NETWORKING_BLOCK_COMMENT_STYLE",
"networking block comments don't use an empty /* line, use /* Comment...\n" . $hereprev);
}
@@ -3895,6 +3960,17 @@
#ignore lines not being added
next if ($line =~ /^[^\+]/);
+# check for self assignments used to avoid compiler warnings
+# e.g.: int foo = foo, *bar = NULL;
+# struct foo bar = *(&(bar));
+ if ($line =~ /^\+\s*(?:$Declare)?([A-Za-z_][A-Za-z\d_]*)\s*=/) {
+ my $var = $1;
+ if ($line =~ /^\+\s*(?:$Declare)?$var\s*=\s*(?:$var|\*\s*\(?\s*&\s*\(?\s*$var\s*\)?\s*\)?)\s*[;,]/) {
+ WARN("SELF_ASSIGNMENT",
+ "Do not use self-assignments to avoid compiler warnings\n" . $herecurr);
+ }
+ }
+
# check for dereferences that span multiple lines
if ($prevline =~ /^\+.*$Lval\s*(?:\.|->)\s*$/ &&
$line =~ /^\+\s*(?!\#\s*(?!define\s+|if))\s*$Lval/) {
@@ -4270,6 +4346,12 @@
"Prefer dev_$level(... to dev_printk(KERN_$orig, ...\n" . $herecurr);
}
+# trace_printk should not be used in production code.
+ if ($line =~ /\b(trace_printk|trace_puts|ftrace_vprintk)\s*\(/) {
+ WARN("TRACE_PRINTK",
+ "Do not use $1() in production code (this can be ignored if built only with a debug config option)\n" . $herecurr);
+ }
+
# ENOSYS means "bad syscall nr" and nothing else. This will have a small
# number of false positives, but assembly files are not checked, so at
# least the arch entry code will not trigger this warning.
@@ -4936,6 +5018,17 @@
}
}
+# check if a statement with a comma should be two statements like:
+# foo = bar(), /* comma should be semicolon */
+# bar = baz();
+ if (defined($stat) &&
+ $stat =~ /^\+\s*(?:$Lval\s*$Assignment\s*)?$FuncArg\s*,\s*(?:$Lval\s*$Assignment\s*)?$FuncArg\s*;\s*$/) {
+ my $cnt = statement_rawlines($stat);
+ my $herectx = get_stat_here($linenr, $cnt, $here);
+ WARN("SUSPECT_COMMA_SEMICOLON",
+ "Possible comma where semicolon could be used\n" . $herectx);
+ }
+
# return is not a function
if (defined($stat) && $stat =~ /^.\s*return(\s*)\(/s) {
my $spacing = $1;
@@ -5295,9 +5388,9 @@
$dstat =~ s/\s*$//s;
# Flatten any parentheses and braces
- while ($dstat =~ s/\([^\(\)]*\)/1/ ||
- $dstat =~ s/\{[^\{\}]*\}/1/ ||
- $dstat =~ s/.\[[^\[\]]*\]/1/)
+ while ($dstat =~ s/\([^\(\)]*\)/1u/ ||
+ $dstat =~ s/\{[^\{\}]*\}/1u/ ||
+ $dstat =~ s/.\[[^\[\]]*\]/1u/)
{
}
@@ -5338,6 +5431,7 @@
$dstat !~ /^\.$Ident\s*=/ && # .foo =
$dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ && # stringification #foo
$dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ && # do {...} while (...); // do {...} while (...)
+ $dstat !~ /^while\s*$Constant\s*$Constant\s*$/ && # while (...) {...}
$dstat !~ /^for\s*$Constant$/ && # for (...)
$dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar()
$dstat !~ /^do\s*{/ && # do {...
@@ -6524,16 +6618,16 @@
}
# check for IS_ENABLED() without CONFIG_<FOO> ($rawline for comments too)
- if ($rawline =~ /\bIS_ENABLED\s*\(\s*(\w+)\s*\)/ && $1 !~ /^CONFIG_/) {
+ if ($rawline =~ /\bIS_ENABLED\s*\(\s*(\w+)\s*\)/ && $1 !~ /^${CONFIG_}/) {
WARN("IS_ENABLED_CONFIG",
- "IS_ENABLED($1) is normally used as IS_ENABLED(CONFIG_$1)\n" . $herecurr);
+ "IS_ENABLED($1) is normally used as IS_ENABLED(${CONFIG_}$1)\n" . $herecurr);
}
# check for #if defined CONFIG_<FOO> || defined CONFIG_<FOO>_MODULE
- if ($line =~ /^\+\s*#\s*if\s+defined(?:\s*\(?\s*|\s+)(CONFIG_[A-Z_]+)\s*\)?\s*\|\|\s*defined(?:\s*\(?\s*|\s+)\1_MODULE\s*\)?\s*$/) {
+ if ($line =~ /^\+\s*#\s*if\s+defined(?:\s*\(?\s*|\s+)(${CONFIG_}[A-Z_]+)\s*\)?\s*\|\|\s*defined(?:\s*\(?\s*|\s+)\1_MODULE\s*\)?\s*$/) {
my $config = $1;
if (WARN("PREFER_IS_ENABLED",
- "Prefer IS_ENABLED(<FOO>) to CONFIG_<FOO> || CONFIG_<FOO>_MODULE\n" . $herecurr) &&
+ "Prefer IS_ENABLED(<FOO>) to ${CONFIG_}<FOO> || ${CONFIG_}<FOO>_MODULE\n" . $herecurr) &&
$fix) {
$fixed[$fixlinenr] = "\+#if IS_ENABLED($config)";
}
@@ -6886,9 +6980,33 @@
if ($signoff == 0) {
ERROR("MISSING_SIGN_OFF",
"Missing Signed-off-by: line(s)\n");
- } elsif (!$authorsignoff) {
- WARN("NO_AUTHOR_SIGN_OFF",
- "Missing Signed-off-by: line by nominal patch author '$author'\n");
+ } elsif ($authorsignoff != 1) {
+ # authorsignoff values:
+ # 0 -> missing sign off
+ # 1 -> sign off identical
+ # 2 -> names and addresses match, comments mismatch
+ # 3 -> addresses match, names different
+ # 4 -> names match, addresses different
+ # 5 -> names match, addresses excluding subaddress details (refer RFC 5233) match
+
+ my $sob_msg = "'From: $author' != 'Signed-off-by: $author_sob'";
+
+ if ($authorsignoff == 0) {
+ ERROR("NO_AUTHOR_SIGN_OFF",
+ "Missing Signed-off-by: line by nominal patch author '$author'\n");
+ } elsif ($authorsignoff == 2) {
+ CHK("FROM_SIGN_OFF_MISMATCH",
+ "From:/Signed-off-by: email comments mismatch: $sob_msg\n");
+ } elsif ($authorsignoff == 3) {
+ WARN("FROM_SIGN_OFF_MISMATCH",
+ "From:/Signed-off-by: email name mismatch: $sob_msg\n");
+ } elsif ($authorsignoff == 4) {
+ WARN("FROM_SIGN_OFF_MISMATCH",
+ "From:/Signed-off-by: email address mismatch: $sob_msg\n");
+ } elsif ($authorsignoff == 5) {
+ WARN("FROM_SIGN_OFF_MISMATCH",
+ "From:/Signed-off-by: email subaddress mismatch: $sob_msg\n");
+ }
}
}
diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch
index e9df9cc..1aae4f4 100644
--- a/scripts/const_structs.checkpatch
+++ b/scripts/const_structs.checkpatch
@@ -39,6 +39,9 @@
nvkm_device_chip
of_device_id
pci_raw_ops
+phy_ops
+pinctrl_ops
+pinmux_ops
pipe_buf_operations
platform_hibernation_ops
platform_suspend_ops
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 6a56bba..09cd871 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -167,6 +167,9 @@
if not namespace:
raise gdb.GdbError("No namespace for current process")
+ gdb.write("{:^18} {:^15} {:>9} {} {} options\n".format(
+ "mount", "super_block", "devname", "pathname", "fstype"))
+
for vfs in lists.list_for_each_entry(namespace['list'],
mount_ptr_type, "mnt_list"):
devname = vfs['mnt_devname'].string()
@@ -190,14 +193,10 @@
m_flags = int(vfs['mnt']['mnt_flags'])
rd = "ro" if (s_flags & constants.LX_SB_RDONLY) else "rw"
- gdb.write(
- "{} {} {} {}{}{} 0 0\n"
- .format(devname,
- pathname,
- fstype,
- rd,
- info_opts(FS_INFO, s_flags),
- info_opts(MNT_INFO, m_flags)))
+ gdb.write("{} {} {} {} {} {}{}{} 0 0\n".format(
+ vfs.format_string(), superblock.format_string(), devname,
+ pathname, fstype, rd, info_opts(FS_INFO, s_flags),
+ info_opts(MNT_INFO, m_flags)))
LxMounts()
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
index 0301dc1..17ec19e 100644
--- a/scripts/gdb/linux/tasks.py
+++ b/scripts/gdb/linux/tasks.py
@@ -73,11 +73,12 @@
super(LxPs, self).__init__("lx-ps", gdb.COMMAND_DATA)
def invoke(self, arg, from_tty):
+ gdb.write("{:>10} {:>12} {:>7}\n".format("TASK", "PID", "COMM"))
for task in task_lists():
- gdb.write("{address} {pid} {comm}\n".format(
- address=task,
- pid=task["pid"],
- comm=task["comm"].string()))
+ gdb.write("{} {:^5} {}\n".format(
+ task.format_string().split()[0],
+ task["pid"].format_string(),
+ task["comm"].string()))
LxPs()
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index 484d2fb..2075db0 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -541,6 +541,9 @@
die "$P: file '${file}' not found\n";
}
}
+ if ($from_filename && (vcs_exists() && !vcs_file_exists($file))) {
+ warn "$P: file '$file' not found in version control $!\n";
+ }
if ($from_filename || ($file ne "&STDIN" && vcs_file_exists($file))) {
$file =~ s/^\Q${cur_path}\E//; #strip any absolute path
$file =~ s/^\Q${lk_path}\E//; #or the path to the lk tree
@@ -954,8 +957,10 @@
foreach my $file (@files) {
if ($email &&
- ($email_git || ($email_git_fallback &&
- !$exact_pattern_match_hash{$file}))) {
+ ($email_git ||
+ ($email_git_fallback &&
+ $file !~ /MAINTAINERS$/ &&
+ !$exact_pattern_match_hash{$file}))) {
vcs_file_signoffs($file);
}
if ($email && $email_git_blame) {
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
index 344a99c..9e2f003 100644
--- a/tools/testing/selftests/exec/.gitignore
+++ b/tools/testing/selftests/exec/.gitignore
@@ -7,6 +7,7 @@
execveat.path.ephemeral
execveat.ephemeral
execveat.denatured
+/load_address_*
/recursion-depth
xxxxxxxx*
pipe
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index 0a13b11..cf69b2f 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -4,7 +4,7 @@
CFLAGS += -D_GNU_SOURCE
TEST_PROGS := binfmt_script non-regular
-TEST_GEN_PROGS := execveat
+TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216
TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir pipe
# Makefile is a run-time dependency, since it's accessed by the execveat test
TEST_FILES := Makefile
@@ -27,4 +27,9 @@
$(OUTPUT)/execveat.denatured: $(OUTPUT)/execveat
cp $< $@
chmod -x $@
-
+$(OUTPUT)/load_address_4096: load_address.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000 -pie $< -o $@
+$(OUTPUT)/load_address_2097152: load_address.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x200000 -pie $< -o $@
+$(OUTPUT)/load_address_16777216: load_address.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000000 -pie $< -o $@
diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c
new file mode 100644
index 0000000..d487c2f
--- /dev/null
+++ b/tools/testing/selftests/exec/load_address.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <link.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct Statistics {
+ unsigned long long load_address;
+ unsigned long long alignment;
+};
+
+int ExtractStatistics(struct dl_phdr_info *info, size_t size, void *data)
+{
+ struct Statistics *stats = (struct Statistics *) data;
+ int i;
+
+ if (info->dlpi_name != NULL && info->dlpi_name[0] != '\0') {
+ // Ignore headers from other than the executable.
+ return 2;
+ }
+
+ stats->load_address = (unsigned long long) info->dlpi_addr;
+ stats->alignment = 0;
+
+ for (i = 0; i < info->dlpi_phnum; i++) {
+ if (info->dlpi_phdr[i].p_type != PT_LOAD)
+ continue;
+
+ if (info->dlpi_phdr[i].p_align > stats->alignment)
+ stats->alignment = info->dlpi_phdr[i].p_align;
+ }
+
+ return 1; // Terminate dl_iterate_phdr.
+}
+
+int main(int argc, char **argv)
+{
+ struct Statistics extracted;
+ unsigned long long misalign;
+ int ret;
+
+ ret = dl_iterate_phdr(ExtractStatistics, &extracted);
+ if (ret != 1) {
+ fprintf(stderr, "FAILED\n");
+ return 1;
+ }
+
+ if (extracted.alignment == 0) {
+ fprintf(stderr, "No alignment found\n");
+ return 1;
+ } else if (extracted.alignment & (extracted.alignment - 1)) {
+ fprintf(stderr, "Alignment is not a power of 2\n");
+ return 1;
+ }
+
+ misalign = extracted.load_address & (extracted.alignment - 1);
+ if (misalign) {
+ printf("alignment = %llu, load_address = %llu\n",
+ extracted.alignment, extracted.load_address);
+ fprintf(stderr, "FAILED\n");
+ return 1;
+ }
+
+ fprintf(stderr, "PASS\n");
+ return 0;
+}