Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton:
"Almost all of the rest of MM. There was an unusually large amount of
MM material this time"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (141 commits)
zpool: remove no-op module init/exit
mm: zbud: constify the zbud_ops
mm: zpool: constify the zpool_ops
mm: swap: zswap: maybe_preload & refactoring
zram: unify error reporting
zsmalloc: remove null check from destroy_handle_cache()
zsmalloc: do not take class lock in zs_shrinker_count()
zsmalloc: use class->pages_per_zspage
zsmalloc: consider ZS_ALMOST_FULL as migrate source
zsmalloc: partial page ordering within a fullness_list
zsmalloc: use shrinker to trigger auto-compaction
zsmalloc: account the number of compacted pages
zsmalloc/zram: introduce zs_pool_stats api
zsmalloc: cosmetic compaction code adjustments
zsmalloc: introduce zs_can_compact() function
zsmalloc: always keep per-class stats
zsmalloc: drop unused variable `nr_to_migrate'
mm/memblock.c: fix comment in __next_mem_range()
mm/page_alloc.c: fix type information of memoryless node
memory-hotplug: fix comments in zone_spanned_pages_in_node() and zone_spanned_pages_in_node()
...
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 7eba542..edccacd 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -104,6 +104,13 @@
from this pool must not cross 4KByte boundaries.
+ void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+
+Wraps dma_pool_alloc() and also zeroes the returned memory if the
+allocation attempt succeeded.
+
+
void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
dma_addr_t *dma_handle);
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index c4de576..62435bb2 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -144,7 +144,8 @@
store compressed data
mem_limit RW the maximum amount of memory ZRAM can use to store
the compressed data
-num_migrated RO the number of objects migrated migrated by compaction
+pages_compacted RO the number of pages freed during compaction
+ (available only via zram<id>/mm_stat node)
compact WO trigger memory compaction
WARNING
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 7af2851..7bde640 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -60,9 +60,10 @@
- implementing the direct_IO address space operation, and calling
dax_do_io() instead of blockdev_direct_IO() if S_DAX is set
- implementing an mmap file operation for DAX files which sets the
- VM_MIXEDMAP flag on the VMA, and setting the vm_ops to include handlers
- for fault and page_mkwrite (which should probably call dax_fault() and
- dax_mkwrite(), passing the appropriate get_block() callback)
+ VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
+ include handlers for fault, pmd_fault and page_mkwrite (which should
+ probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the
+ appropriate get_block() callback)
- calling dax_truncate_page() instead of block_truncate_page() for DAX files
- calling dax_zero_page_range() instead of zero_user() for DAX files
- ensuring that there is sufficient locking between reads, writes,
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 6f7fafd..d411ca6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -424,6 +424,7 @@
Referenced: 892 kB
Anonymous: 0 kB
Swap: 0 kB
+SwapPss: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 374 kB
@@ -433,16 +434,23 @@
mapping in /proc/PID/maps. The remaining lines show the size of the mapping
(size), the amount of the mapping that is currently resident in RAM (RSS), the
process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping. Note that even a page which is part of a
-MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used
-by only one process, is accounted as private and not as shared. "Referenced"
-indicates the amount of memory currently marked as referenced or accessed.
+dirty private pages in the mapping.
+
+The "proportional set size" (PSS) of a process is the count of pages it has
+in memory, where each page is divided by the number of processes sharing it.
+So if a process has 1000 pages all to itself, and 1000 shared with one other
+process, its PSS will be 1500.
+Note that even a page which is part of a MAP_SHARED mapping, but has only
+a single pte mapped, i.e. is currently used by only one process, is accounted
+as private and not as shared.
+"Referenced" indicates the amount of memory currently marked as referenced or
+accessed.
"Anonymous" shows the amount of memory that does not belong to any file. Even
a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
and a page is modified, the file page is replaced by a private anonymous copy.
"Swap" shows how much would-be-anonymous memory is also used, but out on
swap.
-
+"SwapPss" shows proportional swap share of this mapping.
"VmFlags" field deserves a separate description. This member represents the kernel
flags associated with the particular virtual memory area in two letter encoded
manner. The codes are the following:
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9c3f2f8..a4482fc 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -349,7 +349,7 @@
(i < j):
zone[i]->protection[j]
- = (total sums of present_pages from zone[i+1] to zone[j] on the node)
+ = (total sums of managed_pages from zone[i+1] to zone[j] on the node)
/ lowmem_reserve_ratio[i];
(i = j):
(should not be protected. = 0;
@@ -360,7 +360,7 @@
256 (if zone[i] means DMA or DMA32 zone)
32 (others).
As above expression, they are reciprocal number of ratio.
-256 means 1/256. # of protection pages becomes about "0.39%" of total present
+256 means 1/256. # of protection pages becomes about "0.39%" of total managed
pages of higher zones on the node.
If you would like to protect more pages, smaller values are effective.
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 267f393..13f5619 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -75,7 +75,8 @@
'e' - Send a SIGTERM to all processes, except for init.
-'f' - Will call oom_kill to kill a memory hog process.
+'f' - Will call the oom killer to kill a memory hog process, but do not
+ panic if nothing can be killed.
'g' - Used by kgdb (kernel debugger)
diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index 030977f..54dd9b9 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -329,7 +329,14 @@
3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c
-4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a
- wide range of userspace tools to help with huge page usability, environment
- setup, and control. Furthermore it provides useful test cases that should be
- used when modifying code to ensure no regressions are introduced.
+4) The libhugetlbfs (https://github.com/libhugetlbfs/libhugetlbfs) library
+ provides a wide range of userspace tools to help with huge page usability,
+ environment setup, and control.
+
+Kernel development regression testing
+=====================================
+
+The most complete set of hugetlb tests are in the libhugetlbfs repository.
+If you modify any hugetlb related code, use the libhugetlbfs test suite
+to check for regressions. In addition, if you add any new hugetlb
+functionality, please add appropriate tests to libhugetlbfs.
diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 6bfbc17..3cd38438 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -16,11 +16,17 @@
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
- * Bits 56-60 zero
- * Bit 61 page is file-page or shared-anon
+ * Bit 56 page exclusively mapped (since 4.2)
+ * Bits 57-60 zero
+ * Bit 61 page is file-page or shared-anon (since 3.5)
* Bit 62 page swapped
* Bit 63 page present
+ Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
+ In 4.0 and 4.1 opens by unprivileged fail with -EPERM. Starting from
+ 4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
+ Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
+
If the page is not present but in swap, then the PFN contains an
encoding of the swap file number and the page's offset into the
swap. Unmapped pages return a null PFN. This allows determining
@@ -159,3 +165,8 @@
Reading from any of the files will return -EINVAL if you are not starting
the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
into the file), or if the size of the read is not a multiple of 8 bytes.
+
+Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
+always 12 at most architectures). Since Linux 3.11 their meaning changes
+after first clear of soft-dirty bits. Since Linux 4.2 they are used for
+flags unconditionally.
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 8884788..6bab21f 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -339,6 +339,67 @@
}
}
+#ifdef CONFIG_BLK_DEV_INITRD
+/*
+ * Relocate initrd if it is not completely within the linear mapping.
+ * This would be the case if mem= cuts out all or part of it.
+ */
+static void __init relocate_initrd(void)
+{
+ phys_addr_t orig_start = __virt_to_phys(initrd_start);
+ phys_addr_t orig_end = __virt_to_phys(initrd_end);
+ phys_addr_t ram_end = memblock_end_of_DRAM();
+ phys_addr_t new_start;
+ unsigned long size, to_free = 0;
+ void *dest;
+
+ if (orig_end <= ram_end)
+ return;
+
+ /*
+ * Any of the original initrd which overlaps the linear map should
+ * be freed after relocating.
+ */
+ if (orig_start < ram_end)
+ to_free = ram_end - orig_start;
+
+ size = orig_end - orig_start;
+
+ /* initrd needs to be relocated completely inside linear mapping */
+ new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn),
+ size, PAGE_SIZE);
+ if (!new_start)
+ panic("Cannot relocate initrd of size %ld\n", size);
+ memblock_reserve(new_start, size);
+
+ initrd_start = __phys_to_virt(new_start);
+ initrd_end = initrd_start + size;
+
+ pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n",
+ orig_start, orig_start + size - 1,
+ new_start, new_start + size - 1);
+
+ dest = (void *)initrd_start;
+
+ if (to_free) {
+ memcpy(dest, (void *)__phys_to_virt(orig_start), to_free);
+ dest += to_free;
+ }
+
+ copy_from_early_mem(dest, orig_start + to_free, size - to_free);
+
+ if (to_free) {
+ pr_info("Freeing original RAMDISK from [%llx-%llx]\n",
+ orig_start, orig_start + to_free - 1);
+ memblock_free(orig_start, to_free);
+ }
+}
+#else
+static inline void __init relocate_initrd(void)
+{
+}
+#endif
+
u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };
void __init setup_arch(char **cmdline_p)
@@ -372,6 +433,7 @@
acpi_boot_table_init();
paging_init();
+ relocate_initrd();
request_standard_resources();
early_ioremap_reset();
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 344387a..a6d6190 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1140,13 +1140,9 @@
#ifdef CONFIG_NUMA
{
- int node = ioc->node;
struct page *page;
- if (node == NUMA_NO_NODE)
- node = numa_node_id();
-
- page = alloc_pages_exact_node(node, flags, get_order(size));
+ page = alloc_pages_node(ioc->node, flags, get_order(size));
if (unlikely(!page))
return NULL;
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index 20e8a9b..f3976da 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -97,7 +97,7 @@
/* attempt to allocate a granule's worth of cached memory pages */
- page = alloc_pages_exact_node(nid,
+ page = __alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
IA64_GRANULE_SHIFT-PAGE_SHIFT);
if (!page) {
diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c
index d0853e8..8f59907 100644
--- a/arch/ia64/sn/pci/pci_dma.c
+++ b/arch/ia64/sn/pci/pci_dma.c
@@ -92,7 +92,7 @@
*/
node = pcibus_to_node(pdev->bus);
if (likely(node >=0)) {
- struct page *p = alloc_pages_exact_node(node,
+ struct page *p = __alloc_pages_node(node,
flags, get_order(size));
if (likely(p))
diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c
index e865d74..2d4f60c 100644
--- a/arch/powerpc/platforms/cell/ras.c
+++ b/arch/powerpc/platforms/cell/ras.c
@@ -123,7 +123,7 @@
area->nid = nid;
area->order = order;
- area->pages = alloc_pages_exact_node(area->nid,
+ area->pages = __alloc_pages_node(area->nid,
GFP_KERNEL|__GFP_THISNODE,
area->order);
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index f06b36a..91b963a 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -14,7 +14,7 @@
#include <asm-generic/4level-fixup.h>
#include <linux/spinlock.h>
-#include <linux/swap.h>
+#include <linux/mm_types.h>
#include <asm/types.h>
#include <asm/pgtsrmmu.h>
#include <asm/vaddrs.h>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b143c2d..baadbf9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -317,15 +317,12 @@
return ramdisk_size;
}
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- unsigned long slop, clen, mapaddr;
- char *p, *q;
/* We need to move the initrd down into directly mapped mem */
relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +340,8 @@
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
- q = (char *)initrd_start;
+ copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
- /* Copy the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_memremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_memunmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
-
- ramdisk_image = get_ramdisk_image();
- ramdisk_size = get_ramdisk_size();
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4a4eec30..148ea20 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3150,7 +3150,7 @@
struct page *pages;
struct vmcs *vmcs;
- pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb5..c3b3f65 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
- /* and there's no empty block */
- if (bi->start >= bi->end)
+ /* and there's no empty or non-exist block */
+ if (bi->start >= bi->end ||
+ !memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start))
numa_remove_memblk_from(i--, mi);
}
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9c01f5b..9fa15bb 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -388,7 +388,6 @@
static ssize_t compact_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t len)
{
- unsigned long nr_migrated;
struct zram *zram = dev_to_zram(dev);
struct zram_meta *meta;
@@ -399,8 +398,7 @@
}
meta = zram->meta;
- nr_migrated = zs_compact(meta->mem_pool);
- atomic64_add(nr_migrated, &zram->stats.num_migrated);
+ zs_compact(meta->mem_pool);
up_read(&zram->init_lock);
return len;
@@ -428,26 +426,31 @@
struct device_attribute *attr, char *buf)
{
struct zram *zram = dev_to_zram(dev);
+ struct zs_pool_stats pool_stats;
u64 orig_size, mem_used = 0;
long max_used;
ssize_t ret;
+ memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
+
down_read(&zram->init_lock);
- if (init_done(zram))
+ if (init_done(zram)) {
mem_used = zs_get_total_pages(zram->meta->mem_pool);
+ zs_pool_stats(zram->meta->mem_pool, &pool_stats);
+ }
orig_size = atomic64_read(&zram->stats.pages_stored);
max_used = atomic_long_read(&zram->stats.max_used_pages);
ret = scnprintf(buf, PAGE_SIZE,
- "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n",
orig_size << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.compr_data_size),
mem_used << PAGE_SHIFT,
zram->limit_pages << PAGE_SHIFT,
max_used << PAGE_SHIFT,
(u64)atomic64_read(&zram->stats.zero_pages),
- (u64)atomic64_read(&zram->stats.num_migrated));
+ pool_stats.pages_compacted);
up_read(&zram->init_lock);
return ret;
@@ -619,7 +622,7 @@
uncmem = user_mem;
if (!uncmem) {
- pr_info("Unable to allocate temp memory\n");
+ pr_err("Unable to allocate temp memory\n");
ret = -ENOMEM;
goto out_cleanup;
}
@@ -716,7 +719,7 @@
handle = zs_malloc(meta->mem_pool, clen);
if (!handle) {
- pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+ pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
index, clen);
ret = -ENOMEM;
goto out;
@@ -1036,7 +1039,7 @@
comp = zcomp_create(zram->compressor, zram->max_comp_streams);
if (IS_ERR(comp)) {
- pr_info("Cannot initialise %s compressing backend\n",
+ pr_err("Cannot initialise %s compressing backend\n",
zram->compressor);
err = PTR_ERR(comp);
goto out_free_meta;
@@ -1214,7 +1217,7 @@
/* gendisk structure */
zram->disk = alloc_disk(1);
if (!zram->disk) {
- pr_warn("Error allocating disk structure for device %d\n",
+ pr_err("Error allocating disk structure for device %d\n",
device_id);
ret = -ENOMEM;
goto out_free_queue;
@@ -1263,7 +1266,8 @@
ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
&zram_disk_attr_group);
if (ret < 0) {
- pr_warn("Error creating sysfs group");
+ pr_err("Error creating sysfs group for device %d\n",
+ device_id);
goto out_free_disk;
}
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
@@ -1403,13 +1407,13 @@
ret = class_register(&zram_control_class);
if (ret) {
- pr_warn("Unable to register zram-control class\n");
+ pr_err("Unable to register zram-control class\n");
return ret;
}
zram_major = register_blkdev(0, "zram");
if (zram_major <= 0) {
- pr_warn("Unable to get major number\n");
+ pr_err("Unable to get major number\n");
class_unregister(&zram_control_class);
return -EBUSY;
}
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 6dbe2df..8e92339 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -78,7 +78,6 @@
atomic64_t compr_data_size; /* compressed size of pages stored */
atomic64_t num_reads; /* failed + successful */
atomic64_t num_writes; /* --do-- */
- atomic64_t num_migrated; /* no. of migrated object */
atomic64_t failed_reads; /* can happen when memory is too low */
atomic64_t failed_writes; /* can happen when memory is too low */
atomic64_t invalid_io; /* non-page-aligned I/O requests */
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index 95c8944..340b44d 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -239,7 +239,7 @@
mq->mmr_blade = uv_cpu_to_blade_id(cpu);
nid = cpu_to_node(cpu);
- page = alloc_pages_exact_node(nid,
+ page = __alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
pg_order);
if (page == NULL) {
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index b5b4278..95b330a 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -353,9 +353,16 @@
static void moom_callback(struct work_struct *ignored)
{
+ const gfp_t gfp_mask = GFP_KERNEL;
+ struct oom_control oc = {
+ .zonelist = node_zonelist(first_memory_node, gfp_mask),
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = -1,
+ };
+
mutex_lock(&oom_lock);
- if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
- GFP_KERNEL, 0, NULL, true))
+ if (!out_of_memory(&oc))
pr_info("OOM request ignored because killer is disabled\n");
mutex_unlock(&oom_lock);
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f77da0e..22ea424 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -28,6 +28,7 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/fs/dax.c b/fs/dax.c
index 57bb70b..e43389c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -283,7 +283,6 @@
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
- struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
void __pmem *addr;
@@ -291,8 +290,6 @@
pgoff_t size;
int error;
- i_mmap_lock_read(mapping);
-
/*
* Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the
@@ -322,8 +319,6 @@
error = vm_insert_mixed(vma, vaddr, pfn);
out:
- i_mmap_unlock_read(mapping);
-
return error;
}
@@ -385,15 +380,17 @@
* from a read fault and we've raced with a truncate
*/
error = -EIO;
- goto unlock_page;
+ goto unlock;
}
+ } else {
+ i_mmap_lock_write(mapping);
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock_page;
+ goto unlock;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -404,8 +401,9 @@
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO;
if (error)
- goto unlock_page;
+ goto unlock;
} else {
+ i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf);
}
}
@@ -417,17 +415,15 @@
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock_page;
+ goto unlock;
vmf->page = page;
if (!page) {
- i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT;
if (vmf->pgoff >= size) {
- i_mmap_unlock_read(mapping);
error = -EIO;
- goto out;
+ goto unlock;
}
}
return VM_FAULT_LOCKED;
@@ -463,6 +459,8 @@
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
+ if (!page)
+ i_mmap_unlock_write(mapping);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -471,11 +469,14 @@
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
- unlock_page:
+ unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
+ } else {
+ i_mmap_unlock_write(mapping);
}
+
goto out;
}
EXPORT_SYMBOL(__dax_fault);
@@ -507,6 +508,176 @@
}
EXPORT_SYMBOL_GPL(dax_fault);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
+ * more often than one might expect in the below function.
+ */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head bh;
+ unsigned blkbits = inode->i_blkbits;
+ unsigned long pmd_addr = address & PMD_MASK;
+ bool write = flags & FAULT_FLAG_WRITE;
+ long length;
+ void *kaddr;
+ pgoff_t size, pgoff;
+ sector_t block, sector;
+ unsigned long pfn;
+ int result = 0;
+
+ /* Fall back to PTEs if we're going to COW */
+ if (write && !(vma->vm_flags & VM_SHARED))
+ return VM_FAULT_FALLBACK;
+ /* If the PMD would extend outside the VMA */
+ if (pmd_addr < vma->vm_start)
+ return VM_FAULT_FALLBACK;
+ if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+ return VM_FAULT_FALLBACK;
+
+ pgoff = linear_page_index(vma, pmd_addr);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size)
+ return VM_FAULT_SIGBUS;
+ /* If the PMD would cover blocks out of the file */
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ return VM_FAULT_FALLBACK;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
+
+ bh.b_size = PMD_SIZE;
+ i_mmap_lock_write(mapping);
+ length = get_block(inode, block, &bh, write);
+ if (length)
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * If the filesystem isn't willing to tell us the length of a hole,
+ * just fall back to PTEs. Calling get_block 512 times in a loop
+ * would be silly.
+ */
+ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
+ goto fallback;
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ int i;
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ clear_page(kaddr + i * PAGE_SIZE);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+
+ /*
+ * If we allocated new storage, make sure no process has any
+ * zero pages covering this hole
+ */
+ if (buffer_new(&bh)) {
+ i_mmap_unlock_write(mapping);
+ unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
+ i_mmap_lock_write(mapping);
+ }
+
+ /*
+ * If a truncate happened while we were allocating blocks, we may
+ * leave blocks allocated to the file that are beyond EOF. We can't
+ * take i_mutex here, so just leave them hanging; they'll be freed
+ * when the file is deleted.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (pgoff >= size) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((pgoff | PG_PMD_COLOUR) >= size)
+ goto fallback;
+
+ if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
+ spinlock_t *ptl;
+ pmd_t entry;
+ struct page *zero_page = get_huge_zero_page();
+
+ if (unlikely(!zero_page))
+ goto fallback;
+
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_none(*pmd)) {
+ spin_unlock(ptl);
+ goto fallback;
+ }
+
+ entry = mk_pmd(zero_page, vma->vm_page_prot);
+ entry = pmd_mkhuge(entry);
+ set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
+ result = VM_FAULT_NOPAGE;
+ spin_unlock(ptl);
+ } else {
+ sector = bh.b_blocknr << (blkbits - 9);
+ length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
+ bh.b_size);
+ if (length < 0) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+ goto fallback;
+
+ result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+ }
+
+ out:
+ if (buffer_unwritten(&bh))
+ complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
+
+ i_mmap_unlock_write(mapping);
+
+ return result;
+
+ fallback:
+ count_vm_event(THP_FAULT_FALLBACK);
+ result = VM_FAULT_FALLBACK;
+ goto out;
+}
+EXPORT_SYMBOL_GPL(__dax_pmd_fault);
+
+/**
+ * dax_pmd_fault - handle a PMD fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * pmd_fault handler for DAX files.
+ */
+int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmd, unsigned int flags, get_block_t get_block,
+ dax_iodone_t complete_unwritten)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = __dax_pmd_fault(vma, address, pmd, flags, get_block,
+ complete_unwritten);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_pmd_fault);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/**
* dax_pfn_mkwrite - handle first write to DAX page
* @vma: The virtual memory area where the fault occurred
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 3b57c9f8..1982c3f 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include "ext2.h"
#include "xattr.h"
@@ -31,6 +32,12 @@
return dax_fault(vma, vmf, ext2_get_block, NULL);
}
+static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+}
+
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
@@ -38,6 +45,7 @@
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
+ .pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -49,7 +57,7 @@
file_accessed(file);
vma->vm_ops = &ext2_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
#else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a3a404c..c60a248 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -25,6 +25,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 32071f5..fd1f28b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2272,6 +2272,8 @@
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bc313ac..113837e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/path.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
@@ -195,7 +196,7 @@
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
struct inode *inode = bh->b_assoc_map->host;
- /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ /* XXX: breaks on 32-bit > 16TB. Is that even supported? */
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
int err;
if (!uptodate)
@@ -206,17 +207,74 @@
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
- /* Is this the right get_block? */
+ int result;
+ handle_t *handle = NULL;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ EXT4_DATA_TRANS_BLOCKS(sb));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_fault(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
+}
+
+static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags)
+{
+ int result;
+ handle_t *handle = NULL;
+ struct inode *inode = file_inode(vma->vm_file);
+ struct super_block *sb = inode->i_sb;
+ bool write = flags & FAULT_FLAG_WRITE;
+
+ if (write) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
+ ext4_chunk_trans_blocks(inode,
+ PMD_SIZE / PAGE_SIZE));
+ }
+
+ if (IS_ERR(handle))
+ result = VM_FAULT_SIGBUS;
+ else
+ result = __dax_pmd_fault(vma, addr, pmd, flags,
+ ext4_get_block_dax, ext4_end_io_unwritten);
+
+ if (write) {
+ if (!IS_ERR(handle))
+ ext4_journal_stop(handle);
+ sb_end_pagefault(sb);
+ }
+
+ return result;
}
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
+ return dax_mkwrite(vma, vmf, ext4_get_block_dax,
+ ext4_end_io_unwritten);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
+ .pmd_fault = ext4_dax_pmd_fault,
.page_mkwrite = ext4_dax_mkwrite,
.pfn_mkwrite = dax_pfn_mkwrite,
};
@@ -244,7 +302,7 @@
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
}
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 4f6ac49..2468261 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/dax.h>
#include <linux/uio.h>
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 29f1af7..612fbcf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -22,6 +22,7 @@
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
+#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
@@ -3020,6 +3021,17 @@
EXT4_GET_BLOCKS_NO_LOCK);
}
+int ext4_get_block_dax(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
+ if (create)
+ flags |= EXT4_GET_BLOCKS_CREATE;
+ ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result, flags);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private)
{
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 973c24c..316adb9 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/sched.h> /* remove ASAP */
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
@@ -84,6 +85,29 @@
{Opt_err, NULL},
};
+#ifdef CONFIG_NUMA
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+ vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
+ index);
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+ mpol_cond_put(vma->vm_policy);
+}
+#else
+static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
+ struct inode *inode, pgoff_t index)
+{
+}
+
+static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
+{
+}
+#endif
+
static void huge_pagevec_release(struct pagevec *pvec)
{
int i;
@@ -293,26 +317,61 @@
return -EINVAL;
}
-static void truncate_huge_page(struct page *page)
+static void remove_huge_page(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
-static void truncate_hugepages(struct inode *inode, loff_t lstart)
+
+/*
+ * remove_inode_hugepages handles two distinct cases: truncation and hole
+ * punch. There are subtle differences in operation for each case.
+
+ * truncation is indicated by end of range being LLONG_MAX
+ * In this case, we first scan the range and release found pages.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
+ * maps and global counts.
+ * hole punch is indicated if end is not LLONG_MAX
+ * In the hole punch case we scan the range and release found pages.
+ * Only when releasing a page is the associated region/reserv map
+ * deleted. The region/reserv map for ranges without associated
+ * pages are not modified.
+ * Note: If the passed end of range value is beyond the end of file, but
+ * not LLONG_MAX this routine still performs a hole punch operation.
+ */
+static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
+ loff_t lend)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
+ const pgoff_t end = lend >> huge_page_shift(h);
+ struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
+ long lookup_nr = PAGEVEC_SIZE;
+ bool truncate_op = (lend == LLONG_MAX);
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init(&pvec, 0);
next = start;
- while (1) {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ while (next < end) {
+ /*
+ * Make sure to never grab more pages that we
+ * might possibly need.
+ */
+ if (end - next < lookup_nr)
+ lookup_nr = end - next;
+
+ /*
+ * This pagevec_lookup() may return pages past 'end',
+ * so we must check for page->index > end.
+ */
+ if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) {
if (next == start)
break;
next = start;
@@ -321,26 +380,69 @@
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
+ u32 hash;
+
+ hash = hugetlb_fault_mutex_hash(h, current->mm,
+ &pseudo_vma,
+ mapping, next, 0);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
lock_page(page);
+ if (page->index >= end) {
+ unlock_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ next = end; /* we are done */
+ break;
+ }
+
+ /*
+ * If page is mapped, it was faulted in after being
+ * unmapped. Do nothing in this race case. In the
+ * normal case page is not mapped.
+ */
+ if (!page_mapped(page)) {
+ bool rsv_on_error = !PagePrivate(page);
+ /*
+ * We must free the huge page and remove
+ * from page cache (remove_huge_page) BEFORE
+ * removing the region/reserve map
+ * (hugetlb_unreserve_pages). In rare out
+ * of memory conditions, removal of the
+ * region/reserve map could fail. Before
+ * free'ing the page, note PagePrivate which
+ * is used in case of error.
+ */
+ remove_huge_page(page);
+ freed++;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(
+ inode, next,
+ next + 1, 1)))
+ hugetlb_fix_reserve_counts(
+ inode, rsv_on_error);
+ }
+ }
+
if (page->index > next)
next = page->index;
+
++next;
- truncate_huge_page(page);
unlock_page(page);
- freed++;
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
huge_pagevec_release(&pvec);
}
- BUG_ON(!lstart && mapping->nrpages);
- hugetlb_unreserve_pages(inode, start, freed);
+
+ if (truncate_op)
+ (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
- truncate_hugepages(inode, 0);
+ remove_inode_hugepages(inode, 0, LLONG_MAX);
resv_map = (struct resv_map *)inode->i_mapping->private_data;
/* root inode doesn't have the resv_map, so we should check it */
if (resv_map)
@@ -349,11 +451,15 @@
}
static inline void
-hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff)
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
{
struct vm_area_struct *vma;
- vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) {
+ /*
+ * end == 0 indicates that the entire range after
+ * start should be unmapped.
+ */
+ vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
unsigned long v_offset;
/*
@@ -362,13 +468,20 @@
* which overlap the truncated area starting at pgoff,
* and no vma on a 32-bit arch can span beyond the 4GB.
*/
- if (vma->vm_pgoff < pgoff)
- v_offset = (pgoff - vma->vm_pgoff) << PAGE_SHIFT;
+ if (vma->vm_pgoff < start)
+ v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
else
v_offset = 0;
- unmap_hugepage_range(vma, vma->vm_start + v_offset,
- vma->vm_end, NULL);
+ if (end) {
+ end = ((end - start) << PAGE_SHIFT) +
+ vma->vm_start + v_offset;
+ if (end > vma->vm_end)
+ end = vma->vm_end;
+ } else
+ end = vma->vm_end;
+
+ unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
}
}
@@ -384,12 +497,164 @@
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap))
- hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
i_mmap_unlock_write(mapping);
- truncate_hugepages(inode, offset);
+ remove_inode_hugepages(inode, offset, LLONG_MAX);
return 0;
}
+static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct hstate *h = hstate_inode(inode);
+ loff_t hpage_size = huge_page_size(h);
+ loff_t hole_start, hole_end;
+
+ /*
+ * For hole punch round up the beginning offset of the hole and
+ * round down the end.
+ */
+ hole_start = round_up(offset, hpage_size);
+ hole_end = round_down(offset + len, hpage_size);
+
+ if (hole_end > hole_start) {
+ struct address_space *mapping = inode->i_mapping;
+
+ mutex_lock(&inode->i_mutex);
+ i_mmap_lock_write(mapping);
+ if (!RB_EMPTY_ROOT(&mapping->i_mmap))
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT);
+ i_mmap_unlock_write(mapping);
+ remove_inode_hugepages(inode, hole_start, hole_end);
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ return 0;
+}
+
+static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
+ struct vm_area_struct pseudo_vma;
+ struct mm_struct *mm = current->mm;
+ loff_t hpage_size = huge_page_size(h);
+ unsigned long hpage_shift = huge_page_shift(h);
+ pgoff_t start, index, end;
+ int error;
+ u32 hash;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return hugetlbfs_punch_hole(inode, offset, len);
+
+ /*
+ * Default preallocate case.
+ * For this range, start is rounded down and end is rounded up
+ * as well as being converted to page offsets.
+ */
+ start = offset >> hpage_shift;
+ end = (offset + len + hpage_size - 1) >> hpage_shift;
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+ error = inode_newsize_ok(inode, offset + len);
+ if (error)
+ goto out;
+
+ /*
+ * Initialize a pseudo vma as this is required by the huge page
+ * allocation routines. If NUMA is configured, use page index
+ * as input to create an allocation policy.
+ */
+ memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
+ pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
+ pseudo_vma.vm_file = file;
+
+ for (index = start; index < end; index++) {
+ /*
+ * This is supposed to be the vaddr where the page is being
+ * faulted in, but we have no vaddr here.
+ */
+ struct page *page;
+ unsigned long addr;
+ int avoid_reserve = 0;
+
+ cond_resched();
+
+ /*
+ * fallocate(2) manpage permits EINTR; we may have been
+ * interrupted because we are using up too much memory.
+ */
+ if (signal_pending(current)) {
+ error = -EINTR;
+ break;
+ }
+
+ /* Set numa allocation policy based on index */
+ hugetlb_set_vma_policy(&pseudo_vma, inode, index);
+
+ /* addr is the offset within the file (zero based) */
+ addr = index * hpage_size;
+
+ /* mutex taken here, fault path and hole punch */
+ hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
+ index, addr);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /* See if already present in mapping to avoid alloc/free */
+ page = find_get_page(mapping, index);
+ if (page) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ continue;
+ }
+
+ /* Allocate page and add to page cache */
+ page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ hugetlb_drop_vma_policy(&pseudo_vma);
+ if (IS_ERR(page)) {
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ error = PTR_ERR(page);
+ goto out;
+ }
+ clear_huge_page(page, addr, pages_per_huge_page(h));
+ __SetPageUptodate(page);
+ error = huge_add_to_page_cache(page, mapping, index);
+ if (unlikely(error)) {
+ put_page(page);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ goto out;
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by add_to_page_cache()
+ */
+ put_page(page);
+ unlock_page(page);
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+ i_size_write(inode, offset + len);
+ inode->i_ctime = CURRENT_TIME;
+ spin_lock(&inode->i_lock);
+ inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return error;
+}
+
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -701,7 +966,8 @@
.mmap = hugetlbfs_file_mmap,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
- .llseek = default_llseek,
+ .llseek = default_llseek,
+ .fallocate = hugetlbfs_fallocate,
};
static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3b4d825..41f1a50 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -446,6 +446,7 @@
unsigned long anonymous_thp;
unsigned long swap;
u64 pss;
+ u64 swap_pss;
};
static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -492,9 +493,20 @@
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
- if (!non_swap_entry(swpent))
+ if (!non_swap_entry(swpent)) {
+ int mapcount;
+
mss->swap += PAGE_SIZE;
- else if (is_migration_entry(swpent))
+ mapcount = swp_swapcount(swpent);
+ if (mapcount >= 2) {
+ u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
+
+ do_div(pss_delta, mapcount);
+ mss->swap_pss += pss_delta;
+ } else {
+ mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
+ }
+ } else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
}
@@ -640,6 +652,7 @@
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
"Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
"MMUPageSize: %8lu kB\n"
"Locked: %8lu kB\n",
@@ -654,6 +667,7 @@
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
mss.swap >> 10,
+ (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
vma_mmu_pagesize(vma) >> 10,
(vma->vm_flags & VM_LOCKED) ?
@@ -712,23 +726,6 @@
.release = proc_map_release,
};
-/*
- * We do not want to have constant page-shift bits sitting in
- * pagemap entries and are about to reuse them some time soon.
- *
- * Here's the "migration strategy":
- * 1. when the system boots these bits remain what they are,
- * but a warning about future change is printed in log;
- * 2. once anyone clears soft-dirty bits via clear_refs file,
- * these flag is set to denote, that user is aware of the
- * new API and those page-shift bits change their meaning.
- * The respective warning is printed in dmesg;
- * 3. In a couple of releases we will remove all the mentions
- * of page-shift in pagemap entries.
- */
-
-static bool soft_dirty_cleared __read_mostly;
-
enum clear_refs_types {
CLEAR_REFS_ALL = 1,
CLEAR_REFS_ANON,
@@ -889,13 +886,6 @@
if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
return -EINVAL;
- if (type == CLEAR_REFS_SOFT_DIRTY) {
- soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
- " See the linux/Documentation/vm/pagemap.txt for "
- "details.\n");
- }
-
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -963,36 +953,26 @@
struct pagemapread {
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
- bool v2;
+ bool show_pfn;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
#define PAGEMAP_WALK_MASK (PMD_MASK)
-#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-/* in "new" pagemap pshift bits are occupied with more status bits */
-#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
+#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
+#define PM_SOFT_DIRTY BIT_ULL(55)
+#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_FILE BIT_ULL(61)
+#define PM_SWAP BIT_ULL(62)
+#define PM_PRESENT BIT_ULL(63)
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_FILE PM_STATUS(1LL)
-#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
#define PM_END_OF_BUFFER 1
-static inline pagemap_entry_t make_pme(u64 val)
+static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
{
- return (pagemap_entry_t) { .pme = val };
+ return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
}
static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1013,7 +993,7 @@
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ pagemap_entry_t pme = make_pme(0, 0);
/* End of address space hole, which we mark as non-present. */
unsigned long hole_end;
@@ -1033,7 +1013,7 @@
/* Addresses in the VMA. */
if (vma->vm_flags & VM_SOFTDIRTY)
- pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
+ pme = make_pme(0, PM_SOFT_DIRTY);
for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
err = add_to_pagemap(addr, &pme, pm);
if (err)
@@ -1044,67 +1024,42 @@
return err;
}
-static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
+static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- u64 frame, flags;
+ u64 frame = 0, flags = 0;
struct page *page = NULL;
- int flags2 = 0;
if (pte_present(pte)) {
- frame = pte_pfn(pte);
- flags = PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte);
+ flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
- flags2 |= __PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
- flags = PM_SWAP;
+ flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
- } else {
- if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 |= __PM_SOFT_DIRTY;
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
- return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if ((vma->vm_flags & VM_SOFTDIRTY))
- flags2 |= __PM_SOFT_DIRTY;
+ if (page && page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags |= PM_SOFT_DIRTY;
- *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
+ return make_pme(frame, flags);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
- /*
- * Currently pmd for thp is always present because thp can not be
- * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
- * This if-check is just to prepare for future implementation.
- */
- if (pmd_present(pmd))
- *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
- | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
-}
-#else
-static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pmd_t pmd, int offset, int pmd_flags2)
-{
-}
-#endif
-
-static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
@@ -1113,41 +1068,58 @@
pte_t *pte, *orig_pte;
int err = 0;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- int pmd_flags2;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+ u64 flags = 0, frame = 0;
+ pmd_t pmd = *pmdp;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
- pmd_flags2 = __PM_SOFT_DIRTY;
- else
- pmd_flags2 = 0;
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
+ flags |= PM_SOFT_DIRTY;
+
+ /*
+ * Currently pmd for thp is always present because thp
+ * can not be swapped-out, migrated, or HWPOISONed
+ * (split in such cases instead.)
+ * This if-check is just to prepare for future implementation.
+ */
+ if (pmd_present(pmd)) {
+ struct page *page = pmd_page(pmd);
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pmd_pfn(pmd) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- unsigned long offset;
- pagemap_entry_t pme;
+ pagemap_entry_t pme = make_pme(frame, flags);
- offset = (addr & ~PAGEMAP_WALK_MASK) >>
- PAGE_SHIFT;
- thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
spin_unlock(ptl);
return err;
}
- if (pmd_trans_unstable(pmd))
+ if (pmd_trans_unstable(pmdp))
return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* We can assume that @vma always points to a valid one and @end never
* goes beyond vma->vm_end.
*/
- orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
for (; addr < end; pte++, addr += PAGE_SIZE) {
pagemap_entry_t pme;
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
@@ -1160,40 +1132,44 @@
}
#ifdef CONFIG_HUGETLB_PAGE
-static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pte_t pte, int offset, int flags2)
-{
- if (pte_present(pte))
- *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
- PM_STATUS2(pm->v2, flags2) |
- PM_PRESENT);
- else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
- PM_STATUS2(pm->v2, flags2));
-}
-
/* This function walks within one hugetlb entry in the single call */
-static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
+static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
+ u64 flags = 0, frame = 0;
int err = 0;
- int flags2;
- pagemap_entry_t pme;
+ pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
- flags2 = __PM_SOFT_DIRTY;
- else
- flags2 = 0;
+ flags |= PM_SOFT_DIRTY;
+
+ pte = huge_ptep_get(ptep);
+ if (pte_present(pte)) {
+ struct page *page = pte_page(pte);
+
+ if (!PageAnon(page))
+ flags |= PM_FILE;
+
+ if (page_mapcount(page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+
+ flags |= PM_PRESENT;
+ if (pm->show_pfn)
+ frame = pte_pfn(pte) +
+ ((addr & ~hmask) >> PAGE_SHIFT);
+ }
for (; addr != end; addr += PAGE_SIZE) {
- int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
+ pagemap_entry_t pme = make_pme(frame, flags);
+
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
+ if (pm->show_pfn && (flags & PM_PRESENT))
+ frame++;
}
cond_resched();
@@ -1211,7 +1187,9 @@
* Bits 0-54 page frame number (PFN) if present
* Bits 0-4 swap type if swapped
* Bits 5-54 swap offset if swapped
- * Bits 55-60 page shift (page size = 1<<page shift)
+ * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
+ * Bit 56 page exclusively mapped
+ * Bits 57-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -1229,42 +1207,37 @@
static ssize_t pagemap_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = get_proc_task(file_inode(file));
- struct mm_struct *mm;
+ struct mm_struct *mm = file->private_data;
struct pagemapread pm;
- int ret = -ESRCH;
struct mm_walk pagemap_walk = {};
unsigned long src;
unsigned long svpfn;
unsigned long start_vaddr;
unsigned long end_vaddr;
- int copied = 0;
+ int ret = 0, copied = 0;
- if (!task)
+ if (!mm || !atomic_inc_not_zero(&mm->mm_users))
goto out;
ret = -EINVAL;
/* file position must be aligned */
if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
- goto out_task;
+ goto out_mm;
ret = 0;
if (!count)
- goto out_task;
+ goto out_mm;
- pm.v2 = soft_dirty_cleared;
+ /* do not disclose physical addresses: attack vector */
+ pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
+
pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
ret = -ENOMEM;
if (!pm.buffer)
- goto out_task;
+ goto out_mm;
- mm = mm_access(task, PTRACE_MODE_READ);
- ret = PTR_ERR(mm);
- if (!mm || IS_ERR(mm))
- goto out_free;
-
- pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pmd_entry = pagemap_pmd_range;
pagemap_walk.pte_hole = pagemap_pte_hole;
#ifdef CONFIG_HUGETLB_PAGE
pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1275,10 +1248,10 @@
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
start_vaddr = svpfn << PAGE_SHIFT;
- end_vaddr = TASK_SIZE_OF(task);
+ end_vaddr = mm->task_size;
/* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ if (svpfn > mm->task_size >> PAGE_SHIFT)
start_vaddr = end_vaddr;
/*
@@ -1305,7 +1278,7 @@
len = min(count, PM_ENTRY_BYTES * pm.pos);
if (copy_to_user(buf, pm.buffer, len)) {
ret = -EFAULT;
- goto out_mm;
+ goto out_free;
}
copied += len;
buf += len;
@@ -1315,24 +1288,31 @@
if (!ret || ret == PM_END_OF_BUFFER)
ret = copied;
-out_mm:
- mmput(mm);
out_free:
kfree(pm.buffer);
-out_task:
- put_task_struct(task);
+out_mm:
+ mmput(mm);
out:
return ret;
}
static int pagemap_open(struct inode *inode, struct file *file)
{
- /* do not disclose physical addresses: attack vector */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
- "to stop being page-shift some time soon. See the "
- "linux/Documentation/vm/pagemap.txt for details.\n");
+ struct mm_struct *mm;
+
+ mm = proc_mem_open(inode, PTRACE_MODE_READ);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ file->private_data = mm;
+ return 0;
+}
+
+static int pagemap_release(struct inode *inode, struct file *file)
+{
+ struct mm_struct *mm = file->private_data;
+
+ if (mm)
+ mmdrop(mm);
return 0;
}
@@ -1340,6 +1320,7 @@
.llseek = mem_lseek, /* borrow this */
.read = pagemap_read,
.open = pagemap_open,
+ .release = pagemap_release,
};
#endif /* CONFIG_PROC_PAGE_MONITOR */
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 331c1cc..c79b717 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -23,6 +23,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/dax.h>
#include <linux/buffer_head.h>
#include <linux/uio.h>
#include <linux/list_lru.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index de2c237..e78feb4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1546,8 +1546,36 @@
return ret;
}
+STATIC int
+xfs_filemap_pmd_fault(
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret;
+
+ if (!IS_DAX(inode))
+ return VM_FAULT_FALLBACK;
+
+ trace_xfs_filemap_pmd_fault(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+ xfs_end_io_dax_write);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
+ .pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
};
@@ -1560,7 +1588,7 @@
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
- vma->vm_flags |= VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
return 0;
}
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 9aeeb21..5ed36b1 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -687,6 +687,7 @@
DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h
index 316bd04..734ad4d 100644
--- a/include/asm-generic/early_ioremap.h
+++ b/include/asm-generic/early_ioremap.h
@@ -35,6 +35,12 @@
*/
extern void early_ioremap_reset(void);
+/*
+ * Early copy from unmapped memory to kernel mapped memory.
+ */
+extern void copy_from_early_mem(void *dest, phys_addr_t src,
+ unsigned long size);
+
#else
static inline void early_ioremap_init(void) { }
static inline void early_ioremap_setup(void) { }
diff --git a/include/linux/dax.h b/include/linux/dax.h
new file mode 100644
index 0000000..b415e52
--- /dev/null
+++ b/include/linux/dax.h
@@ -0,0 +1,39 @@
+#ifndef _LINUX_DAX_H
+#define _LINUX_DAX_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <asm/pgtable.h>
+
+ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
+ get_block_t, dio_iodone_t, int flags);
+int dax_clear_blocks(struct inode *, sector_t block, long size);
+int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
+int dax_truncate_page(struct inode *, loff_t from, get_block_t);
+int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
+ dax_iodone_t);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned int flags, get_block_t, dax_iodone_t);
+int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned int flags, get_block_t, dax_iodone_t);
+#else
+static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned int flags, get_block_t gb,
+ dax_iodone_t di)
+{
+ return VM_FAULT_FALLBACK;
+}
+#define __dax_pmd_fault dax_pmd_fault
+#endif
+int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
+#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
+#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
+
+static inline bool vma_is_dax(struct vm_area_struct *vma)
+{
+ return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
+}
+#endif
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index e1043f7..53ba737 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -24,6 +24,12 @@
void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle);
+static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
+ dma_addr_t *handle)
+{
+ return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle);
+}
+
void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b2f9b9c..72d8a84 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -52,7 +52,6 @@
struct seq_file;
struct workqueue_struct;
struct iov_iter;
-struct vm_fault;
extern void __init inode_init(void);
extern void __init inode_init_early(void);
@@ -2678,19 +2677,6 @@
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
- get_block_t, dio_iodone_t, int flags);
-int dax_clear_blocks(struct inode *, sector_t block, long size);
-int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
-int dax_truncate_page(struct inode *, loff_t from, get_block_t);
-int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
- dax_iodone_t);
-int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
-#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
-#define __dax_mkwrite(vma, vmf, gb, iod) __dax_fault(vma, vmf, gb, iod)
-
#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
loff_t file_offset);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ad35f30..f92cbd2 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -63,7 +63,10 @@
* but it is definitely preferable to use the flag rather than opencode endless
* loop around allocator.
*
- * __GFP_NORETRY: The VM implementation must not retry indefinitely.
+ * __GFP_NORETRY: The VM implementation must not retry indefinitely and will
+ * return NULL when direct reclaim and memory compaction have failed to allow
+ * the allocation to succeed. The OOM killer is not called with the current
+ * implementation.
*
* __GFP_MOVABLE: Flag that this page will be movable by the page migration
* mechanism or reclaimed
@@ -300,22 +303,31 @@
return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}
-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
- unsigned int order)
+/*
+ * Allocate pages, preferring the node given as nid. The node must be valid and
+ * online. For more general interface, see alloc_pages_node().
+ */
+static inline struct page *
+__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
{
- /* Unknown node is current node */
- if (nid < 0)
- nid = numa_node_id();
+ VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+ VM_WARN_ON(!node_online(nid));
return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}
-static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
+/*
+ * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
+ * prefer the current CPU's closest node. Otherwise node must be valid and
+ * online.
+ */
+static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
- VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
- return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+ return __alloc_pages_node(nid, gfp_mask, order);
}
#ifdef CONFIG_NUMA
@@ -354,7 +366,6 @@
void *alloc_pages_exact(size_t size, gfp_t gfp_mask);
void free_pages_exact(void *virt, size_t size);
-/* This is different from alloc_pages_exact_node !!! */
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
#define __get_free_page(gfp_mask) \
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f10b20f..ecb080d6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -33,6 +33,8 @@
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
int prot_numa);
+int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
+ unsigned long pfn, bool write);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
@@ -122,7 +124,7 @@
#endif
extern int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice);
-extern void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next);
@@ -138,15 +140,6 @@
else
return 0;
}
-static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- long adjust_next)
-{
- if (!vma->anon_vma || vma->vm_ops)
- return;
- __vma_adjust_trans_huge(vma, start, end, adjust_next);
-}
static inline int hpage_nr_pages(struct page *page)
{
if (unlikely(PageTransHuge(page)))
@@ -164,6 +157,13 @@
return ACCESS_ONCE(huge_zero_page) == page;
}
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+ return is_huge_zero_page(pmd_page(pmd));
+}
+
+struct page *get_huge_zero_page(void);
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d891f94..5e35379 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,9 @@
struct kref refs;
spinlock_t lock;
struct list_head regions;
+ long adds_in_progress;
+ struct list_head region_cache;
+ long region_cache_count;
};
extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);
@@ -80,11 +83,18 @@
int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags);
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed);
int dequeue_hwpoisoned_huge_page(struct page *page);
bool isolate_huge_page(struct page *page, struct list_head *list);
void putback_active_hugepage(struct page *page);
void free_huge_page(struct page *page);
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
+extern struct mutex *hugetlb_fault_mutex_table;
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct address_space *mapping,
+ pgoff_t idx, unsigned long address);
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
@@ -320,9 +330,13 @@
#endif
};
+struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr, int avoid_reserve);
struct page *alloc_huge_page_node(struct hstate *h, int nid);
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve);
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx);
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -471,6 +485,7 @@
#else /* CONFIG_HUGETLB_PAGE */
struct hstate {};
+#define alloc_huge_page(v, a, r) NULL
#define alloc_huge_page_node(h, nid) NULL
#define alloc_huge_page_noerr(v, a, r) NULL
#define alloc_bootmem_huge_page(h) NULL
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index cc4b019..c518eb5 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -77,6 +77,8 @@
int memblock_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
void memblock_trim_memory(phys_addr_t align);
+bool memblock_overlaps_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
@@ -323,7 +325,7 @@
int memblock_is_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
int memblock_is_reserved(phys_addr_t addr);
-int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
+bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);
extern void __memblock_dump_all(void);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 73b02b0..d92b80b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,6 +23,11 @@
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
+#include <linux/page_counter.h>
+#include <linux/vmpressure.h>
+#include <linux/eventfd.h>
+#include <linux/mmzone.h>
+#include <linux/writeback.h>
struct mem_cgroup;
struct page;
@@ -67,12 +72,221 @@
MEMCG_NR_EVENTS,
};
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_TARGET_NUMAINFO,
+ MEM_CGROUP_NTARGETS,
+};
+
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+ /* Currently active and new sockets should be assigned to cgroups */
+ MEMCG_SOCK_ACTIVE,
+ /* It was ever activated; we must disarm static keys on destruction */
+ MEMCG_SOCK_ACTIVATED,
+};
+
+struct cg_proto {
+ struct page_counter memory_allocated; /* Current allocated memory. */
+ struct percpu_counter sockets_allocated; /* Current number of sockets. */
+ int memory_pressure;
+ long sysctl_mem[3];
+ unsigned long flags;
+ /*
+ * memcg field is used to find which memcg we belong directly
+ * Each memcg struct can hold more than one cg_proto, so container_of
+ * won't really cut.
+ *
+ * The elegant solution would be having an inverse function to
+ * proto_cgroup in struct proto, but that means polluting the structure
+ * for everybody, instead of just for memcg users.
+ */
+ struct mem_cgroup *memcg;
+};
+
#ifdef CONFIG_MEMCG
+struct mem_cgroup_stat_cpu {
+ long count[MEM_CGROUP_STAT_NSTATS];
+ unsigned long events[MEMCG_NR_EVENTS];
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct mem_cgroup_reclaim_iter {
+ struct mem_cgroup *position;
+ /* scan generation, increased every round-trip */
+ unsigned int generation;
+};
+
+/*
+ * per-zone information in memory controller.
+ */
+struct mem_cgroup_per_zone {
+ struct lruvec lruvec;
+ unsigned long lru_size[NR_LRU_LISTS];
+
+ struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
+
+ struct rb_node tree_node; /* RB tree node */
+ unsigned long usage_in_excess;/* Set to the value by which */
+ /* the soft limit is exceeded*/
+ bool on_tree;
+ struct mem_cgroup *memcg; /* Back pointer, we cannot */
+ /* use container_of */
+};
+
+struct mem_cgroup_per_node {
+ struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_threshold {
+ struct eventfd_ctx *eventfd;
+ unsigned long threshold;
+};
+
+/* For threshold */
+struct mem_cgroup_threshold_ary {
+ /* An array index points to threshold just below or equal to usage. */
+ int current_threshold;
+ /* Size of entries[] */
+ unsigned int size;
+ /* Array of thresholds */
+ struct mem_cgroup_threshold entries[0];
+};
+
+struct mem_cgroup_thresholds {
+ /* Primary thresholds array */
+ struct mem_cgroup_threshold_ary *primary;
+ /*
+ * Spare threshold array.
+ * This is needed to make mem_cgroup_unregister_event() "never fail".
+ * It must be able to store at least primary->size - 1 entries.
+ */
+ struct mem_cgroup_threshold_ary *spare;
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ */
+struct mem_cgroup {
+ struct cgroup_subsys_state css;
+
+ /* Accounted resources */
+ struct page_counter memory;
+ struct page_counter memsw;
+ struct page_counter kmem;
+
+ /* Normal memory consumption range */
+ unsigned long low;
+ unsigned long high;
+
+ unsigned long soft_limit;
+
+ /* vmpressure notifications */
+ struct vmpressure vmpressure;
+
+ /* css_online() has been completed */
+ int initialized;
+
+ /*
+ * Should the accounting and control be hierarchical, per subtree?
+ */
+ bool use_hierarchy;
+
+ /* protected by memcg_oom_lock */
+ bool oom_lock;
+ int under_oom;
+
+ int swappiness;
+ /* OOM-Killer disable */
+ int oom_kill_disable;
+
+ /* protect arrays of thresholds */
+ struct mutex thresholds_lock;
+
+ /* thresholds for memory usage. RCU-protected */
+ struct mem_cgroup_thresholds thresholds;
+
+ /* thresholds for mem+swap usage. RCU-protected */
+ struct mem_cgroup_thresholds memsw_thresholds;
+
+ /* For oom notifier event fd */
+ struct list_head oom_notify;
+
+ /*
+ * Should we move charges of a task when a task is moved into this
+ * mem_cgroup ? And what type of charges should we move ?
+ */
+ unsigned long move_charge_at_immigrate;
+ /*
+ * set > 0 if pages under this cgroup are moving to other cgroup.
+ */
+ atomic_t moving_account;
+ /* taken only while moving_account > 0 */
+ spinlock_t move_lock;
+ struct task_struct *move_lock_task;
+ unsigned long move_lock_flags;
+ /*
+ * percpu counter.
+ */
+ struct mem_cgroup_stat_cpu __percpu *stat;
+ spinlock_t pcp_counter_lock;
+
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
+ struct cg_proto tcp_mem;
+#endif
+#if defined(CONFIG_MEMCG_KMEM)
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
+ int kmemcg_id;
+ bool kmem_acct_activated;
+ bool kmem_acct_active;
+#endif
+
+ int last_scanned_node;
+#if MAX_NUMNODES > 1
+ nodemask_t scan_nodes;
+ atomic_t numainfo_events;
+ atomic_t numainfo_updating;
+#endif
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+ struct wb_domain cgwb_domain;
+#endif
+
+ /* List of events which userspace want to receive */
+ struct list_head event_list;
+ spinlock_t event_list_lock;
+
+ struct mem_cgroup_per_node *nodeinfo[0];
+ /* WARNING: nodeinfo must be the last member here */
+};
extern struct cgroup_subsys_state *mem_cgroup_root_css;
-void mem_cgroup_events(struct mem_cgroup *memcg,
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
- unsigned int nr);
+ unsigned int nr)
+{
+ this_cpu_add(memcg->stat->events[idx], nr);
+}
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@ -90,15 +304,31 @@
struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
- struct mem_cgroup *root);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
-extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
-extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
+struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
+ return css ? container_of(css, struct mem_cgroup, css) : NULL;
+}
+
+struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
+ struct mem_cgroup *,
+ struct mem_cgroup_reclaim_cookie *);
+void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+
+static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
+ struct mem_cgroup *root)
+{
+ if (root == memcg)
+ return true;
+ if (!root->use_hierarchy)
+ return false;
+ return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
+}
static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
@@ -114,24 +344,67 @@
return match;
}
-extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
-extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
-struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
- struct mem_cgroup *,
- struct mem_cgroup_reclaim_cookie *);
-void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
+static inline bool mem_cgroup_disabled(void)
+{
+ if (memory_cgrp_subsys.disabled)
+ return true;
+ return false;
+}
/*
* For memory reclaim.
*/
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
-void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
-extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
- struct task_struct *p);
+
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int nr_pages);
+
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return true;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ memcg = mz->memcg;
+
+ return !!(memcg->css.flags & CSS_ONLINE);
+}
+
+static inline
+unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ return mz->lru_size[lru];
+}
+
+static inline int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
+{
+ unsigned long inactive_ratio;
+ unsigned long inactive;
+ unsigned long active;
+ unsigned long gb;
+
+ inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
+ active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
+
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+
+ return inactive * inactive_ratio < active;
+}
+
+void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
+ struct task_struct *p);
static inline void mem_cgroup_oom_enable(void)
{
@@ -156,18 +429,26 @@
extern int do_swap_account;
#endif
-static inline bool mem_cgroup_disabled(void)
-{
- if (memory_cgrp_subsys.disabled)
- return true;
- return false;
-}
-
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val);
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
+/**
+ * mem_cgroup_update_page_stat - update page state statistics
+ * @memcg: memcg to account against
+ * @idx: page state item to account
+ * @val: number of pages (positive or negative)
+ *
+ * See mem_cgroup_begin_page_stat() for locking requirements.
+ */
+static inline void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ VM_BUG_ON(!rcu_read_lock_held());
+
+ if (memcg)
+ this_cpu_add(memcg->stat->count[idx], val);
+}
+
static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx)
{
@@ -184,13 +465,31 @@
gfp_t gfp_mask,
unsigned long *total_scanned);
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
enum vm_event_item idx)
{
+ struct mem_cgroup *memcg;
+
if (mem_cgroup_disabled())
return;
- __mem_cgroup_count_vm_event(mm, idx);
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!memcg))
+ goto out;
+
+ switch (idx) {
+ case PGFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
+ break;
+ case PGMAJFAULT:
+ this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+ break;
+ default:
+ BUG();
+ }
+out:
+ rcu_read_unlock();
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void mem_cgroup_split_huge_fixup(struct page *head);
@@ -199,8 +498,6 @@
#else /* CONFIG_MEMCG */
struct mem_cgroup;
-#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
-
static inline void mem_cgroup_events(struct mem_cgroup *memcg,
enum mem_cgroup_events_index idx,
unsigned int nr)
@@ -275,12 +572,6 @@
return true;
}
-static inline struct cgroup_subsys_state
- *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return NULL;
-}
-
static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -428,8 +719,8 @@
extern struct static_key memcg_kmem_enabled_key;
extern int memcg_nr_cache_ids;
-extern void memcg_get_cache_ids(void);
-extern void memcg_put_cache_ids(void);
+void memcg_get_cache_ids(void);
+void memcg_put_cache_ids(void);
/*
* Helper macro to loop through all memcg-specific caches. Callers must still
@@ -444,7 +735,10 @@
return static_key_false(&memcg_kmem_enabled_key);
}
-bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+ return memcg->kmem_acct_active;
+}
/*
* In general, we'll do everything in our power to not incur in any overhead
@@ -463,7 +757,15 @@
struct mem_cgroup *memcg, int order);
void __memcg_kmem_uncharge_pages(struct page *page, int order);
-int memcg_cache_id(struct mem_cgroup *memcg);
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
+{
+ return memcg ? memcg->kmemcg_id : -1;
+}
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
void __memcg_kmem_put_cache(struct kmem_cache *cachep);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1171a29..f25a957 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -249,6 +249,8 @@
void (*close)(struct vm_area_struct * area);
int (*mremap)(struct vm_area_struct * area);
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+ int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
+ pmd_t *, unsigned int flags);
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
/* notification that a previously read-only page is about to become
@@ -307,18 +309,6 @@
#define page_private(page) ((page)->private)
#define set_page_private(page, v) ((page)->private = (v))
-/* It's valid only if the page is free path or free_list */
-static inline void set_freepage_migratetype(struct page *page, int migratetype)
-{
- page->index = migratetype;
-}
-
-/* It's valid only if the page is free path or free_list */
-static inline int get_freepage_migratetype(struct page *page)
-{
- return page->index;
-}
-
/*
* FIXME: take this include out, include page-flags.h in
* files which need it (119 of them)
@@ -359,18 +349,6 @@
return atomic_inc_not_zero(&page->_count);
}
-/*
- * Try to drop a ref unless the page has a refcount of one, return false if
- * that is the case.
- * This is to make sure that the refcount won't become zero after this drop.
- * This can be called when MMU is off so it must not access
- * any of the virtual mappings.
- */
-static inline int put_page_unless_one(struct page *page)
-{
- return atomic_add_unless(&page->_count, -1, 1);
-}
-
extern int page_is_ram(unsigned long pfn);
enum {
@@ -1267,6 +1245,11 @@
return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
}
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+ return !vma->vm_ops;
+}
+
static inline int stack_guard_page_start(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -2193,6 +2176,7 @@
extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
extern int unpoison_memory(unsigned long pfn);
extern int get_hwpoison_page(struct page *page);
+extern void put_hwpoison_page(struct page *page);
extern int sysctl_memory_failure_early_kill;
extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c8d0a73..3d6baa7 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -235,7 +235,7 @@
bool pfmemalloc;
};
-typedef unsigned long __nocast vm_flags_t;
+typedef unsigned long vm_flags_t;
/*
* A region containing a mapping of a non-memory backed file under NOMMU
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 7deecb7b..03e6257 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -13,6 +13,27 @@
struct task_struct;
/*
+ * Details of the page allocation that triggered the oom killer that are used to
+ * determine what should be killed.
+ */
+struct oom_control {
+ /* Used to determine cpuset */
+ struct zonelist *zonelist;
+
+ /* Used to determine mempolicy */
+ nodemask_t *nodemask;
+
+ /* Used to determine cpuset and node locality requirement */
+ const gfp_t gfp_mask;
+
+ /*
+ * order == -1 means the oom kill is required by sysrq, otherwise only
+ * for display purposes.
+ */
+ const int order;
+};
+
+/*
* Types of limitations to the nodes from which allocations may occur
*/
enum oom_constraint {
@@ -57,21 +78,18 @@
extern int oom_kills_count(void);
extern void note_oom_kill(void);
-extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message);
+ struct mem_cgroup *memcg, const char *message);
-extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+extern void check_panic_on_oom(struct oom_control *oc,
+ enum oom_constraint constraint,
struct mem_cgroup *memcg);
-extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill);
+extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages);
-extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *mask, bool force_kill);
+extern bool out_of_memory(struct oom_control *oc);
extern void exit_oom_victim(void);
diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 2dc1e16..047d647 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -65,11 +65,6 @@
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages);
-/*
- * Internal functions. Changes pageblock's migrate type.
- */
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages);
-void unset_migratetype_isolate(struct page *page, unsigned migratetype);
struct page *alloc_migrate_target(struct page *page, unsigned long private,
int **resultp);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 1a64733..e90eb22 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1227,6 +1227,8 @@
dma_pool_create(name, &pdev->dev, size, align, allocation)
#define pci_pool_destroy(pool) dma_pool_destroy(pool)
#define pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
+#define pci_pool_zalloc(pool, flags, handle) \
+ dma_pool_zalloc(pool, flags, handle)
#define pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
struct msix_entry {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 31496d2..7ba7dcca 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -351,7 +351,15 @@
extern int kswapd_run(int nid);
extern void kswapd_stop(int nid);
#ifdef CONFIG_MEMCG
-extern int mem_cgroup_swappiness(struct mem_cgroup *mem);
+static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
+{
+ /* root ? */
+ if (mem_cgroup_disabled() || !memcg->css.parent)
+ return vm_swappiness;
+
+ return memcg->swappiness;
+}
+
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
@@ -398,6 +406,9 @@
extern struct page *lookup_swap_cache(swp_entry_t);
extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr);
+extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated);
extern struct page *swapin_readahead(swp_entry_t, gfp_t,
struct vm_area_struct *vma, unsigned long addr);
@@ -431,6 +442,7 @@
extern sector_t map_swap_page(struct page *, struct block_device **);
extern sector_t swapdev_block(int, pgoff_t);
extern int page_swapcount(struct page *);
+extern int swp_swapcount(swp_entry_t entry);
extern struct swap_info_struct *page_swap_info(struct page *);
extern int reuse_swap_page(struct page *);
extern int try_to_free_swap(struct page *);
@@ -522,6 +534,11 @@
return 0;
}
+static inline int swp_swapcount(swp_entry_t entry)
+{
+ return 0;
+}
+
#define reuse_swap_page(page) (page_mapcount(page) == 1)
static inline int try_to_free_swap(struct page *page)
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index cedf3d3..5c3a5f3e 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -164,6 +164,9 @@
#endif
#ifdef CONFIG_MEMORY_FAILURE
+
+extern atomic_long_t num_poisoned_pages __read_mostly;
+
/*
* Support for hardware poisoned pages
*/
@@ -177,6 +180,31 @@
{
return swp_type(entry) == SWP_HWPOISON;
}
+
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+ return TestSetPageHWPoison(page);
+}
+
+static inline void num_poisoned_pages_inc(void)
+{
+ atomic_long_inc(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_dec(void)
+{
+ atomic_long_dec(&num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_add(long num)
+{
+ atomic_long_add(num, &num_poisoned_pages);
+}
+
+static inline void num_poisoned_pages_sub(long num)
+{
+ atomic_long_sub(num, &num_poisoned_pages);
+}
#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
@@ -188,6 +216,15 @@
{
return 0;
}
+
+static inline bool test_set_page_hwpoison(struct page *page)
+{
+ return false;
+}
+
+static inline void num_poisoned_pages_inc(void)
+{
+}
#endif
#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
diff --git a/include/linux/zbud.h b/include/linux/zbud.h
index f9d41a6..e183a0a 100644
--- a/include/linux/zbud.h
+++ b/include/linux/zbud.h
@@ -9,7 +9,7 @@
int (*evict)(struct zbud_pool *pool, unsigned long handle);
};
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops);
void zbud_destroy_pool(struct zbud_pool *pool);
int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index d30eff3..c924a28 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -37,7 +37,7 @@
};
struct zpool *zpool_create_pool(char *type, char *name,
- gfp_t gfp, struct zpool_ops *ops);
+ gfp_t gfp, const struct zpool_ops *ops);
char *zpool_get_type(struct zpool *pool);
@@ -81,7 +81,7 @@
atomic_t refcount;
struct list_head list;
- void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops,
+ void *(*create)(char *name, gfp_t gfp, const struct zpool_ops *ops,
struct zpool *zpool);
void (*destroy)(void *pool);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 1338190..6398dfa 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -34,6 +34,11 @@
*/
};
+struct zs_pool_stats {
+ /* How many pages were migrated (freed) */
+ unsigned long pages_compacted;
+};
+
struct zs_pool;
struct zs_pool *zs_create_pool(char *name, gfp_t flags);
@@ -49,4 +54,5 @@
unsigned long zs_get_total_pages(struct zs_pool *pool);
unsigned long zs_compact(struct zs_pool *pool);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
#endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 43c6abc..7aa7844 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1042,42 +1042,9 @@
#endif
};
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
- /* Currently active and new sockets should be assigned to cgroups */
- MEMCG_SOCK_ACTIVE,
- /* It was ever activated; we must disarm static keys on destruction */
- MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
- struct page_counter memory_allocated; /* Current allocated memory. */
- struct percpu_counter sockets_allocated; /* Current number of sockets. */
- int memory_pressure;
- long sysctl_mem[3];
- unsigned long flags;
- /*
- * memcg field is used to find which memcg we belong directly
- * Each memcg struct can hold more than one cg_proto, so container_of
- * won't really cut.
- *
- * The elegant solution would be having an inverse function to
- * proto_cgroup in struct proto, but that means polluting the structure
- * for everybody, instead of just for memcg users.
- */
- struct mem_cgroup *memcg;
-};
-
int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
-static inline bool memcg_proto_active(struct cg_proto *cg_proto)
-{
- return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
-}
-
#ifdef SOCK_REFCNT_DEBUG
static inline void sk_refcnt_debug_inc(struct sock *sk)
{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a8538e4..2cf0f79 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
- seq_show_option(seq, ss->name, NULL);
+ seq_show_option(seq, ss->legacy_name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28..99513e11 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -339,7 +339,7 @@
node = cpu_to_mem(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -347,7 +347,7 @@
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -543,14 +543,14 @@
int node = cpu_to_mem(cpu);
struct page *page;
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
diff --git a/lib/show_mem.c b/lib/show_mem.c
index adc98e18..1feed6a 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -38,11 +38,9 @@
printk("%lu pages RAM\n", total);
printk("%lu pages HighMem/MovableOnly\n", highmem);
-#ifdef CONFIG_CMA
- printk("%lu pages reserved\n", (reserved - totalcma_pages));
- printk("%lu pages cma reserved\n", totalcma_pages);
-#else
printk("%lu pages reserved\n", reserved);
+#ifdef CONFIG_CMA
+ printk("%lu pages cma reserved\n", totalcma_pages);
#endif
#ifdef CONFIG_QUICKLIST
printk("%lu pages in pagetable cache\n",
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a23dd19..3b63807 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -236,6 +236,7 @@
count += pages;
while (pages--)
__free_pages_bootmem(page++, cur++, 0);
+ bdata->node_bootmem_map = NULL;
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
@@ -294,6 +295,9 @@
sidx + bdata->node_min_pfn,
eidx + bdata->node_min_pfn);
+ if (WARN_ON(bdata->node_bootmem_map == NULL))
+ return;
+
if (bdata->hint_idx > sidx)
bdata->hint_idx = sidx;
@@ -314,6 +318,9 @@
eidx + bdata->node_min_pfn,
flags);
+ if (WARN_ON(bdata->node_bootmem_map == NULL))
+ return 0;
+
for (idx = sidx; idx < eidx; idx++)
if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
if (exclusive) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 018f08d..c5c627a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -207,6 +207,13 @@
return !get_pageblock_skip(page);
}
+static void reset_cached_positions(struct zone *zone)
+{
+ zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+ zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+ zone->compact_cached_free_pfn = zone_end_pfn(zone);
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -218,9 +225,6 @@
unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
- zone->compact_cached_migrate_pfn[0] = start_pfn;
- zone->compact_cached_migrate_pfn[1] = start_pfn;
- zone->compact_cached_free_pfn = end_pfn;
zone->compact_blockskip_flush = false;
/* Walk the zone and mark every pageblock as suitable for isolation */
@@ -238,6 +242,8 @@
clear_pageblock_skip(page);
}
+
+ reset_cached_positions(zone);
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -431,6 +437,24 @@
if (!valid_page)
valid_page = page;
+
+ /*
+ * For compound pages such as THP and hugetlbfs, we can save
+ * potentially a lot of iterations if we skip them at once.
+ * The check is racy, but we can consider only valid values
+ * and the only danger is skipping too much.
+ */
+ if (PageCompound(page)) {
+ unsigned int comp_order = compound_order(page);
+
+ if (likely(comp_order < MAX_ORDER)) {
+ blockpfn += (1UL << comp_order) - 1;
+ cursor += (1UL << comp_order) - 1;
+ }
+
+ goto isolate_fail;
+ }
+
if (!PageBuddy(page))
goto isolate_fail;
@@ -490,6 +514,13 @@
}
+ /*
+ * There is a tiny chance that we have read bogus compound_order(),
+ * so be careful to not go outside of the pageblock.
+ */
+ if (unlikely(blockpfn > end_pfn))
+ blockpfn = end_pfn;
+
trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
nr_scanned, total_isolated);
@@ -674,6 +705,8 @@
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
+ bool is_lru;
+
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort async compaction
@@ -717,36 +750,35 @@
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
*/
- if (!PageLRU(page)) {
+ is_lru = PageLRU(page);
+ if (!is_lru) {
if (unlikely(balloon_page_movable(page))) {
if (balloon_page_isolate(page)) {
/* Successfully isolated */
goto isolate_success;
}
}
- continue;
}
/*
- * PageLRU is set. lru_lock normally excludes isolation
- * splitting and collapsing (collapsing has already happened
- * if PageLRU is set) but the lock is not necessarily taken
- * here and it is wasteful to take it just to check transhuge.
- * Check TransHuge without lock and skip the whole pageblock if
- * it's either a transhuge or hugetlbfs page, as calling
- * compound_order() without preventing THP from splitting the
- * page underneath us may return surprising results.
+ * Regardless of being on LRU, compound pages such as THP and
+ * hugetlbfs are not to be compacted. We can potentially save
+ * a lot of iterations if we skip them at once. The check is
+ * racy, but we can consider only valid values and the only
+ * danger is skipping too much.
*/
- if (PageTransHuge(page)) {
- if (!locked)
- low_pfn = ALIGN(low_pfn + 1,
- pageblock_nr_pages) - 1;
- else
- low_pfn += (1 << compound_order(page)) - 1;
+ if (PageCompound(page)) {
+ unsigned int comp_order = compound_order(page);
+
+ if (likely(comp_order < MAX_ORDER))
+ low_pfn += (1UL << comp_order) - 1;
continue;
}
+ if (!is_lru)
+ continue;
+
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
@@ -763,11 +795,17 @@
if (!locked)
break;
- /* Recheck PageLRU and PageTransHuge under lock */
+ /* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
continue;
- if (PageTransHuge(page)) {
- low_pfn += (1 << compound_order(page)) - 1;
+
+ /*
+ * Page become compound since the non-locked check,
+ * and it's on LRU. It can only be a THP so the order
+ * is safe to read and it's 0 for tail pages.
+ */
+ if (unlikely(PageCompound(page))) {
+ low_pfn += (1UL << compound_order(page)) - 1;
continue;
}
}
@@ -778,7 +816,7 @@
if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page), page);
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -898,6 +936,16 @@
}
/*
+ * Test whether the free scanner has reached the same or lower pageblock than
+ * the migration scanner, and compaction should thus terminate.
+ */
+static inline bool compact_scanners_met(struct compact_control *cc)
+{
+ return (cc->free_pfn >> pageblock_order)
+ <= (cc->migrate_pfn >> pageblock_order);
+}
+
+/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
@@ -933,8 +981,7 @@
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; block_start_pfn >= low_pfn &&
- cc->nr_migratepages > cc->nr_freepages;
+ for (; block_start_pfn >= low_pfn;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
@@ -966,6 +1013,8 @@
block_end_pfn, freelist, false);
/*
+ * If we isolated enough freepages, or aborted due to async
+ * compaction being contended, terminate the loop.
* Remember where the free scanner should restart next time,
* which is where isolate_freepages_block() left off.
* But if it scanned the whole pageblock, isolate_start_pfn
@@ -974,27 +1023,31 @@
* In that case we will however want to restart at the start
* of the previous pageblock.
*/
- cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
- isolate_start_pfn :
- block_start_pfn - pageblock_nr_pages;
-
- /*
- * isolate_freepages_block() might have aborted due to async
- * compaction being contended
- */
- if (cc->contended)
+ if ((cc->nr_freepages >= cc->nr_migratepages)
+ || cc->contended) {
+ if (isolate_start_pfn >= block_end_pfn)
+ isolate_start_pfn =
+ block_start_pfn - pageblock_nr_pages;
break;
+ } else {
+ /*
+ * isolate_freepages_block() should not terminate
+ * prematurely unless contended, or isolated enough
+ */
+ VM_BUG_ON(isolate_start_pfn < block_end_pfn);
+ }
}
/* split_free_page does not map the pages */
map_pages(freelist);
/*
- * If we crossed the migrate scanner, we want to keep it that way
- * so that compact_finished() may detect this
+ * Record where the free scanner will restart next time. Either we
+ * broke from the loop and set isolate_start_pfn based on the last
+ * call to isolate_freepages_block(), or we met the migration scanner
+ * and the loop terminated due to isolate_start_pfn < low_pfn
*/
- if (block_start_pfn < low_pfn)
- cc->free_pfn = cc->migrate_pfn;
+ cc->free_pfn = isolate_start_pfn;
}
/*
@@ -1062,6 +1115,7 @@
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1110,6 +1164,7 @@
continue;
/* Perform the isolation */
+ isolate_start_pfn = low_pfn;
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
isolate_mode);
@@ -1119,6 +1174,15 @@
}
/*
+ * Record where we could have freed pages by migration and not
+ * yet flushed them to buddy allocator.
+ * - this is the lowest page that could have been isolated and
+ * then freed by migration.
+ */
+ if (cc->nr_migratepages && !cc->last_migrated_pfn)
+ cc->last_migrated_pfn = isolate_start_pfn;
+
+ /*
* Either we isolated something and proceed with migration. Or
* we failed and compact_zone should decide if we should
* continue or not.
@@ -1127,12 +1191,8 @@
}
acct_isolated(zone, cc);
- /*
- * Record where migration scanner will be restarted. If we end up in
- * the same pageblock as the free scanner, make the scanners fully
- * meet so that compact_finished() terminates compaction.
- */
- cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
+ /* Record where migration scanner will be restarted. */
+ cc->migrate_pfn = low_pfn;
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
@@ -1147,11 +1207,9 @@
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
- if (cc->free_pfn <= cc->migrate_pfn) {
+ if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
- zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
- zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ reset_cached_positions(zone);
/*
* Mark that the PG_migrate_skip information should be cleared
@@ -1295,7 +1353,6 @@
unsigned long end_pfn = zone_end_pfn(zone);
const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
- unsigned long last_migrated_pfn = 0;
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
@@ -1333,6 +1390,7 @@
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
+ cc->last_migrated_pfn = 0;
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
@@ -1342,7 +1400,6 @@
while ((ret = compact_finished(zone, cc, migratetype)) ==
COMPACT_CONTINUE) {
int err;
- unsigned long isolate_start_pfn = cc->migrate_pfn;
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
@@ -1376,22 +1433,12 @@
* migrate_pages() may return -ENOMEM when scanners meet
* and we want compact_finished() to detect it
*/
- if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
+ if (err == -ENOMEM && !compact_scanners_met(cc)) {
ret = COMPACT_PARTIAL;
goto out;
}
}
- /*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator. We use the pfn that
- * isolate_migratepages() started from in this loop iteration
- * - this is the lowest page that could have been isolated and
- * then freed by migration.
- */
- if (!last_migrated_pfn)
- last_migrated_pfn = isolate_start_pfn;
-
check_drain:
/*
* Has the migration scanner moved away from the previous
@@ -1400,18 +1447,18 @@
* compact_finished() can detect immediately if allocation
* would succeed.
*/
- if (cc->order > 0 && last_migrated_pfn) {
+ if (cc->order > 0 && cc->last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
cc->migrate_pfn & ~((1UL << cc->order) - 1);
- if (last_migrated_pfn < current_block_start) {
+ if (cc->last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
drain_local_pages(zone);
put_cpu();
/* No more flushing until we migrate again */
- last_migrated_pfn = 0;
+ cc->last_migrated_pfn = 0;
}
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 59d10d1..71a8998 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -271,6 +271,9 @@
{
bool empty = false;
+ if (unlikely(!pool))
+ return;
+
mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
@@ -334,7 +337,7 @@
/* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
spin_unlock_irqrestore(&pool->lock, flags);
- page = pool_alloc_page(pool, mem_flags);
+ page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO));
if (!page)
return NULL;
@@ -372,9 +375,14 @@
break;
}
}
- memset(retval, POOL_POISON_ALLOCATED, pool->size);
+ if (!(mem_flags & __GFP_ZERO))
+ memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
spin_unlock_irqrestore(&pool->lock, flags);
+
+ if (mem_flags & __GFP_ZERO)
+ memset(retval, 0, pool->size);
+
return retval;
}
EXPORT_SYMBOL(dma_pool_alloc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 0cfadaf..23f744d 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -224,6 +224,28 @@
return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO);
}
#endif
+
+#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
+
+void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size)
+{
+ unsigned long slop, clen;
+ char *p;
+
+ while (size) {
+ slop = src & ~PAGE_MASK;
+ clen = size;
+ if (clen > MAX_MAP_CHUNK - slop)
+ clen = MAX_MAP_CHUNK - slop;
+ p = early_memremap(src & PAGE_MASK, clen + slop);
+ memcpy(dest, p + slop, clen);
+ early_memunmap(p, clen + slop);
+ dest += clen;
+ src += clen;
+ size -= clen;
+ }
+}
+
#else /* CONFIG_MMU */
void __init __iomem *
diff --git a/mm/filemap.c b/mm/filemap.c
index 1283fc8..72940fb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -674,7 +674,7 @@
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- page = alloc_pages_exact_node(n, gfp, 0);
+ page = __alloc_pages_node(n, gfp, 0);
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
return page;
@@ -2473,21 +2473,6 @@
iov_iter_count(i));
again:
- /*
- * Bring in the user page that we will copy from _first_.
- * Otherwise there's a nasty deadlock on copying from the
- * same page as we're writing to, without it being marked
- * up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
- */
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
- status = -EFAULT;
- break;
- }
-
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status < 0))
@@ -2495,8 +2480,17 @@
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
-
+ /*
+ * 'page' is now locked. If we are trying to copy from a
+ * mapping of 'page' in userspace, the copy might fault and
+ * would need PageUptodate() to complete. But, page can not be
+ * made Uptodate without acquiring the page lock, which we hold.
+ * Deadlock. Avoid with pagefault_disable(). Fix up below with
+ * iov_iter_fault_in_readable().
+ */
+ pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ pagefault_enable();
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
@@ -2519,6 +2513,14 @@
*/
bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
iov_iter_single_seg_count(i));
+ /*
+ * This is the fallback to recover if the copy from
+ * userspace above faults.
+ */
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
goto again;
}
pos += copied;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 279a818..b16279c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
+#include <linux/dax.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
@@ -105,7 +106,7 @@
};
-static int set_recommended_min_free_kbytes(void)
+static void set_recommended_min_free_kbytes(void)
{
struct zone *zone;
int nr_zones = 0;
@@ -140,7 +141,6 @@
min_free_kbytes = recommended_min;
}
setup_per_zone_wmarks();
- return 0;
}
static int start_stop_khugepaged(void)
@@ -172,12 +172,7 @@
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
-static inline bool is_huge_zero_pmd(pmd_t pmd)
-{
- return is_huge_zero_page(pmd_page(pmd));
-}
-
-static struct page *get_huge_zero_page(void)
+struct page *get_huge_zero_page(void)
{
struct page *zero_page;
retry:
@@ -794,16 +789,19 @@
}
/* Caller must hold page table lock. */
-static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
+ if (!pmd_none(*pmd))
+ return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
+ return true;
}
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -870,6 +868,49 @@
flags);
}
+static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t entry;
+ spinlock_t *ptl;
+
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_none(*pmd)) {
+ entry = pmd_mkhuge(pfn_pmd(pfn, prot));
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
+ }
+ spin_unlock(ptl);
+}
+
+int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, unsigned long pfn, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ /*
+ * If we had pmd_special, we could avoid all these restrictions,
+ * but we need to be consistent with PTEs and architectures that
+ * can't support a 'special' bit.
+ */
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+ (VM_PFNMAP|VM_MIXEDMAP));
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+ BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+ if (track_pfn_insert(vma, &pgprot, pfn))
+ return VM_FAULT_SIGBUS;
+ insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+ return VM_FAULT_NOPAGE;
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
@@ -1414,41 +1455,41 @@
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
+ pmd_t orig_pmd;
spinlock_t *ptl;
- int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- struct page *page;
- pgtable_t pgtable;
- pmd_t orig_pmd;
- /*
- * For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_huge_get_and_clear. So do the
- * pgtable_trans_huge_withdraw after finishing pmdp related
- * operations.
- */
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
- tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
- if (is_huge_zero_pmd(orig_pmd)) {
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
+ if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ return 0;
+ /*
+ * For architectures like ppc64 we look at deposited pgtable
+ * when calling pmdp_huge_get_and_clear. So do the
+ * pgtable_trans_huge_withdraw after finishing pmdp related
+ * operations.
+ */
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ if (vma_is_dax(vma)) {
+ spin_unlock(ptl);
+ if (is_huge_zero_pmd(orig_pmd))
put_huge_zero_page();
- } else {
- page = pmd_page(orig_pmd);
- page_remove_rmap(page);
- VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
- add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(!PageHead(page), page);
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
- tlb_remove_page(tlb, page);
- }
- pte_free(tlb->mm, pgtable);
- ret = 1;
+ } else if (is_huge_zero_pmd(orig_pmd)) {
+ pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ put_huge_zero_page();
+ } else {
+ struct page *page = pmd_page(orig_pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+ pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
+ atomic_long_dec(&tlb->mm->nr_ptes);
+ spin_unlock(ptl);
+ tlb_remove_page(tlb, page);
}
- return ret;
+ return 1;
}
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
@@ -2285,8 +2326,12 @@
static void khugepaged_alloc_sleep(void)
{
- wait_event_freezable_timeout(khugepaged_wait, false,
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ DEFINE_WAIT(wait);
+
+ add_wait_queue(&khugepaged_wait, &wait);
+ freezable_schedule_timeout_interruptible(
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
}
static int khugepaged_node_load[MAX_NUMNODES];
@@ -2373,7 +2418,7 @@
*/
up_read(&mm->mmap_sem);
- *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
+ *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2911,7 +2956,7 @@
pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2924,25 +2969,27 @@
again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
- }
- if (is_huge_zero_pmd(*pmd)) {
+ if (unlikely(!pmd_trans_huge(*pmd)))
+ goto unlock;
+ if (vma_is_dax(vma)) {
+ pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ if (is_huge_zero_pmd(_pmd))
+ put_huge_zero_page();
+ } else if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- return;
+ } else {
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ get_page(page);
}
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ unlock:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- split_huge_page(page);
+ if (!page)
+ return;
+ split_huge_page(page);
put_page(page);
/*
@@ -2991,7 +3038,7 @@
split_huge_page_pmd_mm(mm, address, pmd);
}
-void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
long adjust_next)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51ae41d..999fb0a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -240,11 +240,14 @@
/*
* Add the huge page range represented by [f, t) to the reserve
- * map. Existing regions will be expanded to accommodate the
- * specified range. We know only existing regions need to be
- * expanded, because region_add is only called after region_chg
- * with the same range. If a new file_region structure must
- * be allocated, it is done in region_chg.
+ * map. In the normal case, existing regions will be expanded
+ * to accommodate the specified range. Sufficient regions should
+ * exist for expansion due to the previous call to region_chg
+ * with the same range. However, it is possible that region_del
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded. In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
*
* Return the number of new huge pages added to the map. This
* number is greater than or equal to zero.
@@ -261,6 +264,28 @@
if (f <= rg->to)
break;
+ /*
+ * If no region exists which can be expanded to include the
+ * specified range, the list must have been modified by an
+ * interleving call to region_del(). Pull a region descriptor
+ * from the cache and use it for this range.
+ */
+ if (&rg->link == head || t < rg->from) {
+ VM_BUG_ON(resv->region_cache_count <= 0);
+
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region,
+ link);
+ list_del(&nrg->link);
+
+ nrg->from = f;
+ nrg->to = t;
+ list_add(&nrg->link, rg->link.prev);
+
+ add += t - f;
+ goto out_locked;
+ }
+
/* Round our left edge to the current segment if it encloses us. */
if (f > rg->from)
f = rg->from;
@@ -294,6 +319,8 @@
add += t - nrg->to; /* Added to end of region */
nrg->to = t;
+out_locked:
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
@@ -312,11 +339,14 @@
* so that the subsequent region_add call will have all the
* regions it needs and will not fail.
*
- * Returns the number of huge pages that need to be added
- * to the existing reservation map for the range [f, t).
- * This number is greater or equal to zero. -ENOMEM is
- * returned if a new file_region structure is needed and can
- * not be allocated.
+ * Upon entry, region_chg will also examine the cache of region descriptors
+ * associated with the map. If there are not enough descriptors cached, one
+ * will be allocated for the in progress add operation.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t). This number is greater or equal to
+ * zero. -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
*/
static long region_chg(struct resv_map *resv, long f, long t)
{
@@ -326,6 +356,31 @@
retry:
spin_lock(&resv->lock);
+retry_locked:
+ resv->adds_in_progress++;
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations.
+ */
+ if (resv->adds_in_progress > resv->region_cache_count) {
+ struct file_region *trg;
+
+ VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+ /* Must drop lock to allocate a new descriptor. */
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ return -ENOMEM;
+
+ spin_lock(&resv->lock);
+ list_add(&trg->link, &resv->region_cache);
+ resv->region_cache_count++;
+ goto retry_locked;
+ }
+
/* Locate the region we are before or in. */
list_for_each_entry(rg, head, link)
if (f <= rg->to)
@@ -336,6 +391,7 @@
* size such that we can guarantee to record the reservation. */
if (&rg->link == head || t < rg->from) {
if (!nrg) {
+ resv->adds_in_progress--;
spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (!nrg)
@@ -385,43 +441,131 @@
}
/*
- * Truncate the reserve map at index 'end'. Modify/truncate any
- * region which contains end. Delete any regions past end.
- * Return the number of huge pages removed from the map.
+ * Abort the in progress add operation. The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add. Operations are sometimes
+ * aborted after the call to region_chg. In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine. They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
*/
-static long region_truncate(struct resv_map *resv, long end)
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+ spin_lock(&resv->lock);
+ VM_BUG_ON(!resv->region_cache_count);
+ resv->adds_in_progress--;
+ spin_unlock(&resv->lock);
+}
+
+/*
+ * Delete the specified range [f, t) from the reserve map. If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted. Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more. In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated. If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM. Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
+ */
+static long region_del(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *trg;
- long chg = 0;
+ struct file_region *nrg = NULL;
+ long del = 0;
+retry:
spin_lock(&resv->lock);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (end <= rg->to)
+ list_for_each_entry_safe(rg, trg, head, link) {
+ if (rg->to <= f)
+ continue;
+ if (rg->from >= t)
break;
- if (&rg->link == head)
- goto out;
- /* If we are in the middle of a region then adjust it. */
- if (end > rg->from) {
- chg = rg->to - end;
- rg->to = end;
- rg = list_entry(rg->link.next, typeof(*rg), link);
+ if (f > rg->from && t < rg->to) { /* Must split region */
+ /*
+ * Check for an entry in the cache before dropping
+ * lock and attempting allocation.
+ */
+ if (!nrg &&
+ resv->region_cache_count > resv->adds_in_progress) {
+ nrg = list_first_entry(&resv->region_cache,
+ struct file_region,
+ link);
+ list_del(&nrg->link);
+ resv->region_cache_count--;
+ }
+
+ if (!nrg) {
+ spin_unlock(&resv->lock);
+ nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+ if (!nrg)
+ return -ENOMEM;
+ goto retry;
+ }
+
+ del += t - f;
+
+ /* New entry for end of split region */
+ nrg->from = t;
+ nrg->to = rg->to;
+ INIT_LIST_HEAD(&nrg->link);
+
+ /* Original entry is trimmed */
+ rg->to = f;
+
+ list_add(&nrg->link, &rg->link);
+ nrg = NULL;
+ break;
+ }
+
+ if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+ del += rg->to - rg->from;
+ list_del(&rg->link);
+ kfree(rg);
+ continue;
+ }
+
+ if (f <= rg->from) { /* Trim beginning of region */
+ del += t - rg->from;
+ rg->from = t;
+ } else { /* Trim end of region */
+ del += rg->to - f;
+ rg->to = f;
+ }
}
- /* Drop any remaining regions. */
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- chg += rg->to - rg->from;
- list_del(&rg->link);
- kfree(rg);
- }
-
-out:
spin_unlock(&resv->lock);
- return chg;
+ kfree(nrg);
+ return del;
+}
+
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page. The huge page itself was free'ed
+ * and removed from the page cache. This routine will adjust the subpool
+ * usage count, and the global reserve count if needed. By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+ struct hugepage_subpool *spool = subpool_inode(inode);
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+ if (restore_reserve && rsv_adjust) {
+ struct hstate *h = hstate_inode(inode);
+
+ hugetlb_acct_memory(h, 1);
+ }
}
/*
@@ -544,22 +688,44 @@
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
- if (!resv_map)
+ struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+ if (!resv_map || !rg) {
+ kfree(resv_map);
+ kfree(rg);
return NULL;
+ }
kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions);
+ resv_map->adds_in_progress = 0;
+
+ INIT_LIST_HEAD(&resv_map->region_cache);
+ list_add(&rg->link, &resv_map->region_cache);
+ resv_map->region_cache_count = 1;
+
return resv_map;
}
void resv_map_release(struct kref *ref)
{
struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+ struct list_head *head = &resv_map->region_cache;
+ struct file_region *rg, *trg;
/* Clear out any active regions before we release the map. */
- region_truncate(resv_map, 0);
+ region_del(resv_map, 0, LONG_MAX);
+
+ /* ... and any entries left in the cache */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+
+ VM_BUG_ON(resv_map->adds_in_progress);
+
kfree(resv_map);
}
@@ -635,8 +801,19 @@
}
/* Shared mappings always use reserves */
- if (vma->vm_flags & VM_MAYSHARE)
- return true;
+ if (vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * We know VM_NORESERVE is not set. Therefore, there SHOULD
+ * be a region map for all pages. The only situation where
+ * there is no region map is if a hole was punched via
+ * fallocate. In this case, there really are no reverves to
+ * use. This situation is indicated if chg != 0.
+ */
+ if (chg)
+ return false;
+ else
+ return true;
+ }
/*
* Only the process that called mmap() has reserves for
@@ -1154,7 +1331,7 @@
{
struct page *page;
- page = alloc_pages_exact_node(nid,
+ page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
@@ -1306,7 +1483,7 @@
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
else
- page = alloc_pages_exact_node(nid,
+ page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
@@ -1473,16 +1650,19 @@
}
}
+
/*
- * vma_needs_reservation and vma_commit_reservation are used by the huge
- * page allocation routines to manage reservations.
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
+ * are used by the huge page allocation routines to manage reservations.
*
* vma_needs_reservation is called to determine if the huge page at addr
* within the vma has an associated reservation. If a reservation is
* needed, the value 1 is returned. The caller is then responsible for
* managing the global reservation and subpool usage counts. After
* the huge page has been allocated, vma_commit_reservation is called
- * to add the page to the reservation map.
+ * to add the page to the reservation map. If the page allocation fails,
+ * the reservation must be ended instead of committed. vma_end_reservation
+ * is called in such cases.
*
* In the normal case, vma_commit_reservation returns the same value
* as the preceding vma_needs_reservation call. The only time this
@@ -1490,9 +1670,14 @@
* is the responsibility of the caller to notice the difference and
* take appropriate action.
*/
+enum vma_resv_mode {
+ VMA_NEEDS_RESV,
+ VMA_COMMIT_RESV,
+ VMA_END_RESV,
+};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
- bool commit)
+ enum vma_resv_mode mode)
{
struct resv_map *resv;
pgoff_t idx;
@@ -1503,10 +1688,20 @@
return 1;
idx = vma_hugecache_offset(h, vma, addr);
- if (commit)
- ret = region_add(resv, idx, idx + 1);
- else
+ switch (mode) {
+ case VMA_NEEDS_RESV:
ret = region_chg(resv, idx, idx + 1);
+ break;
+ case VMA_COMMIT_RESV:
+ ret = region_add(resv, idx, idx + 1);
+ break;
+ case VMA_END_RESV:
+ region_abort(resv, idx, idx + 1);
+ ret = 0;
+ break;
+ default:
+ BUG();
+ }
if (vma->vm_flags & VM_MAYSHARE)
return ret;
@@ -1517,47 +1712,79 @@
static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, false);
+ return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}
static long vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __vma_reservation_common(h, vma, addr, true);
+ return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+static void vma_end_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
+}
+
+struct page *alloc_huge_page(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- long chg, commit;
+ long map_chg, map_commit;
+ long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
idx = hstate_index(h);
/*
- * Processes that did not create the mapping will have no
- * reserves and will not have accounted against subpool
- * limit. Check that the subpool limit can be made before
- * satisfying the allocation MAP_NORESERVE mappings may also
- * need pages and subpool limit allocated allocated if no reserve
- * mapping overlaps.
+ * Examine the region/reserve map to determine if the process
+ * has a reservation for the page to be allocated. A return
+ * code of zero indicates a reservation exists (no change).
*/
- chg = vma_needs_reservation(h, vma, addr);
- if (chg < 0)
+ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
+ if (map_chg < 0)
return ERR_PTR(-ENOMEM);
- if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1) < 0)
+
+ /*
+ * Processes that did not create the mapping will have no
+ * reserves as indicated by the region/reserve map. Check
+ * that the allocation will not exceed the subpool limit.
+ * Allocations for MAP_NORESERVE mappings also need to be
+ * checked against any subpool limit.
+ */
+ if (map_chg || avoid_reserve) {
+ gbl_chg = hugepage_subpool_get_pages(spool, 1);
+ if (gbl_chg < 0) {
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
+ }
+
+ /*
+ * Even though there was no reservation in the region/reserve
+ * map, there could be reservations associated with the
+ * subpool that can be used. This would be indicated if the
+ * return value of hugepage_subpool_get_pages() is zero.
+ * However, if avoid_reserve is specified we still avoid even
+ * the subpool reservations.
+ */
+ if (avoid_reserve)
+ gbl_chg = 1;
+ }
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
goto out_subpool_put;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+ /*
+ * glb_chg is passed to indicate whether or not a page must be taken
+ * from the global free pool (global change). gbl_chg == 0 indicates
+ * a reservation exists for the allocation.
+ */
+ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
if (!page) {
spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1573,8 +1800,8 @@
set_page_private(page, (unsigned long)spool);
- commit = vma_commit_reservation(h, vma, addr);
- if (unlikely(chg > commit)) {
+ map_commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(map_chg > map_commit)) {
/*
* The page was added to the reservation map between
* vma_needs_reservation and vma_commit_reservation.
@@ -1594,8 +1821,9 @@
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_subpool_put:
- if (chg || avoid_reserve)
+ if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
+ vma_end_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC);
}
@@ -2311,7 +2539,7 @@
}
kobject_put(hugepages_kobj);
- kfree(htlb_fault_mutex_table);
+ kfree(hugetlb_fault_mutex_table);
}
module_exit(hugetlb_exit);
@@ -2344,12 +2572,12 @@
#else
num_fault_mutexes = 1;
#endif
- htlb_fault_mutex_table =
+ hugetlb_fault_mutex_table =
kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
- BUG_ON(!htlb_fault_mutex_table);
+ BUG_ON(!hugetlb_fault_mutex_table);
for (i = 0; i < num_fault_mutexes; i++)
- mutex_init(&htlb_fault_mutex_table[i]);
+ mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
module_init(hugetlb_init);
@@ -3147,6 +3375,23 @@
return page != NULL;
}
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t idx)
+{
+ struct inode *inode = mapping->host;
+ struct hstate *h = hstate_inode(inode);
+ int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+ if (err)
+ return err;
+ ClearPagePrivate(page);
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(h);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3194,21 +3439,13 @@
set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
- int err;
- struct inode *inode = mapping->host;
-
- err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ int err = huge_add_to_page_cache(page, mapping, idx);
if (err) {
put_page(page);
if (err == -EEXIST)
goto retry;
goto out;
}
- ClearPagePrivate(page);
-
- spin_lock(&inode->i_lock);
- inode->i_blocks += blocks_per_huge_page(h);
- spin_unlock(&inode->i_lock);
} else {
lock_page(page);
if (unlikely(anon_vma_prepare(vma))) {
@@ -3236,11 +3473,14 @@
* any allocations necessary to record that reservation occur outside
* the spinlock.
*/
- if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
+ }
ptl = huge_pte_lockptr(h, mm, ptep);
spin_lock(ptl);
@@ -3280,7 +3520,7 @@
}
#ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3305,7 +3545,7 @@
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping,
pgoff_t idx, unsigned long address)
@@ -3353,8 +3593,8 @@
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
- mutex_lock(&htlb_fault_mutex_table[hash]);
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
if (huge_pte_none(entry)) {
@@ -3387,6 +3627,8 @@
ret = VM_FAULT_OOM;
goto out_mutex;
}
+ /* Just decrements count, does not deallocate */
+ vma_end_reservation(h, vma, address);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3437,7 +3679,7 @@
put_page(pagecache_page);
}
out_mutex:
- mutex_unlock(&htlb_fault_mutex_table[hash]);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -3726,12 +3968,15 @@
}
return 0;
out_err:
+ if (!vma || vma->vm_flags & VM_MAYSHARE)
+ region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
}
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+ long freed)
{
struct hstate *h = hstate_inode(inode);
struct resv_map *resv_map = inode_resv_map(inode);
@@ -3739,8 +3984,17 @@
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
- if (resv_map)
- chg = region_truncate(resv_map, offset);
+ if (resv_map) {
+ chg = region_del(resv_map, start, end);
+ /*
+ * region_del() can fail in the rare case where a region
+ * must be split and another region descriptor can not be
+ * allocated. If end == LONG_MAX, it will not fail.
+ */
+ if (chg < 0)
+ return chg;
+ }
+
spin_lock(&inode->i_lock);
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
@@ -3751,6 +4005,8 @@
*/
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
+
+ return 0;
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index bf73ac1..aeba0ed 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -58,7 +58,7 @@
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
put_out:
- put_page(p);
+ put_hwpoison_page(p);
return 0;
}
diff --git a/mm/internal.h b/mm/internal.h
index 1195dd2..bc0fa9a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -182,6 +182,7 @@
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
+ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
int order; /* order a direct compactor needs */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index cf79f11..f532f6a 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -838,6 +838,7 @@
}
if (crt_early_log >= ARRAY_SIZE(early_log)) {
+ crt_early_log++;
kmemleak_disable();
return;
}
@@ -1882,7 +1883,7 @@
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- if (crt_early_log >= ARRAY_SIZE(early_log))
+ if (crt_early_log > ARRAY_SIZE(early_log))
pr_warning("Early log buffer exceeded (%d), please increase "
"DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 909eca2..e1da19f 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -99,8 +99,8 @@
struct list_lru_one *l;
spin_lock(&nlru->lock);
- l = list_lru_from_kmem(nlru, item);
if (list_empty(item)) {
+ l = list_lru_from_kmem(nlru, item);
list_add_tail(item, &l->list);
l->nr_items++;
spin_unlock(&nlru->lock);
@@ -118,8 +118,8 @@
struct list_lru_one *l;
spin_lock(&nlru->lock);
- l = list_lru_from_kmem(nlru, item);
if (!list_empty(item)) {
+ l = list_lru_from_kmem(nlru, item);
list_del_init(item);
l->nr_items--;
spin_unlock(&nlru->lock);
diff --git a/mm/madvise.c b/mm/madvise.c
index ce3a422..c889fcb 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -301,7 +301,7 @@
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
- if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
+ if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
f = vma->vm_file;
diff --git a/mm/memblock.c b/mm/memblock.c
index 95ce68c..1c7b647 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -91,7 +91,7 @@
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -103,7 +103,7 @@
break;
}
- return (i < type->cnt) ? i : -1;
+ return i < type->cnt;
}
/*
@@ -569,6 +569,7 @@
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
+ WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
memblock_insert_region(type, i++, base,
@@ -614,14 +615,14 @@
int nid,
unsigned long flags)
{
- struct memblock_type *_rgn = &memblock.memory;
+ struct memblock_type *type = &memblock.memory;
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(_rgn, base, size, nid, flags);
+ return memblock_add_range(type, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -761,7 +762,7 @@
*
* This function isolates region [@base, @base + @size), and sets/clears flag
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
static int __init_memblock memblock_setclr_flag(phys_addr_t base,
phys_addr_t size, int set, int flag)
@@ -788,7 +789,7 @@
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -800,7 +801,7 @@
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
@@ -812,7 +813,7 @@
* @base: the base phys addr of the region
* @size: the size of the region
*
- * Return 0 on succees, -errno on failure.
+ * Return 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
{
@@ -834,10 +835,10 @@
phys_addr_t *out_start,
phys_addr_t *out_end)
{
- struct memblock_type *rsv = &memblock.reserved;
+ struct memblock_type *type = &memblock.reserved;
- if (*idx >= 0 && *idx < rsv->cnt) {
- struct memblock_region *r = &rsv->regions[*idx];
+ if (*idx >= 0 && *idx < type->cnt) {
+ struct memblock_region *r = &type->regions[*idx];
phys_addr_t base = r->base;
phys_addr_t size = r->size;
@@ -975,7 +976,7 @@
* in type_b.
*
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
@@ -1565,12 +1566,12 @@
* Check if the region [@base, @base+@size) intersects a reserved memory block.
*
* RETURNS:
- * 0 if false, non-zero if true
+ * True if they intersect, false if not.
*/
-int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
memblock_cap_size(base, &size);
- return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+ return memblock_overlaps_region(&memblock.reserved, base, size);
}
void __init_memblock memblock_trim_memory(phys_addr_t align)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1af0575..1742a2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -111,56 +111,10 @@
"unevictable",
};
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
- MEM_CGROUP_TARGET_THRESH,
- MEM_CGROUP_TARGET_SOFTLIMIT,
- MEM_CGROUP_TARGET_NUMAINFO,
- MEM_CGROUP_NTARGETS,
-};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET 1024
-struct mem_cgroup_stat_cpu {
- long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEMCG_NR_EVENTS];
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
- struct mem_cgroup *position;
- /* scan generation, increased every round-trip */
- unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
- struct lruvec lruvec;
- unsigned long lru_size[NR_LRU_LISTS];
-
- struct reclaim_iter iter[DEF_PRIORITY + 1];
-
- struct rb_node tree_node; /* RB tree node */
- unsigned long usage_in_excess;/* Set to the value by which */
- /* the soft limit is exceeded*/
- bool on_tree;
- struct mem_cgroup *memcg; /* Back pointer, we cannot */
- /* use container_of */
-};
-
-struct mem_cgroup_per_node {
- struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -181,32 +135,6 @@
static struct mem_cgroup_tree soft_limit_tree __read_mostly;
-struct mem_cgroup_threshold {
- struct eventfd_ctx *eventfd;
- unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
- /* An array index points to threshold just below or equal to usage. */
- int current_threshold;
- /* Size of entries[] */
- unsigned int size;
- /* Array of thresholds */
- struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
- /* Primary thresholds array */
- struct mem_cgroup_threshold_ary *primary;
- /*
- * Spare threshold array.
- * This is needed to make mem_cgroup_unregister_event() "never fail".
- * It must be able to store at least primary->size - 1 entries.
- */
- struct mem_cgroup_threshold_ary *spare;
-};
-
/* for OOM */
struct mem_cgroup_eventfd_list {
struct list_head list;
@@ -256,113 +184,6 @@
static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
- struct cgroup_subsys_state css;
-
- /* Accounted resources */
- struct page_counter memory;
- struct page_counter memsw;
- struct page_counter kmem;
-
- /* Normal memory consumption range */
- unsigned long low;
- unsigned long high;
-
- unsigned long soft_limit;
-
- /* vmpressure notifications */
- struct vmpressure vmpressure;
-
- /* css_online() has been completed */
- int initialized;
-
- /*
- * Should the accounting and control be hierarchical, per subtree?
- */
- bool use_hierarchy;
-
- /* protected by memcg_oom_lock */
- bool oom_lock;
- int under_oom;
-
- int swappiness;
- /* OOM-Killer disable */
- int oom_kill_disable;
-
- /* protect arrays of thresholds */
- struct mutex thresholds_lock;
-
- /* thresholds for memory usage. RCU-protected */
- struct mem_cgroup_thresholds thresholds;
-
- /* thresholds for mem+swap usage. RCU-protected */
- struct mem_cgroup_thresholds memsw_thresholds;
-
- /* For oom notifier event fd */
- struct list_head oom_notify;
-
- /*
- * Should we move charges of a task when a task is moved into this
- * mem_cgroup ? And what type of charges should we move ?
- */
- unsigned long move_charge_at_immigrate;
- /*
- * set > 0 if pages under this cgroup are moving to other cgroup.
- */
- atomic_t moving_account;
- /* taken only while moving_account > 0 */
- spinlock_t move_lock;
- struct task_struct *move_lock_task;
- unsigned long move_lock_flags;
- /*
- * percpu counter.
- */
- struct mem_cgroup_stat_cpu __percpu *stat;
- spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
- /* Index in the kmem_cache->memcg_params.memcg_caches array */
- int kmemcg_id;
- bool kmem_acct_activated;
- bool kmem_acct_active;
-#endif
-
- int last_scanned_node;
-#if MAX_NUMNODES > 1
- nodemask_t scan_nodes;
- atomic_t numainfo_events;
- atomic_t numainfo_updating;
-#endif
-
-#ifdef CONFIG_CGROUP_WRITEBACK
- struct list_head cgwb_list;
- struct wb_domain cgwb_domain;
-#endif
-
- /* List of events which userspace want to receive */
- struct list_head event_list;
- spinlock_t event_list_lock;
-
- struct mem_cgroup_per_node *nodeinfo[0];
- /* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
- return memcg->kmem_acct_active;
-}
-#endif
-
/* Stuffs for move charges at task migration. */
/*
* Types of charges to be moved.
@@ -423,11 +244,6 @@
*/
static DEFINE_MUTEX(memcg_create_mutex);
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
- return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -499,8 +315,7 @@
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) &&
- memcg_proto_active(cg_proto) &&
+ if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
css_tryget_online(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
@@ -593,11 +408,6 @@
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
-{
- return &memcg->css;
-}
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -876,14 +686,6 @@
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
- struct mem_cgroup_per_zone *mz;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- return mz->lru_size[lru];
-}
-
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid,
unsigned int lru_mask)
@@ -986,6 +788,7 @@
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
+EXPORT_SYMBOL(mem_cgroup_from_task);
static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
@@ -1031,7 +834,7 @@
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -1173,30 +976,6 @@
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
- if (unlikely(!memcg))
- goto out;
-
- switch (idx) {
- case PGFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
- break;
- case PGMAJFAULT:
- this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
- break;
- default:
- BUG();
- }
-out:
- rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
/**
* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
* @zone: zone of the wanted lruvec
@@ -1295,15 +1074,6 @@
VM_BUG_ON((long)(*lru_size) < 0);
}
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
- if (root == memcg)
- return true;
- if (!root->use_hierarchy)
- return false;
- return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
struct mem_cgroup *task_memcg;
@@ -1330,39 +1100,6 @@
return ret;
}
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
- unsigned long inactive_ratio;
- unsigned long inactive;
- unsigned long active;
- unsigned long gb;
-
- inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
- active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
- else
- inactive_ratio = 1;
-
- return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
- struct mem_cgroup_per_zone *mz;
- struct mem_cgroup *memcg;
-
- if (mem_cgroup_disabled())
- return true;
-
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
- memcg = mz->memcg;
-
- return !!(memcg->css.flags & CSS_ONLINE);
-}
-
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1394,15 +1131,6 @@
return margin;
}
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
- /* root ? */
- if (mem_cgroup_disabled() || !memcg->css.parent)
- return vm_swappiness;
-
- return memcg->swappiness;
-}
-
/*
* A routine for checking "mem" is under move_account() or not.
*
@@ -1545,6 +1273,12 @@
static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct mem_cgroup *iter;
unsigned long chosen_points = 0;
unsigned long totalpages;
@@ -1563,7 +1297,7 @@
goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+ check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1571,8 +1305,7 @@
css_task_iter_start(&iter->css, &it);
while ((task = css_task_iter_next(&it))) {
- switch (oom_scan_process_thread(task, totalpages, NULL,
- false)) {
+ switch (oom_scan_process_thread(&oc, task, totalpages)) {
case OOM_SCAN_SELECT:
if (chosen)
put_task_struct(chosen);
@@ -1610,8 +1343,8 @@
if (chosen) {
points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages,
- memcg, NULL, "Memory cgroup out of memory");
+ oom_kill_process(&oc, chosen, points, totalpages, memcg,
+ "Memory cgroup out of memory");
}
unlock:
mutex_unlock(&oom_lock);
@@ -2062,23 +1795,6 @@
}
EXPORT_SYMBOL(mem_cgroup_end_page_stat);
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx, int val)
-{
- VM_BUG_ON(!rcu_read_lock_held());
-
- if (memcg)
- this_cpu_add(memcg->stat->count[idx], val);
-}
-
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
@@ -2504,16 +2220,6 @@
css_put_many(&memcg->css, nr_pages);
}
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
- return memcg ? memcg->kmemcg_id : -1;
-}
-
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -5127,10 +4833,12 @@
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
- struct task_struct *p = cgroup_taskset_first(tset);
- int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *from;
+ struct task_struct *p;
+ struct mm_struct *mm;
unsigned long move_flags;
+ int ret = 0;
/*
* We are now commited to this value whatever it is. Changes in this
@@ -5138,36 +4846,37 @@
* So we need to save it, and keep it going.
*/
move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (move_flags) {
- struct mm_struct *mm;
- struct mem_cgroup *from = mem_cgroup_from_task(p);
+ if (!move_flags)
+ return 0;
- VM_BUG_ON(from == memcg);
+ p = cgroup_taskset_first(tset);
+ from = mem_cgroup_from_task(p);
- mm = get_task_mm(p);
- if (!mm)
- return 0;
- /* We move charges only when we move a owner of the mm */
- if (mm->owner == p) {
- VM_BUG_ON(mc.from);
- VM_BUG_ON(mc.to);
- VM_BUG_ON(mc.precharge);
- VM_BUG_ON(mc.moved_charge);
- VM_BUG_ON(mc.moved_swap);
+ VM_BUG_ON(from == memcg);
- spin_lock(&mc.lock);
- mc.from = from;
- mc.to = memcg;
- mc.flags = move_flags;
- spin_unlock(&mc.lock);
- /* We set mc.moving_task later */
+ mm = get_task_mm(p);
+ if (!mm)
+ return 0;
+ /* We move charges only when we move a owner of the mm */
+ if (mm->owner == p) {
+ VM_BUG_ON(mc.from);
+ VM_BUG_ON(mc.to);
+ VM_BUG_ON(mc.precharge);
+ VM_BUG_ON(mc.moved_charge);
+ VM_BUG_ON(mc.moved_swap);
- ret = mem_cgroup_precharge_mc(mm);
- if (ret)
- mem_cgroup_clear_mc();
- }
- mmput(mm);
+ spin_lock(&mc.lock);
+ mc.from = from;
+ mc.to = memcg;
+ mc.flags = move_flags;
+ spin_unlock(&mc.lock);
+ /* We set mc.moving_task later */
+
+ ret = mem_cgroup_precharge_mc(mm);
+ if (ret)
+ mem_cgroup_clear_mc();
}
+ mmput(mm);
return ret;
}
@@ -5521,19 +5230,6 @@
};
/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx,
- unsigned int nr)
-{
- this_cpu_add(memcg->stat->events[idx], nr);
-}
-
-/**
* mem_cgroup_low - check if memory consumption is below the normal range
* @root: the highest ancestor to consider
* @memcg: the memory cgroup to check
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 1f4446a9..eeda648 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -146,7 +146,7 @@
if (!mem)
return -EINVAL;
- css = mem_cgroup_css(mem);
+ css = &mem->css;
ino = cgroup_ino(css->cgroup);
css_put(css);
@@ -934,6 +934,27 @@
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
+/**
+ * put_hwpoison_page() - Put refcount for memory error handling:
+ * @page: raw error page (hit by memory error)
+ */
+void put_hwpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ if (PageHuge(head)) {
+ put_page(head);
+ return;
+ }
+
+ if (PageTransHuge(head))
+ if (page != head)
+ put_page(head);
+
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(put_hwpoison_page);
+
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1100,7 +1121,7 @@
nr_pages = 1 << compound_order(hpage);
else /* normal page or thp */
nr_pages = 1;
- atomic_long_add(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_add(nr_pages);
/*
* We need/can do nothing about count=0 pages.
@@ -1128,7 +1149,7 @@
if (PageHWPoison(hpage)) {
if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
|| (p != hpage && TestSetPageHWPoison(hpage))) {
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
return 0;
}
@@ -1152,10 +1173,8 @@
else
pr_err("MCE: %#lx: thp split failed\n", pfn);
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- put_page(p);
- if (p != hpage)
- put_page(hpage);
+ num_poisoned_pages_sub(nr_pages);
+ put_hwpoison_page(p);
return -EBUSY;
}
VM_BUG_ON_PAGE(!page_count(p), p);
@@ -1214,16 +1233,16 @@
*/
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
@@ -1237,7 +1256,7 @@
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
return 0;
}
/*
@@ -1426,6 +1445,22 @@
return 0;
}
+ if (page_count(page) > 1) {
+ pr_info("MCE: Someone grabs the hwpoison page %#lx\n", pfn);
+ return 0;
+ }
+
+ if (page_mapped(page)) {
+ pr_info("MCE: Someone maps the hwpoison page %#lx\n", pfn);
+ return 0;
+ }
+
+ if (page_mapping(page)) {
+ pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+ pfn);
+ return 0;
+ }
+
/*
* unpoison_memory() can encounter thp only when the thp is being
* worked by memory_failure() and the page lock is not held yet.
@@ -1450,7 +1485,7 @@
return 0;
}
if (TestClearPageHWPoison(p))
- atomic_long_dec(&num_poisoned_pages);
+ num_poisoned_pages_dec();
pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1464,16 +1499,16 @@
*/
if (TestClearPageHWPoison(page)) {
pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_sub(nr_pages, &num_poisoned_pages);
+ num_poisoned_pages_sub(nr_pages);
freeit = 1;
if (PageHuge(page))
clear_page_hwpoison_huge_page(page);
}
unlock_page(page);
- put_page(page);
+ put_hwpoison_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_page(page);
+ put_hwpoison_page(page);
return 0;
}
@@ -1486,7 +1521,7 @@
return alloc_huge_page_node(page_hstate(compound_head(p)),
nid);
else
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
@@ -1533,7 +1568,7 @@
/*
* Try to free it.
*/
- put_page(page);
+ put_hwpoison_page(page);
shake_page(page, 1);
/*
@@ -1542,7 +1577,7 @@
ret = __get_any_page(page, pfn, 0);
if (!PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
@@ -1565,7 +1600,7 @@
lock_page(hpage);
if (PageHWPoison(hpage)) {
unlock_page(hpage);
- put_page(hpage);
+ put_hwpoison_page(hpage);
pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
@@ -1576,7 +1611,7 @@
* get_any_page() and isolate_huge_page() takes a refcount each,
* so need to drop one here.
*/
- put_page(hpage);
+ put_hwpoison_page(hpage);
if (!ret) {
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
return -EBUSY;
@@ -1600,11 +1635,10 @@
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ num_poisoned_pages_add(1 << compound_order(hpage));
} else {
SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
}
}
return ret;
@@ -1625,7 +1659,7 @@
wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
@@ -1640,10 +1674,10 @@
* would need to fix isolation locking first.
*/
if (ret == 1) {
- put_page(page);
+ put_hwpoison_page(page);
pr_info("soft_offline: %#lx: invalidated\n", pfn);
SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
return 0;
}
@@ -1657,14 +1691,12 @@
* Drop page reference which is came from get_any_page()
* successful isolate_lru_page() already took another one.
*/
- put_page(page);
+ put_hwpoison_page(page);
if (!ret) {
LIST_HEAD(pagelist);
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
- if (!TestSetPageHWPoison(page))
- atomic_long_inc(&num_poisoned_pages);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1679,8 +1711,6 @@
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- if (TestClearPageHWPoison(page))
- atomic_long_dec(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1719,12 +1749,16 @@
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(page);
return -EBUSY;
}
if (!PageHuge(page) && PageTransHuge(hpage)) {
if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
pr_info("soft offline: %#lx: failed to split THP\n",
pfn);
+ if (flags & MF_COUNT_INCREASED)
+ put_hwpoison_page(page);
return -EBUSY;
}
}
@@ -1742,11 +1776,10 @@
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
if (!dequeue_hwpoisoned_huge_page(hpage))
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ num_poisoned_pages_add(1 << compound_order(hpage));
} else {
if (!TestSetPageHWPoison(page))
- atomic_long_inc(&num_poisoned_pages);
+ num_poisoned_pages_inc();
}
}
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index bb04d8f..6cd0b21 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2426,8 +2426,6 @@
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
-
- /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
@@ -3015,9 +3013,9 @@
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
+ * i_mmap_lock for write to protect against truncate.
*/
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
goto uncharge_out;
}
@@ -3031,9 +3029,9 @@
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for read to protect against truncate.
+ * i_mmap_lock for write to protect against truncate.
*/
- i_mmap_unlock_read(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
return ret;
uncharge_out:
@@ -3232,6 +3230,27 @@
return 0;
}
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+ unsigned int flags)
+{
+ if (!vma->vm_ops)
+ return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3267,12 +3286,12 @@
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops)
+ if (vma_is_anonymous(vma))
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, flags);
+ else
return do_fault(mm, vma, address, pte, pmd,
flags, entry);
-
- return do_anonymous_page(mm, vma, address, pte, pmd,
- flags);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
@@ -3334,10 +3353,7 @@
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- int ret = VM_FAULT_FALLBACK;
- if (!vma->vm_ops)
- ret = do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
+ int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
@@ -3361,8 +3377,8 @@
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
- ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
- orig_pmd);
+ ret = wp_huge_pmd(mm, vma, address, pmd,
+ orig_pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a7f1e0d..87a1779 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -608,9 +608,6 @@
qp->prev = vma;
- if (vma->vm_flags & VM_PFNMAP)
- return 1;
-
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
@@ -945,7 +942,7 @@
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+ return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE, 0);
}
@@ -2001,7 +1998,7 @@
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
- page = alloc_pages_exact_node(hpage_node,
+ page = __alloc_pages_node(hpage_node,
gfp | __GFP_THISNODE, order);
goto out;
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 2cc08de..4c533bc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -150,6 +150,9 @@
*/
void mempool_destroy(mempool_t *pool)
{
+ if (unlikely(!pool))
+ return;
+
while (pool->curr_nr) {
void *element = remove_element(pool);
pool->free(element, pool->pool_data);
diff --git a/mm/memtest.c b/mm/memtest.c
index 0a1cc13..8eaa4c3 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -1,11 +1,6 @@
#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
#include <linux/init.h>
-#include <linux/pfn.h>
#include <linux/memblock.h>
static u64 patterns[] __initdata = {
@@ -31,10 +26,8 @@
static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
{
- printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
- (unsigned long long) pattern,
- (unsigned long long) start_bad,
- (unsigned long long) end_bad);
+ pr_info(" %016llx bad mem addr %pa - %pa reserved\n",
+ cpu_to_be64(pattern), &start_bad, &end_bad);
memblock_reserve(start_bad, end_bad - start_bad);
}
@@ -79,26 +72,26 @@
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long)this_start,
- (unsigned long long)this_end,
- (unsigned long long)cpu_to_be64(pattern));
+ pr_info(" %pa - %pa pattern %016llx\n",
+ &this_start, &this_end, cpu_to_be64(pattern));
memtest(pattern, this_start, this_end - this_start);
}
}
}
/* default is disabled */
-static int memtest_pattern __initdata;
+static unsigned int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
+ int ret = 0;
+
if (arg)
- memtest_pattern = simple_strtoul(arg, NULL, 0);
+ ret = kstrtouint(arg, 0, &memtest_pattern);
else
memtest_pattern = ARRAY_SIZE(patterns);
- return 0;
+ return ret;
}
early_param("memtest", parse_memtest);
@@ -111,7 +104,7 @@
if (!memtest_pattern)
return;
- printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+ pr_info("early_memtest: # of tests: %u\n", memtest_pattern);
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5c08cab..02ce25d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -880,8 +880,7 @@
/* Establish migration ptes or remove ptes */
if (page_mapped(page)) {
try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
- TTU_IGNORE_HWPOISON);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
@@ -952,9 +951,11 @@
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
/* Soft-offlined page shouldn't go through lru cache list */
- if (reason == MR_MEMORY_FAILURE)
+ if (reason == MR_MEMORY_FAILURE) {
put_page(page);
- else
+ if (!test_set_page_hwpoison(page))
+ num_poisoned_pages_inc();
+ } else
putback_lru_page(page);
}
@@ -1194,7 +1195,7 @@
return alloc_huge_page_node(page_hstate(compound_head(p)),
pm->node);
else
- return alloc_pages_exact_node(pm->node,
+ return __alloc_pages_node(pm->node,
GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
}
@@ -1554,7 +1555,7 @@
int nid = (int) data;
struct page *newpage;
- newpage = alloc_pages_exact_node(nid,
+ newpage = __alloc_pages_node(nid,
(GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE | __GFP_NOMEMALLOC |
__GFP_NORETRY | __GFP_NOWARN) &
diff --git a/mm/mmap.c b/mm/mmap.c
index 82db4fc..b6be324 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2455,7 +2455,7 @@
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
- int err = -ENOMEM;
+ int err;
if (is_vm_hugetlb_page(vma) && (addr &
~(huge_page_mask(hstate_vma(vma)))))
@@ -2463,7 +2463,7 @@
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
- goto out_err;
+ return -ENOMEM;
/* most fields are the same, copy all, and then fixup */
*new = *vma;
@@ -2511,7 +2511,6 @@
mpol_put(vma_policy(new));
out_free_vma:
kmem_cache_free(vm_area_cachep, new);
- out_err:
return err;
}
@@ -2872,6 +2871,13 @@
struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ return -ENOMEM;
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ return -ENOMEM;
+
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
* until its first write fault, when page's anon_vma and index
@@ -2884,16 +2890,10 @@
* using the existing file pgoff checks and manipulations.
* Similarly in do_mmap_pgoff and in do_brk.
*/
- if (!vma->vm_file) {
+ if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- return -ENOMEM;
- if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, vma_pages(vma)))
- return -ENOMEM;
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
@@ -2918,7 +2918,7 @@
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
*/
- if (unlikely(!vma->vm_file && !vma->anon_vma)) {
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
pgoff = addr >> PAGE_SHIFT;
faulted_in_anon_vma = false;
}
@@ -2952,30 +2952,31 @@
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new_vma) {
- *new_vma = *vma;
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
- if (vma_dup_policy(vma, new_vma))
- goto out_free_vma;
- INIT_LIST_HEAD(&new_vma->anon_vma_chain);
- if (anon_vma_clone(new_vma, vma))
- goto out_free_mempol;
- if (new_vma->vm_file)
- get_file(new_vma->vm_file);
- if (new_vma->vm_ops && new_vma->vm_ops->open)
- new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
- *need_rmap_locks = false;
- }
+ if (!new_vma)
+ goto out;
+ *new_vma = *vma;
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
+ if (vma_dup_policy(vma, new_vma))
+ goto out_free_vma;
+ INIT_LIST_HEAD(&new_vma->anon_vma_chain);
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
+ if (new_vma->vm_file)
+ get_file(new_vma->vm_file);
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
}
return new_vma;
- out_free_mempol:
+out_free_mempol:
mpol_put(vma_policy(new_vma));
- out_free_vma:
+out_free_vma:
kmem_cache_free(vm_area_cachep, new_vma);
+out:
return NULL;
}
@@ -3027,21 +3028,13 @@
pgoff_t pgoff;
struct page **pages;
- /*
- * special mappings have no vm_file, and in that case, the mm
- * uses vm_pgoff internally. So we have to subtract it from here.
- * We are allowed to do this because we are the mm; do not copy
- * this code into drivers!
- */
- pgoff = vmf->pgoff - vma->vm_pgoff;
-
if (vma->vm_ops == &legacy_special_mapping_vmops)
pages = vma->vm_private_data;
else
pages = ((struct vm_special_mapping *)vma->vm_private_data)->
pages;
- for (; pgoff && *pages; ++pages)
+ for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
if (*pages) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dff991e..1ecc0bc 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,26 @@
* Determine the type of allocation constraint.
*/
#ifdef CONFIG_NUMA
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+ enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
/* Default to all available memory */
*totalpages = totalram_pages + total_swap_pages;
- if (!zonelist)
+ if (!oc->zonelist)
return CONSTRAINT_NONE;
/*
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
* to kill current.We have to random task kill in this case.
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
*/
- if (gfp_mask & __GFP_THISNODE)
+ if (oc->gfp_mask & __GFP_THISNODE)
return CONSTRAINT_NONE;
/*
@@ -224,17 +223,18 @@
* the page allocator means a mempolicy is in effect. Cpuset policy
* is enforced in get_page_from_freelist().
*/
- if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) {
+ if (oc->nodemask &&
+ !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
*totalpages = total_swap_pages;
- for_each_node_mask(nid, *nodemask)
+ for_each_node_mask(nid, *oc->nodemask)
*totalpages += node_spanned_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
/* Check this allocation failure is caused by cpuset's wall function */
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- if (!cpuset_zone_allowed(zone, gfp_mask))
+ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
+ high_zoneidx, oc->nodemask)
+ if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
if (cpuset_limited) {
@@ -246,20 +246,18 @@
return CONSTRAINT_NONE;
}
#else
-static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
- gfp_t gfp_mask, nodemask_t *nodemask,
- unsigned long *totalpages)
+static enum oom_constraint constrained_alloc(struct oom_control *oc,
+ unsigned long *totalpages)
{
*totalpages = totalram_pages + total_swap_pages;
return CONSTRAINT_NONE;
}
#endif
-enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
+ struct task_struct *task, unsigned long totalpages)
{
- if (oom_unkillable_task(task, NULL, nodemask))
+ if (oom_unkillable_task(task, NULL, oc->nodemask))
return OOM_SCAN_CONTINUE;
/*
@@ -267,7 +265,7 @@
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (!force_kill)
+ if (oc->order != -1)
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -280,7 +278,7 @@
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !force_kill)
+ if (task_will_free_mem(task) && oc->order != -1)
return OOM_SCAN_ABORT;
return OOM_SCAN_OK;
@@ -289,12 +287,9 @@
/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. Returns -1 on scan abort.
- *
- * (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, const nodemask_t *nodemask,
- bool force_kill)
+static struct task_struct *select_bad_process(struct oom_control *oc,
+ unsigned int *ppoints, unsigned long totalpages)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -304,8 +299,7 @@
for_each_process_thread(g, p) {
unsigned int points;
- switch (oom_scan_process_thread(p, totalpages, nodemask,
- force_kill)) {
+ switch (oom_scan_process_thread(oc, p, totalpages)) {
case OOM_SCAN_SELECT:
chosen = p;
chosen_points = ULONG_MAX;
@@ -318,7 +312,7 @@
case OOM_SCAN_OK:
break;
};
- points = oom_badness(p, NULL, nodemask, totalpages);
+ points = oom_badness(p, NULL, oc->nodemask, totalpages);
if (!points || points < chosen_points)
continue;
/* Prefer thread group leaders for display purposes */
@@ -380,13 +374,13 @@
rcu_read_unlock();
}
-static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_header(struct oom_control *oc, struct task_struct *p,
+ struct mem_cgroup *memcg)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
"oom_score_adj=%hd\n",
- current->comm, gfp_mask, order,
+ current->comm, oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
@@ -396,7 +390,7 @@
else
show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
- dump_tasks(memcg, nodemask);
+ dump_tasks(memcg, oc->nodemask);
}
/*
@@ -487,10 +481,9 @@
* Must be called while holding a reference to p, which will be released upon
* returning.
*/
-void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+ struct mem_cgroup *memcg, const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -514,7 +507,7 @@
task_unlock(p);
if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
+ dump_header(oc, p, memcg);
task_lock(p);
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
@@ -537,7 +530,7 @@
/*
* oom_badness() returns 0 if the thread is unkillable
*/
- child_points = oom_badness(child, memcg, nodemask,
+ child_points = oom_badness(child, memcg, oc->nodemask,
totalpages);
if (child_points > victim_points) {
put_task_struct(victim);
@@ -600,8 +593,7 @@
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask,
+void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
@@ -615,7 +607,10 @@
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, memcg, nodemask);
+ /* Do not panic for oom kills triggered by sysrq */
+ if (oc->order == -1)
+ return;
+ dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -635,28 +630,21 @@
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
/**
- * __out_of_memory - kill the "best" process when we run out of memory
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
+ * out_of_memory - kill the "best" process when we run out of memory
+ * @oc: pointer to struct oom_control
*
* If we run out of memory, we have the choice between either
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct oom_control *oc)
{
- const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
unsigned int uninitialized_var(points);
enum oom_constraint constraint = CONSTRAINT_NONE;
- int killed = 0;
if (oom_killer_disabled)
return false;
@@ -664,7 +652,7 @@
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
- goto out;
+ return true;
/*
* If current has a pending SIGKILL or is exiting, then automatically
@@ -677,47 +665,42 @@
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
mark_oom_victim(current);
- goto out;
+ return true;
}
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA) that may require different handling.
*/
- constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
- &totalpages);
- mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
+ constraint = constrained_alloc(oc, &totalpages);
+ if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->nodemask = NULL;
+ check_panic_on_oom(oc, constraint, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
- !oom_unkillable_task(current, NULL, nodemask) &&
+ !oom_unkillable_task(current, NULL, oc->nodemask) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
+ oom_kill_process(oc, current, 0, totalpages, NULL,
"Out of memory (oom_kill_allocating_task)");
- goto out;
+ return true;
}
- p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+ p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
+ if (!p && oc->order != -1) {
+ dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
- if (p != (void *)-1UL) {
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
- killed = 1;
- }
-out:
- /*
- * Give the killed threads a good chance of exiting before trying to
- * allocate memory again.
- */
- if (killed)
+ if (p && p != (void *)-1UL) {
+ oom_kill_process(oc, p, points, totalpages, NULL,
+ "Out of memory");
+ /*
+ * Give the killed process a good chance to exit before trying
+ * to allocate memory again.
+ */
schedule_timeout_killable(1);
-
+ }
return true;
}
@@ -728,13 +711,20 @@
*/
void pagefault_out_of_memory(void)
{
+ struct oom_control oc = {
+ .zonelist = NULL,
+ .nodemask = NULL,
+ .gfp_mask = 0,
+ .order = 0,
+ };
+
if (mem_cgroup_oom_synchronize(true))
return;
if (!mutex_trylock(&oom_lock))
return;
- if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ if (!out_of_memory(&oc)) {
/*
* There shouldn't be any user tasks runnable while the
* OOM killer is disabled, so the current task has to
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b401d40..48aaf7b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -125,6 +125,24 @@
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+ return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+ page->index = migratetype;
+}
+
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
@@ -791,7 +809,11 @@
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- mt = get_freepage_migratetype(page);
+
+ mt = get_pcppage_migratetype(page);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
if (unlikely(has_isolate_pageblock(zone)))
mt = get_pageblock_migratetype(page);
@@ -955,7 +977,6 @@
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- set_freepage_migratetype(page, migratetype);
free_one_page(page_zone(page), page, pfn, order, migratetype);
local_irq_restore(flags);
}
@@ -1383,7 +1404,7 @@
rmv_page_order(page);
area->nr_free--;
expand(zone, page, order, current_order, area, migratetype);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
return page;
}
@@ -1460,7 +1481,6 @@
order = page_order(page);
list_move(&page->lru,
&zone->free_area[order].free_list[migratetype]);
- set_freepage_migratetype(page, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -1630,14 +1650,13 @@
expand(zone, page, order, current_order, area,
start_migratetype);
/*
- * The freepage_migratetype may differ from pageblock's
+ * The pcppage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
+ * find_suitable_fallback(). This is OK as long as it does not
+ * differ for MIGRATE_CMA pageblocks. Those can be used as
+ * fallback only via special __rmqueue_cma_fallback() function
*/
- set_freepage_migratetype(page, start_migratetype);
+ set_pcppage_migratetype(page, start_migratetype);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -1713,7 +1732,7 @@
else
list_add_tail(&page->lru, list);
list = &page->lru;
- if (is_migrate_cma(get_freepage_migratetype(page)))
+ if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
@@ -1910,7 +1929,7 @@
return;
migratetype = get_pfnblock_migratetype(page, pfn);
- set_freepage_migratetype(page, migratetype);
+ set_pcppage_migratetype(page, migratetype);
local_irq_save(flags);
__count_vm_event(PGFREE);
@@ -2115,7 +2134,7 @@
if (!page)
goto failed;
__mod_zone_freepage_state(zone, -(1 << order),
- get_freepage_migratetype(page));
+ get_pcppage_migratetype(page));
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -2696,6 +2715,12 @@
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
{
+ struct oom_control oc = {
+ .zonelist = ac->zonelist,
+ .nodemask = ac->nodemask,
+ .gfp_mask = gfp_mask,
+ .order = order,
+ };
struct page *page;
*did_some_progress = 0;
@@ -2747,8 +2772,7 @@
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
- || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
mutex_unlock(&oom_lock);
@@ -3490,8 +3514,6 @@
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
- * Note this is not alloc_pages_exact_node() which allocates on a specific node,
- * but is not exact.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
@@ -5066,7 +5088,7 @@
{
unsigned long zone_start_pfn, zone_end_pfn;
- /* When hotadd a new node, the node should be empty */
+ /* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
@@ -5133,7 +5155,7 @@
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
- /* When hotadd a new node, the node should be empty */
+ /* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
@@ -5306,8 +5328,7 @@
*
* NOTE: pgdat should get zeroed by caller.
*/
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn)
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -5458,7 +5479,8 @@
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5470,7 +5492,7 @@
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn);
+ free_area_init_core(pgdat);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5481,11 +5503,9 @@
*/
void __init setup_nr_node_ids(void)
{
- unsigned int node;
- unsigned int highest = 0;
+ unsigned int highest;
- for_each_node_mask(node, node_possible_map)
- highest = node;
+ highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
nr_node_ids = highest + 1;
}
#endif
@@ -6006,7 +6026,7 @@
* set_dma_reserve - set the specified number of pages reserved in the first zone
* @new_dma_reserve: The number of pages to mark reserved
*
- * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
* function may optionally be used to account for unfreeable pages in the
@@ -6059,7 +6079,7 @@
}
/*
- * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
* or min_free_kbytes changes.
*/
static void calculate_totalreserve_pages(void)
@@ -6103,7 +6123,7 @@
/*
* setup_per_zone_lowmem_reserve - called whenever
- * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
+ * sysctl_lowmem_reserve_ratio changes. Ensures that each zone
* has a correct pages reserved value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*/
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 303c908..4568fd5 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,7 +9,8 @@
#include <linux/hugetlb.h>
#include "internal.h"
-int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page,
+ bool skip_hwpoisoned_pages)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -72,7 +73,7 @@
return ret;
}
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
@@ -223,34 +224,16 @@
continue;
}
page = pfn_to_page(pfn);
- if (PageBuddy(page)) {
+ if (PageBuddy(page))
/*
- * If race between isolatation and allocation happens,
- * some free pages could be in MIGRATE_MOVABLE list
- * although pageblock's migratation type of the page
- * is MIGRATE_ISOLATE. Catch it and move the page into
- * MIGRATE_ISOLATE list.
+ * If the page is on a free list, it has to be on
+ * the correct MIGRATE_ISOLATE freelist. There is no
+ * simple way to verify that as VM_BUG_ON(), though.
*/
- if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
- struct page *end_page;
-
- end_page = page + (1 << page_order(page)) - 1;
- move_freepages(page_zone(page), page, end_page,
- MIGRATE_ISOLATE);
- }
pfn += 1 << page_order(page);
- }
- else if (page_count(page) == 0 &&
- get_freepage_migratetype(page) == MIGRATE_ISOLATE)
- pfn += 1;
- else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
- /*
- * The HWPoisoned page may be not in buddy
- * system, and page_count() is not 0.
- */
+ else if (skip_hwpoisoned_pages && PageHWPoison(page))
+ /* A HWPoisoned page cannot be also PageBuddy */
pfn++;
- continue;
- }
else
break;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index dbe0c1e..48ce829 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -542,6 +542,21 @@
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
+static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+
+ generic_fillattr(inode, stat);
+
+ return 0;
+}
+
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
@@ -3122,6 +3137,7 @@
};
static const struct inode_operations shmem_inode_operations = {
+ .getattr = shmem_getattr,
.setattr = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
diff --git a/mm/slab.c b/mm/slab.c
index 60c9369..c77ebe6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1595,7 +1595,7 @@
if (memcg_charge_slab(cachep, flags, cachep->gfporder))
return NULL;
- page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+ page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
if (!page) {
memcg_uncharge_slab(cachep, cachep->gfporder);
slab_out_of_memory(cachep, flags, nodeid);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c26829f..5ce4fae 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -500,7 +500,7 @@
struct kmem_cache *root_cache)
{
static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+ struct cgroup_subsys_state *css = &memcg->css;
struct memcg_cache_array *arr;
struct kmem_cache *s = NULL;
char *cache_name;
@@ -640,6 +640,9 @@
bool need_rcu_barrier = false;
bool busy = false;
+ if (unlikely(!s))
+ return;
+
BUG_ON(!is_root_cache(s));
get_online_cpus();
diff --git a/mm/slob.c b/mm/slob.c
index 165bbd3..0d7e5df 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -45,7 +45,7 @@
* NUMA support in SLOB is fairly simplistic, pushing most of the real
* logic down to the page allocator, and simply doing the node accounting
* on the upper levels. In the event that a node id is explicitly
- * provided, alloc_pages_exact_node() with the specified node id is used
+ * provided, __alloc_pages_node() with the specified node id is used
* instead. The common case (or when the node id isn't explicitly provided)
* will default to the current node, as per numa_node_id().
*
@@ -193,7 +193,7 @@
#ifdef CONFIG_NUMA
if (node != NUMA_NO_NODE)
- page = alloc_pages_exact_node(node, gfp, order);
+ page = __alloc_pages_node(node, gfp, order);
else
#endif
page = alloc_pages(gfp, order);
diff --git a/mm/slub.c b/mm/slub.c
index 084184e..f614b5d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1334,7 +1334,7 @@
if (node == NUMA_NO_NODE)
page = alloc_pages(flags, order);
else
- page = alloc_pages_exact_node(node, flags, order);
+ page = __alloc_pages_node(node, flags, order);
if (!page)
memcg_uncharge_slab(s, order);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8bc8e66..d504adb 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -288,17 +288,14 @@
return page;
}
-/*
- * Locate a page of swap in physical memory, reserving swap cache space
- * and reading the disk if it is not already cached.
- * A failure return means that either the page allocation failed or that
- * the swap entry is no longer in use.
- */
-struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr)
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool *new_page_allocated)
{
struct page *found_page, *new_page = NULL;
+ struct address_space *swapper_space = swap_address_space(entry);
int err;
+ *new_page_allocated = false;
do {
/*
@@ -306,8 +303,7 @@
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swap_address_space(entry),
- entry.val);
+ found_page = find_get_page(swapper_space, entry.val);
if (found_page)
break;
@@ -366,7 +362,7 @@
* Initiate read into locked page and return.
*/
lru_cache_add_anon(new_page);
- swap_readpage(new_page);
+ *new_page_allocated = true;
return new_page;
}
radix_tree_preload_end();
@@ -384,6 +380,25 @@
return found_page;
}
+/*
+ * Locate a page of swap in physical memory, reserving swap cache space
+ * and reading the disk if it is not already cached.
+ * A failure return means that either the page allocation failed or that
+ * the swap entry is no longer in use.
+ */
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ bool page_was_allocated;
+ struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
+ vma, addr, &page_was_allocated);
+
+ if (page_was_allocated)
+ swap_readpage(retpage);
+
+ return retpage;
+}
+
static unsigned long swapin_nr_pages(unsigned long offset)
{
static unsigned long prev_offset;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index aebc2dd6e..5887731 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -875,6 +875,48 @@
}
/*
+ * How many references to @entry are currently swapped out?
+ * This considers COUNT_CONTINUED so it returns exact answer.
+ */
+int swp_swapcount(swp_entry_t entry)
+{
+ int count, tmp_count, n;
+ struct swap_info_struct *p;
+ struct page *page;
+ pgoff_t offset;
+ unsigned char *map;
+
+ p = swap_info_get(entry);
+ if (!p)
+ return 0;
+
+ count = swap_count(p->swap_map[swp_offset(entry)]);
+ if (!(count & COUNT_CONTINUED))
+ goto out;
+
+ count &= ~COUNT_CONTINUED;
+ n = SWAP_MAP_MAX + 1;
+
+ offset = swp_offset(entry);
+ page = vmalloc_to_page(p->swap_map + offset);
+ offset &= ~PAGE_MASK;
+ VM_BUG_ON(page_private(page) != SWP_CONTINUED);
+
+ do {
+ page = list_entry(page->lru.next, struct page, lru);
+ map = kmap_atomic(page);
+ tmp_count = map[offset];
+ kunmap_atomic(map);
+
+ count += (tmp_count & ~COUNT_CONTINUED) * n;
+ n *= (SWAP_CONT_MAX + 1);
+ } while (tmp_count & COUNT_CONTINUED);
+out:
+ spin_unlock(&p->lock);
+ return count;
+}
+
+/*
* We can write to an anon page without COW if there are no other references
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b113903..2d978b2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -175,7 +175,7 @@
if (!memcg)
return true;
#ifdef CONFIG_CGROUP_WRITEBACK
- if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+ if (memcg->css.cgroup)
return true;
#endif
return false;
@@ -985,7 +985,7 @@
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) Legacy memcg encounters a page that is not already marked
+ * 3) Legacy memcg encounters a page that is already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
@@ -1015,12 +1015,15 @@
*/
SetPageReclaim(page);
nr_writeback++;
-
goto keep_locked;
/* Case 3 above */
} else {
+ unlock_page(page);
wait_on_page_writeback(page);
+ /* then go back and try same page again */
+ list_add_tail(&page->lru, page_list);
+ continue;
}
}
@@ -1196,7 +1199,7 @@
if (PageSwapCache(page))
try_to_free_swap(page);
unlock_page(page);
- putback_lru_page(page);
+ list_add(&page->lru, &ret_pages);
continue;
activate_locked:
@@ -1359,7 +1362,8 @@
unsigned long nr_taken = 0;
unsigned long scan;
- for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+ !list_empty(src); scan++) {
struct page *page;
int nr_pages;
diff --git a/mm/zbud.c b/mm/zbud.c
index f3bf6f7..fa48bcdf 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -96,10 +96,10 @@
struct list_head buddied;
struct list_head lru;
u64 pages_nr;
- struct zbud_ops *ops;
+ const struct zbud_ops *ops;
#ifdef CONFIG_ZPOOL
struct zpool *zpool;
- struct zpool_ops *zpool_ops;
+ const struct zpool_ops *zpool_ops;
#endif
};
@@ -133,12 +133,12 @@
return -ENOENT;
}
-static struct zbud_ops zbud_zpool_ops = {
+static const struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
static void *zbud_zpool_create(char *name, gfp_t gfp,
- struct zpool_ops *zpool_ops,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
struct zbud_pool *pool;
@@ -302,7 +302,7 @@
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
{
struct zbud_pool *pool;
int i;
diff --git a/mm/zpool.c b/mm/zpool.c
index 722a4f6..68d2dd8 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -22,7 +22,7 @@
struct zpool_driver *driver;
void *pool;
- struct zpool_ops *ops;
+ const struct zpool_ops *ops;
struct list_head list;
};
@@ -115,7 +115,7 @@
* Returns: New zpool on success, NULL on failure.
*/
struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
- struct zpool_ops *ops)
+ const struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -320,20 +320,6 @@
return zpool->driver->total_size(zpool->pool);
}
-static int __init init_zpool(void)
-{
- pr_info("loaded\n");
- return 0;
-}
-
-static void __exit exit_zpool(void)
-{
- pr_info("unloaded\n");
-}
-
-module_init(init_zpool);
-module_exit(exit_zpool);
-
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0a7f81a..f135b1b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -169,14 +169,12 @@
NR_ZS_STAT_TYPE,
};
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static struct dentry *zs_stat_root;
-
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
#endif
/*
@@ -201,6 +199,8 @@
static const int fullness_threshold_frac = 4;
struct size_class {
+ spinlock_t lock;
+ struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
/*
* Size of objects stored in this class. Must be multiple
* of ZS_ALIGN.
@@ -210,16 +210,10 @@
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+ struct zs_size_stat stats;
+
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
bool huge;
-
-#ifdef CONFIG_ZSMALLOC_STAT
- struct zs_size_stat stats;
-#endif
-
- spinlock_t lock;
-
- struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
};
/*
@@ -251,6 +245,15 @@
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+ struct zs_pool_stats stats;
+
+ /* Compact classes */
+ struct shrinker shrinker;
+ /*
+ * To signify that register_shrinker() was successful
+ * and unregister_shrinker() will not Oops.
+ */
+ bool shrinker_enabled;
#ifdef CONFIG_ZSMALLOC_STAT
struct dentry *stat_dentry;
#endif
@@ -285,8 +288,7 @@
static void destroy_handle_cache(struct zs_pool *pool)
{
- if (pool->handle_cachep)
- kmem_cache_destroy(pool->handle_cachep);
+ kmem_cache_destroy(pool->handle_cachep);
}
static unsigned long alloc_handle(struct zs_pool *pool)
@@ -309,7 +311,8 @@
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+static void *zs_zpool_create(char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
return zs_create_pool(name, gfp);
@@ -441,8 +444,6 @@
return min(zs_size_classes - 1, idx);
}
-#ifdef CONFIG_ZSMALLOC_STAT
-
static inline void zs_stat_inc(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
@@ -461,6 +462,8 @@
return class->stats.objs[type];
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
static int __init zs_stat_init(void)
{
if (!debugfs_initialized())
@@ -576,23 +579,6 @@
}
#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return 0;
-}
-
static int __init zs_stat_init(void)
{
return 0;
@@ -610,7 +596,6 @@
static inline void zs_pool_stat_destroy(struct zs_pool *pool)
{
}
-
#endif
@@ -658,13 +643,22 @@
if (fullness >= _ZS_NR_FULLNESS_GROUPS)
return;
- head = &class->fullness_list[fullness];
- if (*head)
- list_add_tail(&page->lru, &(*head)->lru);
-
- *head = page;
zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
+
+ head = &class->fullness_list[fullness];
+ if (!*head) {
+ *head = page;
+ return;
+ }
+
+ /*
+ * We want to see more ZS_FULL pages and less almost
+ * empty/full. Put pages with higher ->inuse first.
+ */
+ list_add_tail(&page->lru, &(*head)->lru);
+ if (page->inuse >= (*head)->inuse)
+ *head = page;
}
/*
@@ -1495,7 +1489,7 @@
}
EXPORT_SYMBOL_GPL(zs_free);
-static void zs_object_copy(unsigned long src, unsigned long dst,
+static void zs_object_copy(unsigned long dst, unsigned long src,
struct size_class *class)
{
struct page *s_page, *d_page;
@@ -1602,8 +1596,6 @@
/* Starting object index within @s_page which used for live object
* in the subpage. */
int index;
- /* how many of objects are migrated */
- int nr_migrated;
};
static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
@@ -1614,7 +1606,6 @@
struct page *s_page = cc->s_page;
struct page *d_page = cc->d_page;
unsigned long index = cc->index;
- int nr_migrated = 0;
int ret = 0;
while (1) {
@@ -1636,23 +1627,21 @@
used_obj = handle_to_obj(handle);
free_obj = obj_malloc(d_page, class, handle);
- zs_object_copy(used_obj, free_obj, class);
+ zs_object_copy(free_obj, used_obj, class);
index++;
record_obj(handle, free_obj);
unpin_tag(handle);
obj_free(pool, class, used_obj);
- nr_migrated++;
}
/* Remember last position in this iteration */
cc->s_page = s_page;
cc->index = index;
- cc->nr_migrated = nr_migrated;
return ret;
}
-static struct page *alloc_target_page(struct size_class *class)
+static struct page *isolate_target_page(struct size_class *class)
{
int i;
struct page *page;
@@ -1668,8 +1657,17 @@
return page;
}
-static void putback_zspage(struct zs_pool *pool, struct size_class *class,
- struct page *first_page)
+/*
+ * putback_zspage - add @first_page into right class's fullness list
+ * @pool: target pool
+ * @class: destination class
+ * @first_page: target page
+ *
+ * Return @fist_page's fullness_group
+ */
+static enum fullness_group putback_zspage(struct zs_pool *pool,
+ struct size_class *class,
+ struct page *first_page)
{
enum fullness_group fullness;
@@ -1687,50 +1685,72 @@
free_zspage(first_page);
}
+
+ return fullness;
}
static struct page *isolate_source_page(struct size_class *class)
{
- struct page *page;
+ int i;
+ struct page *page = NULL;
- page = class->fullness_list[ZS_ALMOST_EMPTY];
- if (page)
- remove_zspage(page, class, ZS_ALMOST_EMPTY);
+ for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) {
+ page = class->fullness_list[i];
+ if (!page)
+ continue;
+
+ remove_zspage(page, class, i);
+ break;
+ }
return page;
}
-static unsigned long __zs_compact(struct zs_pool *pool,
- struct size_class *class)
+/*
+ *
+ * Based on the number of unused allocated objects calculate
+ * and return the number of pages that we can free.
+ */
+static unsigned long zs_can_compact(struct size_class *class)
{
- int nr_to_migrate;
+ unsigned long obj_wasted;
+
+ obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
+ zs_stat_get(class, OBJ_USED);
+
+ obj_wasted /= get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+
+ return obj_wasted * class->pages_per_zspage;
+}
+
+static void __zs_compact(struct zs_pool *pool, struct size_class *class)
+{
struct zs_compact_control cc;
struct page *src_page;
struct page *dst_page = NULL;
- unsigned long nr_total_migrated = 0;
spin_lock(&class->lock);
while ((src_page = isolate_source_page(class))) {
BUG_ON(!is_first_page(src_page));
- /* The goal is to migrate all live objects in source page */
- nr_to_migrate = src_page->inuse;
+ if (!zs_can_compact(class))
+ break;
+
cc.index = 0;
cc.s_page = src_page;
- while ((dst_page = alloc_target_page(class))) {
+ while ((dst_page = isolate_target_page(class))) {
cc.d_page = dst_page;
/*
- * If there is no more space in dst_page, try to
- * allocate another zspage.
+ * If there is no more space in dst_page, resched
+ * and see if anyone had allocated another zspage.
*/
if (!migrate_zspage(pool, class, &cc))
break;
putback_zspage(pool, class, dst_page);
- nr_total_migrated += cc.nr_migrated;
- nr_to_migrate -= cc.nr_migrated;
}
/* Stop if we couldn't find slot */
@@ -1738,9 +1758,9 @@
break;
putback_zspage(pool, class, dst_page);
- putback_zspage(pool, class, src_page);
+ if (putback_zspage(pool, class, src_page) == ZS_EMPTY)
+ pool->stats.pages_compacted += class->pages_per_zspage;
spin_unlock(&class->lock);
- nr_total_migrated += cc.nr_migrated;
cond_resched();
spin_lock(&class->lock);
}
@@ -1749,14 +1769,11 @@
putback_zspage(pool, class, src_page);
spin_unlock(&class->lock);
-
- return nr_total_migrated;
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
- unsigned long nr_migrated = 0;
struct size_class *class;
for (i = zs_size_classes - 1; i >= 0; i--) {
@@ -1765,13 +1782,80 @@
continue;
if (class->index != i)
continue;
- nr_migrated += __zs_compact(pool, class);
+ __zs_compact(pool, class);
}
- return nr_migrated;
+ return pool->stats.pages_compacted;
}
EXPORT_SYMBOL_GPL(zs_compact);
+void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats)
+{
+ memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats));
+}
+EXPORT_SYMBOL_GPL(zs_pool_stats);
+
+static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long pages_freed;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ pages_freed = pool->stats.pages_compacted;
+ /*
+ * Compact classes and calculate compaction delta.
+ * Can run concurrently with a manually triggered
+ * (by user) compaction.
+ */
+ pages_freed = zs_compact(pool) - pages_freed;
+
+ return pages_freed ? pages_freed : SHRINK_STOP;
+}
+
+static unsigned long zs_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ int i;
+ struct size_class *class;
+ unsigned long pages_to_free = 0;
+ struct zs_pool *pool = container_of(shrinker, struct zs_pool,
+ shrinker);
+
+ if (!pool->shrinker_enabled)
+ return 0;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+
+ pages_to_free += zs_can_compact(class);
+ }
+
+ return pages_to_free;
+}
+
+static void zs_unregister_shrinker(struct zs_pool *pool)
+{
+ if (pool->shrinker_enabled) {
+ unregister_shrinker(&pool->shrinker);
+ pool->shrinker_enabled = false;
+ }
+}
+
+static int zs_register_shrinker(struct zs_pool *pool)
+{
+ pool->shrinker.scan_objects = zs_shrinker_scan;
+ pool->shrinker.count_objects = zs_shrinker_count;
+ pool->shrinker.batch = 0;
+ pool->shrinker.seeks = DEFAULT_SEEKS;
+
+ return register_shrinker(&pool->shrinker);
+}
+
/**
* zs_create_pool - Creates an allocation pool to work from.
* @flags: allocation flags used to allocate pool metadata
@@ -1857,6 +1941,12 @@
if (zs_pool_stat_create(name, pool))
goto err;
+ /*
+ * Not critical, we still can use the pool
+ * and user can trigger compaction manually.
+ */
+ if (zs_register_shrinker(pool) == 0)
+ pool->shrinker_enabled = true;
return pool;
err:
@@ -1869,6 +1959,7 @@
{
int i;
+ zs_unregister_shrinker(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < zs_size_classes; i++) {
diff --git a/mm/zswap.c b/mm/zswap.c
index 2d5727b..48a1d08 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -446,75 +446,14 @@
static int zswap_get_swap_cache_page(swp_entry_t entry,
struct page **retpage)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
- int err;
+ bool page_was_allocated;
- *retpage = NULL;
- do {
- /*
- * First check the swap cache. Since this is normally
- * called after lookup_swap_cache() failed, re-calling
- * that would confuse statistics.
- */
- found_page = find_get_page(swapper_space, entry.val);
- if (found_page)
- break;
-
- /*
- * Get a new page to read into from swap.
- */
- if (!new_page) {
- new_page = alloc_page(GFP_KERNEL);
- if (!new_page)
- break; /* Out of memory */
- }
-
- /*
- * call radix_tree_preload() while we can wait.
- */
- err = radix_tree_preload(GFP_KERNEL);
- if (err)
- break;
-
- /*
- * Swap entry may have been freed since our caller observed it.
- */
- err = swapcache_prepare(entry);
- if (err == -EEXIST) { /* seems racy */
- radix_tree_preload_end();
- continue;
- }
- if (err) { /* swp entry is obsolete ? */
- radix_tree_preload_end();
- break;
- }
-
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
- SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
- if (likely(!err)) {
- radix_tree_preload_end();
- lru_cache_add_anon(new_page);
- *retpage = new_page;
- return ZSWAP_SWAPCACHE_NEW;
- }
- radix_tree_preload_end();
- ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
- /*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
- */
- swapcache_free(entry);
- } while (err != -ENOMEM);
-
- if (new_page)
- page_cache_release(new_page);
- if (!found_page)
+ *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
+ NULL, 0, &page_was_allocated);
+ if (page_was_allocated)
+ return ZSWAP_SWAPCACHE_NEW;
+ if (!*retpage)
return ZSWAP_SWAPCACHE_FAIL;
- *retpage = found_page;
return ZSWAP_SWAPCACHE_EXIST;
}
@@ -816,7 +755,7 @@
zswap_trees[type] = NULL;
}
-static struct zpool_ops zswap_zpool_ops = {
+static const struct zpool_ops zswap_zpool_ops = {
.evict = zswap_writeback_entry
};
diff --git a/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci b/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci
new file mode 100644
index 0000000..9b7eb32
--- /dev/null
+++ b/scripts/coccinelle/api/alloc/pool_zalloc-simple.cocci
@@ -0,0 +1,84 @@
+///
+/// Use *_pool_zalloc rather than *_pool_alloc followed by memset with 0
+///
+// Copyright: (C) 2015 Intel Corp. GPLv2.
+// Options: --no-includes --include-headers
+//
+// Keywords: dma_pool_zalloc, pci_pool_zalloc
+//
+
+virtual context
+virtual patch
+virtual org
+virtual report
+
+//----------------------------------------------------------
+// For context mode
+//----------------------------------------------------------
+
+@depends on context@
+expression x;
+statement S;
+@@
+
+* x = \(dma_pool_alloc\|pci_pool_alloc\)(...);
+ if ((x==NULL) || ...) S
+* memset(x,0, ...);
+
+//----------------------------------------------------------
+// For patch mode
+//----------------------------------------------------------
+
+@depends on patch@
+expression x;
+expression a,b,c;
+statement S;
+@@
+
+- x = dma_pool_alloc(a,b,c);
++ x = dma_pool_zalloc(a,b,c);
+ if ((x==NULL) || ...) S
+- memset(x,0,...);
+
+@depends on patch@
+expression x;
+expression a,b,c;
+statement S;
+@@
+
+- x = pci_pool_alloc(a,b,c);
++ x = pci_pool_zalloc(a,b,c);
+ if ((x==NULL) || ...) S
+- memset(x,0,...);
+
+//----------------------------------------------------------
+// For org and report mode
+//----------------------------------------------------------
+
+@r depends on org || report@
+expression x;
+expression a,b,c;
+statement S;
+position p;
+@@
+
+ x = @p\(dma_pool_alloc\|pci_pool_alloc\)(a,b,c);
+ if ((x==NULL) || ...) S
+ memset(x,0, ...);
+
+@script:python depends on org@
+p << r.p;
+x << r.x;
+@@
+
+msg="%s" % (x)
+msg_safe=msg.replace("[","@(").replace("]",")")
+coccilib.org.print_todo(p[0], msg_safe)
+
+@script:python depends on report@
+p << r.p;
+x << r.x;
+@@
+
+msg="WARNING: *_pool_zalloc should be used for %s, instead of *_pool_alloc/memset" % (x)
+coccilib.report.print_report(p[0], msg)
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 0d68547..d36fab7 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -4,7 +4,6 @@
BINARIES = compaction_test
BINARIES += hugepage-mmap
BINARIES += hugepage-shm
-BINARIES += hugetlbfstest
BINARIES += map_hugetlb
BINARIES += thuge-gen
BINARIES += transhuge-stress
diff --git a/tools/testing/selftests/vm/hugetlbfstest.c b/tools/testing/selftests/vm/hugetlbfstest.c
deleted file mode 100644
index 02e1072..0000000
--- a/tools/testing/selftests/vm/hugetlbfstest.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#define _GNU_SOURCE
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-typedef unsigned long long u64;
-
-static size_t length = 1 << 24;
-
-static u64 read_rss(void)
-{
- char buf[4096], *s = buf;
- int i, fd;
- u64 rss;
-
- fd = open("/proc/self/statm", O_RDONLY);
- assert(fd > 2);
- memset(buf, 0, sizeof(buf));
- read(fd, buf, sizeof(buf) - 1);
- for (i = 0; i < 1; i++)
- s = strchr(s, ' ') + 1;
- rss = strtoull(s, NULL, 10);
- return rss << 12; /* assumes 4k pagesize */
-}
-
-static void do_mmap(int fd, int extra_flags, int unmap)
-{
- int *p;
- int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags;
- u64 before, after;
- int ret;
-
- before = read_rss();
- p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0);
- assert(p != MAP_FAILED ||
- !"mmap returned an unexpected error");
- after = read_rss();
- assert(llabs(after - before - length) < 0x40000 ||
- !"rss didn't grow as expected");
- if (!unmap)
- return;
- ret = munmap(p, length);
- assert(!ret || !"munmap returned an unexpected error");
- after = read_rss();
- assert(llabs(after - before) < 0x40000 ||
- !"rss didn't shrink as expected");
-}
-
-static int open_file(const char *path)
-{
- int fd, err;
-
- unlink(path);
- fd = open(path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL
- | O_LARGEFILE | O_CLOEXEC, 0600);
- assert(fd > 2);
- unlink(path);
- err = ftruncate(fd, length);
- assert(!err);
- return fd;
-}
-
-int main(void)
-{
- int hugefd, fd;
-
- fd = open_file("/dev/shm/hugetlbhog");
- hugefd = open_file("/hugepages/hugetlbhog");
-
- system("echo 100 > /proc/sys/vm/nr_hugepages");
- do_mmap(-1, MAP_ANONYMOUS, 1);
- do_mmap(fd, 0, 1);
- do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 1);
- do_mmap(hugefd, 0, 1);
- do_mmap(hugefd, MAP_HUGETLB, 1);
- /* Leak the last one to test do_exit() */
- do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 0);
- printf("oll korrekt.\n");
- return 0;
-}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 831adeb..9179ce8 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -75,16 +75,9 @@
echo "[PASS]"
fi
-echo "--------------------"
-echo "running hugetlbfstest"
-echo "--------------------"
-./hugetlbfstest
-if [ $? -ne 0 ]; then
- echo "[FAIL]"
- exitcode=1
-else
- echo "[PASS]"
-fi
+echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
+echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
+echo " hugetlb regression testing."
echo "--------------------"
echo "running userfaultfd"
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 76071b1..2c7cca6 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -147,7 +147,8 @@
if (sizeof(page_nr) > sizeof(rand_nr)) {
if (random_r(&rand, &rand_nr))
fprintf(stderr, "random_r 2 error\n"), exit(1);
- page_nr |= ((unsigned long) rand_nr) << 32;
+ page_nr |= (((unsigned long) rand_nr) << 16) <<
+ 16;
}
} else
page_nr += 1;
@@ -290,7 +291,8 @@
msg.event), exit(1);
if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
fprintf(stderr, "unexpected write fault\n"), exit(1);
- offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset = (char *)(unsigned long)msg.arg.pagefault.address -
+ area_dst;
offset &= ~(page_size-1);
if (copy_page(offset))
userfaults++;
@@ -327,7 +329,8 @@
if (bounces & BOUNCE_VERIFY &&
msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
fprintf(stderr, "unexpected write fault\n"), exit(1);
- offset = (char *)msg.arg.pagefault.address - area_dst;
+ offset = (char *)(unsigned long)msg.arg.pagefault.address -
+ area_dst;
offset &= ~(page_size-1);
if (copy_page(offset))
(*this_cpu_userfaults)++;
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 8bdf16b..7f73fa3 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -57,23 +57,15 @@
* pagemap kernel ABI bits
*/
-#define PM_ENTRY_BYTES sizeof(uint64_t)
-#define PM_STATUS_BITS 3
-#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
-#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
-#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
-#define PM_PSHIFT_BITS 6
-#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
-#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
-#define __PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
-#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
-#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
-
-#define __PM_SOFT_DIRTY (1LL)
-#define PM_PRESENT PM_STATUS(4LL)
-#define PM_SWAP PM_STATUS(2LL)
-#define PM_SOFT_DIRTY __PM_PSHIFT(__PM_SOFT_DIRTY)
-
+#define PM_ENTRY_BYTES 8
+#define PM_PFRAME_BITS 55
+#define PM_PFRAME_MASK ((1LL << PM_PFRAME_BITS) - 1)
+#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
+#define PM_SOFT_DIRTY (1ULL << 55)
+#define PM_MMAP_EXCLUSIVE (1ULL << 56)
+#define PM_FILE (1ULL << 61)
+#define PM_SWAP (1ULL << 62)
+#define PM_PRESENT (1ULL << 63)
/*
* kernel page flags
@@ -100,6 +92,8 @@
#define KPF_SLOB_FREE 49
#define KPF_SLUB_FROZEN 50
#define KPF_SLUB_DEBUG 51
+#define KPF_FILE 62
+#define KPF_MMAP_EXCLUSIVE 63
#define KPF_ALL_BITS ((uint64_t)~0ULL)
#define KPF_HACKERS_BITS (0xffffULL << 32)
@@ -149,6 +143,9 @@
[KPF_SLOB_FREE] = "P:slob_free",
[KPF_SLUB_FROZEN] = "A:slub_frozen",
[KPF_SLUB_DEBUG] = "E:slub_debug",
+
+ [KPF_FILE] = "F:file",
+ [KPF_MMAP_EXCLUSIVE] = "1:mmap_exclusive",
};
@@ -452,6 +449,10 @@
if (pme & PM_SOFT_DIRTY)
flags |= BIT(SOFTDIRTY);
+ if (pme & PM_FILE)
+ flags |= BIT(FILE);
+ if (pme & PM_MMAP_EXCLUSIVE)
+ flags |= BIT(MMAP_EXCLUSIVE);
return flags;
}