mm/memblock: add extra "flags" to memblock to allow selection of memory based on attribute
Some high end Intel Xeon systems report uncorrectable memory errors as a
recoverable machine check. Linux has included code for some time to
process these and just signal the affected processes (or even recover
completely if the error was in a read only page that can be replaced by
reading from disk).
But we have no recovery path for errors encountered during kernel code
execution. Except for some very specific cases were are unlikely to ever
be able to recover.
Enter memory mirroring. Actually 3rd generation of memory mirroing.
Gen1: All memory is mirrored
Pro: No s/w enabling - h/w just gets good data from other side of the
mirror
Con: Halves effective memory capacity available to OS/applications
Gen2: Partial memory mirror - just mirror memory begind some memory controllers
Pro: Keep more of the capacity
Con: Nightmare to enable. Have to choose between allocating from
mirrored memory for safety vs. NUMA local memory for performance
Gen3: Address range partial memory mirror - some mirror on each memory
controller
Pro: Can tune the amount of mirror and keep NUMA performance
Con: I have to write memory management code to implement
The current plan is just to use mirrored memory for kernel allocations.
This has been broken into two phases:
1) This patch series - find the mirrored memory, use it for boot time
allocations
2) Wade into mm/page_alloc.c and define a ZONE_MIRROR to pick up the
unused mirrored memory from mm/memblock.c and only give it out to
select kernel allocations (this is still being scoped because
page_alloc.c is scary).
This patch (of 3):
Add extra "flags" to memblock to allow selection of memory based on
attribute. No functional changes
Signed-off-by: Tony Luck <tony.luck@intel.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Xiexiuqi <xiexiuqi@huawei.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/mm/cma.c b/mm/cma.c
index 6612780..e7d1db5 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -316,13 +316,15 @@
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range(size, alignment,
- highmem_start, limit);
+ highmem_start, limit,
+ MEMBLOCK_NONE);
limit = highmem_start;
}
if (!addr) {
addr = memblock_alloc_range(size, alignment, base,
- limit);
+ limit,
+ MEMBLOCK_NONE);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 9318b56..b9ff2f4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -107,6 +107,7 @@
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -115,12 +116,13 @@
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
- for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@@ -139,6 +141,7 @@
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -147,12 +150,14 @@
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
- for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
+ NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@@ -174,6 +179,7 @@
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -190,7 +196,7 @@
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, ulong flags)
{
phys_addr_t kernel_end, ret;
@@ -215,7 +221,7 @@
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid);
+ size, align, nid, flags);
if (ret)
return ret;
@@ -233,7 +239,8 @@
"memory hotunplug may be affected\n");
}
- return __memblock_find_range_top_down(start, end, size, align, nid);
+ return __memblock_find_range_top_down(start, end, size, align, nid,
+ flags);
}
/**
@@ -253,7 +260,7 @@
phys_addr_t align)
{
return memblock_find_in_range_node(size, align, start, end,
- NUMA_NO_NODE);
+ NUMA_NO_NODE, MEMBLOCK_NONE);
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -782,6 +789,7 @@
* __next__mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -803,7 +811,7 @@
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -895,6 +903,7 @@
*
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -903,7 +912,7 @@
*
* Reverse of __next_mem_range().
*/
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -1050,14 +1059,15 @@
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, ulong flags)
{
phys_addr_t found;
if (!align)
align = SMP_CACHE_BYTES;
- found = memblock_find_in_range_node(size, align, start, end, nid);
+ found = memblock_find_in_range_node(size, align, start, end, nid,
+ flags);
if (found && !memblock_reserve(found, size)) {
/*
* The min_count is set to 0 so that memblock allocations are
@@ -1070,26 +1080,30 @@
}
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end)
+ phys_addr_t start, phys_addr_t end,
+ ulong flags)
{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+ flags);
}
static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
- int nid)
+ int nid, ulong flags)
{
- return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+ return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid, MEMBLOCK_NONE);
}
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
+ MEMBLOCK_NONE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1173,13 +1187,14 @@
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid);
+ nid, MEMBLOCK_NONE);
if (alloc)
goto done;
if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE);
+ max_addr, NUMA_NO_NODE,
+ MEMBLOCK_NONE);
if (alloc)
goto done;
}
diff --git a/mm/memtest.c b/mm/memtest.c
index 1997d93..0a1cc13 100644
--- a/mm/memtest.c
+++ b/mm/memtest.c
@@ -74,7 +74,8 @@
u64 i;
phys_addr_t this_start, this_end;
- for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
+ &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
if (this_start < this_end) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 90b5046..ad3641d 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -41,7 +41,8 @@
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid,
+ MEMBLOCK_NONE);
if (!addr)
return NULL;
@@ -121,7 +122,8 @@
memblock_clear_hotplug(0, -1);
- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL)
count += __free_memory_core(start, end);
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK