[IA64] memory-less-nodes repost

I reworked how nodes with only CPUs are treated.  The patch below seems
simpler to me and has eliminated the complicated routine
reassign_cpu_only_nodes.  There isn't any longer the requirement
to modify ACPI NUMA information which was in large part the
complexity introduced in reassign_cpu_only_nodes. 

This patch will produce a different number of nodes. For example,
reassign_cpu_only_nodes would reduce two CPUonly nodes and one memory node
configuration to one memory+CPUs node configuration.  This patch
doesn't change the number of nodes which means the user will see three.  Two
nodes without memory and one node with all the memory.

While doing this patch, I noticed that early_nr_phys_cpus_node isn't serving
any useful purpose.  It is called once in find_pernode_space but the value
isn't used to computer pernode space.  

Signed-off-by: bob.picco <bob.picco@hp.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index f3fd528..54136fd 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -44,150 +44,7 @@
 };
 
 static struct early_node_data mem_data[MAX_NUMNODES] __initdata;
-
-/**
- * reassign_cpu_only_nodes - called from find_memory to move CPU-only nodes to a memory node
- *
- * This function will move nodes with only CPUs (no memory)
- * to a node with memory which is at the minimum numa_slit distance.
- * Any reassigments will result in the compression of the nodes
- * and renumbering the nid values where appropriate.
- * The static declarations below are to avoid large stack size which
- * makes the code not re-entrant.
- */
-static void __init reassign_cpu_only_nodes(void)
-{
-	struct node_memblk_s *p;
-	int i, j, k, nnode, nid, cpu, cpunid, pxm;
-	u8 cslit, slit;
-	static DECLARE_BITMAP(nodes_with_mem, MAX_NUMNODES) __initdata;
-	static u8 numa_slit_fix[MAX_NUMNODES * MAX_NUMNODES] __initdata;
-	static int node_flip[MAX_NUMNODES] __initdata;
-	static int old_nid_map[NR_CPUS] __initdata;
-
-	for (nnode = 0, p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
-		if (!test_bit(p->nid, (void *) nodes_with_mem)) {
-			set_bit(p->nid, (void *) nodes_with_mem);
-			nnode++;
-		}
-
-	/*
-	 * All nids with memory.
-	 */
-	if (nnode == num_online_nodes())
-		return;
-
-	/*
-	 * Change nids and attempt to migrate CPU-only nodes
-	 * to the best numa_slit (closest neighbor) possible.
-	 * For reassigned CPU nodes a nid can't be arrived at
-	 * until after this loop because the target nid's new
-	 * identity might not have been established yet. So
-	 * new nid values are fabricated above num_online_nodes() and
-	 * mapped back later to their true value.
-	 */
-	/* MCD - This code is a bit complicated, but may be unnecessary now.
-	 * We can now handle much more interesting node-numbering.
-	 * The old requirement that 0 <= nid <= numnodes <= MAX_NUMNODES
-	 * and that there be no holes in the numbering 0..numnodes
-	 * has become simply 0 <= nid <= MAX_NUMNODES.
-	 */
-	nid = 0;
-	for_each_online_node(i)  {
-		if (test_bit(i, (void *) nodes_with_mem)) {
-			/*
-			 * Save original nid value for numa_slit
-			 * fixup and node_cpuid reassignments.
-			 */
-			node_flip[nid] = i;
-
-			if (i == nid) {
-				nid++;
-				continue;
-			}
-
-			for (p = &node_memblk[0]; p < &node_memblk[num_node_memblks]; p++)
-				if (p->nid == i)
-					p->nid = nid;
-
-			cpunid = nid;
-			nid++;
-		} else
-			cpunid = MAX_NUMNODES;
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
-			if (node_cpuid[cpu].nid == i) {
-				/*
-				 * For nodes not being reassigned just
-				 * fix the cpu's nid and reverse pxm map
-				 */
-				if (cpunid < MAX_NUMNODES) {
-					pxm = nid_to_pxm_map[i];
-					pxm_to_nid_map[pxm] =
-					          node_cpuid[cpu].nid = cpunid;
-					continue;
-				}
-
-				/*
-				 * For nodes being reassigned, find best node by
-				 * numa_slit information and then make a temporary
-				 * nid value based on current nid and num_online_nodes().
-				 */
-				slit = 0xff;
-				k = 2*num_online_nodes();
-				for_each_online_node(j) {
-					if (i == j)
-						continue;
-					else if (test_bit(j, (void *) nodes_with_mem)) {
-						cslit = numa_slit[i * num_online_nodes() + j];
-						if (cslit < slit) {
-							k = num_online_nodes() + j;
-							slit = cslit;
-						}
-					}
-				}
-
-				/* save old nid map so we can update the pxm */
-				old_nid_map[cpu] = node_cpuid[cpu].nid;
-				node_cpuid[cpu].nid = k;
-			}
-	}
-
-	/*
-	 * Fixup temporary nid values for CPU-only nodes.
-	 */
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		if (node_cpuid[cpu].nid == (2*num_online_nodes())) {
-			pxm = nid_to_pxm_map[old_nid_map[cpu]];
-			pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = nnode - 1;
-		} else {
-			for (i = 0; i < nnode; i++) {
-				if (node_flip[i] != (node_cpuid[cpu].nid - num_online_nodes()))
-					continue;
-
-				pxm = nid_to_pxm_map[old_nid_map[cpu]];
-				pxm_to_nid_map[pxm] = node_cpuid[cpu].nid = i;
-				break;
-			}
-		}
-
-	/*
-	 * Fix numa_slit by compressing from larger
-	 * nid array to reduced nid array.
-	 */
-	for (i = 0; i < nnode; i++)
-		for (j = 0; j < nnode; j++)
-			numa_slit_fix[i * nnode + j] =
-				numa_slit[node_flip[i] * num_online_nodes() + node_flip[j]];
-
-	memcpy(numa_slit, numa_slit_fix, sizeof (numa_slit));
-
-	nodes_clear(node_online_map);
-	for (i = 0; i < nnode; i++)
-		node_set_online(i);
-
-	return;
-}
+static nodemask_t memory_less_mask __initdata;
 
 /*
  * To prevent cache aliasing effects, align per-node structures so that they
@@ -233,28 +90,6 @@
 }
 
 /**
- * early_nr_phys_cpus_node - return number of physical cpus on a given node
- * @node: node to check
- *
- * Count the number of physical cpus on @node.  These are cpus that actually
- * exist.  We can't use nr_cpus_node() yet because
- * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
- * called yet.
- */
-static int early_nr_phys_cpus_node(int node)
-{
-	int cpu, n = 0;
-
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		if (node == node_cpuid[cpu].nid)
-			if ((cpu == 0) || node_cpuid[cpu].phys_id)
-				n++;
-
-	return n;
-}
-
-
-/**
  * early_nr_cpus_node - return number of cpus on a given node
  * @node: node to check
  *
@@ -262,7 +97,7 @@
  * acpi_boot_init() (which builds the node_to_cpu_mask array) hasn't been
  * called yet.  Note that node 0 will also count all non-existent cpus.
  */
-static int early_nr_cpus_node(int node)
+static int __init early_nr_cpus_node(int node)
 {
 	int cpu, n = 0;
 
@@ -274,6 +109,70 @@
 }
 
 /**
+ * compute_pernodesize - compute size of pernode data
+ * @node: the node id.
+ */
+static unsigned long __init compute_pernodesize(int node)
+{
+	unsigned long pernodesize = 0, cpus;
+
+	cpus = early_nr_cpus_node(node);
+	pernodesize += PERCPU_PAGE_SIZE * cpus;
+	pernodesize += node * L1_CACHE_BYTES;
+	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
+	pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+	pernodesize = PAGE_ALIGN(pernodesize);
+	return pernodesize;
+}
+
+/**
+ * fill_pernode - initialize pernode data.
+ * @node: the node id.
+ * @pernode: physical address of pernode data
+ * @pernodesize: size of the pernode data
+ */
+static void __init fill_pernode(int node, unsigned long pernode,
+	unsigned long pernodesize)
+{
+	void *cpu_data;
+	int cpus = early_nr_cpus_node(node), cpu;
+	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
+
+	mem_data[node].pernode_addr = pernode;
+	mem_data[node].pernode_size = pernodesize;
+	memset(__va(pernode), 0, pernodesize);
+
+	cpu_data = (void *)pernode;
+	pernode += PERCPU_PAGE_SIZE * cpus;
+	pernode += node * L1_CACHE_BYTES;
+
+	mem_data[node].pgdat = __va(pernode);
+	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+	mem_data[node].node_data = __va(pernode);
+	pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
+
+	mem_data[node].pgdat->bdata = bdp;
+	pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
+
+	/*
+	 * Copy the static per-cpu data into the region we
+	 * just set aside and then setup __per_cpu_offset
+	 * for each CPU on this node.
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (node == node_cpuid[cpu].nid) {
+			memcpy(__va(cpu_data), __phys_per_cpu_start,
+			       __per_cpu_end - __per_cpu_start);
+			__per_cpu_offset[cpu] = (char*)__va(cpu_data) -
+				__per_cpu_start;
+			cpu_data += PERCPU_PAGE_SIZE;
+		}
+	}
+
+	return;
+}
+/**
  * find_pernode_space - allocate memory for memory map and per-node structures
  * @start: physical start of range
  * @len: length of range
@@ -304,9 +203,8 @@
 static int __init find_pernode_space(unsigned long start, unsigned long len,
 				     int node)
 {
-	unsigned long epfn, cpu, cpus, phys_cpus;
+	unsigned long epfn;
 	unsigned long pernodesize = 0, pernode, pages, mapsize;
-	void *cpu_data;
 	struct bootmem_data *bdp = &mem_data[node].bootmem_data;
 
 	epfn = (start + len) >> PAGE_SHIFT;
@@ -329,49 +227,12 @@
 	 * Calculate total size needed, incl. what's necessary
 	 * for good alignment and alias prevention.
 	 */
-	cpus = early_nr_cpus_node(node);
-	phys_cpus = early_nr_phys_cpus_node(node);
-	pernodesize += PERCPU_PAGE_SIZE * cpus;
-	pernodesize += node * L1_CACHE_BYTES;
-	pernodesize += L1_CACHE_ALIGN(sizeof(pg_data_t));
-	pernodesize += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-	pernodesize = PAGE_ALIGN(pernodesize);
+	pernodesize = compute_pernodesize(node);
 	pernode = NODEDATA_ALIGN(start, node);
 
 	/* Is this range big enough for what we want to store here? */
-	if (start + len > (pernode + pernodesize + mapsize)) {
-		mem_data[node].pernode_addr = pernode;
-		mem_data[node].pernode_size = pernodesize;
-		memset(__va(pernode), 0, pernodesize);
-
-		cpu_data = (void *)pernode;
-		pernode += PERCPU_PAGE_SIZE * cpus;
-		pernode += node * L1_CACHE_BYTES;
-
-		mem_data[node].pgdat = __va(pernode);
-		pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-		mem_data[node].node_data = __va(pernode);
-		pernode += L1_CACHE_ALIGN(sizeof(struct ia64_node_data));
-
-		mem_data[node].pgdat->bdata = bdp;
-		pernode += L1_CACHE_ALIGN(sizeof(pg_data_t));
-
-		/*
-		 * Copy the static per-cpu data into the region we
-		 * just set aside and then setup __per_cpu_offset
-		 * for each CPU on this node.
-		 */
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			if (node == node_cpuid[cpu].nid) {
-				memcpy(__va(cpu_data), __phys_per_cpu_start,
-				       __per_cpu_end - __per_cpu_start);
-				__per_cpu_offset[cpu] = (char*)__va(cpu_data) -
-					__per_cpu_start;
-				cpu_data += PERCPU_PAGE_SIZE;
-			}
-		}
-	}
+	if (start + len > (pernode + pernodesize + mapsize))
+		fill_pernode(node, pernode, pernodesize);
 
 	return 0;
 }
@@ -411,6 +272,9 @@
 	for_each_online_node(node) {
 		pg_data_t *pdp = mem_data[node].pgdat;
 
+		if (node_isset(node, memory_less_mask))
+			continue;
+
 		bdp = pdp->bdata;
 
 		/* First the bootmem_map itself */
@@ -456,6 +320,83 @@
 }
 
 /**
+ * memory_less_node_alloc - * attempt to allocate memory on the best NUMA slit
+ * 	node but fall back to any other node when __alloc_bootmem_node fails
+ *	for best.
+ * @nid: node id
+ * @pernodesize: size of this node's pernode data
+ * @align: alignment to use for this node's pernode data
+ */
+static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize,
+	unsigned long align)
+{
+	void *ptr = NULL;
+	u8 best = 0xff;
+	int bestnode = -1, node;
+
+	for_each_online_node(node) {
+		if (node_isset(node, memory_less_mask))
+			continue;
+		else if (node_distance(nid, node) < best) {
+			best = node_distance(nid, node);
+			bestnode = node;
+		}
+	}
+
+	ptr = __alloc_bootmem_node(mem_data[bestnode].pgdat,
+		pernodesize, align, __pa(MAX_DMA_ADDRESS));
+
+	if (!ptr)
+		panic("NO memory for memory less node\n");
+	return ptr;
+}
+
+/**
+ * pgdat_insert - insert the pgdat into global pgdat_list
+ * @pgdat: the pgdat for a node.
+ */
+static void __init pgdat_insert(pg_data_t *pgdat)
+{
+	pg_data_t *prev = NULL, *next;
+
+	for_each_pgdat(next)
+		if (pgdat->node_id < next->node_id)
+			break;
+		else
+			prev = next;
+
+	if (prev) {
+		prev->pgdat_next = pgdat;
+		pgdat->pgdat_next = next;
+	} else {
+		pgdat->pgdat_next = pgdat_list;
+		pgdat_list = pgdat;
+	}
+
+	return;
+}
+
+/**
+ * memory_less_nodes - allocate and initialize CPU only nodes pernode
+ *	information.
+ */
+static void __init memory_less_nodes(void)
+{
+	unsigned long pernodesize;
+	void *pernode;
+	int node;
+
+	for_each_node_mask(node, memory_less_mask) {
+		pernodesize = compute_pernodesize(node);
+		pernode = memory_less_node_alloc(node, pernodesize,
+			(node) ? (node * PERCPU_PAGE_SIZE) : (1024*1024));
+		fill_pernode(node, __pa(pernode), pernodesize);
+	}
+
+	return;
+}
+
+/**
  * find_memory - walk the EFI memory map and setup the bootmem allocator
  *
  * Called early in boot to setup the bootmem allocator, and to
@@ -472,16 +413,19 @@
 		node_set_online(0);
 	}
 
+	nodes_or(memory_less_mask, memory_less_mask, node_online_map);
 	min_low_pfn = -1;
 	max_low_pfn = 0;
 
-	if (num_online_nodes() > 1)
-		reassign_cpu_only_nodes();
-
 	/* These actually end up getting called by call_pernode_memory() */
 	efi_memmap_walk(filter_rsvd_memory, build_node_maps);
 	efi_memmap_walk(filter_rsvd_memory, find_pernode_space);
 
+	for_each_online_node(node)
+		if (mem_data[node].bootmem_data.node_low_pfn) {
+			node_clear(node, memory_less_mask);
+			mem_data[node].min_pfn = ~0UL;
+		}
 	/*
 	 * Initialize the boot memory maps in reverse order since that's
 	 * what the bootmem allocator expects
@@ -492,17 +436,14 @@
 
 		if (!node_online(node))
 			continue;
+		else if (node_isset(node, memory_less_mask))
+			continue;
 
 		bdp = &mem_data[node].bootmem_data;
 		pernode = mem_data[node].pernode_addr;
 		pernodesize = mem_data[node].pernode_size;
 		map = pernode + pernodesize;
 
-		/* Sanity check... */
-		if (!pernode)
-			panic("pernode space for node %d "
-			      "could not be allocated!", node);
-
 		init_bootmem_node(mem_data[node].pgdat,
 				  map>>PAGE_SHIFT,
 				  bdp->node_boot_start>>PAGE_SHIFT,
@@ -512,6 +453,7 @@
 	efi_memmap_walk(filter_rsvd_memory, free_node_bootmem);
 
 	reserve_pernode_space();
+	memory_less_nodes();
 	initialize_pernode_data();
 
 	max_pfn = max_low_pfn;
@@ -680,12 +622,13 @@
 
 	max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-	/* so min() will work in count_node_pages */
-	for_each_online_node(node)
-		mem_data[node].min_pfn = ~0UL;
-
 	efi_memmap_walk(filter_rsvd_memory, count_node_pages);
 
+	vmalloc_end -= PAGE_ALIGN(max_low_pfn * sizeof(struct page));
+	vmem_map = (struct page *) vmalloc_end;
+	efi_memmap_walk(create_mem_map_page_table, NULL);
+	printk("Virtual mem_map starts at 0x%p\n", vmem_map);
+
 	for_each_online_node(node) {
 		memset(zones_size, 0, sizeof(zones_size));
 		memset(zholes_size, 0, sizeof(zholes_size));
@@ -719,15 +662,6 @@
 				 mem_data[node].num_dma_physpages);
 		}
 
-		if (node == 0) {
-			vmalloc_end -=
-				PAGE_ALIGN(max_low_pfn * sizeof(struct page));
-			vmem_map = (struct page *) vmalloc_end;
-
-			efi_memmap_walk(create_mem_map_page_table, NULL);
-			printk("Virtual mem_map starts at 0x%p\n", vmem_map);
-		}
-
 		pfn_offset = mem_data[node].min_pfn;
 
 		NODE_DATA(node)->node_mem_map = vmem_map + pfn_offset;
@@ -735,5 +669,11 @@
 				    pfn_offset, zholes_size);
 	}
 
+	/*
+	 * Make memory less nodes become a member of the known nodes.
+	 */
+	for_each_node_mask(node, memory_less_mask)
+		pgdat_insert(mem_data[node].pgdat);
+
 	zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 4eb2f52..65f9958 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -597,7 +597,8 @@
 	kclist_add(&kcore_kernel, _stext, _end - _stext);
 
 	for_each_pgdat(pgdat)
-		totalram_pages += free_all_bootmem_node(pgdat);
+		if (pgdat->bdata->node_bootmem_map)
+			totalram_pages += free_all_bootmem_node(pgdat);
 
 	reserved_pages = 0;
 	efi_memmap_walk(count_reserved_pages, &reserved_pages);