Blame - mm/mm_init.c - linux

blob: 077bfe393b5e29e73e98d4071385f7cc3c5c4cc6 [file] [log] [blame]

Thomas Gleixner	457c899	2019-05-19 13:08:55 +0100	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	2	/*
				3	* mm_init.c - Memory initialisation verification and debugging
				4	*
				5	* Copyright 2008 IBM Corporation, 2008
				6	* Author Mel Gorman <mel@csn.ul.ie>
				7	*
				8	*/
				9	#include <linux/kernel.h>
				10	#include <linux/init.h>
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	11	#include <linux/kobject.h>
Paul Gortmaker	b95f1b31	2011-10-16 02:01:52 -0400	[diff] [blame]	12	#include <linux/export.h>
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	13	#include <linux/memory.h>
				14	#include <linux/notifier.h>
Mel Gorman	7e18adb	2015-06-30 14:57:05 -0700	[diff] [blame]	15	#include <linux/sched.h>
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	16	#include <linux/mman.h>
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	17	#include <linux/memblock.h>
				18	#include <linux/page-isolation.h>
				19	#include <linux/padata.h>
				20	#include <linux/nmi.h>
				21	#include <linux/buffer_head.h>
				22	#include <linux/kmemleak.h>
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	23	#include <linux/kfence.h>
				24	#include <linux/page_ext.h>
				25	#include <linux/pti.h>
				26	#include <linux/pgtable.h>
Mike Rapoport (IBM)	eb8589b	2023-03-21 19:05:10 +0200	[diff] [blame]	27	#include <linux/swap.h>
				28	#include <linux/cma.h>
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	29	#include "internal.h"
Mike Rapoport (IBM)	d5d2c02	2023-03-21 19:05:11 +0200	[diff] [blame]	30	#include "slab.h"
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	31	#include "shuffle.h"
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	32
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	33	#include <asm/setup.h>
				34
Nishanth Aravamudan	5e9426ab	2008-07-23 21:27:39 -0700	[diff] [blame]	35	#ifdef CONFIG_DEBUG_MEMORY_INIT
Rasmus Villemoes	194e815	2015-02-12 15:00:12 -0800	[diff] [blame]	36	int __meminitdata mminit_loglevel;
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	37
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	38	/* The zonelists are simply reported, validation is manual. */
Rasmus Villemoes	0e2342c	2015-02-12 15:00:09 -0800	[diff] [blame]	39	void __init mminit_verify_zonelist(void)
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	40	{
				41	int nid;
				42
				43	if (mminit_loglevel < MMINIT_VERIFY)
				44	return;
				45
				46	for_each_online_node(nid) {
				47	pg_data_t *pgdat = NODE_DATA(nid);
				48	struct zone *zone;
				49	struct zoneref *z;
				50	struct zonelist *zonelist;
				51	int i, listid, zoneid;
				52
Mateusz Nosek	e46b893	2020-04-06 20:08:36 -0700	[diff] [blame]	53	BUILD_BUG_ON(MAX_ZONELISTS > 2);
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	54	for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
				55
				56	/* Identify the zone and nodelist */
				57	zoneid = i % MAX_NR_ZONES;
				58	listid = i / MAX_NR_ZONES;
				59	zonelist = &pgdat->node_zonelists[listid];
				60	zone = &pgdat->node_zones[zoneid];
				61	if (!populated_zone(zone))
				62	continue;
				63
				64	/* Print information about the zonelist */
				65	printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
				66	listid > 0 ? "thisnode" : "general", nid,
				67	zone->name);
				68
				69	/* Iterate the zonelist */
Pavel Tatashin	c1093b7	2018-08-21 21:53:32 -0700	[diff] [blame]	70	for_each_zone_zonelist(zone, z, zonelist, zoneid)
				71	pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
Joe Perches	1170532	2016-03-17 14:19:50 -0700	[diff] [blame]	72	pr_cont("\n");
Mel Gorman	68ad8df	2008-07-23 21:26:52 -0700	[diff] [blame]	73	}
				74	}
				75	}
				76
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	77	void __init mminit_verify_pageflags_layout(void)
				78	{
				79	int shift, width;
				80	unsigned long or_mask, add_mask;
				81
Miaohe Lin	daee07b	2023-08-07 10:35:28 +0800	[diff] [blame]	82	shift = BITS_PER_LONG;
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	83	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
Yu Zhao	ec1c86b	2022-09-18 02:00:02 -0600	[diff] [blame]	84	- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	85	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
Yu Zhao	ec1c86b	2022-09-18 02:00:02 -0600	[diff] [blame]	86	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	87	SECTIONS_WIDTH,
				88	NODES_WIDTH,
				89	ZONES_WIDTH,
Peter Zijlstra	9057289	2013-10-07 11:29:20 +0100	[diff] [blame]	90	LAST_CPUPID_WIDTH,
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	91	KASAN_TAG_WIDTH,
Yu Zhao	ec1c86b	2022-09-18 02:00:02 -0600	[diff] [blame]	92	LRU_GEN_WIDTH,
				93	LRU_REFS_WIDTH,
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	94	NR_PAGEFLAGS);
				95	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	96	"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	97	SECTIONS_SHIFT,
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	98	NODES_SHIFT,
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	99	ZONES_SHIFT,
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	100	LAST_CPUPID_SHIFT,
				101	KASAN_TAG_WIDTH);
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	102	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	103	"Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	104	(unsigned long)SECTIONS_PGSHIFT,
				105	(unsigned long)NODES_PGSHIFT,
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	106	(unsigned long)ZONES_PGSHIFT,
Jing Xia	86fea8b	2020-06-01 21:52:49 -0700	[diff] [blame]	107	(unsigned long)LAST_CPUPID_PGSHIFT,
				108	(unsigned long)KASAN_TAG_PGSHIFT);
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	109	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
				110	"Node/Zone ID: %lu -> %lu\n",
				111	(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
				112	(unsigned long)ZONEID_PGOFF);
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	113	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	114	"location: %d -> %d layout %d -> %d unused %d -> %d page-flags\n",
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	115	shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
				116	#ifdef NODE_NOT_IN_PAGE_FLAGS
				117	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
				118	"Node not in page flags");
				119	#endif
Peter Zijlstra	9057289	2013-10-07 11:29:20 +0100	[diff] [blame]	120	#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	121	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
Peter Zijlstra	9057289	2013-10-07 11:29:20 +0100	[diff] [blame]	122	"Last cpupid not in page flags");
Mel Gorman	a4e1b4c	2013-02-22 16:34:47 -0800	[diff] [blame]	123	#endif
Mel Gorman	708614e	2008-07-23 21:26:51 -0700	[diff] [blame]	124
				125	if (SECTIONS_WIDTH) {
				126	shift -= SECTIONS_WIDTH;
				127	BUG_ON(shift != SECTIONS_PGSHIFT);
				128	}
				129	if (NODES_WIDTH) {
				130	shift -= NODES_WIDTH;
				131	BUG_ON(shift != NODES_PGSHIFT);
				132	}
				133	if (ZONES_WIDTH) {
				134	shift -= ZONES_WIDTH;
				135	BUG_ON(shift != ZONES_PGSHIFT);
				136	}
				137
				138	/* Check for bitmask overlaps */
				139	or_mask = (ZONES_MASK << ZONES_PGSHIFT) \|
				140	(NODES_MASK << NODES_PGSHIFT) \|
				141	(SECTIONS_MASK << SECTIONS_PGSHIFT);
				142	add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
				143	(NODES_MASK << NODES_PGSHIFT) +
				144	(SECTIONS_MASK << SECTIONS_PGSHIFT);
				145	BUG_ON(or_mask != add_mask);
				146	}
				147
Mel Gorman	6b74ab9	2008-07-23 21:26:49 -0700	[diff] [blame]	148	static __init int set_mminit_loglevel(char *str)
				149	{
				150	get_option(&str, &mminit_loglevel);
				151	return 0;
				152	}
				153	early_param("mminit_loglevel", set_mminit_loglevel);
Nishanth Aravamudan	5e9426ab	2008-07-23 21:27:39 -0700	[diff] [blame]	154	#endif /* CONFIG_DEBUG_MEMORY_INIT */
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	155
				156	struct kobject *mm_kobj;
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	157
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	158	#ifdef CONFIG_SMP
				159	s32 vm_committed_as_batch = 32;
				160
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	161	void mm_compute_batch(int overcommit_policy)
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	162	{
				163	u64 memsized_batch;
				164	s32 nr = num_present_cpus();
				165	s32 batch = max_t(s32, nr*2, 32);
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	166	unsigned long ram_pages = totalram_pages();
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	167
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	168	/*
				169	* For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
				170	* (total memory/#cpus), and lift it to 25% for other policies
				171	* to easy the possible lock contention for percpu_counter
				172	* vm_committed_as, while the max limit is INT_MAX
				173	*/
				174	if (overcommit_policy == OVERCOMMIT_NEVER)
				175	memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
				176	else
				177	memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	178
				179	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
				180	}
				181
				182	static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
				183	unsigned long action, void *arg)
				184	{
				185	switch (action) {
				186	case MEM_ONLINE:
				187	case MEM_OFFLINE:
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	188	mm_compute_batch(sysctl_overcommit_memory);
Gustavo A. R. Silva	01359eb	2020-12-14 19:15:00 -0800	[diff] [blame]	189	break;
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	190	default:
				191	break;
				192	}
				193	return NOTIFY_OK;
				194	}
				195
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	196	static int __init mm_compute_batch_init(void)
				197	{
Feng Tang	56f3547	2020-08-06 23:23:15 -0700	[diff] [blame]	198	mm_compute_batch(sysctl_overcommit_memory);
Liu Shixin	1eeaa4f	2022-09-23 11:33:47 +0800	[diff] [blame]	199	hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI);
Tim Chen	917d929	2013-07-03 15:02:44 -0700	[diff] [blame]	200	return 0;
				201	}
				202
				203	__initcall(mm_compute_batch_init);
				204
				205	#endif
				206
Nishanth Aravamudan	ff7ea79	2008-07-23 21:27:39 -0700	[diff] [blame]	207	static int __init mm_sysfs_init(void)
				208	{
				209	mm_kobj = kobject_create_and_add("mm", kernel_kobj);
				210	if (!mm_kobj)
				211	return -ENOMEM;
				212
				213	return 0;
				214	}
Hugh Dickins	e82cb95	2014-01-27 17:06:55 -0800	[diff] [blame]	215	postcore_initcall(mm_sysfs_init);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	216
				217	static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
				218	static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
				219	static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
				220
				221	static unsigned long required_kernelcore __initdata;
				222	static unsigned long required_kernelcore_percent __initdata;
				223	static unsigned long required_movablecore __initdata;
				224	static unsigned long required_movablecore_percent __initdata;
				225
				226	static unsigned long nr_kernel_pages __initdata;
				227	static unsigned long nr_all_pages __initdata;
				228	static unsigned long dma_reserve __initdata;
				229
Mike Rapoport (IBM)	de57807	2023-03-21 19:05:09 +0200	[diff] [blame]	230	static bool deferred_struct_pages __meminitdata;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	231
				232	static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
				233
				234	static int __init cmdline_parse_core(char p, unsigned long core,
				235	unsigned long *percent)
				236	{
				237	unsigned long long coremem;
				238	char *endptr;
				239
				240	if (!p)
				241	return -EINVAL;
				242
				243	/* Value may be a percentage of total memory, otherwise bytes */
				244	coremem = simple_strtoull(p, &endptr, 0);
				245	if (*endptr == '%') {
				246	/* Paranoid check for percent values greater than 100 */
				247	WARN_ON(coremem > 100);
				248
				249	*percent = coremem;
				250	} else {
				251	coremem = memparse(p, &p);
				252	/* Paranoid check that UL is enough for the coremem value */
				253	WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
				254
				255	*core = coremem >> PAGE_SHIFT;
				256	*percent = 0UL;
				257	}
				258	return 0;
				259	}
				260
Kefeng Wang	072ba38	2023-05-16 14:38:09 +0800	[diff] [blame]	261	bool mirrored_kernelcore __initdata_memblock;
				262
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	263	/*
				264	* kernelcore=size sets the amount of memory for use for allocations that
				265	* cannot be reclaimed or migrated.
				266	*/
				267	static int __init cmdline_parse_kernelcore(char *p)
				268	{
				269	/* parse kernelcore=mirror */
				270	if (parse_option_str(p, "mirror")) {
				271	mirrored_kernelcore = true;
				272	return 0;
				273	}
				274
				275	return cmdline_parse_core(p, &required_kernelcore,
				276	&required_kernelcore_percent);
				277	}
				278	early_param("kernelcore", cmdline_parse_kernelcore);
				279
				280	/*
				281	* movablecore=size sets the amount of memory for use for allocations that
				282	* can be reclaimed or migrated.
				283	*/
				284	static int __init cmdline_parse_movablecore(char *p)
				285	{
				286	return cmdline_parse_core(p, &required_movablecore,
				287	&required_movablecore_percent);
				288	}
				289	early_param("movablecore", cmdline_parse_movablecore);
				290
				291	/*
				292	* early_calculate_totalpages()
				293	* Sum pages in active regions for movable zone.
				294	* Populate N_MEMORY for calculating usable_nodes.
				295	*/
				296	static unsigned long __init early_calculate_totalpages(void)
				297	{
				298	unsigned long totalpages = 0;
				299	unsigned long start_pfn, end_pfn;
				300	int i, nid;
				301
				302	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
				303	unsigned long pages = end_pfn - start_pfn;
				304
				305	totalpages += pages;
				306	if (pages)
				307	node_set_state(nid, N_MEMORY);
				308	}
				309	return totalpages;
				310	}
				311
				312	/*
				313	* This finds a zone that can be used for ZONE_MOVABLE pages. The
				314	* assumption is made that zones within a node are ordered in monotonic
				315	* increasing memory addresses so that the "highest" populated zone is used
				316	*/
				317	static void __init find_usable_zone_for_movable(void)
				318	{
				319	int zone_index;
				320	for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
				321	if (zone_index == ZONE_MOVABLE)
				322	continue;
				323
				324	if (arch_zone_highest_possible_pfn[zone_index] >
				325	arch_zone_lowest_possible_pfn[zone_index])
				326	break;
				327	}
				328
				329	VM_BUG_ON(zone_index == -1);
				330	movable_zone = zone_index;
				331	}
				332
				333	/*
				334	* Find the PFN the Movable zone begins in each node. Kernel memory
				335	* is spread evenly between nodes as long as the nodes have enough
				336	* memory. When they don't, some nodes will have more kernelcore than
				337	* others
				338	*/
				339	static void __init find_zone_movable_pfns_for_nodes(void)
				340	{
				341	int i, nid;
				342	unsigned long usable_startpfn;
				343	unsigned long kernelcore_node, kernelcore_remaining;
				344	/* save the state before borrow the nodemask */
				345	nodemask_t saved_node_state = node_states[N_MEMORY];
				346	unsigned long totalpages = early_calculate_totalpages();
				347	int usable_nodes = nodes_weight(node_states[N_MEMORY]);
				348	struct memblock_region *r;
				349
				350	/* Need to find movable_zone earlier when movable_node is specified. */
				351	find_usable_zone_for_movable();
				352
				353	/*
				354	* If movable_node is specified, ignore kernelcore and movablecore
				355	* options.
				356	*/
				357	if (movable_node_is_enabled()) {
				358	for_each_mem_region(r) {
				359	if (!memblock_is_hotpluggable(r))
				360	continue;
				361
				362	nid = memblock_get_region_node(r);
				363
				364	usable_startpfn = PFN_DOWN(r->base);
				365	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
				366	min(usable_startpfn, zone_movable_pfn[nid]) :
				367	usable_startpfn;
				368	}
				369
				370	goto out2;
				371	}
				372
				373	/*
				374	* If kernelcore=mirror is specified, ignore movablecore option
				375	*/
				376	if (mirrored_kernelcore) {
				377	bool mem_below_4gb_not_mirrored = false;
				378
Ma Wupeng	0db31d6	2023-08-02 15:23:28 +0800	[diff] [blame]	379	if (!memblock_has_mirror()) {
				380	pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n");
				381	goto out;
				382	}
				383
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	384	for_each_mem_region(r) {
				385	if (memblock_is_mirror(r))
				386	continue;
				387
				388	nid = memblock_get_region_node(r);
				389
				390	usable_startpfn = memblock_region_memory_base_pfn(r);
				391
				392	if (usable_startpfn < PHYS_PFN(SZ_4G)) {
				393	mem_below_4gb_not_mirrored = true;
				394	continue;
				395	}
				396
				397	zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
				398	min(usable_startpfn, zone_movable_pfn[nid]) :
				399	usable_startpfn;
				400	}
				401
				402	if (mem_below_4gb_not_mirrored)
				403	pr_warn("This configuration results in unmirrored kernel memory.\n");
				404
				405	goto out2;
				406	}
				407
				408	/*
				409	* If kernelcore=nn% or movablecore=nn% was specified, calculate the
				410	* amount of necessary memory.
				411	*/
				412	if (required_kernelcore_percent)
				413	required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
				414	10000UL;
				415	if (required_movablecore_percent)
				416	required_movablecore = (totalpages * 100 * required_movablecore_percent) /
				417	10000UL;
				418
				419	/*
				420	* If movablecore= was specified, calculate what size of
				421	* kernelcore that corresponds so that memory usable for
				422	* any allocation type is evenly spread. If both kernelcore
				423	* and movablecore are specified, then the value of kernelcore
				424	* will be used for required_kernelcore if it's greater than
				425	* what movablecore would have allowed.
				426	*/
				427	if (required_movablecore) {
				428	unsigned long corepages;
				429
				430	/*
				431	* Round-up so that ZONE_MOVABLE is at least as large as what
				432	* was requested by the user
				433	*/
				434	required_movablecore =
				435	roundup(required_movablecore, MAX_ORDER_NR_PAGES);
				436	required_movablecore = min(totalpages, required_movablecore);
				437	corepages = totalpages - required_movablecore;
				438
				439	required_kernelcore = max(required_kernelcore, corepages);
				440	}
				441
				442	/*
				443	* If kernelcore was not specified or kernelcore size is larger
				444	* than totalpages, there is no ZONE_MOVABLE.
				445	*/
				446	if (!required_kernelcore \|\| required_kernelcore >= totalpages)
				447	goto out;
				448
				449	/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
				450	usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
				451
				452	restart:
				453	/* Spread kernelcore memory as evenly as possible throughout nodes */
				454	kernelcore_node = required_kernelcore / usable_nodes;
				455	for_each_node_state(nid, N_MEMORY) {
				456	unsigned long start_pfn, end_pfn;
				457
				458	/*
				459	* Recalculate kernelcore_node if the division per node
				460	* now exceeds what is necessary to satisfy the requested
				461	* amount of memory for the kernel
				462	*/
				463	if (required_kernelcore < kernelcore_node)
				464	kernelcore_node = required_kernelcore / usable_nodes;
				465
				466	/*
				467	* As the map is walked, we track how much memory is usable
				468	* by the kernel using kernelcore_remaining. When it is
				469	* 0, the rest of the node is usable by ZONE_MOVABLE
				470	*/
				471	kernelcore_remaining = kernelcore_node;
				472
				473	/* Go through each range of PFNs within this node */
				474	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
				475	unsigned long size_pages;
				476
				477	start_pfn = max(start_pfn, zone_movable_pfn[nid]);
				478	if (start_pfn >= end_pfn)
				479	continue;
				480
				481	/* Account for what is only usable for kernelcore */
				482	if (start_pfn < usable_startpfn) {
				483	unsigned long kernel_pages;
				484	kernel_pages = min(end_pfn, usable_startpfn)
				485	- start_pfn;
				486
				487	kernelcore_remaining -= min(kernel_pages,
				488	kernelcore_remaining);
				489	required_kernelcore -= min(kernel_pages,
				490	required_kernelcore);
				491
				492	/* Continue if range is now fully accounted */
				493	if (end_pfn <= usable_startpfn) {
				494
				495	/*
				496	* Push zone_movable_pfn to the end so
				497	* that if we have to rebalance
				498	* kernelcore across nodes, we will
				499	* not double account here
				500	*/
				501	zone_movable_pfn[nid] = end_pfn;
				502	continue;
				503	}
				504	start_pfn = usable_startpfn;
				505	}
				506
				507	/*
				508	* The usable PFN range for ZONE_MOVABLE is from
				509	* start_pfn->end_pfn. Calculate size_pages as the
				510	* number of pages used as kernelcore
				511	*/
				512	size_pages = end_pfn - start_pfn;
				513	if (size_pages > kernelcore_remaining)
				514	size_pages = kernelcore_remaining;
				515	zone_movable_pfn[nid] = start_pfn + size_pages;
				516
				517	/*
				518	* Some kernelcore has been met, update counts and
				519	* break if the kernelcore for this node has been
				520	* satisfied
				521	*/
				522	required_kernelcore -= min(required_kernelcore,
				523	size_pages);
				524	kernelcore_remaining -= size_pages;
				525	if (!kernelcore_remaining)
				526	break;
				527	}
				528	}
				529
				530	/*
				531	* If there is still required_kernelcore, we do another pass with one
				532	* less node in the count. This will push zone_movable_pfn[nid] further
				533	* along on the nodes that still have memory until kernelcore is
				534	* satisfied
				535	*/
				536	usable_nodes--;
				537	if (usable_nodes && required_kernelcore > usable_nodes)
				538	goto restart;
				539
				540	out2:
				541	/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
				542	for (nid = 0; nid < MAX_NUMNODES; nid++) {
				543	unsigned long start_pfn, end_pfn;
				544
				545	zone_movable_pfn[nid] =
				546	roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
				547
				548	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
				549	if (zone_movable_pfn[nid] >= end_pfn)
				550	zone_movable_pfn[nid] = 0;
				551	}
				552
				553	out:
				554	/* restore the node_state */
				555	node_states[N_MEMORY] = saved_node_state;
				556	}
				557
Usama Arif	fde1c4e	2023-09-13 11:54:01 +0100	[diff] [blame]	558	void __meminit __init_single_page(struct page *page, unsigned long pfn,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	559	unsigned long zone, int nid)
				560	{
				561	mm_zero_struct_page(page);
				562	set_page_links(page, zone, nid, pfn);
				563	init_page_count(page);
				564	page_mapcount_reset(page);
				565	page_cpupid_reset_last(page);
				566	page_kasan_tag_reset(page);
				567
				568	INIT_LIST_HEAD(&page->lru);
				569	#ifdef WANT_PAGE_VIRTUAL
				570	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
				571	if (!is_highmem_idx(zone))
				572	set_page_address(page, __va(pfn << PAGE_SHIFT));
				573	#endif
				574	}
				575
				576	#ifdef CONFIG_NUMA
				577	/*
				578	* During memory init memblocks map pfns to nids. The search is expensive and
				579	* this caches recent lookups. The implementation of __early_pfn_to_nid
				580	* treats start/end as pfns.
				581	*/
				582	struct mminit_pfnnid_cache {
				583	unsigned long last_start;
				584	unsigned long last_end;
				585	int last_nid;
				586	};
				587
				588	static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
				589
				590	/*
				591	* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
				592	*/
				593	static int __meminit __early_pfn_to_nid(unsigned long pfn,
				594	struct mminit_pfnnid_cache *state)
				595	{
				596	unsigned long start_pfn, end_pfn;
				597	int nid;
				598
				599	if (state->last_start <= pfn && pfn < state->last_end)
				600	return state->last_nid;
				601
				602	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
				603	if (nid != NUMA_NO_NODE) {
				604	state->last_start = start_pfn;
				605	state->last_end = end_pfn;
				606	state->last_nid = nid;
				607	}
				608
				609	return nid;
				610	}
				611
				612	int __meminit early_pfn_to_nid(unsigned long pfn)
				613	{
				614	static DEFINE_SPINLOCK(early_pfn_lock);
				615	int nid;
				616
				617	spin_lock(&early_pfn_lock);
				618	nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
				619	if (nid < 0)
				620	nid = first_online_node;
				621	spin_unlock(&early_pfn_lock);
				622
				623	return nid;
				624	}
Mike Rapoport (IBM)	534ef4e	2023-03-21 19:05:03 +0200	[diff] [blame]	625
				626	int hashdist = HASHDIST_DEFAULT;
				627
				628	static int __init set_hashdist(char *str)
				629	{
				630	if (!str)
				631	return 0;
				632	hashdist = simple_strtoul(str, &str, 0);
				633	return 1;
				634	}
				635	__setup("hashdist=", set_hashdist);
				636
				637	static inline void fixup_hashdist(void)
				638	{
				639	if (num_node_state(N_MEMORY) == 1)
				640	hashdist = 0;
				641	}
				642	#else
				643	static inline void fixup_hashdist(void) {}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	644	#endif /* CONFIG_NUMA */
				645
				646	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
				647	static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
				648	{
				649	pgdat->first_deferred_pfn = ULONG_MAX;
				650	}
				651
				652	/* Returns true if the struct page for the pfn is initialised */
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	653	static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	654	{
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	655	if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
				656	return false;
				657
				658	return true;
				659	}
				660
				661	/*
				662	* Returns true when the remaining initialisation should be deferred until
				663	* later in the boot cycle when it can be parallelised.
				664	*/
				665	static bool __meminit
				666	defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
				667	{
				668	static unsigned long prev_end_pfn, nr_initialised;
				669
				670	if (early_page_ext_enabled())
				671	return false;
				672	/*
				673	* prev_end_pfn static that contains the end of previous zone
				674	* No need to protect because called very early in boot before smp_init.
				675	*/
				676	if (prev_end_pfn != end_pfn) {
				677	prev_end_pfn = end_pfn;
				678	nr_initialised = 0;
				679	}
				680
				681	/* Always populate low zones for address-constrained allocations */
				682	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
				683	return false;
				684
				685	if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
				686	return true;
				687	/*
				688	* We start only with one section of pages, more pages are added as
				689	* needed until the rest of deferred pages are initialized.
				690	*/
				691	nr_initialised++;
				692	if ((nr_initialised > PAGES_PER_SECTION) &&
				693	(pfn & (PAGES_PER_SECTION - 1)) == 0) {
				694	NODE_DATA(nid)->first_deferred_pfn = pfn;
				695	return true;
				696	}
				697	return false;
				698	}
				699
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	700	static void __meminit init_reserved_page(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	701	{
				702	pg_data_t *pgdat;
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	703	int zid;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	704
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	705	if (early_page_initialised(pfn, nid))
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	706	return;
				707
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	708	pgdat = NODE_DATA(nid);
				709
				710	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
				711	struct zone *zone = &pgdat->node_zones[zid];
				712
				713	if (zone_spans_pfn(zone, pfn))
				714	break;
				715	}
				716	__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
				717	}
				718	#else
				719	static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
				720
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	721	static inline bool early_page_initialised(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	722	{
				723	return true;
				724	}
				725
				726	static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
				727	{
				728	return false;
				729	}
				730
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	731	static inline void init_reserved_page(unsigned long pfn, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	732	{
				733	}
				734	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
				735
				736	/*
				737	* Initialised pages do not have PageReserved set. This function is
				738	* called for each range allocated by the bootmem allocator and
				739	* marks the pages PageReserved. The remaining valid pages are later
				740	* sent to the buddy page allocator.
				741	*/
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	742	void __meminit reserve_bootmem_region(phys_addr_t start,
				743	phys_addr_t end, int nid)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	744	{
				745	unsigned long start_pfn = PFN_DOWN(start);
				746	unsigned long end_pfn = PFN_UP(end);
				747
				748	for (; start_pfn < end_pfn; start_pfn++) {
				749	if (pfn_valid(start_pfn)) {
				750	struct page *page = pfn_to_page(start_pfn);
				751
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	752	init_reserved_page(start_pfn, nid);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	753
				754	/* Avoid false-positive PageTail() */
				755	INIT_LIST_HEAD(&page->lru);
				756
				757	/*
				758	* no need for atomic set_bit because the struct
				759	* page is not visible yet so nobody should
				760	* access it yet.
				761	*/
				762	__SetPageReserved(page);
				763	}
				764	}
				765	}
				766
				767	/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
				768	static bool __meminit
				769	overlap_memmap_init(unsigned long zone, unsigned long *pfn)
				770	{
				771	static struct memblock_region *r;
				772
				773	if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
				774	if (!r \|\| *pfn >= memblock_region_memory_end_pfn(r)) {
				775	for_each_mem_region(r) {
				776	if (*pfn < memblock_region_memory_end_pfn(r))
				777	break;
				778	}
				779	}
				780	if (*pfn >= memblock_region_memory_base_pfn(r) &&
				781	memblock_is_mirror(r)) {
				782	*pfn = memblock_region_memory_end_pfn(r);
				783	return true;
				784	}
				785	}
				786	return false;
				787	}
				788
				789	/*
				790	* Only struct pages that correspond to ranges defined by memblock.memory
				791	* are zeroed and initialized by going through __init_single_page() during
				792	* memmap_init_zone_range().
				793	*
				794	* But, there could be struct pages that correspond to holes in
				795	* memblock.memory. This can happen because of the following reasons:
				796	* - physical memory bank size is not necessarily the exact multiple of the
				797	* arbitrary section size
				798	* - early reserved memory may not be listed in memblock.memory
				799	* - memory layouts defined with memmap= kernel parameter may not align
				800	* nicely with memmap sections
				801	*
				802	* Explicitly initialize those struct pages so that:
				803	* - PG_Reserved is set
				804	* - zone and node links point to zone and node that span the page if the
				805	* hole is in the middle of a zone
				806	* - zone and node links point to adjacent zone/node if the hole falls on
				807	* the zone boundary; the pages in such holes will be prepended to the
				808	* zone/node above the hole except for the trailing pages in the last
				809	* section that will be appended to the zone/node below.
				810	*/
				811	static void __init init_unavailable_range(unsigned long spfn,
				812	unsigned long epfn,
				813	int zone, int node)
				814	{
				815	unsigned long pfn;
				816	u64 pgcnt = 0;
				817
				818	for (pfn = spfn; pfn < epfn; pfn++) {
				819	if (!pfn_valid(pageblock_start_pfn(pfn))) {
				820	pfn = pageblock_end_pfn(pfn) - 1;
				821	continue;
				822	}
				823	__init_single_page(pfn_to_page(pfn), pfn, zone, node);
				824	__SetPageReserved(pfn_to_page(pfn));
				825	pgcnt++;
				826	}
				827
				828	if (pgcnt)
				829	pr_info("On node %d, zone %s: %lld pages in unavailable ranges",
				830	node, zone_names[zone], pgcnt);
				831	}
				832
				833	/*
				834	* Initially all pages are reserved - free ones are freed
				835	* up by memblock_free_all() once the early boot process is
				836	* done. Non-atomic initialization, single-pass.
				837	*
				838	* All aligned pageblocks are initialized to the specified migratetype
				839	* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
				840	* zone stats (e.g., nr_isolate_pageblock) are touched.
				841	*/
				842	void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
				843	unsigned long start_pfn, unsigned long zone_end_pfn,
				844	enum meminit_context context,
				845	struct vmem_altmap *altmap, int migratetype)
				846	{
				847	unsigned long pfn, end_pfn = start_pfn + size;
				848	struct page *page;
				849
				850	if (highest_memmap_pfn < end_pfn - 1)
				851	highest_memmap_pfn = end_pfn - 1;
				852
				853	#ifdef CONFIG_ZONE_DEVICE
				854	/*
				855	* Honor reservation requested by the driver for this ZONE_DEVICE
				856	* memory. We limit the total number of pages to initialize to just
				857	* those that might contain the memory mapping. We will defer the
				858	* ZONE_DEVICE page initialization until after we have released
				859	* the hotplug lock.
				860	*/
				861	if (zone == ZONE_DEVICE) {
				862	if (!altmap)
				863	return;
				864
				865	if (start_pfn == altmap->base_pfn)
				866	start_pfn += altmap->reserve;
				867	end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
				868	}
				869	#endif
				870
				871	for (pfn = start_pfn; pfn < end_pfn; ) {
				872	/*
				873	* There can be holes in boot-time mem_map[]s handed to this
				874	* function. They do not exist on hotplugged memory.
				875	*/
				876	if (context == MEMINIT_EARLY) {
				877	if (overlap_memmap_init(zone, &pfn))
				878	continue;
				879	if (defer_init(nid, pfn, zone_end_pfn)) {
				880	deferred_struct_pages = true;
				881	break;
				882	}
				883	}
				884
				885	page = pfn_to_page(pfn);
				886	__init_single_page(page, pfn, zone, nid);
				887	if (context == MEMINIT_HOTPLUG)
				888	__SetPageReserved(page);
				889
				890	/*
				891	* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
				892	* such that unmovable allocations won't be scattered all
				893	* over the place during system boot.
				894	*/
				895	if (pageblock_aligned(pfn)) {
				896	set_pageblock_migratetype(page, migratetype);
				897	cond_resched();
				898	}
				899	pfn++;
				900	}
				901	}
				902
				903	static void __init memmap_init_zone_range(struct zone *zone,
				904	unsigned long start_pfn,
				905	unsigned long end_pfn,
				906	unsigned long *hole_pfn)
				907	{
				908	unsigned long zone_start_pfn = zone->zone_start_pfn;
				909	unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
				910	int nid = zone_to_nid(zone), zone_id = zone_idx(zone);
				911
				912	start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
				913	end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
				914
				915	if (start_pfn >= end_pfn)
				916	return;
				917
				918	memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn,
				919	zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
				920
				921	if (*hole_pfn < start_pfn)
				922	init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid);
				923
				924	*hole_pfn = end_pfn;
				925	}
				926
				927	static void __init memmap_init(void)
				928	{
				929	unsigned long start_pfn, end_pfn;
				930	unsigned long hole_pfn = 0;
				931	int i, j, zone_id = 0, nid;
				932
				933	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
				934	struct pglist_data *node = NODE_DATA(nid);
				935
				936	for (j = 0; j < MAX_NR_ZONES; j++) {
				937	struct zone *zone = node->node_zones + j;
				938
				939	if (!populated_zone(zone))
				940	continue;
				941
				942	memmap_init_zone_range(zone, start_pfn, end_pfn,
				943	&hole_pfn);
				944	zone_id = j;
				945	}
				946	}
				947
				948	#ifdef CONFIG_SPARSEMEM
				949	/*
				950	* Initialize the memory map for hole in the range [memory_end,
				951	* section_end].
				952	* Append the pages in this hole to the highest zone in the last
				953	* node.
				954	* The call to init_unavailable_range() is outside the ifdef to
				955	* silence the compiler warining about zone_id set but not used;
				956	* for FLATMEM it is a nop anyway
				957	*/
				958	end_pfn = round_up(end_pfn, PAGES_PER_SECTION);
				959	if (hole_pfn < end_pfn)
				960	#endif
				961	init_unavailable_range(hole_pfn, end_pfn, zone_id, nid);
				962	}
				963
				964	#ifdef CONFIG_ZONE_DEVICE
				965	static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
				966	unsigned long zone_idx, int nid,
				967	struct dev_pagemap *pgmap)
				968	{
				969
				970	__init_single_page(page, pfn, zone_idx, nid);
				971
				972	/*
				973	* Mark page reserved as it will need to wait for onlining
				974	* phase for it to be fully associated with a zone.
				975	*
				976	* We can use the non-atomic __set_bit operation for setting
				977	* the flag as we are still initializing the pages.
				978	*/
				979	__SetPageReserved(page);
				980
				981	/*
				982	* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
				983	* and zone_device_data. It is a bug if a ZONE_DEVICE page is
				984	* ever freed or placed on a driver-private list.
				985	*/
				986	page->pgmap = pgmap;
				987	page->zone_device_data = NULL;
				988
				989	/*
				990	* Mark the block movable so that blocks are reserved for
				991	* movable at startup. This will force kernel allocations
				992	* to reserve their blocks rather than leaking throughout
				993	* the address space during boot when many long-lived
				994	* kernel allocations are made.
				995	*
				996	* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
				997	* because this is done early in section_activate()
				998	*/
				999	if (pageblock_aligned(pfn)) {
				1000	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
				1001	cond_resched();
				1002	}
				1003
				1004	/*
				1005	* ZONE_DEVICE pages are released directly to the driver page allocator
				1006	* which will set the page count to 1 when allocating the page.
				1007	*/
				1008	if (pgmap->type == MEMORY_DEVICE_PRIVATE \|\|
				1009	pgmap->type == MEMORY_DEVICE_COHERENT)
				1010	set_page_count(page, 0);
				1011	}
				1012
				1013	/*
				1014	* With compound page geometry and when struct pages are stored in ram most
				1015	* tail pages are reused. Consequently, the amount of unique struct pages to
				1016	* initialize is a lot smaller that the total amount of struct pages being
				1017	* mapped. This is a paired / mild layering violation with explicit knowledge
				1018	* of how the sparse_vmemmap internals handle compound pages in the lack
				1019	* of an altmap. See vmemmap_populate_compound_pages().
				1020	*/
				1021	static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
Aneesh Kumar K.V	87a7ae7	2023-04-11 19:52:13 +0530	[diff] [blame]	1022	struct dev_pagemap *pgmap)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1023	{
Aneesh Kumar K.V	87a7ae7	2023-04-11 19:52:13 +0530	[diff] [blame]	1024	if (!vmemmap_can_optimize(altmap, pgmap))
				1025	return pgmap_vmemmap_nr(pgmap);
				1026
Aneesh Kumar K.V	c1a6c53	2023-07-25 00:37:49 +0530	[diff] [blame]	1027	return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1028	}
				1029
				1030	static void __ref memmap_init_compound(struct page *head,
				1031	unsigned long head_pfn,
				1032	unsigned long zone_idx, int nid,
				1033	struct dev_pagemap *pgmap,
				1034	unsigned long nr_pages)
				1035	{
				1036	unsigned long pfn, end_pfn = head_pfn + nr_pages;
				1037	unsigned int order = pgmap->vmemmap_shift;
				1038
				1039	__SetPageHead(head);
				1040	for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
				1041	struct page *page = pfn_to_page(pfn);
				1042
				1043	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
				1044	prep_compound_tail(head, pfn - head_pfn);
				1045	set_page_count(page, 0);
				1046
				1047	/*
				1048	* The first tail page stores important compound page info.
				1049	* Call prep_compound_head() after the first tail page has
				1050	* been initialized, to not have the data overwritten.
				1051	*/
				1052	if (pfn == head_pfn + 1)
				1053	prep_compound_head(head, order);
				1054	}
				1055	}
				1056
				1057	void __ref memmap_init_zone_device(struct zone *zone,
				1058	unsigned long start_pfn,
				1059	unsigned long nr_pages,
				1060	struct dev_pagemap *pgmap)
				1061	{
				1062	unsigned long pfn, end_pfn = start_pfn + nr_pages;
				1063	struct pglist_data *pgdat = zone->zone_pgdat;
				1064	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
				1065	unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
				1066	unsigned long zone_idx = zone_idx(zone);
				1067	unsigned long start = jiffies;
				1068	int nid = pgdat->node_id;
				1069
				1070	if (WARN_ON_ONCE(!pgmap \|\| zone_idx != ZONE_DEVICE))
				1071	return;
				1072
				1073	/*
				1074	* The call to memmap_init should have already taken care
				1075	* of the pages reserved for the memmap, so we can just jump to
				1076	* the end of that region and start processing the device pages.
				1077	*/
				1078	if (altmap) {
				1079	start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
				1080	nr_pages = end_pfn - start_pfn;
				1081	}
				1082
				1083	for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
				1084	struct page *page = pfn_to_page(pfn);
				1085
				1086	__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
				1087
				1088	if (pfns_per_compound == 1)
				1089	continue;
				1090
				1091	memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
Aneesh Kumar K.V	87a7ae7	2023-04-11 19:52:13 +0530	[diff] [blame]	1092	compound_nr_pages(altmap, pgmap));
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1093	}
				1094
Tomas Krcka	dd31bad	2023-03-23 17:43:49 +0000	[diff] [blame]	1095	pr_debug("%s initialised %lu pages in %ums\n", __func__,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1096	nr_pages, jiffies_to_msecs(jiffies - start));
				1097	}
				1098	#endif
				1099
				1100	/*
				1101	* The zone ranges provided by the architecture do not include ZONE_MOVABLE
				1102	* because it is sized independent of architecture. Unlike the other zones,
				1103	* the starting point for ZONE_MOVABLE is not fixed. It may be different
				1104	* in each node depending on the size of each node and how evenly kernelcore
				1105	* is distributed. This helper function adjusts the zone ranges
				1106	* provided by the architecture for a given node by using the end of the
				1107	* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
				1108	* zones within a node are in order of monotonic increases memory addresses
				1109	*/
				1110	static void __init adjust_zone_range_for_zone_movable(int nid,
				1111	unsigned long zone_type,
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1112	unsigned long node_end_pfn,
				1113	unsigned long *zone_start_pfn,
				1114	unsigned long *zone_end_pfn)
				1115	{
				1116	/* Only adjust if ZONE_MOVABLE is on this node */
				1117	if (zone_movable_pfn[nid]) {
				1118	/* Size ZONE_MOVABLE */
				1119	if (zone_type == ZONE_MOVABLE) {
				1120	*zone_start_pfn = zone_movable_pfn[nid];
				1121	*zone_end_pfn = min(node_end_pfn,
				1122	arch_zone_highest_possible_pfn[movable_zone]);
				1123
				1124	/* Adjust for ZONE_MOVABLE starting within this range */
				1125	} else if (!mirrored_kernelcore &&
				1126	*zone_start_pfn < zone_movable_pfn[nid] &&
				1127	*zone_end_pfn > zone_movable_pfn[nid]) {
				1128	*zone_end_pfn = zone_movable_pfn[nid];
				1129
				1130	/* Check if this whole range is within ZONE_MOVABLE */
				1131	} else if (*zone_start_pfn >= zone_movable_pfn[nid])
				1132	zone_start_pfn = zone_end_pfn;
				1133	}
				1134	}
				1135
				1136	/*
				1137	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
				1138	* then all holes in the requested range will be accounted for.
				1139	*/
				1140	unsigned long __init __absent_pages_in_range(int nid,
				1141	unsigned long range_start_pfn,
				1142	unsigned long range_end_pfn)
				1143	{
				1144	unsigned long nr_absent = range_end_pfn - range_start_pfn;
				1145	unsigned long start_pfn, end_pfn;
				1146	int i;
				1147
				1148	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
				1149	start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
				1150	end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
				1151	nr_absent -= end_pfn - start_pfn;
				1152	}
				1153	return nr_absent;
				1154	}
				1155
				1156	/**
				1157	* absent_pages_in_range - Return number of page frames in holes within a range
				1158	* @start_pfn: The start PFN to start searching for holes
				1159	* @end_pfn: The end PFN to stop searching for holes
				1160	*
				1161	* Return: the number of pages frames in memory holes within a range.
				1162	*/
				1163	unsigned long __init absent_pages_in_range(unsigned long start_pfn,
				1164	unsigned long end_pfn)
				1165	{
				1166	return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
				1167	}
				1168
				1169	/* Return the number of page frames in holes in a zone on a node */
				1170	static unsigned long __init zone_absent_pages_in_node(int nid,
				1171	unsigned long zone_type,
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1172	unsigned long zone_start_pfn,
				1173	unsigned long zone_end_pfn)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1174	{
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1175	unsigned long nr_absent;
				1176
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1177	/* zone is empty, we don't have any absent pages */
				1178	if (zone_start_pfn == zone_end_pfn)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1179	return 0;
				1180
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1181	nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
				1182
				1183	/*
				1184	* ZONE_MOVABLE handling.
				1185	* Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
				1186	* and vice versa.
				1187	*/
				1188	if (mirrored_kernelcore && zone_movable_pfn[nid]) {
				1189	unsigned long start_pfn, end_pfn;
				1190	struct memblock_region *r;
				1191
				1192	for_each_mem_region(r) {
				1193	start_pfn = clamp(memblock_region_memory_base_pfn(r),
				1194	zone_start_pfn, zone_end_pfn);
				1195	end_pfn = clamp(memblock_region_memory_end_pfn(r),
				1196	zone_start_pfn, zone_end_pfn);
				1197
				1198	if (zone_type == ZONE_MOVABLE &&
				1199	memblock_is_mirror(r))
				1200	nr_absent += end_pfn - start_pfn;
				1201
				1202	if (zone_type == ZONE_NORMAL &&
				1203	!memblock_is_mirror(r))
				1204	nr_absent += end_pfn - start_pfn;
				1205	}
				1206	}
				1207
				1208	return nr_absent;
				1209	}
				1210
				1211	/*
				1212	* Return the number of pages a zone spans in a node, including holes
				1213	* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
				1214	*/
				1215	static unsigned long __init zone_spanned_pages_in_node(int nid,
				1216	unsigned long zone_type,
				1217	unsigned long node_start_pfn,
				1218	unsigned long node_end_pfn,
				1219	unsigned long *zone_start_pfn,
				1220	unsigned long *zone_end_pfn)
				1221	{
				1222	unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
				1223	unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1224
				1225	/* Get the start and end of the zone */
				1226	*zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
				1227	*zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
Haifeng Xu	0792e47d	2023-07-17 06:58:11 +0000	[diff] [blame]	1228	adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn,
				1229	zone_start_pfn, zone_end_pfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1230
				1231	/* Check that this node has pages within the zone's required range */
				1232	if (zone_end_pfn < node_start_pfn \|\| zone_start_pfn > node_end_pfn)
				1233	return 0;
				1234
				1235	/* Move the zone boundaries inside the node if necessary */
				1236	zone_end_pfn = min(zone_end_pfn, node_end_pfn);
				1237	zone_start_pfn = max(zone_start_pfn, node_start_pfn);
				1238
				1239	/* Return the spanned pages */
				1240	return zone_end_pfn - zone_start_pfn;
				1241	}
				1242
Haifeng Xu	ba1b67c	2023-05-26 08:52:50 +0000	[diff] [blame]	1243	static void __init reset_memoryless_node_totalpages(struct pglist_data *pgdat)
				1244	{
				1245	struct zone *z;
				1246
				1247	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) {
				1248	z->zone_start_pfn = 0;
				1249	z->spanned_pages = 0;
				1250	z->present_pages = 0;
				1251	#if defined(CONFIG_MEMORY_HOTPLUG)
				1252	z->present_early_pages = 0;
				1253	#endif
				1254	}
				1255
				1256	pgdat->node_spanned_pages = 0;
				1257	pgdat->node_present_pages = 0;
				1258	pr_debug("On node %d totalpages: 0\n", pgdat->node_id);
				1259	}
				1260
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1261	static void __init calculate_node_totalpages(struct pglist_data *pgdat,
				1262	unsigned long node_start_pfn,
				1263	unsigned long node_end_pfn)
				1264	{
				1265	unsigned long realtotalpages = 0, totalpages = 0;
				1266	enum zone_type i;
				1267
				1268	for (i = 0; i < MAX_NR_ZONES; i++) {
				1269	struct zone *zone = pgdat->node_zones + i;
				1270	unsigned long zone_start_pfn, zone_end_pfn;
				1271	unsigned long spanned, absent;
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1272	unsigned long real_size;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1273
				1274	spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
				1275	node_start_pfn,
				1276	node_end_pfn,
				1277	&zone_start_pfn,
				1278	&zone_end_pfn);
				1279	absent = zone_absent_pages_in_node(pgdat->node_id, i,
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1280	zone_start_pfn,
				1281	zone_end_pfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1282
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1283	real_size = spanned - absent;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1284
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1285	if (spanned)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1286	zone->zone_start_pfn = zone_start_pfn;
				1287	else
				1288	zone->zone_start_pfn = 0;
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1289	zone->spanned_pages = spanned;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1290	zone->present_pages = real_size;
				1291	#if defined(CONFIG_MEMORY_HOTPLUG)
				1292	zone->present_early_pages = real_size;
				1293	#endif
				1294
Haifeng Xu	1c2d252	2023-05-26 08:52:51 +0000	[diff] [blame]	1295	totalpages += spanned;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1296	realtotalpages += real_size;
				1297	}
				1298
				1299	pgdat->node_spanned_pages = totalpages;
				1300	pgdat->node_present_pages = realtotalpages;
				1301	pr_debug("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
				1302	}
				1303
				1304	static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
				1305	unsigned long present_pages)
				1306	{
				1307	unsigned long pages = spanned_pages;
				1308
				1309	/*
				1310	* Provide a more accurate estimation if there are holes within
				1311	* the zone and SPARSEMEM is in use. If there are holes within the
				1312	* zone, each populated memory region may cost us one or two extra
				1313	* memmap pages due to alignment because memmap pages for each
				1314	* populated regions may not be naturally aligned on page boundary.
				1315	* So the (present_pages >> 4) heuristic is a tradeoff for that.
				1316	*/
				1317	if (spanned_pages > present_pages + (present_pages >> 4) &&
				1318	IS_ENABLED(CONFIG_SPARSEMEM))
				1319	pages = present_pages;
				1320
				1321	return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
				1322	}
				1323
				1324	#ifdef CONFIG_TRANSPARENT_HUGEPAGE
				1325	static void pgdat_init_split_queue(struct pglist_data *pgdat)
				1326	{
				1327	struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
				1328
				1329	spin_lock_init(&ds_queue->split_queue_lock);
				1330	INIT_LIST_HEAD(&ds_queue->split_queue);
				1331	ds_queue->split_queue_len = 0;
				1332	}
				1333	#else
				1334	static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
				1335	#endif
				1336
				1337	#ifdef CONFIG_COMPACTION
				1338	static void pgdat_init_kcompactd(struct pglist_data *pgdat)
				1339	{
				1340	init_waitqueue_head(&pgdat->kcompactd_wait);
				1341	}
				1342	#else
				1343	static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
				1344	#endif
				1345
				1346	static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
				1347	{
				1348	int i;
				1349
				1350	pgdat_resize_init(pgdat);
				1351	pgdat_kswapd_lock_init(pgdat);
				1352
				1353	pgdat_init_split_queue(pgdat);
				1354	pgdat_init_kcompactd(pgdat);
				1355
				1356	init_waitqueue_head(&pgdat->kswapd_wait);
				1357	init_waitqueue_head(&pgdat->pfmemalloc_wait);
				1358
				1359	for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
				1360	init_waitqueue_head(&pgdat->reclaim_wait[i]);
				1361
				1362	pgdat_page_ext_init(pgdat);
				1363	lruvec_init(&pgdat->__lruvec);
				1364	}
				1365
				1366	static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
				1367	unsigned long remaining_pages)
				1368	{
				1369	atomic_long_set(&zone->managed_pages, remaining_pages);
				1370	zone_set_nid(zone, nid);
				1371	zone->name = zone_names[idx];
				1372	zone->zone_pgdat = NODE_DATA(nid);
				1373	spin_lock_init(&zone->lock);
				1374	zone_seqlock_init(zone);
				1375	zone_pcp_init(zone);
				1376	}
				1377
				1378	static void __meminit zone_init_free_lists(struct zone *zone)
				1379	{
				1380	unsigned int order, t;
				1381	for_each_migratetype_order(order, t) {
				1382	INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
				1383	zone->free_area[order].nr_free = 0;
				1384	}
Kirill A. Shutemov	dcdfdd4	2023-06-06 17:26:29 +0300	[diff] [blame]	1385
				1386	#ifdef CONFIG_UNACCEPTED_MEMORY
				1387	INIT_LIST_HEAD(&zone->unaccepted_pages);
				1388	#endif
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1389	}
				1390
				1391	void __meminit init_currently_empty_zone(struct zone *zone,
				1392	unsigned long zone_start_pfn,
				1393	unsigned long size)
				1394	{
				1395	struct pglist_data *pgdat = zone->zone_pgdat;
				1396	int zone_idx = zone_idx(zone) + 1;
				1397
				1398	if (zone_idx > pgdat->nr_zones)
				1399	pgdat->nr_zones = zone_idx;
				1400
				1401	zone->zone_start_pfn = zone_start_pfn;
				1402
				1403	mminit_dprintk(MMINIT_TRACE, "memmap_init",
				1404	"Initialising map node %d zone %lu pfns %lu -> %lu\n",
				1405	pgdat->node_id,
				1406	(unsigned long)zone_idx(zone),
				1407	zone_start_pfn, (zone_start_pfn + size));
				1408
				1409	zone_init_free_lists(zone);
				1410	zone->initialized = 1;
				1411	}
				1412
				1413	#ifndef CONFIG_SPARSEMEM
				1414	/*
				1415	* Calculate the size of the zone->blockflags rounded to an unsigned long
				1416	* Start by making sure zonesize is a multiple of pageblock_order by rounding
				1417	* up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
				1418	* round what is now in bits to nearest long in bits, then return it in
				1419	* bytes.
				1420	*/
				1421	static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
				1422	{
				1423	unsigned long usemapsize;
				1424
				1425	zonesize += zone_start_pfn & (pageblock_nr_pages-1);
				1426	usemapsize = roundup(zonesize, pageblock_nr_pages);
				1427	usemapsize = usemapsize >> pageblock_order;
				1428	usemapsize *= NR_PAGEBLOCK_BITS;
Miaohe Lin	daee07b	2023-08-07 10:35:28 +0800	[diff] [blame]	1429	usemapsize = roundup(usemapsize, BITS_PER_LONG);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1430
Miaohe Lin	daee07b	2023-08-07 10:35:28 +0800	[diff] [blame]	1431	return usemapsize / BITS_PER_BYTE;
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1432	}
				1433
				1434	static void __ref setup_usemap(struct zone *zone)
				1435	{
				1436	unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
				1437	zone->spanned_pages);
				1438	zone->pageblock_flags = NULL;
				1439	if (usemapsize) {
				1440	zone->pageblock_flags =
				1441	memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
				1442	zone_to_nid(zone));
				1443	if (!zone->pageblock_flags)
				1444	panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
				1445	usemapsize, zone->name, zone_to_nid(zone));
				1446	}
				1447	}
				1448	#else
				1449	static inline void setup_usemap(struct zone *zone) {}
				1450	#endif /* CONFIG_SPARSEMEM */
				1451
				1452	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
				1453
				1454	/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
				1455	void __init set_pageblock_order(void)
				1456	{
				1457	unsigned int order = MAX_ORDER;
				1458
				1459	/* Check that pageblock_nr_pages has not already been setup */
				1460	if (pageblock_order)
				1461	return;
				1462
				1463	/* Don't let pageblocks exceed the maximum allocation granularity. */
				1464	if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
				1465	order = HUGETLB_PAGE_ORDER;
				1466
				1467	/*
				1468	* Assume the largest contiguous order of interest is a huge page.
				1469	* This value may be variable depending on boot parameters on IA64 and
				1470	* powerpc.
				1471	*/
				1472	pageblock_order = order;
				1473	}
				1474	#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
				1475
				1476	/*
				1477	* When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
				1478	* is unused as pageblock_order is set at compile-time. See
				1479	* include/linux/pageblock-flags.h for the values of pageblock_order based on
				1480	* the kernel config
				1481	*/
				1482	void __init set_pageblock_order(void)
				1483	{
				1484	}
				1485
				1486	#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
				1487
				1488	/*
				1489	* Set up the zone data structures
				1490	* - init pgdat internals
				1491	* - init all zones belonging to this node
				1492	*
				1493	* NOTE: this function is only called during memory hotplug
				1494	*/
				1495	#ifdef CONFIG_MEMORY_HOTPLUG
				1496	void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
				1497	{
				1498	int nid = pgdat->node_id;
				1499	enum zone_type z;
				1500	int cpu;
				1501
				1502	pgdat_init_internals(pgdat);
				1503
				1504	if (pgdat->per_cpu_nodestats == &boot_nodestats)
				1505	pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
				1506
				1507	/*
				1508	* Reset the nr_zones, order and highest_zoneidx before reuse.
				1509	* Note that kswapd will init kswapd_highest_zoneidx properly
				1510	* when it starts in the near future.
				1511	*/
				1512	pgdat->nr_zones = 0;
				1513	pgdat->kswapd_order = 0;
				1514	pgdat->kswapd_highest_zoneidx = 0;
				1515	pgdat->node_start_pfn = 0;
Haifeng Xu	32b6a4a	2023-06-07 02:50:56 +0000	[diff] [blame]	1516	pgdat->node_present_pages = 0;
				1517
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1518	for_each_online_cpu(cpu) {
				1519	struct per_cpu_nodestat *p;
				1520
				1521	p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
				1522	memset(p, 0, sizeof(*p));
				1523	}
				1524
Haifeng Xu	32b6a4a	2023-06-07 02:50:56 +0000	[diff] [blame]	1525	/*
				1526	* When memory is hot-added, all the memory is in offline state. So
				1527	* clear all zones' present_pages and managed_pages because they will
				1528	* be updated in online_pages() and offline_pages().
				1529	*/
				1530	for (z = 0; z < MAX_NR_ZONES; z++) {
				1531	struct zone *zone = pgdat->node_zones + z;
				1532
				1533	zone->present_pages = 0;
				1534	zone_init_internals(zone, z, nid, 0);
				1535	}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1536	}
				1537	#endif
				1538
				1539	/*
				1540	* Set up the zone data structures:
				1541	* - mark all pages reserved
				1542	* - mark all memory queues empty
				1543	* - clear the memory bitmaps
				1544	*
				1545	* NOTE: pgdat should get zeroed by caller.
				1546	* NOTE: this function is only called during early init.
				1547	*/
				1548	static void __init free_area_init_core(struct pglist_data *pgdat)
				1549	{
				1550	enum zone_type j;
				1551	int nid = pgdat->node_id;
				1552
				1553	pgdat_init_internals(pgdat);
				1554	pgdat->per_cpu_nodestats = &boot_nodestats;
				1555
				1556	for (j = 0; j < MAX_NR_ZONES; j++) {
				1557	struct zone *zone = pgdat->node_zones + j;
				1558	unsigned long size, freesize, memmap_pages;
				1559
				1560	size = zone->spanned_pages;
				1561	freesize = zone->present_pages;
				1562
				1563	/*
				1564	* Adjust freesize so that it accounts for how much memory
				1565	* is used by this zone for memmap. This affects the watermark
				1566	* and per-cpu initialisations
				1567	*/
				1568	memmap_pages = calc_memmap_size(size, freesize);
				1569	if (!is_highmem_idx(j)) {
				1570	if (freesize >= memmap_pages) {
				1571	freesize -= memmap_pages;
				1572	if (memmap_pages)
				1573	pr_debug(" %s zone: %lu pages used for memmap\n",
				1574	zone_names[j], memmap_pages);
				1575	} else
				1576	pr_warn(" %s zone: %lu memmap pages exceeds freesize %lu\n",
				1577	zone_names[j], memmap_pages, freesize);
				1578	}
				1579
				1580	/* Account for reserved pages */
				1581	if (j == 0 && freesize > dma_reserve) {
				1582	freesize -= dma_reserve;
				1583	pr_debug(" %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
				1584	}
				1585
				1586	if (!is_highmem_idx(j))
				1587	nr_kernel_pages += freesize;
				1588	/* Charge for highmem memmap if there are enough kernel pages */
				1589	else if (nr_kernel_pages > memmap_pages * 2)
				1590	nr_kernel_pages -= memmap_pages;
				1591	nr_all_pages += freesize;
				1592
				1593	/*
				1594	* Set an approximate value for lowmem here, it will be adjusted
				1595	* when the bootmem allocator frees pages into the buddy system.
				1596	* And all highmem pages will be managed by the buddy system.
				1597	*/
				1598	zone_init_internals(zone, j, nid, freesize);
				1599
				1600	if (!size)
				1601	continue;
				1602
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1603	setup_usemap(zone);
				1604	init_currently_empty_zone(zone, zone->zone_start_pfn, size);
				1605	}
				1606	}
				1607
				1608	void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
				1609	phys_addr_t min_addr, int nid, bool exact_nid)
				1610	{
				1611	void *ptr;
				1612
				1613	if (exact_nid)
				1614	ptr = memblock_alloc_exact_nid_raw(size, align, min_addr,
				1615	MEMBLOCK_ALLOC_ACCESSIBLE,
				1616	nid);
				1617	else
				1618	ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
				1619	MEMBLOCK_ALLOC_ACCESSIBLE,
				1620	nid);
				1621
				1622	if (ptr && size > 0)
				1623	page_init_poison(ptr, size);
				1624
				1625	return ptr;
				1626	}
				1627
				1628	#ifdef CONFIG_FLATMEM
				1629	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
				1630	{
				1631	unsigned long __maybe_unused start = 0;
				1632	unsigned long __maybe_unused offset = 0;
				1633
				1634	/* Skip empty nodes */
				1635	if (!pgdat->node_spanned_pages)
				1636	return;
				1637
				1638	start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
				1639	offset = pgdat->node_start_pfn - start;
				1640	/* ia64 gets its own node_mem_map, before this, without bootmem */
				1641	if (!pgdat->node_mem_map) {
				1642	unsigned long size, end;
				1643	struct page *map;
				1644
				1645	/*
				1646	* The zone's endpoints aren't required to be MAX_ORDER
				1647	* aligned but the node_mem_map endpoints must be in order
				1648	* for the buddy allocator to function correctly.
				1649	*/
				1650	end = pgdat_end_pfn(pgdat);
				1651	end = ALIGN(end, MAX_ORDER_NR_PAGES);
				1652	size = (end - start) * sizeof(struct page);
				1653	map = memmap_alloc(size, SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT,
				1654	pgdat->node_id, false);
				1655	if (!map)
				1656	panic("Failed to allocate %ld bytes for node %d memory map\n",
				1657	size, pgdat->node_id);
				1658	pgdat->node_mem_map = map + offset;
				1659	}
				1660	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
				1661	__func__, pgdat->node_id, (unsigned long)pgdat,
				1662	(unsigned long)pgdat->node_mem_map);
				1663	#ifndef CONFIG_NUMA
				1664	/*
				1665	* With no DISCONTIG, the global mem_map is just set as node 0's
				1666	*/
				1667	if (pgdat == NODE_DATA(0)) {
				1668	mem_map = NODE_DATA(0)->node_mem_map;
				1669	if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
				1670	mem_map -= offset;
				1671	}
				1672	#endif
				1673	}
				1674	#else
				1675	static inline void alloc_node_mem_map(struct pglist_data *pgdat) { }
				1676	#endif /* CONFIG_FLATMEM */
				1677
				1678	/**
				1679	* get_pfn_range_for_nid - Return the start and end page frames for a node
				1680	* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
				1681	* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
				1682	* @end_pfn: Passed by reference. On return, it will have the node end_pfn.
				1683	*
				1684	* It returns the start and end page frame of a node based on information
				1685	* provided by memblock_set_node(). If called for a node
Miaohe Lin	3a29280	2023-06-25 11:33:40 +0800	[diff] [blame]	1686	* with no available memory, the start and end PFNs will be 0.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1687	*/
				1688	void __init get_pfn_range_for_nid(unsigned int nid,
				1689	unsigned long start_pfn, unsigned long end_pfn)
				1690	{
				1691	unsigned long this_start_pfn, this_end_pfn;
				1692	int i;
				1693
				1694	*start_pfn = -1UL;
				1695	*end_pfn = 0;
				1696
				1697	for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
				1698	start_pfn = min(start_pfn, this_start_pfn);
				1699	end_pfn = max(end_pfn, this_end_pfn);
				1700	}
				1701
				1702	if (*start_pfn == -1UL)
				1703	*start_pfn = 0;
				1704	}
				1705
				1706	static void __init free_area_init_node(int nid)
				1707	{
				1708	pg_data_t *pgdat = NODE_DATA(nid);
				1709	unsigned long start_pfn = 0;
				1710	unsigned long end_pfn = 0;
				1711
				1712	/* pg_data_t should be reset to zero when it's allocated */
				1713	WARN_ON(pgdat->nr_zones \|\| pgdat->kswapd_highest_zoneidx);
				1714
				1715	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
				1716
				1717	pgdat->node_id = nid;
				1718	pgdat->node_start_pfn = start_pfn;
				1719	pgdat->per_cpu_nodestats = NULL;
				1720
				1721	if (start_pfn != end_pfn) {
				1722	pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
				1723	(u64)start_pfn << PAGE_SHIFT,
				1724	end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
Haifeng Xu	ba1b67c	2023-05-26 08:52:50 +0000	[diff] [blame]	1725
				1726	calculate_node_totalpages(pgdat, start_pfn, end_pfn);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1727	} else {
				1728	pr_info("Initmem setup node %d as memoryless\n", nid);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1729
Haifeng Xu	ba1b67c	2023-05-26 08:52:50 +0000	[diff] [blame]	1730	reset_memoryless_node_totalpages(pgdat);
				1731	}
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1732
				1733	alloc_node_mem_map(pgdat);
				1734	pgdat_set_deferred_range(pgdat);
				1735
				1736	free_area_init_core(pgdat);
				1737	lru_gen_init_pgdat(pgdat);
				1738	}
				1739
				1740	/* Any regular or high memory on that node ? */
Haifeng Xu	b894da0	2023-07-10 09:37:50 +0000	[diff] [blame]	1741	static void __init check_for_memory(pg_data_t *pgdat)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1742	{
				1743	enum zone_type zone_type;
				1744
				1745	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
				1746	struct zone *zone = &pgdat->node_zones[zone_type];
				1747	if (populated_zone(zone)) {
				1748	if (IS_ENABLED(CONFIG_HIGHMEM))
Haifeng Xu	91ff4d7	2023-06-07 03:24:02 +0000	[diff] [blame]	1749	node_set_state(pgdat->node_id, N_HIGH_MEMORY);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1750	if (zone_type <= ZONE_NORMAL)
Haifeng Xu	91ff4d7	2023-06-07 03:24:02 +0000	[diff] [blame]	1751	node_set_state(pgdat->node_id, N_NORMAL_MEMORY);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1752	break;
				1753	}
				1754	}
				1755	}
				1756
				1757	#if MAX_NUMNODES > 1
				1758	/*
				1759	* Figure out the number of possible node ids.
				1760	*/
				1761	void __init setup_nr_node_ids(void)
				1762	{
				1763	unsigned int highest;
				1764
				1765	highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
				1766	nr_node_ids = highest + 1;
				1767	}
				1768	#endif
				1769
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1770	/*
				1771	* Some architectures, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
				1772	* such cases we allow max_zone_pfn sorted in the descending order
				1773	*/
Arnd Bergmann	5f300fd	2023-04-14 10:03:53 +0200	[diff] [blame]	1774	static bool arch_has_descending_max_zone_pfns(void)
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1775	{
Arnd Bergmann	5f300fd	2023-04-14 10:03:53 +0200	[diff] [blame]	1776	return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1777	}
				1778
				1779	/**
				1780	* free_area_init - Initialise all pg_data_t and zone data
				1781	* @max_zone_pfn: an array of max PFNs for each zone
				1782	*
				1783	* This will call free_area_init_node() for each active node in the system.
				1784	* Using the page ranges provided by memblock_set_node(), the size of each
				1785	* zone in each node and their holes is calculated. If the maximum PFN
				1786	* between two adjacent zones match, it is assumed that the zone is empty.
				1787	* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
				1788	* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
				1789	* starts where the previous one ended. For example, ZONE_DMA32 starts
				1790	* at arch_max_dma_pfn.
				1791	*/
				1792	void __init free_area_init(unsigned long *max_zone_pfn)
				1793	{
				1794	unsigned long start_pfn, end_pfn;
				1795	int i, nid, zone;
				1796	bool descending;
				1797
				1798	/* Record where the zone boundaries are */
				1799	memset(arch_zone_lowest_possible_pfn, 0,
				1800	sizeof(arch_zone_lowest_possible_pfn));
				1801	memset(arch_zone_highest_possible_pfn, 0,
				1802	sizeof(arch_zone_highest_possible_pfn));
				1803
				1804	start_pfn = PHYS_PFN(memblock_start_of_DRAM());
				1805	descending = arch_has_descending_max_zone_pfns();
				1806
				1807	for (i = 0; i < MAX_NR_ZONES; i++) {
				1808	if (descending)
				1809	zone = MAX_NR_ZONES - i - 1;
				1810	else
				1811	zone = i;
				1812
				1813	if (zone == ZONE_MOVABLE)
				1814	continue;
				1815
				1816	end_pfn = max(max_zone_pfn[zone], start_pfn);
				1817	arch_zone_lowest_possible_pfn[zone] = start_pfn;
				1818	arch_zone_highest_possible_pfn[zone] = end_pfn;
				1819
				1820	start_pfn = end_pfn;
				1821	}
				1822
				1823	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
				1824	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
				1825	find_zone_movable_pfns_for_nodes();
				1826
				1827	/* Print out the zone ranges */
				1828	pr_info("Zone ranges:\n");
				1829	for (i = 0; i < MAX_NR_ZONES; i++) {
				1830	if (i == ZONE_MOVABLE)
				1831	continue;
				1832	pr_info(" %-8s ", zone_names[i]);
				1833	if (arch_zone_lowest_possible_pfn[i] ==
				1834	arch_zone_highest_possible_pfn[i])
				1835	pr_cont("empty\n");
				1836	else
				1837	pr_cont("[mem %#018Lx-%#018Lx]\n",
				1838	(u64)arch_zone_lowest_possible_pfn[i]
				1839	<< PAGE_SHIFT,
				1840	((u64)arch_zone_highest_possible_pfn[i]
				1841	<< PAGE_SHIFT) - 1);
				1842	}
				1843
				1844	/* Print out the PFNs ZONE_MOVABLE begins at in each node */
				1845	pr_info("Movable zone start for each node\n");
				1846	for (i = 0; i < MAX_NUMNODES; i++) {
				1847	if (zone_movable_pfn[i])
				1848	pr_info(" Node %d: %#018Lx\n", i,
				1849	(u64)zone_movable_pfn[i] << PAGE_SHIFT);
				1850	}
				1851
				1852	/*
				1853	* Print out the early node map, and initialize the
				1854	* subsection-map relative to active online memory ranges to
				1855	* enable future "sub-section" extensions of the memory map.
				1856	*/
				1857	pr_info("Early memory node ranges\n");
				1858	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
				1859	pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
				1860	(u64)start_pfn << PAGE_SHIFT,
				1861	((u64)end_pfn << PAGE_SHIFT) - 1);
				1862	subsection_map_init(start_pfn, end_pfn - start_pfn);
				1863	}
				1864
				1865	/* Initialise every node */
				1866	mminit_verify_pageflags_layout();
				1867	setup_nr_node_ids();
Haifeng Xu	e3d9b45	2023-06-01 06:35:35 +0000	[diff] [blame]	1868	set_pageblock_order();
				1869
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1870	for_each_node(nid) {
				1871	pg_data_t *pgdat;
				1872
				1873	if (!node_online(nid)) {
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1874	/* Allocator not initialized yet */
				1875	pgdat = arch_alloc_nodedata(nid);
				1876	if (!pgdat)
				1877	panic("Cannot allocate %zuB for node %d.\n",
				1878	sizeof(*pgdat), nid);
				1879	arch_refresh_nodedata(nid, pgdat);
Haifeng Xu	837c2ba	2023-05-28 04:57:20 +0000	[diff] [blame]	1880	free_area_init_node(nid);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1881
				1882	/*
				1883	* We do not want to confuse userspace by sysfs
				1884	* files/directories for node without any memory
				1885	* attached to it, so this node is not marked as
				1886	* N_MEMORY and not marked online so that no sysfs
				1887	* hierarchy will be created via register_one_node for
				1888	* it. The pgdat will get fully initialized by
				1889	* hotadd_init_pgdat() when memory is hotplugged into
				1890	* this node.
				1891	*/
				1892	continue;
				1893	}
				1894
				1895	pgdat = NODE_DATA(nid);
				1896	free_area_init_node(nid);
				1897
				1898	/* Any memory on that node */
				1899	if (pgdat->node_present_pages)
				1900	node_set_state(nid, N_MEMORY);
Haifeng Xu	91ff4d7	2023-06-07 03:24:02 +0000	[diff] [blame]	1901	check_for_memory(pgdat);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1902	}
				1903
				1904	memmap_init();
Mike Rapoport (IBM)	534ef4e	2023-03-21 19:05:03 +0200	[diff] [blame]	1905
				1906	/* disable hash distribution for systems with a single node */
				1907	fixup_hashdist();
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1908	}
				1909
				1910	/**
				1911	* node_map_pfn_alignment - determine the maximum internode alignment
				1912	*
				1913	* This function should be called after node map is populated and sorted.
				1914	* It calculates the maximum power of two alignment which can distinguish
				1915	* all the nodes.
				1916	*
				1917	* For example, if all nodes are 1GiB and aligned to 1GiB, the return value
				1918	* would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
				1919	* nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
				1920	* shifted, 1GiB is enough and this function will indicate so.
				1921	*
				1922	* This is used to test whether pfn -> nid mapping of the chosen memory
				1923	* model has fine enough granularity to avoid incorrect mapping for the
				1924	* populated node map.
				1925	*
				1926	* Return: the determined alignment in pfn's. 0 if there is no alignment
				1927	* requirement (single node).
				1928	*/
				1929	unsigned long __init node_map_pfn_alignment(void)
				1930	{
				1931	unsigned long accl_mask = 0, last_end = 0;
				1932	unsigned long start, end, mask;
				1933	int last_nid = NUMA_NO_NODE;
				1934	int i, nid;
				1935
				1936	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
				1937	if (!start \|\| last_nid < 0 \|\| last_nid == nid) {
				1938	last_nid = nid;
				1939	last_end = end;
				1940	continue;
				1941	}
				1942
				1943	/*
				1944	* Start with a mask granular enough to pin-point to the
				1945	* start pfn and tick off bits one-by-one until it becomes
				1946	* too coarse to separate the current node from the last.
				1947	*/
				1948	mask = ~((1 << __ffs(start)) - 1);
				1949	while (mask && last_end <= (start & (mask << 1)))
				1950	mask <<= 1;
				1951
				1952	/* accumulate all internode masks */
				1953	accl_mask \|= mask;
				1954	}
				1955
				1956	/* convert mask to number of pages */
				1957	return ~accl_mask + 1;
				1958	}
				1959
				1960	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
				1961	static void __init deferred_free_range(unsigned long pfn,
				1962	unsigned long nr_pages)
				1963	{
				1964	struct page *page;
				1965	unsigned long i;
				1966
				1967	if (!nr_pages)
				1968	return;
				1969
				1970	page = pfn_to_page(pfn);
				1971
				1972	/* Free a large naturally-aligned chunk if possible */
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	1973	if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) {
				1974	for (i = 0; i < nr_pages; i += pageblock_nr_pages)
				1975	set_pageblock_migratetype(page + i, MIGRATE_MOVABLE);
				1976	__free_pages_core(page, MAX_ORDER);
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1977	return;
				1978	}
				1979
Kirill A. Shutemov	dcdfdd4	2023-06-06 17:26:29 +0300	[diff] [blame]	1980	/* Accept chunks smaller than MAX_ORDER upfront */
				1981	accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
				1982
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	1983	for (i = 0; i < nr_pages; i++, page++, pfn++) {
				1984	if (pageblock_aligned(pfn))
				1985	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
				1986	__free_pages_core(page, 0);
				1987	}
				1988	}
				1989
				1990	/* Completion tracking for deferred_init_memmap() threads */
				1991	static atomic_t pgdat_init_n_undone __initdata;
				1992	static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
				1993
				1994	static inline void __init pgdat_init_report_one_done(void)
				1995	{
				1996	if (atomic_dec_and_test(&pgdat_init_n_undone))
				1997	complete(&pgdat_init_all_done_comp);
				1998	}
				1999
				2000	/*
				2001	* Returns true if page needs to be initialized or freed to buddy allocator.
				2002	*
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	2003	* We check if a current MAX_ORDER block is valid by only checking the validity
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2004	* of the head pfn.
				2005	*/
				2006	static inline bool __init deferred_pfn_valid(unsigned long pfn)
				2007	{
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	2008	if (IS_MAX_ORDER_ALIGNED(pfn) && !pfn_valid(pfn))
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2009	return false;
				2010	return true;
				2011	}
				2012
				2013	/*
				2014	* Free pages to buddy allocator. Try to free aligned pages in
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	2015	* MAX_ORDER_NR_PAGES sizes.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2016	*/
				2017	static void __init deferred_free_pages(unsigned long pfn,
				2018	unsigned long end_pfn)
				2019	{
				2020	unsigned long nr_free = 0;
				2021
				2022	for (; pfn < end_pfn; pfn++) {
				2023	if (!deferred_pfn_valid(pfn)) {
				2024	deferred_free_range(pfn - nr_free, nr_free);
				2025	nr_free = 0;
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	2026	} else if (IS_MAX_ORDER_ALIGNED(pfn)) {
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2027	deferred_free_range(pfn - nr_free, nr_free);
				2028	nr_free = 1;
				2029	} else {
				2030	nr_free++;
				2031	}
				2032	}
				2033	/* Free the last block of pages to allocator */
				2034	deferred_free_range(pfn - nr_free, nr_free);
				2035	}
				2036
				2037	/*
				2038	* Initialize struct pages. We minimize pfn page lookups and scheduler checks
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	2039	* by performing it only once every MAX_ORDER_NR_PAGES.
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2040	* Return number of pages initialized.
				2041	*/
				2042	static unsigned long __init deferred_init_pages(struct zone *zone,
				2043	unsigned long pfn,
				2044	unsigned long end_pfn)
				2045	{
				2046	int nid = zone_to_nid(zone);
				2047	unsigned long nr_pages = 0;
				2048	int zid = zone_idx(zone);
				2049	struct page *page = NULL;
				2050
				2051	for (; pfn < end_pfn; pfn++) {
				2052	if (!deferred_pfn_valid(pfn)) {
				2053	page = NULL;
				2054	continue;
Kirill A. Shutemov	3f6dac0	2023-03-21 03:24:15 +0300	[diff] [blame]	2055	} else if (!page \|\| IS_MAX_ORDER_ALIGNED(pfn)) {
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2056	page = pfn_to_page(pfn);
				2057	} else {
				2058	page++;
				2059	}
				2060	__init_single_page(page, pfn, zid, nid);
				2061	nr_pages++;
				2062	}
				2063	return (nr_pages);
				2064	}
				2065
				2066	/*
				2067	* This function is meant to pre-load the iterator for the zone init.
				2068	* Specifically it walks through the ranges until we are caught up to the
				2069	* first_init_pfn value and exits there. If we never encounter the value we
				2070	* return false indicating there are no valid ranges left.
				2071	*/
				2072	static bool __init
				2073	deferred_init_mem_pfn_range_in_zone(u64 i, struct zone zone,
				2074	unsigned long spfn, unsigned long epfn,
				2075	unsigned long first_init_pfn)
				2076	{
				2077	u64 j;
				2078
				2079	/*
				2080	* Start out by walking through the ranges in this zone that have
				2081	* already been initialized. We don't need to do anything with them
				2082	* so we just need to flush them out of the system.
				2083	*/
				2084	for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
				2085	if (*epfn <= first_init_pfn)
				2086	continue;
				2087	if (*spfn < first_init_pfn)
				2088	*spfn = first_init_pfn;
				2089	*i = j;
				2090	return true;
				2091	}
				2092
				2093	return false;
				2094	}
				2095
				2096	/*
				2097	* Initialize and free pages. We do it in two loops: first we initialize
				2098	* struct page, then free to buddy allocator, because while we are
				2099	* freeing pages we can access pages that are ahead (computing buddy
				2100	* page in __free_one_page()).
				2101	*
				2102	* In order to try and keep some memory in the cache we have the loop
				2103	* broken along max page order boundaries. This way we will not cause
				2104	* any issues with the buddy page computation.
				2105	*/
				2106	static unsigned long __init
				2107	deferred_init_maxorder(u64 i, struct zone zone, unsigned long *start_pfn,
				2108	unsigned long *end_pfn)
				2109	{
				2110	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
				2111	unsigned long spfn = start_pfn, epfn = end_pfn;
				2112	unsigned long nr_pages = 0;
				2113	u64 j = *i;
				2114
				2115	/* First we loop through and initialize the page values */
				2116	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
				2117	unsigned long t;
				2118
				2119	if (mo_pfn <= *start_pfn)
				2120	break;
				2121
				2122	t = min(mo_pfn, *end_pfn);
				2123	nr_pages += deferred_init_pages(zone, *start_pfn, t);
				2124
				2125	if (mo_pfn < *end_pfn) {
				2126	*start_pfn = mo_pfn;
				2127	break;
				2128	}
				2129	}
				2130
				2131	/* Reset values and now loop through freeing pages as needed */
				2132	swap(j, *i);
				2133
				2134	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
				2135	unsigned long t;
				2136
				2137	if (mo_pfn <= spfn)
				2138	break;
				2139
				2140	t = min(mo_pfn, epfn);
				2141	deferred_free_pages(spfn, t);
				2142
				2143	if (mo_pfn <= epfn)
				2144	break;
				2145	}
				2146
				2147	return nr_pages;
				2148	}
				2149
				2150	static void __init
				2151	deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
				2152	void *arg)
				2153	{
				2154	unsigned long spfn, epfn;
				2155	struct zone *zone = arg;
				2156	u64 i;
				2157
				2158	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
				2159
				2160	/*
				2161	* Initialize and free pages in MAX_ORDER sized increments so that we
				2162	* can avoid introducing any issues with the buddy allocator.
				2163	*/
				2164	while (spfn < end_pfn) {
				2165	deferred_init_maxorder(&i, zone, &spfn, &epfn);
				2166	cond_resched();
				2167	}
				2168	}
				2169
				2170	/* An arch may override for more concurrency. */
				2171	__weak int __init
				2172	deferred_page_init_max_threads(const struct cpumask *node_cpumask)
				2173	{
				2174	return 1;
				2175	}
				2176
				2177	/* Initialise remaining memory on a node */
				2178	static int __init deferred_init_memmap(void *data)
				2179	{
				2180	pg_data_t *pgdat = data;
				2181	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
				2182	unsigned long spfn = 0, epfn = 0;
				2183	unsigned long first_init_pfn, flags;
				2184	unsigned long start = jiffies;
				2185	struct zone *zone;
				2186	int zid, max_threads;
				2187	u64 i;
				2188
				2189	/* Bind memory initialisation thread to a local node if possible */
				2190	if (!cpumask_empty(cpumask))
				2191	set_cpus_allowed_ptr(current, cpumask);
				2192
				2193	pgdat_resize_lock(pgdat, &flags);
				2194	first_init_pfn = pgdat->first_deferred_pfn;
				2195	if (first_init_pfn == ULONG_MAX) {
				2196	pgdat_resize_unlock(pgdat, &flags);
				2197	pgdat_init_report_one_done();
				2198	return 0;
				2199	}
				2200
				2201	/* Sanity check boundaries */
				2202	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
				2203	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
				2204	pgdat->first_deferred_pfn = ULONG_MAX;
				2205
				2206	/*
				2207	* Once we unlock here, the zone cannot be grown anymore, thus if an
				2208	* interrupt thread must allocate this early in boot, zone must be
				2209	* pre-grown prior to start of deferred page initialization.
				2210	*/
				2211	pgdat_resize_unlock(pgdat, &flags);
				2212
				2213	/* Only the highest zone is deferred so find it */
				2214	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
				2215	zone = pgdat->node_zones + zid;
				2216	if (first_init_pfn < zone_end_pfn(zone))
				2217	break;
				2218	}
				2219
				2220	/* If the zone is empty somebody else may have cleared out the zone */
				2221	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
				2222	first_init_pfn))
				2223	goto zone_empty;
				2224
				2225	max_threads = deferred_page_init_max_threads(cpumask);
				2226
				2227	while (spfn < epfn) {
				2228	unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
				2229	struct padata_mt_job job = {
				2230	.thread_fn = deferred_init_memmap_chunk,
				2231	.fn_arg = zone,
				2232	.start = spfn,
				2233	.size = epfn_align - spfn,
				2234	.align = PAGES_PER_SECTION,
				2235	.min_chunk = PAGES_PER_SECTION,
				2236	.max_threads = max_threads,
				2237	};
				2238
				2239	padata_do_multithreaded(&job);
				2240	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
				2241	epfn_align);
				2242	}
				2243	zone_empty:
				2244	/* Sanity check that the next zone really is unpopulated */
				2245	WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
				2246
				2247	pr_info("node %d deferred pages initialised in %ums\n",
				2248	pgdat->node_id, jiffies_to_msecs(jiffies - start));
				2249
				2250	pgdat_init_report_one_done();
				2251	return 0;
				2252	}
				2253
				2254	/*
				2255	* If this zone has deferred pages, try to grow it by initializing enough
				2256	* deferred pages to satisfy the allocation specified by order, rounded up to
				2257	* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
				2258	* of SECTION_SIZE bytes by initializing struct pages in increments of
				2259	* PAGES_PER_SECTION * sizeof(struct page) bytes.
				2260	*
				2261	* Return true when zone was grown, otherwise return false. We return true even
				2262	* when we grow less than requested, to let the caller decide if there are
				2263	* enough pages to satisfy the allocation.
				2264	*
				2265	* Note: We use noinline because this function is needed only during boot, and
				2266	* it is called from a __ref function _deferred_grow_zone. This way we are
				2267	* making sure that it is not inlined into permanent text section.
				2268	*/
				2269	bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
				2270	{
				2271	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
				2272	pg_data_t *pgdat = zone->zone_pgdat;
				2273	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
				2274	unsigned long spfn, epfn, flags;
				2275	unsigned long nr_pages = 0;
				2276	u64 i;
				2277
				2278	/* Only the last zone may have deferred pages */
				2279	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
				2280	return false;
				2281
				2282	pgdat_resize_lock(pgdat, &flags);
				2283
				2284	/*
				2285	* If someone grew this zone while we were waiting for spinlock, return
				2286	* true, as there might be enough pages already.
				2287	*/
				2288	if (first_deferred_pfn != pgdat->first_deferred_pfn) {
				2289	pgdat_resize_unlock(pgdat, &flags);
				2290	return true;
				2291	}
				2292
				2293	/* If the zone is empty somebody else may have cleared out the zone */
				2294	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
				2295	first_deferred_pfn)) {
				2296	pgdat->first_deferred_pfn = ULONG_MAX;
				2297	pgdat_resize_unlock(pgdat, &flags);
				2298	/* Retry only once. */
				2299	return first_deferred_pfn != ULONG_MAX;
				2300	}
				2301
				2302	/*
				2303	* Initialize and free pages in MAX_ORDER sized increments so
				2304	* that we can avoid introducing any issues with the buddy
				2305	* allocator.
				2306	*/
				2307	while (spfn < epfn) {
				2308	/* update our first deferred PFN for this section */
				2309	first_deferred_pfn = spfn;
				2310
				2311	nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
				2312	touch_nmi_watchdog();
				2313
				2314	/* We should only stop along section boundaries */
				2315	if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
				2316	continue;
				2317
				2318	/* If our quota has been met we can stop here */
				2319	if (nr_pages >= nr_pages_needed)
				2320	break;
				2321	}
				2322
				2323	pgdat->first_deferred_pfn = spfn;
				2324	pgdat_resize_unlock(pgdat, &flags);
				2325
				2326	return nr_pages > 0;
				2327	}
				2328
				2329	#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
				2330
				2331	#ifdef CONFIG_CMA
				2332	void __init init_cma_reserved_pageblock(struct page *page)
				2333	{
				2334	unsigned i = pageblock_nr_pages;
				2335	struct page *p = page;
				2336
				2337	do {
				2338	__ClearPageReserved(p);
				2339	set_page_count(p, 0);
				2340	} while (++p, --i);
				2341
				2342	set_pageblock_migratetype(page, MIGRATE_CMA);
				2343	set_page_refcounted(page);
				2344	__free_pages(page, pageblock_order);
				2345
				2346	adjust_managed_page_count(page, pageblock_nr_pages);
				2347	page_zone(page)->cma_pages += pageblock_nr_pages;
				2348	}
				2349	#endif
				2350
Kefeng Wang	904d585	2023-05-16 14:38:11 +0800	[diff] [blame]	2351	void set_zone_contiguous(struct zone *zone)
				2352	{
				2353	unsigned long block_start_pfn = zone->zone_start_pfn;
				2354	unsigned long block_end_pfn;
				2355
				2356	block_end_pfn = pageblock_end_pfn(block_start_pfn);
				2357	for (; block_start_pfn < zone_end_pfn(zone);
				2358	block_start_pfn = block_end_pfn,
				2359	block_end_pfn += pageblock_nr_pages) {
				2360
				2361	block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
				2362
				2363	if (!__pageblock_pfn_to_page(block_start_pfn,
				2364	block_end_pfn, zone))
				2365	return;
				2366	cond_resched();
				2367	}
				2368
				2369	/* We confirm that there is no hole */
				2370	zone->contiguous = true;
				2371	}
				2372
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2373	void __init page_alloc_init_late(void)
				2374	{
				2375	struct zone *zone;
				2376	int nid;
				2377
				2378	#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
				2379
				2380	/* There will be num_node_state(N_MEMORY) threads */
				2381	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
				2382	for_each_node_state(nid, N_MEMORY) {
				2383	kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
				2384	}
				2385
				2386	/* Block until all are initialised */
				2387	wait_for_completion(&pgdat_init_all_done_comp);
				2388
				2389	/*
				2390	* We initialized the rest of the deferred pages. Permanently disable
				2391	* on-demand struct page initialization.
				2392	*/
				2393	static_branch_disable(&deferred_pages);
				2394
				2395	/* Reinit limits that are based on free pages after the kernel is up */
				2396	files_maxfiles_init();
				2397	#endif
				2398
				2399	buffer_init();
				2400
				2401	/* Discard memblock private memory */
				2402	memblock_discard();
				2403
				2404	for_each_node_state(nid, N_MEMORY)
				2405	shuffle_free_memory(NODE_DATA(nid));
				2406
				2407	for_each_populated_zone(zone)
				2408	set_zone_contiguous(zone);
Mike Rapoport (IBM)	de57807	2023-03-21 19:05:09 +0200	[diff] [blame]	2409
				2410	/* Initialize page ext after all struct pages are initialized. */
				2411	if (deferred_struct_pages)
				2412	page_ext_init();
Kefeng Wang	e95d372	2023-05-16 14:38:20 +0800	[diff] [blame]	2413
				2414	page_alloc_sysctl_init();
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2415	}
				2416
				2417	#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
				2418	/*
				2419	* Returns the number of pages that arch has reserved but
				2420	* is not known to alloc_large_system_hash().
				2421	*/
				2422	static unsigned long __init arch_reserved_kernel_pages(void)
				2423	{
				2424	return 0;
				2425	}
				2426	#endif
				2427
				2428	/*
				2429	* Adaptive scale is meant to reduce sizes of hash tables on large memory
				2430	* machines. As memory size is increased the scale is also increased but at
				2431	* slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
				2432	* quadruples the scale is increased by one, which means the size of hash table
				2433	* only doubles, instead of quadrupling as well.
				2434	* Because 32-bit systems cannot have large physical memory, where this scaling
				2435	* makes sense, it is disabled on such platforms.
				2436	*/
				2437	#if __BITS_PER_LONG > 32
				2438	#define ADAPT_SCALE_BASE (64ul << 30)
				2439	#define ADAPT_SCALE_SHIFT 2
				2440	#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
				2441	#endif
				2442
				2443	/*
				2444	* allocate a large system hash table from bootmem
				2445	* - it is assumed that the hash table must contain an exact power-of-2
				2446	* quantity of entries
				2447	* - limit is the number of hash buckets, not the total allocation size
				2448	*/
				2449	void __init alloc_large_system_hash(const char tablename,
				2450	unsigned long bucketsize,
				2451	unsigned long numentries,
				2452	int scale,
				2453	int flags,
				2454	unsigned int *_hash_shift,
				2455	unsigned int *_hash_mask,
				2456	unsigned long low_limit,
				2457	unsigned long high_limit)
				2458	{
				2459	unsigned long long max = high_limit;
				2460	unsigned long log2qty, size;
				2461	void *table;
				2462	gfp_t gfp_flags;
				2463	bool virt;
				2464	bool huge;
				2465
				2466	/* allow the kernel cmdline to have a say */
				2467	if (!numentries) {
				2468	/* round applicable memory size up to nearest megabyte */
				2469	numentries = nr_kernel_pages;
				2470	numentries -= arch_reserved_kernel_pages();
				2471
				2472	/* It isn't necessary when PAGE_SIZE >= 1MB */
				2473	if (PAGE_SIZE < SZ_1M)
				2474	numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
				2475
				2476	#if __BITS_PER_LONG > 32
				2477	if (!high_limit) {
				2478	unsigned long adapt;
				2479
				2480	for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
				2481	adapt <<= ADAPT_SCALE_SHIFT)
				2482	scale++;
				2483	}
				2484	#endif
				2485
				2486	/* limit to 1 bucket per 2^scale bytes of low memory */
				2487	if (scale > PAGE_SHIFT)
				2488	numentries >>= (scale - PAGE_SHIFT);
				2489	else
				2490	numentries <<= (PAGE_SHIFT - scale);
				2491
Miaohe Lin	3fade62	2023-06-25 10:13:23 +0800	[diff] [blame]	2492	if (unlikely((numentries * bucketsize) < PAGE_SIZE))
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2493	numentries = PAGE_SIZE / bucketsize;
				2494	}
				2495	numentries = roundup_pow_of_two(numentries);
				2496
				2497	/* limit allocation size to 1/16 total memory by default */
				2498	if (max == 0) {
				2499	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
				2500	do_div(max, bucketsize);
				2501	}
				2502	max = min(max, 0x80000000ULL);
				2503
				2504	if (numentries < low_limit)
				2505	numentries = low_limit;
				2506	if (numentries > max)
				2507	numentries = max;
				2508
				2509	log2qty = ilog2(numentries);
				2510
				2511	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC \| __GFP_ZERO : GFP_ATOMIC;
				2512	do {
				2513	virt = false;
				2514	size = bucketsize << log2qty;
				2515	if (flags & HASH_EARLY) {
				2516	if (flags & HASH_ZERO)
				2517	table = memblock_alloc(size, SMP_CACHE_BYTES);
				2518	else
				2519	table = memblock_alloc_raw(size,
				2520	SMP_CACHE_BYTES);
				2521	} else if (get_order(size) > MAX_ORDER \|\| hashdist) {
				2522	table = vmalloc_huge(size, gfp_flags);
				2523	virt = true;
				2524	if (table)
				2525	huge = is_vm_area_hugepages(table);
				2526	} else {
				2527	/*
				2528	* If bucketsize is not a power-of-two, we may free
				2529	* some pages at the end of hash table which
				2530	* alloc_pages_exact() automatically does
				2531	*/
				2532	table = alloc_pages_exact(size, gfp_flags);
				2533	kmemleak_alloc(table, size, 1, gfp_flags);
				2534	}
				2535	} while (!table && size > PAGE_SIZE && --log2qty);
				2536
				2537	if (!table)
				2538	panic("Failed to allocate %s hash table\n", tablename);
				2539
				2540	pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
				2541	tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
				2542	virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
				2543
				2544	if (_hash_shift)
				2545	*_hash_shift = log2qty;
				2546	if (_hash_mask)
				2547	*_hash_mask = (1 << log2qty) - 1;
				2548
				2549	return table;
				2550	}
				2551
				2552	/**
				2553	* set_dma_reserve - set the specified number of pages reserved in the first zone
				2554	* @new_dma_reserve: The number of pages to mark reserved
				2555	*
				2556	* The per-cpu batchsize and zone watermarks are determined by managed_pages.
				2557	* In the DMA zone, a significant percentage may be consumed by kernel image
				2558	* and other unfreeable allocations which can skew the watermarks badly. This
				2559	* function may optionally be used to account for unfreeable pages in the
				2560	* first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
				2561	* smaller per-cpu batchsize.
				2562	*/
				2563	void __init set_dma_reserve(unsigned long new_dma_reserve)
				2564	{
				2565	dma_reserve = new_dma_reserve;
				2566	}
				2567
				2568	void __init memblock_free_pages(struct page *page, unsigned long pfn,
				2569	unsigned int order)
				2570	{
Yajun Deng	61167ad	2023-06-19 10:34:06 +0800	[diff] [blame]	2571
				2572	if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
				2573	int nid = early_pfn_to_nid(pfn);
				2574
				2575	if (!early_page_initialised(pfn, nid))
				2576	return;
				2577	}
				2578
Mike Rapoport (IBM)	9420f89	2023-03-21 19:05:02 +0200	[diff] [blame]	2579	if (!kmsan_memblock_free_pages(page, order)) {
				2580	/* KMSAN will take care of these pages. */
				2581	return;
				2582	}
				2583	__free_pages_core(page, order);
				2584	}
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2585
Kefeng Wang	5e7d5da	2023-05-16 14:38:10 +0800	[diff] [blame]	2586	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
				2587	EXPORT_SYMBOL(init_on_alloc);
				2588
				2589	DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
				2590	EXPORT_SYMBOL(init_on_free);
				2591
Mike Rapoport (IBM)	f2fc4b4	2023-03-21 19:05:08 +0200	[diff] [blame]	2592	static bool _init_on_alloc_enabled_early __read_mostly
				2593	= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
				2594	static int __init early_init_on_alloc(char *buf)
				2595	{
				2596
				2597	return kstrtobool(buf, &_init_on_alloc_enabled_early);
				2598	}
				2599	early_param("init_on_alloc", early_init_on_alloc);
				2600
				2601	static bool _init_on_free_enabled_early __read_mostly
				2602	= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
				2603	static int __init early_init_on_free(char *buf)
				2604	{
				2605	return kstrtobool(buf, &_init_on_free_enabled_early);
				2606	}
				2607	early_param("init_on_free", early_init_on_free);
				2608
				2609	DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
				2610
				2611	/*
				2612	* Enable static keys related to various memory debugging and hardening options.
				2613	* Some override others, and depend on early params that are evaluated in the
				2614	* order of appearance. So we need to first gather the full picture of what was
				2615	* enabled, and then make decisions.
				2616	*/
				2617	static void __init mem_debugging_and_hardening_init(void)
				2618	{
				2619	bool page_poisoning_requested = false;
				2620	bool want_check_pages = false;
				2621
				2622	#ifdef CONFIG_PAGE_POISONING
				2623	/*
				2624	* Page poisoning is debug page alloc for some arches. If
				2625	* either of those options are enabled, enable poisoning.
				2626	*/
				2627	if (page_poisoning_enabled() \|\|
				2628	(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
				2629	debug_pagealloc_enabled())) {
				2630	static_branch_enable(&_page_poisoning_enabled);
				2631	page_poisoning_requested = true;
				2632	want_check_pages = true;
				2633	}
				2634	#endif
				2635
				2636	if ((_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early) &&
				2637	page_poisoning_requested) {
				2638	pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
				2639	"will take precedence over init_on_alloc and init_on_free\n");
				2640	_init_on_alloc_enabled_early = false;
				2641	_init_on_free_enabled_early = false;
				2642	}
				2643
				2644	if (_init_on_alloc_enabled_early) {
				2645	want_check_pages = true;
				2646	static_branch_enable(&init_on_alloc);
				2647	} else {
				2648	static_branch_disable(&init_on_alloc);
				2649	}
				2650
				2651	if (_init_on_free_enabled_early) {
				2652	want_check_pages = true;
				2653	static_branch_enable(&init_on_free);
				2654	} else {
				2655	static_branch_disable(&init_on_free);
				2656	}
				2657
				2658	if (IS_ENABLED(CONFIG_KMSAN) &&
				2659	(_init_on_alloc_enabled_early \|\| _init_on_free_enabled_early))
				2660	pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
				2661
				2662	#ifdef CONFIG_DEBUG_PAGEALLOC
				2663	if (debug_pagealloc_enabled()) {
				2664	want_check_pages = true;
				2665	static_branch_enable(&_debug_pagealloc_enabled);
				2666
				2667	if (debug_guardpage_minorder())
				2668	static_branch_enable(&_debug_guardpage_enabled);
				2669	}
				2670	#endif
				2671
				2672	/*
				2673	* Any page debugging or hardening option also enables sanity checking
				2674	* of struct pages being allocated or freed. With CONFIG_DEBUG_VM it's
				2675	* enabled already.
				2676	*/
				2677	if (!IS_ENABLED(CONFIG_DEBUG_VM) && want_check_pages)
				2678	static_branch_enable(&check_pages_enabled);
				2679	}
				2680
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2681	/* Report memory auto-initialization states for this boot. */
				2682	static void __init report_meminit(void)
				2683	{
				2684	const char *stack;
				2685
				2686	if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
				2687	stack = "all(pattern)";
				2688	else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
				2689	stack = "all(zero)";
				2690	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
				2691	stack = "byref_all(zero)";
				2692	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
				2693	stack = "byref(zero)";
				2694	else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
				2695	stack = "__user(zero)";
				2696	else
				2697	stack = "off";
				2698
				2699	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
				2700	stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
				2701	want_init_on_free() ? "on" : "off");
				2702	if (want_init_on_free())
				2703	pr_info("mem auto-init: clearing system memory may take some time...\n");
				2704	}
				2705
Mike Rapoport (IBM)	eb8589b	2023-03-21 19:05:10 +0200	[diff] [blame]	2706	static void __init mem_init_print_info(void)
				2707	{
				2708	unsigned long physpages, codesize, datasize, rosize, bss_size;
				2709	unsigned long init_code_size, init_data_size;
				2710
				2711	physpages = get_num_physpages();
				2712	codesize = _etext - _stext;
				2713	datasize = _edata - _sdata;
				2714	rosize = __end_rodata - __start_rodata;
				2715	bss_size = __bss_stop - __bss_start;
				2716	init_data_size = __init_end - __init_begin;
				2717	init_code_size = _einittext - _sinittext;
				2718
				2719	/*
				2720	* Detect special cases and adjust section sizes accordingly:
				2721	* 1) .init.* may be embedded into .data sections
				2722	* 2) .init.text.* may be out of [__init_begin, __init_end],
				2723	* please refer to arch/tile/kernel/vmlinux.lds.S.
				2724	* 3) .rodata.* may be embedded into .text or .data sections.
				2725	*/
				2726	#define adj_init_size(start, end, size, pos, adj) \
				2727	do { \
				2728	if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
				2729	size -= adj; \
				2730	} while (0)
				2731
				2732	adj_init_size(__init_begin, __init_end, init_data_size,
				2733	_sinittext, init_code_size);
				2734	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
				2735	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
				2736	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
				2737	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
				2738
				2739	#undef adj_init_size
				2740
				2741	pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
				2742	#ifdef CONFIG_HIGHMEM
				2743	", %luK highmem"
				2744	#endif
				2745	")\n",
				2746	K(nr_free_pages()), K(physpages),
				2747	codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
				2748	(init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
				2749	K(physpages - totalram_pages() - totalcma_pages),
				2750	K(totalcma_pages)
				2751	#ifdef CONFIG_HIGHMEM
				2752	, K(totalhigh_pages())
				2753	#endif
				2754	);
				2755	}
				2756
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2757	/*
				2758	* Set up kernel memory allocators
				2759	*/
				2760	void __init mm_core_init(void)
				2761	{
				2762	/* Initializations relying on SMP setup */
				2763	build_all_zonelists(NULL);
				2764	page_alloc_init_cpuhp();
				2765
				2766	/*
				2767	* page_ext requires contiguous pages,
				2768	* bigger than MAX_ORDER unless SPARSEMEM.
				2769	*/
				2770	page_ext_init_flatmem();
Mike Rapoport (IBM)	f2fc4b4	2023-03-21 19:05:08 +0200	[diff] [blame]	2771	mem_debugging_and_hardening_init();
Peng Zhang	cabdf74	2023-07-18 15:30:19 +0800	[diff] [blame]	2772	kfence_alloc_pool_and_metadata();
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2773	report_meminit();
				2774	kmsan_init_shadow();
				2775	stack_depot_early_init();
				2776	mem_init();
				2777	mem_init_print_info();
				2778	kmem_cache_init();
				2779	/*
				2780	* page_owner must be initialized after buddy is ready, and also after
				2781	* slab is ready so that stack_depot_init() works properly
				2782	*/
				2783	page_ext_init_flatmem_late();
				2784	kmemleak_init();
Mike Rapoport (IBM)	4cd1e9e	2023-03-21 19:05:07 +0200	[diff] [blame]	2785	ptlock_cache_init();
				2786	pgtable_cache_init();
Mike Rapoport (IBM)	b7ec1bf	2023-03-21 19:05:06 +0200	[diff] [blame]	2787	debug_objects_mem_init();
				2788	vmalloc_init();
				2789	/* If no deferred init page_ext now, as vmap is fully initialized */
				2790	if (!deferred_struct_pages)
				2791	page_ext_init();
				2792	/* Should be run before the first non-init thread is created */
				2793	init_espfix_bsp();
				2794	/* Should be run after espfix64 is set up. */
				2795	pti_init();
				2796	kmsan_init_runtime();
				2797	mm_cache_init();
				2798	}