mm/mm_init: simplify deferred initialization of struct pages

Refactor and simplify deferred initialization of the memory map. Beside the negative diffstat it gives 3ms (55ms vs 58ms) reduction in the initialization of deferred pages on single node system with 64GiB of RAM. -----BEGIN PGP SIGNATURE----- iQFEBAABCgAuFiEEeOVYVaWZL5900a/pOQOGJssO/ZEFAmjg5awQHHJwcHRAa2Vy bmVsLm9yZwAKCRA5A4Ymyw79kcgsB/0fnrzF18rUvPWPeCMm3D8taDl74U8qTOp7 RcxaFa+EQZJZmYp3QkBGs/LPYPAuOn1uUlJny9dPqk3XSg//yxSdTH35++mvCL3s gTOGEsckQPGzavD30VX9XJ0BXNwVNbpfZcH6Ga9dGY25cGG24Zi9Uln6qx8f6fu8 /qV92aQaNL27mLVjbb4h/c2yNbmQHdwFa42OObTPLqNPp5O8UXK3/Bk1gZ/3jDdX DPWq5P56S0QichAmcFnPfEUjUzCRSzJdDJsoGQuD2G4Ves/Z2BH7Avik1LxM5kQw dVl47icFi0esBEe41WQQu18u/2eRDqw9Urz3MZ3v/aGiCh7Oq3NQ =GDe0 -----END PGP SIGNATURE----- Merge tag 'memblock-v6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock Pull mm-init update from Mike Rapoport: "Simplify deferred initialization of struct pages Refactor and simplify deferred initialization of the memory map. Beside the negative diffstat it gives 3ms (55ms vs 58ms) reduction in the initialization of deferred pages on single node system with 64GiB of RAM" * tag 'memblock-v6.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rppt/memblock: memblock: drop for_each_free_mem_pfn_range_in_zone_from() mm/mm_init: drop deferred_init_maxorder() mm/mm_init: deferred_init_memmap: use a job per zone mm/mm_init: use deferred_init_memmap_chunk() in deferred_grow_zone()
2025-10-04 11:03:10 -07:00 · 2025-10-04 11:03:10 -07:00 · b41048485e
parent c4c8bcab18 e68f150bc1
commit b41048485e
4 changed files with 67 additions and 221 deletions
--- a/.clang-format
+++ b/.clang-format
@ -294,7 +294,6 @@ ForEachMacros:
  - 'for_each_fib6_node_rt_rcu'
  - 'for_each_fib6_walker_rt'
  - 'for_each_file_lock'
  - 'for_each_free_mem_pfn_range_in_zone_from'
  - 'for_each_free_mem_range'
  - 'for_each_free_mem_range_reverse'
  - 'for_each_func_rsrc'
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@ -324,28 +324,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 	for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
 	     i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
 				  unsigned long *out_spfn,
 				  unsigned long *out_epfn);
 /**
 * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific
 * free memblock areas from a given point
 * @i: u64 used as loop variable
 * @zone: zone in which all of the memory blocks reside
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 *
 * Walks over free (memory && !reserved) areas of memblock in a specific
 * zone, continuing from current position. Available as soon as memblock is
 * initialized.
 */
 #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
 	for (; i != U64_MAX;					  \
 	     __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 /**
 * for_each_free_mem_range - iterate through free memblock areas
--- a/mm/memblock.c
+++ b/mm/memblock.c
@ -1445,70 +1445,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
 	return 0;
 }
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 /**
 * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
 *
 * @idx: pointer to u64 loop variable
 * @zone: zone in which all of the memory blocks reside
 * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
 * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
 *
 * This function is meant to be a zone/pfn specific wrapper for the
 * for_each_mem_range type iterators. Specifically they are used in the
 * deferred memory init routines and as such we were duplicating much of
 * this logic throughout the code. So instead of having it in multiple
 * locations it seemed like it would make more sense to centralize this to
 * one new iterator that does everything they need.
 */
 void __init_memblock
 __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
 			     unsigned long *out_spfn, unsigned long *out_epfn)
 {
 	int zone_nid = zone_to_nid(zone);
 	phys_addr_t spa, epa;
 	__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
 			 &memblock.memory, &memblock.reserved,
 			 &spa, &epa, NULL);
 	while (*idx != U64_MAX) {
 		unsigned long epfn = PFN_DOWN(epa);
 		unsigned long spfn = PFN_UP(spa);
 		/*
 		 * Verify the end is at least past the start of the zone and
 		 * that we have at least one PFN to initialize.
 		 */
 		if (zone->zone_start_pfn < epfn && spfn < epfn) {
 			/* if we went too far just stop searching */
 			if (zone_end_pfn(zone) <= spfn) {
 				*idx = U64_MAX;
 				break;
 			}
 			if (out_spfn)
 				*out_spfn = max(zone->zone_start_pfn, spfn);
 			if (out_epfn)
 				*out_epfn = min(zone_end_pfn(zone), epfn);
 			return;
 		}
 		__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
 				 &memblock.memory, &memblock.reserved,
 				 &spa, &epa, NULL);
 	}
 	/* signal end of iteration */
 	if (out_spfn)
 		*out_spfn = ULONG_MAX;
 	if (out_epfn)
 		*out_epfn = 0;
 }
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 /**
 * memblock_alloc_range_nid - allocate boot memory block
 * @size: size of memory block to be allocated in bytes
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@ -2045,112 +2045,63 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
 }
 /*
- * This function is meant to pre-load the iterator for the zone init from
+ * Initialize and free pages.
 * a given point.
 * Specifically it walks through the ranges starting with initial index
 * passed to it until we are caught up to the first_init_pfn value and
 * exits there. If we never encounter the value we return false indicating
 * there are no valid ranges left.
 */
 static bool __init
 deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
 				    unsigned long *spfn, unsigned long *epfn,
 				    unsigned long first_init_pfn)
 {
 	u64 j = *i;
 	if (j == 0)
 		__next_mem_pfn_range_in_zone(&j, zone, spfn, epfn);
 	/*
 	 * Start out by walking through the ranges in this zone that have
 	 * already been initialized. We don't need to do anything with them
 	 * so we just need to flush them out of the system.
 	 */
 	for_each_free_mem_pfn_range_in_zone_from(j, zone, spfn, epfn) {
 		if (*epfn <= first_init_pfn)
 			continue;
 		if (*spfn < first_init_pfn)
 			*spfn = first_init_pfn;
 		*i = j;
 		return true;
 	}
 	return false;
 }
 /*
 * Initialize and free pages. We do it in two loops: first we initialize
 * struct page, then free to buddy allocator, because while we are
 * freeing pages we can access pages that are ahead (computing buddy
 * page in __free_one_page()).
 *
- * In order to try and keep some memory in the cache we have the loop
+ * At this point reserved pages and struct pages that correspond to holes in
- * broken along max page order boundaries. This way we will not cause
+ * memblock.memory are already intialized so every free range has a valid
- * any issues with the buddy page computation.
+ * memory map around it.
 * This ensures that access of pages that are ahead of the range being
 * initialized (computing buddy page in __free_one_page()) always reads a valid
 * struct page.
 *
 * In order to try and improve CPU cache locality we have the loop broken along
 * max page order boundaries.
 */
 static unsigned long __init
-deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
-		       unsigned long *end_pfn)
+			   struct zone *zone)
 {
-	unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+	int nid = zone_to_nid(zone);
 	unsigned long spfn = *start_pfn, epfn = *end_pfn;
 	unsigned long nr_pages = 0;
-	u64 j = *i;
+	phys_addr_t start, end;
 	u64 i = 0;
-	/* First we loop through and initialize the page values */
+	for_each_free_mem_range(i, nid, 0, &start, &end, NULL) {
-	for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+		unsigned long spfn = PFN_UP(start);
-		unsigned long t;
+		unsigned long epfn = PFN_DOWN(end);
-		if (mo_pfn <= *start_pfn)
+		if (spfn >= end_pfn)
 			break;
-		t = min(mo_pfn, *end_pfn);
+		spfn = max(spfn, start_pfn);
-		nr_pages += deferred_init_pages(zone, *start_pfn, t);
+		epfn = min(epfn, end_pfn);
-		if (mo_pfn < *end_pfn) {
+		while (spfn < epfn) {
-			*start_pfn = mo_pfn;
+			unsigned long mo_pfn = ALIGN(spfn + 1, MAX_ORDER_NR_PAGES);
-			break;
+			unsigned long chunk_end = min(mo_pfn, epfn);
 			nr_pages += deferred_init_pages(zone, spfn, chunk_end);
 			deferred_free_pages(spfn, chunk_end - spfn);
 			spfn = chunk_end;
 			if (irqs_disabled())
 				touch_nmi_watchdog();
 			else
 				cond_resched();
 		}
 	}
 	/* Reset values and now loop through freeing pages as needed */
 	swap(j, *i);
 	for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
 		unsigned long t;
 		if (mo_pfn <= spfn)
 			break;
 		t = min(mo_pfn, epfn);
 		deferred_free_pages(spfn, t - spfn);
 		if (mo_pfn <= epfn)
 			break;
 	}
 	return nr_pages;
 }
 static void __init
-deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn,
-			   void *arg)
+			 void *arg)
 {
 	unsigned long spfn, epfn;
 	struct zone *zone = arg;
 	u64 i = 0;
-	deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+	deferred_init_memmap_chunk(start_pfn, end_pfn, zone);
 	/*
 	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so that
 	 * we can avoid introducing any issues with the buddy allocator.
 	 */
 	while (spfn < end_pfn) {
 		deferred_init_maxorder(&i, zone, &spfn, &epfn);
 		cond_resched();
 	}
 }
 static unsigned int __init
@ -2164,12 +2115,10 @@ static int __init deferred_init_memmap(void *data)
 {
 	pg_data_t *pgdat = data;
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-	unsigned long spfn = 0, epfn = 0;
+	int max_threads = deferred_page_init_max_threads(cpumask);
-	unsigned long first_init_pfn, flags;
+	unsigned long first_init_pfn, last_pfn, flags;
 	unsigned long start = jiffies;
 	struct zone *zone;
 	int max_threads;
 	u64 i = 0;
 	/* Bind memory initialisation thread to a local node if possible */
 	if (!cpumask_empty(cpumask))
@ -2197,24 +2146,20 @@ static int __init deferred_init_memmap(void *data)
 	/* Only the highest zone is deferred */
 	zone = pgdat->node_zones + pgdat->nr_zones - 1;
 	last_pfn = SECTION_ALIGN_UP(zone_end_pfn(zone));
-	max_threads = deferred_page_init_max_threads(cpumask);
+	struct padata_mt_job job = {
 		.thread_fn   = deferred_init_memmap_job,
 		.fn_arg      = zone,
 		.start       = first_init_pfn,
 		.size        = last_pfn - first_init_pfn,
 		.align       = PAGES_PER_SECTION,
 		.min_chunk   = PAGES_PER_SECTION,
 		.max_threads = max_threads,
 		.numa_aware  = false,
 	};
-	while (deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, first_init_pfn)) {
+	padata_do_multithreaded(&job);
 		first_init_pfn = ALIGN(epfn, PAGES_PER_SECTION);
 		struct padata_mt_job job = {
 			.thread_fn   = deferred_init_memmap_chunk,
 			.fn_arg      = zone,
 			.start       = spfn,
 			.size        = first_init_pfn - spfn,
 			.align       = PAGES_PER_SECTION,
 			.min_chunk   = PAGES_PER_SECTION,
 			.max_threads = max_threads,
 			.numa_aware  = false,
 		};
 		padata_do_multithreaded(&job);
 	}
 	/* Sanity check that the next zone really is unpopulated */
 	WARN_ON(pgdat->nr_zones < MAX_NR_ZONES && populated_zone(++zone));
@ -2239,12 +2184,11 @@ static int __init deferred_init_memmap(void *data)
 */
 bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
 {
-	unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
+	unsigned long nr_pages_needed = SECTION_ALIGN_UP(1 << order);
 	pg_data_t *pgdat = zone->zone_pgdat;
 	unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
 	unsigned long spfn, epfn, flags;
 	unsigned long nr_pages = 0;
 	u64 i = 0;
 	/* Only the last zone may have deferred pages */
 	if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
@ -2261,37 +2205,26 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
 		return true;
 	}
-	/* If the zone is empty somebody else may have cleared out the zone */
+	/*
-	if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+	 * Initialize at least nr_pages_needed in section chunks.
-						 first_deferred_pfn)) {
+	 * If a section has less free memory than nr_pages_needed, the next
-		pgdat->first_deferred_pfn = ULONG_MAX;
+	 * section will be also initialized.
-		pgdat_resize_unlock(pgdat, &flags);
+	 * Note, that it still does not guarantee that allocation of order can
-		/* Retry only once. */
+	 * be satisfied if the sections are fragmented because of memblock
-		return first_deferred_pfn != ULONG_MAX;
+	 * allocations.
 	 */
 	for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);
 	     nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
 	     spfn = epfn, epfn += PAGES_PER_SECTION) {
 		nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
 	}
 	/*
-	 * Initialize and free pages in MAX_PAGE_ORDER sized increments so
+	 * There were no pages to initialize and free which means the zone's
-	 * that we can avoid introducing any issues with the buddy
+	 * memory map is completely initialized.
 	 * allocator.
 	 */
-	while (spfn < epfn) {
+	pgdat->first_deferred_pfn = nr_pages ? spfn : ULONG_MAX;
 		/* update our first deferred PFN for this section */
 		first_deferred_pfn = spfn;
 		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
 		touch_nmi_watchdog();
 		/* We should only stop along section boundaries */
 		if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
 			continue;
 		/* If our quota has been met we can stop here */
 		if (nr_pages >= nr_pages_needed)
 			break;
 	}
 	pgdat->first_deferred_pfn = spfn;
 	pgdat_resize_unlock(pgdat, &flags);
 	return nr_pages > 0;