From 0f2620ffc41d117cc28bc053efe2dc837cf748dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:42 +0100 Subject: [PATCH 01/12] fault-inject: make enum fault_flags available unconditionally This will allow using should_fail_ex from code without having to make it conditional on CONFIG_FAULT_INJECTION. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-2-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/fault-inject.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 8c829d28dcf3..58fd14c82270 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -8,6 +8,10 @@ struct dentry; struct kmem_cache; +enum fault_flags { + FAULT_NOWARN = 1 << 0, +}; + #ifdef CONFIG_FAULT_INJECTION #include @@ -36,10 +40,6 @@ struct fault_attr { struct dentry *dname; }; -enum fault_flags { - FAULT_NOWARN = 1 << 0, -}; - #define FAULT_ATTR_INITIALIZER { \ .interval = 1, \ .times = ATOMIC_INIT(1), \ From e9939cebc0be8dabb1798b357e9dadf6398fa859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:43 +0100 Subject: [PATCH 02/12] mm: improve kerneldoc comments for __alloc_pages_bulk Describe the semantincs in more detail, as the filling empty slots in an array scheme is not quite obvious. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-3-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/page_alloc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..b3d37169a553 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4982,13 +4982,18 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * @nr_pages: The number of pages desired in the array * @page_array: Array to store the pages * - * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to the page_array. + * This is a batched version of the page allocator that attempts to allocate + * @nr_pages quickly. Pages are added to @page_array. * - * Note that only NULL elements are populated with pages and nr_pages - * is the maximum number of pages that will be stored in the array. + * Note that only the elements in @page_array that were cleared to %NULL on + * entry are populated with newly allocated pages. @nr_pages is the maximum + * number of pages that will be stored in the array. * - * Returns the number of pages in the array. + * Returns the number of pages in @page_array, including ones already + * allocated on entry. This can be less than the number requested in @nr_pages, + * but all empty slots are filled from the beginning. I.e., if all slots in + * @page_array were set to %NULL on entry, the slots from 0 to the return value + * - 1 will be filled. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, From 5c829783e5f8dbb7ca6fce50c5c4a33f7c75d0d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:44 +0100 Subject: [PATCH 03/12] mempool: improve kerneldoc comments Use proper formatting, use full sentences and reduce some verbosity in function parameter descriptions. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-4-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 1c38e873e546..1f4701713203 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -372,18 +372,20 @@ int mempool_resize(mempool_t *pool, int new_min_nr) EXPORT_SYMBOL(mempool_resize); /** - * mempool_alloc - allocate an element from a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). - * @gfp_mask: the usual allocation bitmask. + * mempool_alloc - allocate an element from a memory pool + * @pool: pointer to the memory pool + * @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported. * - * this function only sleeps if the alloc_fn() function sleeps or - * returns NULL. Note that due to preallocation, this function - * *never* fails when called from process contexts. (it might - * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. + * Allocate an element from @pool. This is done by first calling into the + * alloc_fn supplied at pool initialization time, and dipping into the reserved + * pool when alloc_fn fails to allocate an element. * - * Return: pointer to the allocated element or %NULL on error. + * This function only sleeps if the alloc_fn callback sleeps, or when waiting + * for elements to become available in the pool. + * + * Return: pointer to the allocated element or %NULL when failing to allocate + * an element. Allocation failure can only happen when @gfp_mask does not + * include %__GFP_DIRECT_RECLAIM. */ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { @@ -456,11 +458,10 @@ EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_alloc_preallocated - allocate an element from preallocated elements - * belonging to a specific memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * belonging to a memory pool + * @pool: pointer to the memory pool * - * This function is similar to mempool_alloc, but it only attempts allocating + * This function is similar to mempool_alloc(), but it only attempts allocating * an element from the preallocated elements. It does not sleep and immediately * returns if no preallocated elements are available. * @@ -492,12 +493,14 @@ void *mempool_alloc_preallocated(mempool_t *pool) EXPORT_SYMBOL(mempool_alloc_preallocated); /** - * mempool_free - return an element to the pool. - * @element: pool element pointer. - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). + * mempool_free - return an element to a mempool + * @element: pointer to element + * @pool: pointer to the memory pool * - * this function only sleeps if the free_fn() function sleeps. + * Returns @element to @pool if it needs replenishing, else frees it using + * the free_fn callback in @pool. + * + * This function only sleeps if the free_fn callback sleeps. */ void mempool_free(void *element, mempool_t *pool) { From b77fc08e393b77883bcb71825cfd49e44da44022 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:45 +0100 Subject: [PATCH 04/12] mempool: add error injection support Add a call to should_fail_ex that forces mempool to actually allocate from the pool to stress the mempool implementation when enabled through debugfs. By default should_fail{,_ex} prints a very verbose stack trace that clutters the kernel log, slows down execution and triggers the kernel bug detection in xfstests. Pass FAULT_NOWARN and print a single-line message notating the caller instead so that full tests can be run with fault injection. Signed-off-by: Christoph Hellwig Acked-by: Vlastimil Babka Link: https://patch.msgid.link/20251113084022.1255121-5-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 1f4701713203..5cf59779cc3d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -9,7 +9,7 @@ * started by Ingo Molnar, Copyright (C) 2001 * debugging by David Rientjes, Copyright (C) 2015 */ - +#include #include #include #include @@ -20,6 +20,15 @@ #include #include "slab.h" +static DECLARE_FAULT_ATTR(fail_mempool_alloc); + +static int __init mempool_faul_inject_init(void) +{ + return PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", + NULL, &fail_mempool_alloc)); +} +late_initcall(mempool_faul_inject_init); + #ifdef CONFIG_SLUB_DEBUG_ON static void poison_error(mempool_t *pool, void *element, size_t size, size_t byte) @@ -404,9 +413,15 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: + if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + element = NULL; + } else { + element = pool->alloc(gfp_temp, pool->pool_data); + } - element = pool->alloc(gfp_temp, pool->pool_data); - if (likely(element != NULL)) + if (likely(element)) return element; spin_lock_irqsave(&pool->lock, flags); From 3d2492401d3cdb8e9e1276c3af5f1cd0c8a2b076 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:46 +0100 Subject: [PATCH 05/12] mempool: factor out a mempool_adjust_gfp helper Add a helper to better isolate and document the gfp flags adjustments. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-6-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 5cf59779cc3d..a0718a35c34f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -380,6 +380,19 @@ int mempool_resize(mempool_t *pool, int new_min_nr) } EXPORT_SYMBOL(mempool_resize); +/* + * Adjust the gfp flags for mempool allocations, as we never want to dip into + * the global emergency reserves or retry in the page allocator. + * + * The first pass also doesn't want to go reclaim, but the next passes do, so + * return a separate subset for that first iteration. + */ +static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) +{ + *gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); +} + /** * mempool_alloc - allocate an element from a memory pool * @pool: pointer to the memory pool @@ -398,20 +411,14 @@ EXPORT_SYMBOL(mempool_resize); */ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; unsigned long flags; wait_queue_entry_t wait; - gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ - gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ - gfp_mask |= __GFP_NOWARN; /* failures are OK */ - - gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); - repeat_alloc: if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) { pr_info("forcing mempool usage for %pS\n", From 1742d97df628de55c0df1a0eb6eefb27136ee890 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:47 +0100 Subject: [PATCH 06/12] mempool: factor out a mempool_alloc_from_pool helper Add a helper for the mempool_alloc slowpath to better separate it from the fast path, and also use it to implement mempool_alloc_preallocated which shares the same logic. [hughd@google.com: fix lack of retrying with __GFP_DIRECT_RECLAIM] [vbabka@suse.cz: really use limited flags for first mempool attempt] Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-7-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 126 +++++++++++++++++++++++++-------------------------- 1 file changed, 62 insertions(+), 64 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index a0718a35c34f..6bcc319d547d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -380,6 +380,50 @@ int mempool_resize(mempool_t *pool, int new_min_nr) } EXPORT_SYMBOL(mempool_resize); +static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) +{ + unsigned long flags; + void *element; + + spin_lock_irqsave(&pool->lock, flags); + if (unlikely(!pool->curr_nr)) + goto fail; + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + + /* Paired with rmb in mempool_free(), read comment there. */ + smp_wmb(); + + /* + * Update the allocation stack trace as this is more useful for + * debugging. + */ + kmemleak_update_trace(element); + return element; + +fail: + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + DEFINE_WAIT(wait); + + prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * Wait for someone else to return an element to @pool. + * + * FIXME: this should be io_schedule(). The timeout is there as + * a workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5 * HZ); + finish_wait(&pool->wait, &wait); + } else { + /* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */ + spin_unlock_irqrestore(&pool->lock, flags); + } + + return NULL; +} + /* * Adjust the gfp flags for mempool allocations, as we never want to dip into * the global emergency reserves or retry in the page allocator. @@ -413,8 +457,6 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; - unsigned long flags; - wait_queue_entry_t wait; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_alloc(gfp_mask); @@ -428,53 +470,27 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) element = pool->alloc(gfp_temp, pool->pool_data); } - if (likely(element)) - return element; - - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); - spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); + if (unlikely(!element)) { /* - * Update the allocation stack trace as this is more useful - * for debugging. + * Try to allocate an element from the pool. + * + * The first pass won't have __GFP_DIRECT_RECLAIM and won't + * sleep in mempool_alloc_from_pool. Retry the allocation + * with all flags set in that case. */ - kmemleak_update_trace(element); - return element; + element = mempool_alloc_from_pool(pool, gfp_temp); + if (!element) { + if (gfp_temp != gfp_mask) { + gfp_temp = gfp_mask; + goto repeat_alloc; + } + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + goto repeat_alloc; + } + } } - /* - * We use gfp mask w/o direct reclaim or IO for the first round. If - * alloc failed with that and @pool was empty, retry immediately. - */ - if (gfp_temp != gfp_mask) { - spin_unlock_irqrestore(&pool->lock, flags); - gfp_temp = gfp_mask; - goto repeat_alloc; - } - - /* We must not sleep if !__GFP_DIRECT_RECLAIM */ - if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { - spin_unlock_irqrestore(&pool->lock, flags); - return NULL; - } - - /* Let's wait for someone else to return an element to @pool */ - init_wait(&wait); - prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - - spin_unlock_irqrestore(&pool->lock, flags); - - /* - * FIXME: this should be io_schedule(). The timeout is there as a - * workaround for some DM problems in 2.6.18. - */ - io_schedule_timeout(5*HZ); - - finish_wait(&pool->wait, &wait); - goto repeat_alloc; + return element; } EXPORT_SYMBOL(mempool_alloc_noprof); @@ -492,25 +508,7 @@ EXPORT_SYMBOL(mempool_alloc_noprof); */ void *mempool_alloc_preallocated(mempool_t *pool) { - void *element; - unsigned long flags; - - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr)) { - element = remove_element(pool); - spin_unlock_irqrestore(&pool->lock, flags); - /* paired with rmb in mempool_free(), read comment there */ - smp_wmb(); - /* - * Update the allocation stack trace as this is more useful - * for debugging. - */ - kmemleak_update_trace(element); - return element; - } - spin_unlock_irqrestore(&pool->lock, flags); - - return NULL; + return mempool_alloc_from_pool(pool, GFP_NOWAIT); } EXPORT_SYMBOL(mempool_alloc_preallocated); From ac529d86ad26d632d3c70b7c5b839282a3294d2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:48 +0100 Subject: [PATCH 07/12] mempool: add mempool_{alloc,free}_bulk Add a version of the mempool allocator that works for batch allocations of multiple objects. Calling mempool_alloc in a loop is not safe because it could deadlock if multiple threads are performing such an allocation at the same time. As an extra benefit the interface is build so that the same array can be used for alloc_pages_bulk / release_pages so that at least for page backed mempools the fast path can use a nice batch optimization. Note that mempool_alloc_bulk does not take a gfp_mask argument as it must always be able to sleep and doesn't support any non-trivial modifiers. NOFO or NOIO constrainst must be set through the scoped API. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-8-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 6 ++ mm/mempool.c | 177 ++++++++++++++++++++++++++++++---------- 2 files changed, 141 insertions(+), 42 deletions(-) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 34941a4b9026..e914fec0e119 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -66,9 +66,15 @@ extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; #define mempool_alloc(...) \ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, + unsigned int count, unsigned int allocated); +#define mempool_alloc_bulk(...) \ + alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; extern void mempool_free(void *element, mempool_t *pool); +unsigned int mempool_free_bulk(struct mempool *pool, void **elem, + unsigned int count); /* * A mempool_alloc_t and mempool_free_t that get the memory from diff --git a/mm/mempool.c b/mm/mempool.c index 6bcc319d547d..b45bcf415147 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -21,11 +21,21 @@ #include "slab.h" static DECLARE_FAULT_ATTR(fail_mempool_alloc); +static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk); static int __init mempool_faul_inject_init(void) { - return PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", + int error; + + error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc", NULL, &fail_mempool_alloc)); + if (error) + return error; + + /* booting will fail on error return here, don't bother to cleanup */ + return PTR_ERR_OR_ZERO( + fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL, + &fail_mempool_alloc_bulk)); } late_initcall(mempool_faul_inject_init); @@ -380,15 +390,22 @@ int mempool_resize(mempool_t *pool, int new_min_nr) } EXPORT_SYMBOL(mempool_resize); -static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) +static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated, + gfp_t gfp_mask) { unsigned long flags; - void *element; + unsigned int i; spin_lock_irqsave(&pool->lock, flags); - if (unlikely(!pool->curr_nr)) + if (unlikely(pool->curr_nr < count - allocated)) goto fail; - element = remove_element(pool); + for (i = 0; i < count; i++) { + if (!elems[i]) { + elems[i] = remove_element(pool); + allocated++; + } + } spin_unlock_irqrestore(&pool->lock, flags); /* Paired with rmb in mempool_free(), read comment there. */ @@ -398,8 +415,9 @@ static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) * Update the allocation stack trace as this is more useful for * debugging. */ - kmemleak_update_trace(element); - return element; + for (i = 0; i < count; i++) + kmemleak_update_trace(elems[i]); + return allocated; fail: if (gfp_mask & __GFP_DIRECT_RECLAIM) { @@ -421,7 +439,7 @@ static void *mempool_alloc_from_pool(struct mempool *pool, gfp_t gfp_mask) spin_unlock_irqrestore(&pool->lock, flags); } - return NULL; + return allocated; } /* @@ -437,6 +455,65 @@ static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask) return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); } +/** + * mempool_alloc_bulk - allocate multiple elements from a memory pool + * @pool: pointer to the memory pool + * @elems: partially or fully populated elements array + * @count: number of entries in @elem that need to be allocated + * @allocated: number of entries in @elem already allocated + * + * Allocate elements for each slot in @elem that is non-%NULL. This is done by + * first calling into the alloc_fn supplied at pool initialization time, and + * dipping into the reserved pool when alloc_fn fails to allocate an element. + * + * On return all @count elements in @elems will be populated. + * + * Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void. + */ +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems, + unsigned int count, unsigned int allocated) +{ + gfp_t gfp_mask = GFP_KERNEL; + gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); + unsigned int i = 0; + + VM_WARN_ON_ONCE(count > pool->min_nr); + might_alloc(gfp_mask); + + /* + * If an error is injected, fail all elements in a bulk allocation so + * that we stress the multiple elements missing path. + */ + if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) { + pr_info("forcing mempool usage for %pS\n", + (void *)_RET_IP_); + goto use_pool; + } + +repeat_alloc: + /* + * Try to allocate the elements using the allocation callback first as + * that might succeed even when the caller's bulk allocation did not. + */ + for (i = 0; i < count; i++) { + if (elems[i]) + continue; + elems[i] = pool->alloc(gfp_temp, pool->pool_data); + if (unlikely(!elems[i])) + goto use_pool; + allocated++; + } + + return 0; + +use_pool: + allocated = mempool_alloc_from_pool(pool, elems, count, allocated, + gfp_temp); + gfp_temp = gfp_mask; + goto repeat_alloc; +} +EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); + /** * mempool_alloc - allocate an element from a memory pool * @pool: pointer to the memory pool @@ -478,8 +555,7 @@ void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) * sleep in mempool_alloc_from_pool. Retry the allocation * with all flags set in that case. */ - element = mempool_alloc_from_pool(pool, gfp_temp); - if (!element) { + if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) { if (gfp_temp != gfp_mask) { gfp_temp = gfp_mask; goto repeat_alloc; @@ -508,26 +584,33 @@ EXPORT_SYMBOL(mempool_alloc_noprof); */ void *mempool_alloc_preallocated(mempool_t *pool) { - return mempool_alloc_from_pool(pool, GFP_NOWAIT); + void *element = NULL; + + mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT); + return element; } EXPORT_SYMBOL(mempool_alloc_preallocated); /** - * mempool_free - return an element to a mempool - * @element: pointer to element + * mempool_free_bulk - return elements to a mempool * @pool: pointer to the memory pool + * @elems: elements to return + * @count: number of elements to return * - * Returns @element to @pool if it needs replenishing, else frees it using - * the free_fn callback in @pool. + * Returns a number of elements from the start of @elem to @pool if @pool needs + * replenishing and sets their slots in @elem to NULL. Other elements are left + * in @elem. * - * This function only sleeps if the free_fn callback sleeps. + * Return: number of elements transferred to @pool. Elements are always + * transferred from the beginning of @elem, so the return value can be used as + * an offset into @elem for the freeing the remaining elements in the caller. */ -void mempool_free(void *element, mempool_t *pool) +unsigned int mempool_free_bulk(struct mempool *pool, void **elems, + unsigned int count) { unsigned long flags; - - if (unlikely(element == NULL)) - return; + unsigned int freed = 0; + bool added = false; /* * Paired with the wmb in mempool_alloc(). The preceding read is @@ -561,21 +644,6 @@ void mempool_free(void *element, mempool_t *pool) * Waiters happen iff curr_nr is 0 and the above guarantee also * ensures that there will be frees which return elements to the * pool waking up the waiters. - */ - if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { - spin_lock_irqsave(&pool->lock, flags); - if (likely(pool->curr_nr < pool->min_nr)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; - } - spin_unlock_irqrestore(&pool->lock, flags); - } - - /* - * Handle the min_nr = 0 edge case: * * For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds, * so waiters sleeping on pool->wait would never be woken by the @@ -583,20 +651,45 @@ void mempool_free(void *element, mempool_t *pool) * allocation of element when both min_nr and curr_nr are 0, and * any active waiters are properly awakened. */ - if (unlikely(pool->min_nr == 0 && + if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) { + spin_lock_irqsave(&pool->lock, flags); + while (pool->curr_nr < pool->min_nr && freed < count) { + add_element(pool, elems[freed++]); + added = true; + } + spin_unlock_irqrestore(&pool->lock, flags); + } else if (unlikely(pool->min_nr == 0 && READ_ONCE(pool->curr_nr) == 0)) { + /* Handle the min_nr = 0 edge case: */ spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr == 0)) { - add_element(pool, element); - spin_unlock_irqrestore(&pool->lock, flags); - if (wq_has_sleeper(&pool->wait)) - wake_up(&pool->wait); - return; + add_element(pool, elems[freed++]); + added = true; } spin_unlock_irqrestore(&pool->lock, flags); } - pool->free(element, pool->pool_data); + if (unlikely(added) && wq_has_sleeper(&pool->wait)) + wake_up(&pool->wait); + + return freed; +} +EXPORT_SYMBOL_GPL(mempool_free_bulk); + +/** + * mempool_free - return an element to the pool. + * @element: element to return + * @pool: pointer to the memory pool + * + * Returns @element to @pool if it needs replenishing, else frees it using + * the free_fn callback in @pool. + * + * This function only sleeps if the free_fn callback sleeps. + */ +void mempool_free(void *element, struct mempool *pool) +{ + if (likely(element) && !mempool_free_bulk(pool, &element, 1)) + pool->free(element, pool->pool_data); } EXPORT_SYMBOL(mempool_free); From 9c4391767f31d4114da577ab87437f28c1171d6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:49 +0100 Subject: [PATCH 08/12] mempool: legitimize the io_schedule_timeout in mempool_alloc_from_pool The timeout here is and old workaround with a Fixme comment. But thinking about it, it makes sense to keep it, so reword the comment. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-9-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index b45bcf415147..9ec3a04a0130 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -427,10 +427,10 @@ static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems, spin_unlock_irqrestore(&pool->lock, flags); /* - * Wait for someone else to return an element to @pool. - * - * FIXME: this should be io_schedule(). The timeout is there as - * a workaround for some DM problems in 2.6.18. + * Wait for someone else to return an element to @pool, but wake + * up occasionally as memory pressure might have reduced even + * and the normal allocation in alloc_fn could succeed even if + * no element was returned. */ io_schedule_timeout(5 * HZ); finish_wait(&pool->wait, &wait); From 8b41fb80a2cc023591f47d63b094e96af9c2c615 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:50 +0100 Subject: [PATCH 09/12] mempool: remove mempool_{init,create}_kvmalloc_pool This was added for bcachefs and is unused now. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-10-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 13 ------------- mm/mempool.c | 13 ------------- 2 files changed, 26 deletions(-) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index e914fec0e119..d9332485e8ca 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -103,19 +103,6 @@ void mempool_kfree(void *element, void *pool_data); mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ (void *)(unsigned long)(_size)) -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); -void mempool_kvfree(void *element, void *pool_data); - -static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - -static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) -{ - return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data diff --git a/mm/mempool.c b/mm/mempool.c index 9ec3a04a0130..0e1e015998e7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -728,19 +728,6 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t)pool_data; - return kvmalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kvmalloc); - -void mempool_kvfree(void *element, void *pool_data) -{ - kvfree(element); -} -EXPORT_SYMBOL(mempool_kvfree); - /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. From 0cab6873b7305abdd0acd95ee8cfa56b983500da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:51 +0100 Subject: [PATCH 10/12] mempool: de-typedef Switch all uses of the deprecated mempool_t typedef in the core mempool code to use struct mempool instead. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-11-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 39 ++++++++++++++++---------------- mm/mempool.c | 50 +++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/include/linux/mempool.h b/include/linux/mempool.h index d9332485e8ca..e8e440e04a06 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -27,32 +27,31 @@ typedef struct mempool { wait_queue_head_t wait; } mempool_t; -static inline bool mempool_initialized(mempool_t *pool) +static inline bool mempool_initialized(struct mempool *pool) { return pool->elements != NULL; } -static inline bool mempool_is_saturated(mempool_t *pool) +static inline bool mempool_is_saturated(struct mempool *pool) { return READ_ONCE(pool->curr_nr) >= pool->min_nr; } -void mempool_exit(mempool_t *pool); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id); - -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); +void mempool_exit(struct mempool *pool); +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id); +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data); #define mempool_init(...) \ alloc_hooks(mempool_init_noprof(__VA_ARGS__)) -extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); - -extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int nid); +struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int nid); #define mempool_create_node(...) \ alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) @@ -60,10 +59,10 @@ extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_ mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ GFP_KERNEL, NUMA_NO_NODE) -extern int mempool_resize(mempool_t *pool, int new_min_nr); -extern void mempool_destroy(mempool_t *pool); +int mempool_resize(struct mempool *pool, int new_min_nr); +void mempool_destroy(struct mempool *pool); -extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc; #define mempool_alloc(...) \ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, @@ -71,8 +70,8 @@ int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, #define mempool_alloc_bulk(...) \ alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) -extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; -extern void mempool_free(void *element, mempool_t *pool); +void *mempool_alloc_preallocated(struct mempool *pool) __malloc; +void mempool_free(void *element, struct mempool *pool); unsigned int mempool_free_bulk(struct mempool *pool, void **elem, unsigned int count); diff --git a/mm/mempool.c b/mm/mempool.c index 0e1e015998e7..89ab7bba5c9c 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -40,7 +40,7 @@ static int __init mempool_faul_inject_init(void) late_initcall(mempool_faul_inject_init); #ifdef CONFIG_SLUB_DEBUG_ON -static void poison_error(mempool_t *pool, void *element, size_t size, +static void poison_error(struct mempool *pool, void *element, size_t size, size_t byte) { const int nr = pool->curr_nr; @@ -57,7 +57,7 @@ static void poison_error(mempool_t *pool, void *element, size_t size, dump_stack(); } -static void __check_element(mempool_t *pool, void *element, size_t size) +static void __check_element(struct mempool *pool, void *element, size_t size) { u8 *obj = element; size_t i; @@ -73,7 +73,7 @@ static void __check_element(mempool_t *pool, void *element, size_t size) memset(obj, POISON_INUSE, size); } -static void check_element(mempool_t *pool, void *element) +static void check_element(struct mempool *pool, void *element) { /* Skip checking: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -102,7 +102,7 @@ static void __poison_element(void *element, size_t size) obj[size - 1] = POISON_END; } -static void poison_element(mempool_t *pool, void *element) +static void poison_element(struct mempool *pool, void *element) { /* Skip poisoning: KASAN might save its metadata in the element. */ if (kasan_enabled()) @@ -123,15 +123,16 @@ static void poison_element(mempool_t *pool, void *element) } } #else /* CONFIG_SLUB_DEBUG_ON */ -static inline void check_element(mempool_t *pool, void *element) +static inline void check_element(struct mempool *pool, void *element) { } -static inline void poison_element(mempool_t *pool, void *element) +static inline void poison_element(struct mempool *pool, void *element) { } #endif /* CONFIG_SLUB_DEBUG_ON */ -static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) +static __always_inline bool kasan_poison_element(struct mempool *pool, + void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) return kasan_mempool_poison_object(element); @@ -141,7 +142,7 @@ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element) return true; } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(struct mempool *pool, void *element) { if (pool->alloc == mempool_kmalloc) kasan_mempool_unpoison_object(element, (size_t)pool->pool_data); @@ -153,7 +154,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) (unsigned long)pool->pool_data); } -static __always_inline void add_element(mempool_t *pool, void *element) +static __always_inline void add_element(struct mempool *pool, void *element) { BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr); poison_element(pool, element); @@ -161,7 +162,7 @@ static __always_inline void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(struct mempool *pool) { void *element = pool->elements[--pool->curr_nr]; @@ -182,7 +183,7 @@ static void *remove_element(mempool_t *pool) * May be called on a zeroed but uninitialized mempool (i.e. allocated with * kzalloc()). */ -void mempool_exit(mempool_t *pool) +void mempool_exit(struct mempool *pool) { while (pool->curr_nr) { void *element = remove_element(pool); @@ -201,7 +202,7 @@ EXPORT_SYMBOL(mempool_exit); * Free all reserved elements in @pool and @pool itself. This function * only sleeps if the free_fn() function sleeps. */ -void mempool_destroy(mempool_t *pool) +void mempool_destroy(struct mempool *pool) { if (unlikely(!pool)) return; @@ -211,9 +212,9 @@ void mempool_destroy(mempool_t *pool) } EXPORT_SYMBOL(mempool_destroy); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { spin_lock_init(&pool->lock); pool->min_nr = min_nr; @@ -263,8 +264,9 @@ EXPORT_SYMBOL(mempool_init_node); * * Return: %0 on success, negative error code otherwise. */ -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); @@ -290,11 +292,11 @@ EXPORT_SYMBOL(mempool_init_noprof); * * Return: pointer to the created memory pool object or %NULL on error. */ -mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id) { - mempool_t *pool; + struct mempool *pool; pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); if (!pool) @@ -328,7 +330,7 @@ EXPORT_SYMBOL(mempool_create_node_noprof); * * Return: %0 on success, negative error code otherwise. */ -int mempool_resize(mempool_t *pool, int new_min_nr) +int mempool_resize(struct mempool *pool, int new_min_nr) { void *element; void **new_elements; @@ -530,7 +532,7 @@ EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof); * an element. Allocation failure can only happen when @gfp_mask does not * include %__GFP_DIRECT_RECLAIM. */ -void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) { gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask); void *element; @@ -582,7 +584,7 @@ EXPORT_SYMBOL(mempool_alloc_noprof); * Return: pointer to the allocated element or %NULL if no elements are * available. */ -void *mempool_alloc_preallocated(mempool_t *pool) +void *mempool_alloc_preallocated(struct mempool *pool) { void *element = NULL; From 07723a41eee9525a90d027f7ca49d33fcd47e775 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:52 +0100 Subject: [PATCH 11/12] mempool: drop the file name in the top of file comment Mentioning the name of the file is redundant, so drop it. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-12-hch@lst.de Signed-off-by: Vlastimil Babka --- mm/mempool.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 89ab7bba5c9c..efb383a94a28 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/mm/mempool.c - * * memory buffer pool support. Such pools are mostly used * for guaranteed, deadlock-free memory allocations during * extreme VM load. From 48233291461b0539d798d00aaacccf1b3b163102 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 14 Oct 2025 14:17:23 +0200 Subject: [PATCH 12/12] mempool: clarify behavior of mempool_alloc_preallocated() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The documentation of that function promises to never sleep. However on PREEMPT_RT a spinlock_t might in fact sleep. Reword the documentation so users can predict its behavior better. mempool could also replace spinlock_t with raw_spinlock_t which doesn't sleep even on PREEMPT_RT but that would take away the improved preemptibility of sleeping locks. Link: https://lkml.kernel.org/r/20251014-mempool-doc-v1-1-bc9ebf169700@linutronix.de Signed-off-by: Thomas Weißschuh Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: "Vishal Moola (Oracle)" Signed-off-by: Andrew Morton Signed-off-by: Vlastimil Babka --- mm/mempool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index efb383a94a28..bb596cac57ff 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -576,8 +576,8 @@ EXPORT_SYMBOL(mempool_alloc_noprof); * @pool: pointer to the memory pool * * This function is similar to mempool_alloc(), but it only attempts allocating - * an element from the preallocated elements. It does not sleep and immediately - * returns if no preallocated elements are available. + * an element from the preallocated elements. It only takes a single spinlock_t + * and immediately returns if no preallocated elements are available. * * Return: pointer to the allocated element or %NULL if no elements are * available.