slab updates for 6.19

-----BEGIN PGP SIGNATURE-----
 
 iQFPBAABCAA5FiEEe7vIQRWZI0iWSE3xu+CwddJFiJoFAmksibgbFIAAAAAABAAO
 bWFudTIsMi41KzEuMTEsMiwyAAoJELvgsHXSRYiavR8H/jTNKlb8jZtre1Q2xIGJ
 PgU8+fc4PGX8C6XuKRgb4KYL+zn3VSnTyxLUc3ObKIRTrGOJOBw3YT8R0LvrMOJx
 Ibx/6o0o+vjnDxmq6QGcuYdytDdL/rL6Gh8PR1dyWAqPz6jGtraP0nCJu7Y9jRZ0
 JHbyRTfpC8I6fTZv/WHocTsUDUu/+M4jQx3kMAMgSSTc7IAF+El5GqhpwEaWv7u/
 6D0px1lXI3rGimzmHeLy+CEjW041MTkxPH3GNzkiZwi2WUwI+ZEteMcs29KHcCOe
 /sdqmlzn2CPxzqG3TkJ4LbJE3XThYkqxe56LmBVJnhHFe+vCF8urEX9UUTtMn1dh
 3zs=
 =iQ4N
 -----END PGP SIGNATURE-----

Merge tag 'slab-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Pull slab updates from Vlastimil Babka:

 - mempool_alloc_bulk() support for upcoming users in the block layer
   that need to allocate multiple objects at once with the mempool's
   guaranteed progress semantics, which is not achievable with an
   allocation single objects in a loop. Along with refactoring and
   various improvements (Christoph Hellwig)

 - Preparations for the upcoming separation of struct slab from struct
   page, mostly by removing the struct folio layer, as the purpose of
   struct folio has shifted since it became used in slab code (Matthew
   Wilcox)

 - Modernisation of slab's boot param API usage, which removes some
   unexpected parsing corner cases (Petr Tesarik)

 - Refactoring of freelist_aba_t (now struct freelist_counters) and
   associated functions for double cmpxchg, enabled by -fms-extensions
   (Vlastimil Babka)

 - Cleanups and improvements related to sheaves caching layer, that were
   part of the full conversion to sheaves, which is planned for the next
   release (Vlastimil Babka)

* tag 'slab-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (42 commits)
  slab: Remove unnecessary call to compound_head() in alloc_from_pcs()
  mempool: clarify behavior of mempool_alloc_preallocated()
  mempool: drop the file name in the top of file comment
  mempool: de-typedef
  mempool: remove mempool_{init,create}_kvmalloc_pool
  mempool: legitimize the io_schedule_timeout in mempool_alloc_from_pool
  mempool: add mempool_{alloc,free}_bulk
  mempool: factor out a mempool_alloc_from_pool helper
  slab: Remove references to folios from virt_to_slab()
  kasan: Remove references to folio in __kasan_mempool_poison_object()
  memcg: Convert mem_cgroup_from_obj_folio() to mem_cgroup_from_obj_slab()
  mempool: factor out a mempool_adjust_gfp helper
  mempool: add error injection support
  mempool: improve kerneldoc comments
  mm: improve kerneldoc comments for __alloc_pages_bulk
  fault-inject: make enum fault_flags available unconditionally
  usercopy: Remove folio references from check_heap_object()
  slab: Remove folio references from kfree_nolock()
  slab: Remove folio references from kfree_rcu_sheaf()
  slab: Remove folio references from build_detached_freelist()
  ...
This commit is contained in:
Linus Torvalds 2025-12-03 11:53:47 -08:00
commit b687034b1a
13 changed files with 767 additions and 688 deletions

View File

@ -8,6 +8,10 @@
struct dentry;
struct kmem_cache;
enum fault_flags {
FAULT_NOWARN = 1 << 0,
};
#ifdef CONFIG_FAULT_INJECTION
#include <linux/atomic.h>
@ -36,10 +40,6 @@ struct fault_attr {
struct dentry *dname;
};
enum fault_flags {
FAULT_NOWARN = 1 << 0,
};
#define FAULT_ATTR_INITIALIZER { \
.interval = 1, \
.times = ATOMIC_INIT(1), \

View File

@ -55,9 +55,7 @@ enum {
#ifdef CONFIG_LOCKDEP
___GFP_NOLOCKDEP_BIT,
#endif
#ifdef CONFIG_SLAB_OBJ_EXT
___GFP_NO_OBJ_EXT_BIT,
#endif
___GFP_LAST_BIT
};
@ -98,11 +96,7 @@ enum {
#else
#define ___GFP_NOLOCKDEP 0
#endif
#ifdef CONFIG_SLAB_OBJ_EXT
#define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT)
#else
#define ___GFP_NO_OBJ_EXT 0
#endif
/*
* Physical address zone modifiers (see linux/mmzone.h - low four bits)

View File

@ -27,32 +27,31 @@ typedef struct mempool {
wait_queue_head_t wait;
} mempool_t;
static inline bool mempool_initialized(mempool_t *pool)
static inline bool mempool_initialized(struct mempool *pool)
{
return pool->elements != NULL;
}
static inline bool mempool_is_saturated(mempool_t *pool)
static inline bool mempool_is_saturated(struct mempool *pool)
{
return READ_ONCE(pool->curr_nr) >= pool->min_nr;
}
void mempool_exit(mempool_t *pool);
int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int node_id);
int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
void mempool_exit(struct mempool *pool);
int mempool_init_node(struct mempool *pool, int min_nr,
mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
void *pool_data, gfp_t gfp_mask, int node_id);
int mempool_init_noprof(struct mempool *pool, int min_nr,
mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
void *pool_data);
#define mempool_init(...) \
alloc_hooks(mempool_init_noprof(__VA_ARGS__))
extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int nid);
struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data);
struct mempool *mempool_create_node_noprof(int min_nr,
mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
void *pool_data, gfp_t gfp_mask, int nid);
#define mempool_create_node(...) \
alloc_hooks(mempool_create_node_noprof(__VA_ARGS__))
@ -60,15 +59,21 @@ extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_
mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \
GFP_KERNEL, NUMA_NO_NODE)
extern int mempool_resize(mempool_t *pool, int new_min_nr);
extern void mempool_destroy(mempool_t *pool);
int mempool_resize(struct mempool *pool, int new_min_nr);
void mempool_destroy(struct mempool *pool);
extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc;
void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc;
#define mempool_alloc(...) \
alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem,
unsigned int count, unsigned int allocated);
#define mempool_alloc_bulk(...) \
alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__))
extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
extern void mempool_free(void *element, mempool_t *pool);
void *mempool_alloc_preallocated(struct mempool *pool) __malloc;
void mempool_free(void *element, struct mempool *pool);
unsigned int mempool_free_bulk(struct mempool *pool, void **elem,
unsigned int count);
/*
* A mempool_alloc_t and mempool_free_t that get the memory from
@ -97,19 +102,6 @@ void mempool_kfree(void *element, void *pool_data);
mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \
(void *)(unsigned long)(_size))
void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
void mempool_kvfree(void *element, void *pool_data);
static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size)
{
return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
}
static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
{
return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size);
}
/*
* A mempool_alloc_t and mempool_free_t for a simple page allocator that
* allocates pages of the order specified by pool_data

View File

@ -1048,19 +1048,7 @@ PAGE_TYPE_OPS(Table, table, pgtable)
*/
PAGE_TYPE_OPS(Guard, guard, guard)
FOLIO_TYPE_OPS(slab, slab)
/**
* PageSlab - Determine if the page belongs to the slab allocator
* @page: The page to test.
*
* Context: Any context.
* Return: True for slab pages, false for any other kind of page.
*/
static inline bool PageSlab(const struct page *page)
{
return folio_test_slab(page_folio(page));
}
PAGE_TYPE_OPS(Slab, slab, slab)
#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
@ -1076,7 +1064,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
* Serialized with zone lock.
*/
PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc)
PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc)
/**
* PageHuge - Determine if the page belongs to hugetlbfs

View File

@ -520,24 +520,20 @@ void __kasan_mempool_unpoison_pages(struct page *page, unsigned int order,
bool __kasan_mempool_poison_object(void *ptr, unsigned long ip)
{
struct folio *folio = virt_to_folio(ptr);
struct page *page = virt_to_page(ptr);
struct slab *slab;
/*
* This function can be called for large kmalloc allocation that get
* their memory from page_alloc. Thus, the folio might not be a slab.
*/
if (unlikely(!folio_test_slab(folio))) {
if (unlikely(PageLargeKmalloc(page))) {
if (check_page_allocation(ptr, ip))
return false;
kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false);
kasan_poison(ptr, page_size(page), KASAN_PAGE_FREE, false);
return true;
}
if (is_kfence_address(ptr))
return true;
slab = folio_slab(folio);
slab = page_slab(page);
if (check_slab_allocation(slab->slab_cache, ptr, ip))
return false;

View File

@ -612,14 +612,15 @@ static unsigned long kfence_init_pool(void)
* enters __slab_free() slow-path.
*/
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
struct slab *slab;
struct page *page;
if (!i || (i % 2))
continue;
slab = page_slab(pfn_to_page(start_pfn + i));
__folio_set_slab(slab_folio(slab));
page = pfn_to_page(start_pfn + i);
__SetPageSlab(page);
#ifdef CONFIG_MEMCG
struct slab *slab = page_slab(page);
slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
MEMCG_DATA_OBJEXTS;
#endif
@ -665,16 +666,17 @@ static unsigned long kfence_init_pool(void)
reset_slab:
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
struct slab *slab;
struct page *page;
if (!i || (i % 2))
continue;
slab = page_slab(pfn_to_page(start_pfn + i));
page = pfn_to_page(start_pfn + i);
#ifdef CONFIG_MEMCG
struct slab *slab = page_slab(page);
slab->obj_exts = 0;
#endif
__folio_clear_slab(slab_folio(slab));
__ClearPageSlab(page);
}
return addr;

View File

@ -2557,38 +2557,25 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
}
static __always_inline
struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p)
{
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* slab->obj_exts.
*/
if (folio_test_slab(folio)) {
struct slabobj_ext *obj_exts;
struct slab *slab;
unsigned int off;
slab = folio_slab(folio);
obj_exts = slab_obj_exts(slab);
if (!obj_exts)
return NULL;
off = obj_to_index(slab->slab_cache, slab, p);
if (obj_exts[off].objcg)
return obj_cgroup_memcg(obj_exts[off].objcg);
struct slabobj_ext *obj_exts;
unsigned int off;
obj_exts = slab_obj_exts(slab);
if (!obj_exts)
return NULL;
}
/*
* folio_memcg_check() is used here, because in theory we can encounter
* a folio where the slab flag has been cleared already, but
* slab->obj_exts has not been freed yet
* folio_memcg_check() will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
return folio_memcg_check(folio);
off = obj_to_index(slab->slab_cache, slab, p);
if (obj_exts[off].objcg)
return obj_cgroup_memcg(obj_exts[off].objcg);
return NULL;
}
/*
@ -2602,10 +2589,15 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
*/
struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
{
struct slab *slab;
if (mem_cgroup_disabled())
return NULL;
return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
slab = virt_to_slab(p);
if (slab)
return mem_cgroup_from_obj_slab(slab, p);
return folio_memcg_check(virt_to_folio(p));
}
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)

View File

@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
* linux/mm/mempool.c
*
* memory buffer pool support. Such pools are mostly used
* for guaranteed, deadlock-free memory allocations during
* extreme VM load.
@ -9,7 +7,7 @@
* started by Ingo Molnar, Copyright (C) 2001
* debugging by David Rientjes, Copyright (C) 2015
*/
#include <linux/fault-inject.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/highmem.h>
@ -20,8 +18,27 @@
#include <linux/writeback.h>
#include "slab.h"
static DECLARE_FAULT_ATTR(fail_mempool_alloc);
static DECLARE_FAULT_ATTR(fail_mempool_alloc_bulk);
static int __init mempool_faul_inject_init(void)
{
int error;
error = PTR_ERR_OR_ZERO(fault_create_debugfs_attr("fail_mempool_alloc",
NULL, &fail_mempool_alloc));
if (error)
return error;
/* booting will fail on error return here, don't bother to cleanup */
return PTR_ERR_OR_ZERO(
fault_create_debugfs_attr("fail_mempool_alloc_bulk", NULL,
&fail_mempool_alloc_bulk));
}
late_initcall(mempool_faul_inject_init);
#ifdef CONFIG_SLUB_DEBUG_ON
static void poison_error(mempool_t *pool, void *element, size_t size,
static void poison_error(struct mempool *pool, void *element, size_t size,
size_t byte)
{
const int nr = pool->curr_nr;
@ -38,7 +55,7 @@ static void poison_error(mempool_t *pool, void *element, size_t size,
dump_stack();
}
static void __check_element(mempool_t *pool, void *element, size_t size)
static void __check_element(struct mempool *pool, void *element, size_t size)
{
u8 *obj = element;
size_t i;
@ -54,7 +71,7 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
memset(obj, POISON_INUSE, size);
}
static void check_element(mempool_t *pool, void *element)
static void check_element(struct mempool *pool, void *element)
{
/* Skip checking: KASAN might save its metadata in the element. */
if (kasan_enabled())
@ -93,7 +110,7 @@ static void __poison_element(void *element, size_t size)
obj[size - 1] = POISON_END;
}
static void poison_element(mempool_t *pool, void *element)
static void poison_element(struct mempool *pool, void *element)
{
/* Skip poisoning: KASAN might save its metadata in the element. */
if (kasan_enabled())
@ -124,15 +141,16 @@ static void poison_element(mempool_t *pool, void *element)
}
}
#else /* CONFIG_SLUB_DEBUG_ON */
static inline void check_element(mempool_t *pool, void *element)
static inline void check_element(struct mempool *pool, void *element)
{
}
static inline void poison_element(mempool_t *pool, void *element)
static inline void poison_element(struct mempool *pool, void *element)
{
}
#endif /* CONFIG_SLUB_DEBUG_ON */
static __always_inline bool kasan_poison_element(mempool_t *pool, void *element)
static __always_inline bool kasan_poison_element(struct mempool *pool,
void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
return kasan_mempool_poison_object(element);
@ -142,7 +160,7 @@ static __always_inline bool kasan_poison_element(mempool_t *pool, void *element)
return true;
}
static void kasan_unpoison_element(mempool_t *pool, void *element)
static void kasan_unpoison_element(struct mempool *pool, void *element)
{
if (pool->alloc == mempool_kmalloc)
kasan_mempool_unpoison_object(element, (size_t)pool->pool_data);
@ -154,7 +172,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
(unsigned long)pool->pool_data);
}
static __always_inline void add_element(mempool_t *pool, void *element)
static __always_inline void add_element(struct mempool *pool, void *element)
{
BUG_ON(pool->min_nr != 0 && pool->curr_nr >= pool->min_nr);
poison_element(pool, element);
@ -162,7 +180,7 @@ static __always_inline void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
static void *remove_element(mempool_t *pool)
static void *remove_element(struct mempool *pool)
{
void *element = pool->elements[--pool->curr_nr];
@ -183,7 +201,7 @@ static void *remove_element(mempool_t *pool)
* May be called on a zeroed but uninitialized mempool (i.e. allocated with
* kzalloc()).
*/
void mempool_exit(mempool_t *pool)
void mempool_exit(struct mempool *pool)
{
while (pool->curr_nr) {
void *element = remove_element(pool);
@ -202,7 +220,7 @@ EXPORT_SYMBOL(mempool_exit);
* Free all reserved elements in @pool and @pool itself. This function
* only sleeps if the free_fn() function sleeps.
*/
void mempool_destroy(mempool_t *pool)
void mempool_destroy(struct mempool *pool)
{
if (unlikely(!pool))
return;
@ -212,9 +230,9 @@ void mempool_destroy(mempool_t *pool)
}
EXPORT_SYMBOL(mempool_destroy);
int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int node_id)
int mempool_init_node(struct mempool *pool, int min_nr,
mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
void *pool_data, gfp_t gfp_mask, int node_id)
{
spin_lock_init(&pool->lock);
pool->min_nr = min_nr;
@ -264,8 +282,9 @@ EXPORT_SYMBOL(mempool_init_node);
*
* Return: %0 on success, negative error code otherwise.
*/
int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
int mempool_init_noprof(struct mempool *pool, int min_nr,
mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
void *pool_data)
{
return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
pool_data, GFP_KERNEL, NUMA_NO_NODE);
@ -291,11 +310,11 @@ EXPORT_SYMBOL(mempool_init_noprof);
*
* Return: pointer to the created memory pool object or %NULL on error.
*/
mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
gfp_t gfp_mask, int node_id)
struct mempool *mempool_create_node_noprof(int min_nr,
mempool_alloc_t *alloc_fn, mempool_free_t *free_fn,
void *pool_data, gfp_t gfp_mask, int node_id)
{
mempool_t *pool;
struct mempool *pool;
pool = kmalloc_node_noprof(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
if (!pool)
@ -329,7 +348,7 @@ EXPORT_SYMBOL(mempool_create_node_noprof);
*
* Return: %0 on success, negative error code otherwise.
*/
int mempool_resize(mempool_t *pool, int new_min_nr)
int mempool_resize(struct mempool *pool, int new_min_nr)
{
void *element;
void **new_elements;
@ -391,140 +410,227 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
}
EXPORT_SYMBOL(mempool_resize);
/**
* mempool_alloc - allocate an element from a specific memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* @gfp_mask: the usual allocation bitmask.
*
* this function only sleeps if the alloc_fn() function sleeps or
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
* Note: using __GFP_ZERO is not supported.
*
* Return: pointer to the allocated element or %NULL on error.
*/
void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask)
static unsigned int mempool_alloc_from_pool(struct mempool *pool, void **elems,
unsigned int count, unsigned int allocated,
gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_entry_t wait;
gfp_t gfp_temp;
unsigned int i;
spin_lock_irqsave(&pool->lock, flags);
if (unlikely(pool->curr_nr < count - allocated))
goto fail;
for (i = 0; i < count; i++) {
if (!elems[i]) {
elems[i] = remove_element(pool);
allocated++;
}
}
spin_unlock_irqrestore(&pool->lock, flags);
/* Paired with rmb in mempool_free(), read comment there. */
smp_wmb();
/*
* Update the allocation stack trace as this is more useful for
* debugging.
*/
for (i = 0; i < count; i++)
kmemleak_update_trace(elems[i]);
return allocated;
fail:
if (gfp_mask & __GFP_DIRECT_RECLAIM) {
DEFINE_WAIT(wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irqrestore(&pool->lock, flags);
/*
* Wait for someone else to return an element to @pool, but wake
* up occasionally as memory pressure might have reduced even
* and the normal allocation in alloc_fn could succeed even if
* no element was returned.
*/
io_schedule_timeout(5 * HZ);
finish_wait(&pool->wait, &wait);
} else {
/* We must not sleep if __GFP_DIRECT_RECLAIM is not set. */
spin_unlock_irqrestore(&pool->lock, flags);
}
return allocated;
}
/*
* Adjust the gfp flags for mempool allocations, as we never want to dip into
* the global emergency reserves or retry in the page allocator.
*
* The first pass also doesn't want to go reclaim, but the next passes do, so
* return a separate subset for that first iteration.
*/
static inline gfp_t mempool_adjust_gfp(gfp_t *gfp_mask)
{
*gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
return *gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
}
/**
* mempool_alloc_bulk - allocate multiple elements from a memory pool
* @pool: pointer to the memory pool
* @elems: partially or fully populated elements array
* @count: number of entries in @elem that need to be allocated
* @allocated: number of entries in @elem already allocated
*
* Allocate elements for each slot in @elem that is non-%NULL. This is done by
* first calling into the alloc_fn supplied at pool initialization time, and
* dipping into the reserved pool when alloc_fn fails to allocate an element.
*
* On return all @count elements in @elems will be populated.
*
* Return: Always 0. If it wasn't for %$#^$ alloc tags, it would return void.
*/
int mempool_alloc_bulk_noprof(struct mempool *pool, void **elems,
unsigned int count, unsigned int allocated)
{
gfp_t gfp_mask = GFP_KERNEL;
gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
unsigned int i = 0;
VM_WARN_ON_ONCE(count > pool->min_nr);
might_alloc(gfp_mask);
/*
* If an error is injected, fail all elements in a bulk allocation so
* that we stress the multiple elements missing path.
*/
if (should_fail_ex(&fail_mempool_alloc_bulk, 1, FAULT_NOWARN)) {
pr_info("forcing mempool usage for %pS\n",
(void *)_RET_IP_);
goto use_pool;
}
repeat_alloc:
/*
* Try to allocate the elements using the allocation callback first as
* that might succeed even when the caller's bulk allocation did not.
*/
for (i = 0; i < count; i++) {
if (elems[i])
continue;
elems[i] = pool->alloc(gfp_temp, pool->pool_data);
if (unlikely(!elems[i]))
goto use_pool;
allocated++;
}
return 0;
use_pool:
allocated = mempool_alloc_from_pool(pool, elems, count, allocated,
gfp_temp);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
EXPORT_SYMBOL_GPL(mempool_alloc_bulk_noprof);
/**
* mempool_alloc - allocate an element from a memory pool
* @pool: pointer to the memory pool
* @gfp_mask: GFP_* flags. %__GFP_ZERO is not supported.
*
* Allocate an element from @pool. This is done by first calling into the
* alloc_fn supplied at pool initialization time, and dipping into the reserved
* pool when alloc_fn fails to allocate an element.
*
* This function only sleeps if the alloc_fn callback sleeps, or when waiting
* for elements to become available in the pool.
*
* Return: pointer to the allocated element or %NULL when failing to allocate
* an element. Allocation failure can only happen when @gfp_mask does not
* include %__GFP_DIRECT_RECLAIM.
*/
void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask)
{
gfp_t gfp_temp = mempool_adjust_gfp(&gfp_mask);
void *element;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
might_alloc(gfp_mask);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
if (should_fail_ex(&fail_mempool_alloc, 1, FAULT_NOWARN)) {
pr_info("forcing mempool usage for %pS\n",
(void *)_RET_IP_);
element = NULL;
} else {
element = pool->alloc(gfp_temp, pool->pool_data);
}
element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL))
return element;
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
if (unlikely(!element)) {
/*
* Update the allocation stack trace as this is more useful
* for debugging.
* Try to allocate an element from the pool.
*
* The first pass won't have __GFP_DIRECT_RECLAIM and won't
* sleep in mempool_alloc_from_pool. Retry the allocation
* with all flags set in that case.
*/
kmemleak_update_trace(element);
return element;
if (!mempool_alloc_from_pool(pool, &element, 1, 0, gfp_temp)) {
if (gfp_temp != gfp_mask) {
gfp_temp = gfp_mask;
goto repeat_alloc;
}
if (gfp_mask & __GFP_DIRECT_RECLAIM) {
goto repeat_alloc;
}
}
}
/*
* We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
if (gfp_temp != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
/* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
spin_unlock_irqrestore(&pool->lock, flags);
return NULL;
}
/* Let's wait for someone else to return an element to @pool */
init_wait(&wait);
prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irqrestore(&pool->lock, flags);
/*
* FIXME: this should be io_schedule(). The timeout is there as a
* workaround for some DM problems in 2.6.18.
*/
io_schedule_timeout(5*HZ);
finish_wait(&pool->wait, &wait);
goto repeat_alloc;
return element;
}
EXPORT_SYMBOL(mempool_alloc_noprof);
/**
* mempool_alloc_preallocated - allocate an element from preallocated elements
* belonging to a specific memory pool
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* belonging to a memory pool
* @pool: pointer to the memory pool
*
* This function is similar to mempool_alloc, but it only attempts allocating
* an element from the preallocated elements. It does not sleep and immediately
* returns if no preallocated elements are available.
* This function is similar to mempool_alloc(), but it only attempts allocating
* an element from the preallocated elements. It only takes a single spinlock_t
* and immediately returns if no preallocated elements are available.
*
* Return: pointer to the allocated element or %NULL if no elements are
* available.
*/
void *mempool_alloc_preallocated(mempool_t *pool)
void *mempool_alloc_preallocated(struct mempool *pool)
{
void *element;
unsigned long flags;
void *element = NULL;
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
element = remove_element(pool);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
/*
* Update the allocation stack trace as this is more useful
* for debugging.
*/
kmemleak_update_trace(element);
return element;
}
spin_unlock_irqrestore(&pool->lock, flags);
return NULL;
mempool_alloc_from_pool(pool, &element, 1, 0, GFP_NOWAIT);
return element;
}
EXPORT_SYMBOL(mempool_alloc_preallocated);
/**
* mempool_free - return an element to the pool.
* @element: pool element pointer.
* @pool: pointer to the memory pool which was allocated via
* mempool_create().
* mempool_free_bulk - return elements to a mempool
* @pool: pointer to the memory pool
* @elems: elements to return
* @count: number of elements to return
*
* this function only sleeps if the free_fn() function sleeps.
* Returns a number of elements from the start of @elem to @pool if @pool needs
* replenishing and sets their slots in @elem to NULL. Other elements are left
* in @elem.
*
* Return: number of elements transferred to @pool. Elements are always
* transferred from the beginning of @elem, so the return value can be used as
* an offset into @elem for the freeing the remaining elements in the caller.
*/
void mempool_free(void *element, mempool_t *pool)
unsigned int mempool_free_bulk(struct mempool *pool, void **elems,
unsigned int count)
{
unsigned long flags;
if (unlikely(element == NULL))
return;
unsigned int freed = 0;
bool added = false;
/*
* Paired with the wmb in mempool_alloc(). The preceding read is
@ -558,21 +664,6 @@ void mempool_free(void *element, mempool_t *pool)
* Waiters happen iff curr_nr is 0 and the above guarantee also
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
if (wq_has_sleeper(&pool->wait))
wake_up(&pool->wait);
return;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
/*
* Handle the min_nr = 0 edge case:
*
* For zero-minimum pools, curr_nr < min_nr (0 < 0) never succeeds,
* so waiters sleeping on pool->wait would never be woken by the
@ -580,20 +671,45 @@ void mempool_free(void *element, mempool_t *pool)
* allocation of element when both min_nr and curr_nr are 0, and
* any active waiters are properly awakened.
*/
if (unlikely(pool->min_nr == 0 &&
if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
while (pool->curr_nr < pool->min_nr && freed < count) {
add_element(pool, elems[freed++]);
added = true;
}
spin_unlock_irqrestore(&pool->lock, flags);
} else if (unlikely(pool->min_nr == 0 &&
READ_ONCE(pool->curr_nr) == 0)) {
/* Handle the min_nr = 0 edge case: */
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr == 0)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
if (wq_has_sleeper(&pool->wait))
wake_up(&pool->wait);
return;
add_element(pool, elems[freed++]);
added = true;
}
spin_unlock_irqrestore(&pool->lock, flags);
}
pool->free(element, pool->pool_data);
if (unlikely(added) && wq_has_sleeper(&pool->wait))
wake_up(&pool->wait);
return freed;
}
EXPORT_SYMBOL_GPL(mempool_free_bulk);
/**
* mempool_free - return an element to the pool.
* @element: element to return
* @pool: pointer to the memory pool
*
* Returns @element to @pool if it needs replenishing, else frees it using
* the free_fn callback in @pool.
*
* This function only sleeps if the free_fn callback sleeps.
*/
void mempool_free(void *element, struct mempool *pool)
{
if (likely(element) && !mempool_free_bulk(pool, &element, 1))
pool->free(element, pool->pool_data);
}
EXPORT_SYMBOL(mempool_free);
@ -632,19 +748,6 @@ void mempool_kfree(void *element, void *pool_data)
}
EXPORT_SYMBOL(mempool_kfree);
void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data)
{
size_t size = (size_t)pool_data;
return kvmalloc(size, gfp_mask);
}
EXPORT_SYMBOL(mempool_kvmalloc);
void mempool_kvfree(void *element, void *pool_data)
{
kvfree(element);
}
EXPORT_SYMBOL(mempool_kvfree);
/*
* A simple mempool-backed page allocator that allocates pages
* of the order specified by pool_data.

View File

@ -4977,13 +4977,18 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
* @nr_pages: The number of pages desired in the array
* @page_array: Array to store the pages
*
* This is a batched version of the page allocator that attempts to
* allocate nr_pages quickly. Pages are added to the page_array.
* This is a batched version of the page allocator that attempts to allocate
* @nr_pages quickly. Pages are added to @page_array.
*
* Note that only NULL elements are populated with pages and nr_pages
* is the maximum number of pages that will be stored in the array.
* Note that only the elements in @page_array that were cleared to %NULL on
* entry are populated with newly allocated pages. @nr_pages is the maximum
* number of pages that will be stored in the array.
*
* Returns the number of pages in the array.
* Returns the number of pages in @page_array, including ones already
* allocated on entry. This can be less than the number requested in @nr_pages,
* but all empty slots are filled from the beginning. I.e., if all slots in
* @page_array were set to %NULL on entry, the slots from 0 to the return value
* - 1 will be filled.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,

112
mm/slab.h
View File

@ -40,13 +40,29 @@ typedef u64 freelist_full_t;
* Freelist pointer and counter to cmpxchg together, avoids the typical ABA
* problems with cmpxchg of just a pointer.
*/
typedef union {
struct {
void *freelist;
unsigned long counter;
struct freelist_counters {
union {
struct {
void *freelist;
union {
unsigned long counters;
struct {
unsigned inuse:16;
unsigned objects:15;
/*
* If slab debugging is enabled then the
* frozen bit can be reused to indicate
* that the slab was corrupted
*/
unsigned frozen:1;
};
};
};
#ifdef system_has_freelist_aba
freelist_full_t freelist_counters;
#endif
};
freelist_full_t full;
} freelist_aba_t;
};
/* Reuses the bits in struct page */
struct slab {
@ -69,27 +85,7 @@ struct slab {
#endif
};
/* Double-word boundary */
union {
struct {
void *freelist; /* first free object */
union {
unsigned long counters;
struct {
unsigned inuse:16;
unsigned objects:15;
/*
* If slab debugging is enabled then the
* frozen bit can be reused to indicate
* that the slab was corrupted
*/
unsigned frozen:1;
};
};
};
#ifdef system_has_freelist_aba
freelist_aba_t freelist_counter;
#endif
};
struct freelist_counters;
};
struct rcu_head rcu_head;
};
@ -114,22 +110,9 @@ SLAB_MATCH(_unused_slab_obj_exts, obj_exts);
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
#if defined(system_has_freelist_aba)
static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)));
static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(struct freelist_counters)));
#endif
/**
* folio_slab - Converts from folio to slab.
* @folio: The folio.
*
* Currently struct slab is a different representation of a folio where
* folio_test_slab() is true.
*
* Return: The slab which contains this folio.
*/
#define folio_slab(folio) (_Generic((folio), \
const struct folio *: (const struct slab *)(folio), \
struct folio *: (struct slab *)(folio)))
/**
* slab_folio - The folio allocated for a slab
* @s: The slab.
@ -146,20 +129,24 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
struct slab *: (struct folio *)s))
/**
* page_slab - Converts from first struct page to slab.
* @p: The first (either head of compound or single) page of slab.
* page_slab - Converts from struct page to its slab.
* @page: A page which may or may not belong to a slab.
*
* A temporary wrapper to convert struct page to struct slab in situations where
* we know the page is the compound head, or single order-0 page.
*
* Long-term ideally everything would work with struct slab directly or go
* through folio to struct slab.
*
* Return: The slab which contains this page
* Return: The slab which contains this page or NULL if the page does
* not belong to a slab. This includes pages returned from large kmalloc.
*/
#define page_slab(p) (_Generic((p), \
const struct page *: (const struct slab *)(p), \
struct page *: (struct slab *)(p)))
static inline struct slab *page_slab(const struct page *page)
{
unsigned long head;
head = READ_ONCE(page->compound_head);
if (head & 1)
page = (struct page *)(head - 1);
if (data_race(page->page_type >> 24) != PGTY_slab)
page = NULL;
return (struct slab *)page;
}
/**
* slab_page - The first struct page allocated for a slab
@ -188,12 +175,7 @@ static inline pg_data_t *slab_pgdat(const struct slab *slab)
static inline struct slab *virt_to_slab(const void *addr)
{
struct folio *folio = virt_to_folio(addr);
if (!folio_test_slab(folio))
return NULL;
return folio_slab(folio);
return page_slab(virt_to_page(addr));
}
static inline int slab_order(const struct slab *slab)
@ -236,10 +218,8 @@ struct kmem_cache_order_objects {
* Slab cache management.
*/
struct kmem_cache {
#ifndef CONFIG_SLUB_TINY
struct kmem_cache_cpu __percpu *cpu_slab;
struct lock_class_key lock_key;
#endif
struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
@ -601,6 +581,16 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
return s->size;
}
static inline unsigned int large_kmalloc_order(const struct page *page)
{
return page[1].flags.f & 0xff;
}
static inline size_t large_kmalloc_size(const struct page *page)
{
return PAGE_SIZE << large_kmalloc_order(page);
}
#ifdef CONFIG_SLUB_DEBUG
void dump_unreclaimable_slab(void);
#else

View File

@ -997,26 +997,27 @@ void __init create_kmalloc_caches(void)
*/
size_t __ksize(const void *object)
{
struct folio *folio;
const struct page *page;
const struct slab *slab;
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
folio = virt_to_folio(object);
page = virt_to_page(object);
if (unlikely(!folio_test_slab(folio))) {
if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
return 0;
if (WARN_ON(object != folio_address(folio)))
return 0;
return folio_size(folio);
}
if (unlikely(PageLargeKmalloc(page)))
return large_kmalloc_size(page);
slab = page_slab(page);
/* Delete this after we're sure there are no users */
if (WARN_ON(!slab))
return page_size(page);
#ifdef CONFIG_SLUB_DEBUG
skip_orig_size_check(folio_slab(folio)->slab_cache, object);
skip_orig_size_check(slab->slab_cache, object);
#endif
return slab_ksize(folio_slab(folio)->slab_cache);
return slab_ksize(slab->slab_cache);
}
gfp_t kmalloc_fix_flags(gfp_t flags)
@ -1614,17 +1615,15 @@ static void kfree_rcu_work(struct work_struct *work)
static bool kfree_rcu_sheaf(void *obj)
{
struct kmem_cache *s;
struct folio *folio;
struct slab *slab;
if (is_vmalloc_addr(obj))
return false;
folio = virt_to_folio(obj);
if (unlikely(!folio_test_slab(folio)))
slab = virt_to_slab(obj);
if (unlikely(!slab))
return false;
slab = folio_slab(folio);
s = slab->slab_cache;
if (s->cpu_sheaves) {
if (likely(!IS_ENABLED(CONFIG_NUMA) ||

694
mm/slub.c

File diff suppressed because it is too large Load Diff

View File

@ -164,7 +164,8 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
{
unsigned long addr = (unsigned long)ptr;
unsigned long offset;
struct folio *folio;
struct page *page;
struct slab *slab;
if (is_kmap_addr(ptr)) {
offset = offset_in_page(ptr);
@ -189,16 +190,23 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
if (!virt_addr_valid(ptr))
return;
folio = virt_to_folio(ptr);
if (folio_test_slab(folio)) {
page = virt_to_page(ptr);
slab = page_slab(page);
if (slab) {
/* Check slab allocator for flags and size. */
__check_heap_object(ptr, n, folio_slab(folio), to_user);
} else if (folio_test_large(folio)) {
offset = ptr - folio_address(folio);
if (n > folio_size(folio) - offset)
__check_heap_object(ptr, n, slab, to_user);
} else if (PageCompound(page)) {
page = compound_head(page);
offset = ptr - page_address(page);
if (n > page_size(page) - offset)
usercopy_abort("page alloc", NULL, to_user, offset, n);
}
/*
* We cannot check non-compound pages. They might be part of
* a large allocation, in which case crossing a page boundary
* is fine.
*/
}
DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,