diff options
| author | Vlastimil Babka <vbabka@suse.cz> | 2026-02-08 19:17:42 +0100 |
|---|---|---|
| committer | Vlastimil Babka <vbabka@suse.cz> | 2026-02-10 09:10:00 +0100 |
| commit | 815c8e35511d0b9a214e9f644983fe477af9d5cb (patch) | |
| tree | cbbb5c37b065a9595a207eca145ceb257ea5fdc6 | |
| parent | 98e99fc4ad4b30dd28c09ba19686ec583af345b4 (diff) | |
| parent | 40fd0acc45d06709b3b1eea77e50e13f4145dff0 (diff) | |
Merge branch 'slab/for-7.0/sheaves' into slab/for-next
Merge series "slab: replace cpu (partial) slabs with sheaves".
The percpu sheaves caching layer was introduced as opt-in but the goal
was to eventually move all caches to them. This is the next step,
enabling sheaves for all caches (except the two bootstrap ones) and then
removing the per cpu (partial) slabs and lots of associated code.
Besides the lower locking overhead and much more likely fastpath when
freeing, this removes the rather complicated code related to the cpu
slab lockless fastpaths (using this_cpu_try_cmpxchg128/64) and all its
complications for PREEMPT_RT or kmalloc_nolock().
The lockless slab freelist+counters update operation using
try_cmpxchg128/64 remains and is crucial for freeing remote NUMA objects
and to allow flushing objects from sheaves to slabs mostly without the
node list_lock.
Link: https://lore.kernel.org/all/20260123-sheaves-for-all-v4-0-041323d506f7@suse.cz/
| -rw-r--r-- | include/linux/slab.h | 6 | ||||
| -rw-r--r-- | mm/Kconfig | 11 | ||||
| -rw-r--r-- | mm/internal.h | 1 | ||||
| -rw-r--r-- | mm/page_alloc.c | 5 | ||||
| -rw-r--r-- | mm/slab.h | 57 | ||||
| -rw-r--r-- | mm/slab_common.c | 9 | ||||
| -rw-r--r-- | mm/slub.c | 2518 |
7 files changed, 909 insertions, 1698 deletions
diff --git a/include/linux/slab.h b/include/linux/slab.h index 34db237319c1..a0081642606b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -57,9 +57,7 @@ enum _slab_flag_bits { #endif _SLAB_OBJECT_POISON, _SLAB_CMPXCHG_DOUBLE, -#ifdef CONFIG_SLAB_OBJ_EXT _SLAB_NO_OBJ_EXT, -#endif #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) _SLAB_OBJ_EXT_IN_OBJ, #endif @@ -241,11 +239,7 @@ enum _slab_flag_bits { #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* Slab created using create_boot_cache */ -#ifdef CONFIG_SLAB_OBJ_EXT #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) -#else -#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED -#endif #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ) diff --git a/mm/Kconfig b/mm/Kconfig index bd0ea5454af8..08593674cd20 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -247,17 +247,6 @@ config SLUB_STATS out which slabs are relevant to a particular load. Try running: slabinfo -DA -config SLUB_CPU_PARTIAL - default y - depends on SMP && !SLUB_TINY - bool "Enable per cpu partial caches" - help - Per cpu partial caches accelerate objects allocation and freeing - that is local to a processor at the price of more indeterminism - in the latency of the free. On overflow these caches will be cleared - which requires the taking of locks that may cause latency spikes. - Typically one would choose no for a realtime system. - config RANDOM_KMALLOC_CACHES default n depends on !SLUB_TINY diff --git a/mm/internal.h b/mm/internal.h index e430da900430..1f44ccb4badf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -846,6 +846,7 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_frozen_pages_nolock(...) \ alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) +void free_frozen_pages_nolock(struct page *page, unsigned int order); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c380f063e8b7..0127e9d661ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2981,6 +2981,11 @@ void free_frozen_pages(struct page *page, unsigned int order) __free_frozen_pages(page, order, FPI_NONE); } +void free_frozen_pages_nolock(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_TRYLOCK); +} + /* * Free a batch of folios */ diff --git a/mm/slab.h b/mm/slab.h index 3f49666e943c..71c7261bf822 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -21,14 +21,12 @@ # define system_has_freelist_aba() system_has_cmpxchg128() # define try_cmpxchg_freelist try_cmpxchg128 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 typedef u128 freelist_full_t; #else /* CONFIG_64BIT */ # ifdef system_has_cmpxchg64 # define system_has_freelist_aba() system_has_cmpxchg64() # define try_cmpxchg_freelist try_cmpxchg64 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 typedef u64 freelist_full_t; #endif /* CONFIG_64BIT */ @@ -79,19 +77,7 @@ struct slab { struct kmem_cache *slab_cache; union { struct { - union { - struct list_head slab_list; - struct { /* For deferred deactivate_slab() */ - struct llist_node llnode; - void *flush_freelist; - }; -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct { - struct slab *next; - int slabs; /* Nr of slabs left */ - }; -#endif - }; + struct list_head slab_list; /* Double-word boundary */ struct freelist_counters; }; @@ -196,23 +182,6 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the @@ -226,8 +195,6 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { - struct kmem_cache_cpu __percpu *cpu_slab; - struct lock_class_key lock_key; struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; @@ -236,12 +203,6 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; @@ -282,9 +243,25 @@ struct kmem_cache { unsigned int usersize; /* Usercopy region size */ #endif +#ifdef CONFIG_SLUB_STATS + struct kmem_cache_stats __percpu *cpu_stats; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; +/* + * Every cache has !NULL s->cpu_sheaves but they may point to the + * bootstrap_sheaf temporarily during init, or permanently for the boot caches + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This + * helper distinguishes whether cache has real non-bootstrap sheaves. + */ +static inline bool cache_has_sheaves(struct kmem_cache *s) +{ + /* Test CONFIG_SLUB_TINY for code elimination purposes */ + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; +} + #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS 1 void sysfs_slab_unlink(struct kmem_cache *s); diff --git a/mm/slab_common.c b/mm/slab_common.c index 094afa2792d0..d5a70a831a2a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1604,11 +1604,8 @@ static bool kfree_rcu_sheaf(void *obj) return false; s = slab->slab_cache; - if (s->cpu_sheaves) { - if (likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id())) - return __kfree_rcu_sheaf(s, obj); - } + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) + return __kfree_rcu_sheaf(s, obj); return false; } @@ -2112,7 +2109,7 @@ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); */ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) { - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { flush_rcu_sheaves_on_cache(s); rcu_barrier(); } diff --git a/mm/slub.c b/mm/slub.c index 6fac2b123b42..11a99bd06ac7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1,13 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* - * SLUB: A slab allocator that limits cache line use instead of queuing - * objects in per cpu and per node lists. + * SLUB: A slab allocator with low overhead percpu array caches and mostly + * lockless freeing of objects to slabs in the slowpath. * - * The allocator synchronizes using per slab locks or atomic operations - * and only uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using spin_trylock for percpu arrays in the + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. + * Uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter * (C) 2011 Linux Foundation, Christoph Lameter + * (C) 2025 SUSE, Vlastimil Babka */ #include <linux/mm.h> @@ -53,11 +55,13 @@ /* * Lock order: - * 1. slab_mutex (Global Mutex) - * 2. node->list_lock (Spinlock) - * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches) - * 5. object_map_lock (Only for debugging) + * 0. cpu_hotplug_lock + * 1. slab_mutex (Global Mutex) + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) + * 2b. node->barn->lock (Spinlock) + * 2c. node->list_lock (Spinlock) + * 3. slab_lock(slab) (Only on some arches) + * 4. object_map_lock (Only for debugging) * * slab_mutex * @@ -78,31 +82,38 @@ * C. slab->objects -> Number of objects in slab * D. slab->frozen -> frozen state * - * Frozen slabs + * SL_partial slabs + * + * Slabs on node partial list have at least one free object. A limited number + * of slabs on the list can be fully free (slab->inuse == 0), until we start + * discarding them. These slabs are marked with SL_partial, and the flag is + * cleared while removing them, usually to grab their freelist afterwards. + * This clearing also exempts them from list management. Please see + * __slab_free() for more details. * - * If a slab is frozen then it is exempt from list management. It is - * the cpu slab which is actively allocated from by the processor that - * froze it and it is not on any list. The processor that froze the - * slab is the one who can perform list operations on the slab. Other - * processors may put objects onto the freelist but the processor that - * froze the slab is the only one that can retrieve the objects from the - * slab's freelist. + * Full slabs * - * CPU partial slabs + * For caches without debugging enabled, full slabs (slab->inuse == + * slab->objects and slab->freelist == NULL) are not placed on any list. + * The __slab_free() freeing the first object from such a slab will place + * it on the partial list. Caches with debugging enabled place such slab + * on the full list and use different allocation and freeing paths. + * + * Frozen slabs * - * The partially empty slabs cached on the CPU partial list are used - * for performance reasons, which speeds up the allocation process. - * These slabs are not frozen, but are also exempt from list management, - * by clearing the SL_partial flag when moving out of the node - * partial list. Please see __slab_free() for more details. + * If a slab is frozen then it is exempt from list management. It is used to + * indicate a slab that has failed consistency checks and thus cannot be + * allocated from anymore - it is also marked as full. Any previously + * allocated objects will be simply leaked upon freeing instead of attempting + * to modify the potentially corrupted freelist and metadata. * * To sum up, the current scheme is: - * - node partial slab: SL_partial && !frozen - * - cpu partial slab: !SL_partial && !frozen - * - cpu slab: !SL_partial && frozen - * - full slab: !SL_partial && !frozen + * - node partial slab: SL_partial && !full && !frozen + * - taken off partial list: !SL_partial && !full && !frozen + * - full slab, not on any list: !SL_partial && full && !frozen + * - frozen due to inconsistency: !SL_partial && full && frozen * - * list_lock + * node->list_lock (spinlock) * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -112,47 +123,46 @@ * * The list_lock is a centralized lock and thus we avoid taking it as * much as possible. As long as SLUB does not have to handle partial - * slabs, operations can continue without any centralized lock. F.e. - * allocating a long series of objects that fill up slabs does not require - * the list lock. + * slabs, operations can continue without any centralized lock. * * For debug caches, all allocations are forced to go through a list_lock * protected region to serialize against concurrent validation. * - * cpu_slab->lock local lock + * cpu_sheaves->lock (local_trylock) * - * This locks protect slowpath manipulation of all kmem_cache_cpu fields - * except the stat counters. This is a percpu structure manipulated only by - * the local cpu, so the lock protects against being preempted or interrupted - * by an irq. Fast path operations rely on lockless operations instead. + * This lock protects fastpath operations on the percpu sheaves. On !RT it + * only disables preemption and does no atomic operations. As long as the main + * or spare sheaf can handle the allocation or free, there is no other + * overhead. * - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption - * which means the lockless fastpath cannot be used as it might interfere with - * an in-progress slow path operations. In this case the local lock is always - * taken but it still utilizes the freelist for the common operations. + * node->barn->lock (spinlock) * - * lockless fastpaths + * This lock protects the operations on per-NUMA-node barn. It can quickly + * serve an empty or full sheaf if available, and avoid more expensive refill + * or flush operation. * - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) - * are fully lockless when satisfied from the percpu slab (and when - * cmpxchg_double is possible to use, otherwise slab_lock is taken). - * They also don't disable preemption or migration or irqs. They rely on - * the transaction id (tid) field to detect being preempted or moved to - * another cpu. + * Lockless freeing + * + * Objects may have to be freed to their slabs when they are from a remote + * node (where we want to avoid filling local sheaves with remote objects) + * or when there are too many full sheaves. On architectures supporting + * cmpxchg_double this is done by a lockless update of slab's freelist and + * counters, otherwise slab_lock is taken. This only needs to take the + * list_lock if it's a first free to a full slab, or when a slab becomes empty + * after the free. * * irq, preemption, migration considerations * - * Interrupts are disabled as part of list_lock or local_lock operations, or + * Interrupts are disabled as part of list_lock or barn lock operations, or * around the slab_lock operation, in order to make the slab allocator safe * to use in the context of an irq. + * Preemption is disabled as part of local_trylock operations. + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see + * their limitations. * - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer - * doesn't have to be revalidated in each section protected by the local lock. - * - * SLUB assigns one slab for allocation to each processor. - * Allocations only occur from these slabs called cpu slabs. + * SLUB assigns two object arrays called sheaves for caching allocations and + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. + * Allocations and frees are primarily served from these sheaves. * * Slabs with free elements are kept on a partial list and during regular * operations no list for full slabs is used. If an object in a full slab is @@ -160,25 +170,8 @@ * We track full slabs for debugging purposes though because otherwise we * cannot scan all objects. * - * Slabs are freed when they become empty. Teardown and setup is - * minimal so we rely on the page allocators per cpu caches for - * fast frees and allocs. - * - * slab->frozen The slab is frozen and exempt from list processing. - * This means that the slab is dedicated to a purpose - * such as satisfying allocations for a specific - * processor. Objects may be freed in the slab while - * it is frozen but slab_free will then skip the usual - * list operations. It is up to the processor holding - * the slab to integrate the slab into the slab lists - * when the slab is no longer needed. - * - * One use of this flag is to mark slabs that are - * used for allocations. Then such a slab becomes a cpu - * slab. The cpu slab may be equipped with an additional - * freelist that allows lockless access to - * free objects in addition to the regular freelist - * that requires the slab lock. + * Slabs are freed when they become empty. Teardown and setup is minimal so we + * rely on the page allocators per cpu caches for fast frees and allocs. * * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of @@ -201,28 +194,6 @@ enum slab_flags { SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ }; -/* - * We could simply use migrate_disable()/enable() but as long as it's a - * function call even on !PREEMPT_RT, use inline preempt_disable() there. - */ -#ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) -#define USE_LOCKLESS_FAST_PATH() (true) -#else -#define slub_get_cpu_ptr(var) \ -({ \ - migrate_disable(); \ - this_cpu_ptr(var); \ -}) -#define slub_put_cpu_ptr(var) \ -do { \ - (void)(var); \ - migrate_enable(); \ -} while (0) -#define USE_LOCKLESS_FAST_PATH() (false) -#endif - #ifndef CONFIG_SLUB_TINY #define __fastpath_inline __always_inline #else @@ -241,11 +212,18 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); static DEFINE_STATIC_KEY_FALSE(strict_numa); #endif -/* Structure holding parameters for get_partial() call chain */ +/* Structure holding parameters for get_from_partial() call chain */ struct partial_context { gfp_t flags; unsigned int orig_size; - void *object; +}; + +/* Structure holding parameters for get_partial_node_bulk() */ +struct partial_bulk_context { + gfp_t flags; + unsigned int min_objects; + unsigned int max_objects; + struct list_head slabs; }; static inline bool kmem_cache_debug(struct kmem_cache *s) @@ -261,15 +239,6 @@ void *fixup_red_left(struct kmem_cache *s, void *p) return p; } -static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_CPU_PARTIAL - return !kmem_cache_debug(s); -#else - return false; -#endif -} - /* * Issues still to be resolved: * @@ -360,37 +329,25 @@ static void debugfs_slab_add(struct kmem_cache *); static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif +enum add_mode { + ADD_TO_HEAD, + ADD_TO_TAIL, +}; + enum stat_item { - ALLOC_PCS, /* Allocation from percpu sheaf */ - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_PCS, /* Free to percpu sheaf */ + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ + FREE_FASTPATH, /* Free to percpu sheaves */ + FREE_SLOWPATH, /* Free to a slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + ALLOC_SLAB, /* New slab acquired from page allocator */ + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ SHEAF_FLUSH, /* Objects flushed from a sheaf */ SHEAF_REFILL, /* Objects refilled to a sheaf */ SHEAF_ALLOC, /* Allocation of an empty sheaf */ @@ -407,31 +364,11 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; -struct freelist_tid { - union { - struct { - void *freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_full_t freelist_tid; - }; -}; - -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - struct freelist_tid; - struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated slabs */ -#endif - local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS +struct kmem_cache_stats { unsigned int stat[NR_SLUB_STAT_ITEMS]; -#endif }; +#endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { @@ -440,7 +377,7 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * The rmw is racy on a preemptible kernel but this is acceptable, so * avoid this_cpu_add()'s irq-disable overhead. */ - raw_cpu_inc(s->cpu_slab->stat[si]); + raw_cpu_inc(s->cpu_stats->stat[si]); #endif } @@ -448,7 +385,7 @@ static inline void stat_add(const struct kmem_cache *s, enum stat_item si, int v) { #ifdef CONFIG_SLUB_STATS - raw_cpu_add(s->cpu_slab->stat[si], v); + raw_cpu_add(s->cpu_stats->stat[si], v); #endif } @@ -537,7 +474,7 @@ static inline struct node_barn *get_barn(struct kmem_cache *s) static nodemask_t slab_nodes; /* - * Workqueue used for flush_cpu_slab(). + * Workqueue used for flushing cpu and kfree_rcu sheaves. */ static struct workqueue_struct *flushwq; @@ -596,36 +533,6 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_ptr_decode(s, p, ptr_addr); } -static void prefetch_freepointer(const struct kmem_cache *s, void *object) -{ - prefetchw(object + s->offset); -} - -/* - * When running under KMSAN, get_freepointer_safe() may return an uninitialized - * pointer value in the case the current thread loses the race for the next - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in - * slab_alloc_node() will fail, so the uninitialized value won't be used, but - * KMSAN will still check all arguments of cmpxchg because of imperfect - * handling of inline assembly. - * To work around this problem, we apply __no_kmsan_checks to ensure that - * get_freepointer_safe() returns initialized memory. - */ -__no_kmsan_checks -static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) -{ - unsigned long freepointer_addr; - freeptr_t p; - - if (!debug_pagealloc_enabled_static()) - return get_freepointer(s, object); - - object = kasan_reset_tag(object); - freepointer_addr = (unsigned long)object + s->offset; - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); - return freelist_ptr_decode(s, p, freepointer_addr); -} - static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { unsigned long freeptr_addr = (unsigned long)object + s->offset; @@ -689,41 +596,6 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ - unsigned int nr_slabs; - - s->cpu_partial = nr_objects; - - /* - * We take the number of objects but actually limit the number of - * slabs on the per cpu partial list, in order to limit excessive - * growth of the list. For simplicity we assume that the slabs will - * be half-full. - */ - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); - s->cpu_partial_slabs = nr_slabs; -} - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return s->cpu_partial_slabs; -} -#else -#ifdef SLAB_SUPPORTS_SYSFS -static inline void -slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ -} -#endif - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return 0; -} -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - /* * If network-based swap is enabled, slub must keep track of whether memory * were allocated from pfmemalloc reserves. @@ -779,7 +651,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old, if (slab->freelist == old->freelist && slab->counters == old->counters) { slab->freelist = new->freelist; - slab->counters = new->counters; + /* prevent tearing for the read in get_partial_node_bulk() */ + WRITE_ONCE(slab->counters, new->counters); ret = true; } slab_unlock(slab); @@ -799,7 +672,7 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla { bool ret; - if (USE_LOCKLESS_FAST_PATH()) + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) lockdep_assert_irqs_disabled(); if (s->flags & __CMPXCHG_DOUBLE) @@ -1178,7 +1051,7 @@ static void set_track_update(struct kmem_cache *s, void *object, p->handle = handle; #endif p->addr = addr; - p->cpu = smp_processor_id(); + p->cpu = raw_smp_processor_id(); p->pid = current->pid; p->when = jiffies; } @@ -1342,20 +1215,6 @@ static void object_err(struct kmem_cache *s, struct slab *slab, WARN_ON(1); } -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && - !check_valid_pointer(s, slab, nextfree) && freelist) { - object_err(s, slab, *freelist, "Freechain corrupt"); - *freelist = NULL; - slab_fix(s, "Isolate corrupted freechain"); - return true; - } - - return false; -} - static void __slab_err(struct slab *slab) { if (slab_in_kunit_test()) @@ -2167,11 +2026,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - return false; -} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -2872,7 +2726,8 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } -static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, + unsigned int capacity) { struct slab_sheaf *sheaf; size_t sheaf_size; @@ -2890,7 +2745,7 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) if (s->flags & SLAB_KMALLOC) gfp |= __GFP_NO_OBJ_EXT; - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf_size = struct_size(sheaf, objects, capacity); sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) @@ -2903,6 +2758,12 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) return sheaf; } +static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, + gfp_t gfp) +{ + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); +} + static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) { kfree(sheaf); @@ -2910,9 +2771,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) stat(s, SHEAF_FREE); } -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p); - +static unsigned int +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max); static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, gfp_t gfp) @@ -2923,8 +2784,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, if (!to_fill) return 0; - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, - &sheaf->objects[sheaf->size]); + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, + to_fill); sheaf->size += filled; @@ -3125,12 +2986,23 @@ static void pcs_destroy(struct kmem_cache *s) { int cpu; + /* + * We may be unwinding cache creation that failed before or during the + * allocation of this. + */ + if (!s->cpu_sheaves) + return; + + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ + if (!cache_has_sheaves(s)) + goto free_pcs; + for_each_possible_cpu(cpu) { struct slub_percpu_sheaves *pcs; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); - /* can happen when unwinding failed create */ + /* This can happen when unwinding failed cache creation. */ if (!pcs->main) continue; @@ -3152,11 +3024,13 @@ static void pcs_destroy(struct kmem_cache *s) } } +free_pcs: free_percpu(s->cpu_sheaves); s->cpu_sheaves = NULL; } -static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, + bool allow_spin) { struct slab_sheaf *empty = NULL; unsigned long flags; @@ -3164,7 +3038,10 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) if (!data_race(barn->nr_empty)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, @@ -3241,7 +3118,8 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) * change. */ static struct slab_sheaf * -barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, + bool allow_spin) { struct slab_sheaf *full = NULL; unsigned long flags; @@ -3249,7 +3127,10 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) if (!data_race(barn->nr_full)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_full)) { full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, @@ -3270,7 +3151,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) * barn. But if there are too many full sheaves, reject this with -E2BIG. */ static struct slab_sheaf * -barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, + bool allow_spin) { struct slab_sheaf *empty; unsigned long flags; @@ -3281,7 +3163,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) if (!data_race(barn->nr_empty)) return ERR_PTR(-ENOMEM); - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return ERR_PTR(-EBUSY); if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, @@ -3585,7 +3470,7 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); } -static void __free_slab(struct kmem_cache *s, struct slab *slab) +static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) { struct page *page = slab_page(slab); int order = compound_order(page); @@ -3596,14 +3481,26 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __ClearPageSlab(page); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - free_frozen_pages(page, order); + if (allow_spin) + free_frozen_pages(page, order); + else + free_frozen_pages_nolock(page, order); +} + +static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) +{ + /* + * Since it was just allocated, we can skip the actions in + * discard_slab() and free_slab(). + */ + __free_slab(s, slab, false); } static void rcu_free_slab(struct rcu_head *h) { struct slab *slab = container_of(h, struct slab, rcu_head); - __free_slab(slab->slab_cache, slab); + __free_slab(slab->slab_cache, slab, true); } static void free_slab(struct kmem_cache *s, struct slab *slab) @@ -3619,7 +3516,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab) if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); else - __free_slab(s, slab); + __free_slab(s, slab, true); } static void discard_slab(struct kmem_cache *s, struct slab *slab) @@ -3647,10 +3544,10 @@ static inline void slab_clear_node_partial(struct slab *slab) * Management of partially allocated slabs. */ static inline void -__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) +__add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) { n->nr_partial++; - if (tail == DEACTIVATE_TO_TAIL) + if (mode == ADD_TO_TAIL) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); @@ -3658,10 +3555,10 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) } static inline void add_partial(struct kmem_cache_node *n, - struct slab *slab, int tail) + struct slab *slab, enum add_mode mode) { lockdep_assert_held(&n->list_lock); - __add_partial(n, slab, tail); + __add_partial(n, slab, mode); } static inline void remove_partial(struct kmem_cache_node *n, @@ -3712,8 +3609,6 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); - /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist @@ -3729,8 +3624,8 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, void *object; if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { - /* Unlucky, discard newly allocated slab */ - defer_deactivate_slab(slab, NULL); + /* Unlucky, discard newly allocated slab. */ + free_new_slab_nolock(s, slab); return NULL; } @@ -3756,7 +3651,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, if (slab->inuse == slab->objects) add_full(s, n, slab); else - add_partial(n, slab, DEACTIVATE_TO_HEAD); + add_partial(n, slab, ADD_TO_HEAD); inc_slabs_node(s, nid, slab->objects); spin_unlock_irqrestore(&n->list_lock, flags); @@ -3764,29 +3659,78 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, return object; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); -#else -static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, - int drain) { } -#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); +static bool get_partial_node_bulk(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_bulk_context *pc, + bool allow_spin) +{ + struct slab *slab, *slab2; + unsigned int total_free = 0; + unsigned long flags; + + /* Racy check to avoid taking the lock unnecessarily. */ + if (!n || data_race(!n->nr_partial)) + return false; + + INIT_LIST_HEAD(&pc->slabs); + + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return false; + + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + struct freelist_counters flc; + unsigned int slab_free; + + if (!pfmemalloc_match(slab, pc->flags)) + continue; + + /* + * determine the number of free objects in the slab racily + * + * slab_free is a lower bound due to possible subsequent + * concurrent freeing, so the caller may get more objects than + * requested and must handle that + */ + flc.counters = data_race(READ_ONCE(slab->counters)); + slab_free = flc.objects - flc.inuse; + + /* we have already min and this would get us over the max */ + if (total_free >= pc->min_objects + && total_free + slab_free > pc->max_objects) + break; + + remove_partial(n, slab); + + list_add(&slab->slab_list, &pc->slabs); + + total_free += slab_free; + if (total_free >= pc->max_objects) + break; + } + + spin_unlock_irqrestore(&n->list_lock, flags); + return total_free > 0; +} + /* - * Try to allocate a partial slab from a specific node. + * Try to allocate object from a partial slab on a specific node. */ -static struct slab *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, - struct partial_context *pc) +static void *get_from_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct slab *slab, *slab2, *partial = NULL; + struct slab *slab, *slab2; unsigned long flags; - unsigned int partial_slabs = 0; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partial() + * partial slab and there is none available then get_from_partial() * will return NULL. */ if (!n || !n->nr_partial) @@ -3797,54 +3741,55 @@ static struct slab *get_partial_node(struct kmem_cache *s, else if (!spin_trylock_irqsave(&n->list_lock, flags)) return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + + struct freelist_counters old, new; + if (!pfmemalloc_match(slab, pc->flags)) continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - void *object = alloc_single_from_partial(s, n, slab, + object = alloc_single_from_partial(s, n, slab, pc->orig_size); - if (object) { - partial = slab; - pc->object = object; + if (object) break; - } continue; } - remove_partial(n, slab); + /* + * get a single object from the slab. This might race against + * __slab_free(), which however has to take the list_lock if + * it's about to make the slab fully free. + */ + do { + old.freelist = slab->freelist; + old.counters = slab->counters; - if (!partial) { - partial = slab; - stat(s, ALLOC_FROM_PARTIAL); + new.freelist = get_freepointer(s, old.freelist); + new.counters = old.counters; + new.inuse++; - if ((slub_get_cpu_partial(s) == 0)) { - break; - } - } else { - put_cpu_partial(s, slab, 0); - stat(s, CPU_PARTIAL_NODE); + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { - break; - } - } + object = old.freelist; + if (!new.freelist) + remove_partial(n, slab); + + break; } spin_unlock_irqrestore(&n->list_lock, flags); - return partial; + return object; } /* - * Get a slab from somewhere. Search in increasing NUMA distances. + * Get an object from somewhere. Search in increasing NUMA distances. */ -static struct slab *get_any_partial(struct kmem_cache *s, - struct partial_context *pc) +static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(pc->flags); - struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -3879,8 +3824,10 @@ static struct slab *get_any_partial(struct kmem_cache *s, if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - slab = get_partial_node(s, n, pc); - if (slab) { + + void *object = get_from_partial_node(s, n, pc); + + if (object) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -3888,7 +3835,7 @@ static struct slab *get_any_partial(struct kmem_cache *s, * between allocation and the cpuset * update */ - return slab; + return object; } } } @@ -3898,424 +3845,29 @@ static struct slab *get_any_partial(struct kmem_cache *s, } /* - * Get a partial slab, lock it and return it. + * Get an object from a partial slab */ -static struct slab *get_partial(struct kmem_cache *s, int node, - struct partial_context *pc) +static void *get_from_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - struct slab *slab; int searchnode = node; + void *object; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) - return slab; - - return get_any_partial(s, pc); -} - -#ifdef CONFIG_PREEMPTION -/* - * Calculate the next globally unique transaction for disambiguation - * during cmpxchg. The transactions start with the cpu number and are then - * incremented by CONFIG_NR_CPUS. - */ -#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) -#else -/* - * No preemption supported therefore also no need to check for - * different cpus. - */ -#define TID_STEP 1 -#endif /* CONFIG_PREEMPTION */ - -static inline unsigned long next_tid(unsigned long tid) -{ - return tid + TID_STEP; -} - -#ifdef SLUB_DEBUG_CMPXCHG -static inline unsigned int tid_to_cpu(unsigned long tid) -{ - return tid % TID_STEP; -} - -static inline unsigned long tid_to_event(unsigned long tid) -{ - return tid / TID_STEP; -} -#endif - -static inline unsigned int init_tid(int cpu) -{ - return cpu; -} - -static inline void note_cmpxchg_failure(const char *n, - const struct kmem_cache *s, unsigned long tid) -{ -#ifdef SLUB_DEBUG_CMPXCHG - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - - pr_info("%s %s: cmpxchg redo ", n, s->name); - - if (IS_ENABLED(CONFIG_PREEMPTION) && - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { - pr_warn("due to cpu change %d -> %d\n", - tid_to_cpu(tid), tid_to_cpu(actual_tid)); - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { - pr_warn("due to cpu running other code. Event %ld->%ld\n", - tid_to_event(tid), tid_to_event(actual_tid)); - } else { - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", - actual_tid, tid, next_tid(tid)); - } -#endif - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); -} - -static void init_kmem_cache_cpus(struct kmem_cache *s) -{ -#ifdef CONFIG_PREEMPT_RT - /* - * Register lockdep key for non-boot kmem caches to avoid - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() - */ - bool finegrain_lockdep = !init_section_contains(s, 1); -#else - /* - * Don't bother with different lockdep classes for each - * kmem_cache, since we only use local_trylock_irqsave(). - */ - bool finegrain_lockdep = false; -#endif - int cpu; - struct kmem_cache_cpu *c; - - if (finegrain_lockdep) - lockdep_register_key(&s->lock_key); - for_each_possible_cpu(cpu) { - c = per_cpu_ptr(s->cpu_slab, cpu); - local_trylock_init(&c->lock); - if (finegrain_lockdep) - lockdep_set_class(&c->lock, &s->lock_key); - c->tid = init_tid(cpu); - } -} - -/* - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, - * unfreezes the slabs and puts it on the proper list. - * Assumes the slab has been already safely taken away from kmem_cache_cpu - * by the caller. - */ -static void deactivate_slab(struct kmem_cache *s, struct slab *slab, - void *freelist) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - int free_delta = 0; - void *nextfree, *freelist_iter, *freelist_tail; - int tail = DEACTIVATE_TO_HEAD; - unsigned long flags = 0; - struct freelist_counters old, new; - - if (READ_ONCE(slab->freelist)) { - stat(s, DEACTIVATE_REMOTE_FREES); - tail = DEACTIVATE_TO_TAIL; - } - - /* - * Stage one: Count the objects on cpu's freelist as free_delta and - * remember the last object in freelist_tail for later splicing. - */ - freelist_tail = NULL; - freelist_iter = freelist; - while (freelist_iter) { - nextfree = get_freepointer(s, freelist_iter); - - /* - * If 'nextfree' is invalid, it is possible that the object at - * 'freelist_iter' is already corrupted. So isolate all objects - * starting at 'freelist_iter' by skipping them. - */ - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) - break; - - freelist_tail = freelist_iter; - free_delta++; - - freelist_iter = nextfree; - } - - /* - * Stage two: Unfreeze the slab while splicing the per-cpu - * freelist to the head of slab's freelist. - */ - do { - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - new.frozen = 0; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else { - new.freelist = old.freelist; - } - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); - - /* - * Stage three: Manipulate the slab list based on the updated state. - */ - if (!new.inuse && n->nr_partial >= s->min_partial) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } else if (new.freelist) { - spin_lock_irqsave(&n->list_lock, flags); - add_partial(n, slab, tail); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, tail); - } else { - stat(s, DEACTIVATE_FULL); - } -} - -/* - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock - * can be acquired without a deadlock before invoking the function. - * - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), - * and kmalloc() is not used in an unsupported context. - * - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but - * lockdep_assert() will catch a bug in case: - * #1 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() - * or - * #2 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() - * - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt - * disabled context. The lock will always be acquired and if needed it - * block and sleep until the lock is available. - * #1 is possible in !PREEMPT_RT only. - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) - * - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B - */ -#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) -#define local_lock_cpu_slab(s, flags) \ - local_lock_irqsave(&(s)->cpu_slab->lock, flags) -#else -#define local_lock_cpu_slab(s, flags) \ - do { \ - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ - lockdep_assert(__l); \ - } while (0) -#endif - -#define local_unlock_cpu_slab(s, flags) \ - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) - -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) -{ - struct kmem_cache_node *n = NULL, *n2 = NULL; - struct slab *slab, *slab_to_discard = NULL; - unsigned long flags = 0; - - while (partial_slab) { - slab = partial_slab; - partial_slab = slab->next; - - n2 = get_node(s, slab_nid(slab)); - if (n != n2) { - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); - - n = n2; - spin_lock_irqsave(&n->list_lock, flags); - } - - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { - slab->next = slab_to_discard; - slab_to_discard = slab; - } else { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - } - - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); - - while (slab_to_discard) { - slab = slab_to_discard; - slab_to_discard = slab_to_discard->next; - - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } -} - -/* - * Put all the cpu partial slabs to the node partial list. - */ -static void put_partials(struct kmem_cache *s) -{ - struct slab *partial_slab; - unsigned long flags; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - partial_slab = this_cpu_read(s->cpu_slab->partial); - this_cpu_write(s->cpu_slab->partial, NULL); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (partial_slab) - __put_partials(s, partial_slab); -} - -static void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - struct slab *partial_slab; - - partial_slab = slub_percpu_partial(c); - c->partial = NULL; - - if (partial_slab) - __put_partials(s, partial_slab); -} - -/* - * Put a slab into a partial slab slot if available. - * - * If we did not find a slot then simply move all the partials to the - * per node partial list. - */ -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) -{ - struct slab *oldslab; - struct slab *slab_to_put = NULL; - unsigned long flags; - int slabs = 0; - - local_lock_cpu_slab(s, flags); - - oldslab = this_cpu_read(s->cpu_slab->partial); - - if (oldslab) { - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { - /* - * Partial array is full. Move the existing set to the - * per node partial list. Postpone the actual unfreezing - * outside of the critical section. - */ - slab_to_put = oldslab; - oldslab = NULL; - } else { - slabs = oldslab->slabs; - } - } - - slabs++; - - slab->slabs = slabs; - slab->next = oldslab; - - this_cpu_write(s->cpu_slab->partial, slab); - - local_unlock_cpu_slab(s, flags); - - if (slab_to_put) { - __put_partials(s, slab_to_put); - stat(s, CPU_PARTIAL_DRAIN); - } -} - -#else /* CONFIG_SLUB_CPU_PARTIAL */ - -static inline void put_partials(struct kmem_cache *s) { } -static inline void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } - -#endif /* CONFIG_SLUB_CPU_PARTIAL */ + object = get_from_partial_node(s, get_node(s, searchnode), pc); + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) + return object; -static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) -{ - unsigned long flags; - struct slab *slab; - void *freelist; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - - slab = c->slab; - freelist = c->freelist; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } -} - -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - void *freelist = c->freelist; - struct slab *slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } - - put_partials_cpu(s, c); -} - -static inline void flush_this_cpu_slab(struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - - if (c->slab) - flush_slab(s, c); - - put_partials(s); -} - -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - return c->slab || slub_percpu_partial(c); + return get_from_any_partial(s, pc); } static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) return false; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); @@ -4324,11 +3876,11 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s) } /* - * Flush cpu slab. + * Flush percpu sheaves * * Called from CPU work handler with migration disabled. */ -static void flush_cpu_slab(struct work_struct *w) +static void flush_cpu_sheaves(struct work_struct *w) { struct kmem_cache *s; struct slub_flush_work *sfw; @@ -4337,10 +3889,8 @@ static void flush_cpu_slab(struct work_struct *w) s = sfw->s; - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) pcs_flush_all(s); - - flush_this_cpu_slab(s); } static void flush_all_cpus_locked(struct kmem_cache *s) @@ -4353,11 +3903,11 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { + if (!has_pcs_used(cpu, s)) { sfw->skip = true; continue; } - INIT_WORK(&sfw->work, flush_cpu_slab); + INIT_WORK(&sfw->work, flush_cpu_sheaves); sfw->skip = false; sfw->s = s; queue_work_on(cpu, flushwq, &sfw->work); @@ -4442,7 +3992,7 @@ void flush_all_rcu_sheaves(void) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) continue; flush_rcu_sheaves_on_cache(s); } @@ -4463,27 +4013,13 @@ static int slub_cpu_dead(unsigned int cpu) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - __flush_cpu_slab(s, cpu); - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) __pcs_flush_all_cpu(s, cpu); } mutex_unlock(&slab_mutex); return 0; } -/* - * Check if the objects in a per cpu structure fit numa - * locality expectations. - */ -static inline int node_match(struct slab *slab, int node) -{ -#ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && slab_nid(slab) != node) - return 0; -#endif - return 1; -} - #ifdef CONFIG_SLUB_DEBUG static int count_free(struct slab *slab) { @@ -4656,235 +4192,116 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } -static inline bool -__update_cpu_freelist_fast(struct kmem_cache *s, - void *freelist_old, void *freelist_new, - unsigned long tid) -{ - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, - &old.freelist_tid, new.freelist_tid); -} - /* - * Check the slab->freelist and either transfer the freelist to the - * per cpu freelist or deactivate the slab. + * Get the slab's freelist and do not freeze it. * - * The slab is still frozen if the return value is not NULL. + * Assumes the slab is isolated from node partial list and not frozen. * - * If this function returns NULL then the slab has been unfrozen. + * Assumes this is performed only for caches without debugging so we + * don't need to worry about adding the slab to the full list. */ -static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) +static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) { struct freelist_counters old, new; - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - do { old.freelist = slab->freelist; old.counters = slab->counters; new.freelist = NULL; new.counters = old.counters; + VM_WARN_ON_ONCE(new.frozen); new.inuse = old.objects; - new.frozen = old.freelist != NULL; - - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); return old.freelist; } /* - * Freeze the partial slab and return the pointer to the freelist. + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. */ -static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) { - struct freelist_counters old, new; - - do { - old.freelist = slab->freelist; - old.counters = slab->counters; - - new.freelist = NULL; - new.counters = old.counters; - VM_BUG_ON(new.frozen); - - new.inuse = old.objects; - new.frozen = 1; - - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); - - return old.freelist; + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); } -/* - * Slow path. The lockless freelist is empty or we need to perform - * debugging duties. - * - * Processing is still very fast if new objects have been freed to the - * regular freelist. In that case we simply take over the regular freelist - * as the lockless freelist and zap the regular freelist. - * - * If that is not working then we fall back to the partial lists. We take the - * first element of the freelist as the object to allocate now and move the - * rest of the freelist to the lockless freelist. - * - * And if we were unable to get a new slab from the partial slab lists then - * we need to allocate a new slab. This is the slowest path since it involves - * a call to the page allocator and the setup of a new slab. - * - * Version of __slab_alloc to use when we know that preemption is - * already disabled (which is the case for bulk allocation). - */ -static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, + void **p, unsigned int count, bool allow_spin) { - bool allow_spin = gfpflags_allow_spinning(gfpflags); - void *freelist; - struct slab *slab; + unsigned int allocated = 0; + struct kmem_cache_node *n; + bool needs_add_partial; unsigned long flags; - struct partial_context pc; - bool try_thisnode = true; - - stat(s, ALLOC_SLOWPATH); - -reread_slab: - - slab = READ_ONCE(c->slab); - if (!slab) { - /* - * if the node is not online or has no normal memory, just - * ignore the node constraint - */ - if (unlikely(node != NUMA_NO_NODE && - !node_isset(node, slab_nodes))) - node = NUMA_NO_NODE; - goto new_slab; - } - - if (unlikely(!node_match(slab, node))) { - /* - * same as above but node_match() being false already - * implies node != NUMA_NO_NODE. - * - * We don't strictly honor pfmemalloc and NUMA preferences - * when !allow_spin because: - * - * 1. Most kmalloc() users allocate objects on the local node, - * so kmalloc_nolock() tries not to interfere with them by - * deactivating the cpu slab. - * - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause - * unnecessary slab allocations even when n->partial list - * is not empty. - */ - if (!node_isset(node, slab_nodes) || - !allow_spin) { - node = NUMA_NO_NODE; - } else { - stat(s, ALLOC_NODE_MISMATCH); - goto deactivate_slab; - } - } + void *object; /* - * By rights, we should be searching for a slab page that was - * PFMEMALLOC but right now, we are losing the pfmemalloc - * information when the page leaves the per-cpu allocator + * Are we going to put the slab on the partial list? + * Note slab->inuse is 0 on a new slab. */ - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) - goto deactivate_slab; + needs_add_partial = (slab->objects > count); - /* must check again c->slab in case we got preempted and it changed */ - local_lock_cpu_slab(s, flags); + if (!allow_spin && needs_add_partial) { - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - if (freelist) - goto load_freelist; + n = get_node(s, slab_nid(slab)); - freelist = get_freelist(s, slab); - - if (!freelist) { - c->slab = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - stat(s, DEACTIVATE_BYPASS); - goto new_slab; + if (!spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + free_new_slab_nolock(s, slab); + return 0; + } } - stat(s, ALLOC_REFILL); - -load_freelist: - - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + object = slab->freelist; + while (object && allocated < count) { + p[allocated] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[allocated]); - /* - * freelist is pointing to the list of objects to be used. - * slab is pointing to the slab from which the objects are obtained. - * That slab must be frozen for per cpu allocations to work. - */ - VM_BUG_ON(!c->slab->frozen); - c->freelist = get_freepointer(s, freelist); - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - return freelist; - -deactivate_slab: - - local_lock_cpu_slab(s, flags); - if (slab != c->slab) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - deactivate_slab(s, slab, freelist); + slab->inuse++; + allocated++; + } + slab->freelist = object; -new_slab: + if (needs_add_partial) { -#ifdef CONFIG_SLUB_CPU_PARTIAL - while (slub_percpu_partial(c)) { - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - if (unlikely(!slub_percpu_partial(c))) { - local_unlock_cpu_slab(s, flags); - /* we were preempted and partial list got empty */ - goto new_objects; + if (allow_spin) { + n = get_node(s, slab_nid(slab)); + spin_lock_irqsave(&n->list_lock, flags); } + add_partial(n, slab, ADD_TO_HEAD); + spin_unlock_irqrestore(&n->list_lock, flags); + } - slab = slub_percpu_partial(c); - slub_set_percpu_partial(c, slab); - - if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags)) || - !allow_spin) { - c->slab = slab; - freelist = get_freelist(s, slab); - VM_BUG_ON(!freelist); - stat(s, CPU_PARTIAL_ALLOC); - goto load_freelist; - } + inc_slabs_node(s, slab_nid(slab), slab->objects); + return allocated; +} - local_unlock_cpu_slab(s, flags); +/* + * Slow path. We failed to allocate via percpu sheaves or they are not available + * due to bootstrap or debugging enabled or SLUB_TINY. + * + * We try to allocate from partial slab lists and fall back to allocating a new + * slab. + */ +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, unsigned int orig_size) +{ + bool allow_spin = gfpflags_allow_spinning(gfpflags); + void *object; + struct slab *slab; + struct partial_context pc; + bool try_thisnode = true; - slab->next = NULL; - __put_partials(s, slab); - } -#endif + stat(s, ALLOC_SLOWPATH); new_objects: @@ -4893,12 +4310,12 @@ new_objects: * When a preferred node is indicated but no __GFP_THISNODE * * 1) try to get a partial slab from target node only by having - * __GFP_THISNODE in pc.flags for get_partial() + * __GFP_THISNODE in pc.flags for get_from_partial() * 2) if 1) failed, try to allocate a new slab from target node with * GPF_NOWAIT | __GFP_THISNODE opportunistically * 3) if 2) failed, retry with original gfpflags which will allow - * get_partial() try partial lists of other nodes before potentially - * allocating new page from other nodes + * get_from_partial() try partial lists of other nodes before + * potentially allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)) { @@ -4910,33 +4327,11 @@ new_objects: } pc.orig_size = orig_size; - slab = get_partial(s, node, &pc); - if (slab) { - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = pc.object; - /* - * For debug caches here we had to go through - * alloc_single_from_partial() so just store the - * tracking info and return the object. - * - * Due to disabled preemption we need to disallow - * blocking. The flags are further adjusted by - * gfp_nested_mask() in stack_depot itself. - */ - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); + object = get_from_partial(s, node, &pc); + if (object) + goto success; - return freelist; - } - - freelist = freeze_slab(s, slab); - goto retry_load_slab; - } - - slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, pc.flags, node); - c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) @@ -4951,165 +4346,36 @@ new_objects: stat(s, ALLOC_SLAB); if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - - if (unlikely(!freelist)) { - /* This could cause an endless loop. Fail instead. */ - if (!allow_spin) - return NULL; - goto new_objects; - } - - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); - - return freelist; - } - - /* - * No other reference to the slab yet so we can - * muck around with it freely without cmpxchg - */ - freelist = slab->freelist; - slab->freelist = NULL; - slab->inuse = slab->objects; - slab->frozen = 1; + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - inc_slabs_node(s, slab_nid(slab), slab->objects); + if (likely(object)) + goto success; + } else { + alloc_from_new_slab(s, slab, &object, 1, allow_spin); - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { - /* - * For !pfmemalloc_match() case we don't load freelist so that - * we don't make further mismatched allocations easier. - */ - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; + /* we don't need to check SLAB_STORE_USER here */ + if (likely(object)) + return object; } -retry_load_slab: - - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - void *flush_freelist = c->freelist; - struct slab *flush_slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_cpu_slab(s, flags); - - if (unlikely(!allow_spin)) { - /* Reentrant slub cannot take locks, defer */ - defer_deactivate_slab(flush_slab, flush_freelist); - } else { - deactivate_slab(s, flush_slab, flush_freelist); - } - - stat(s, CPUSLAB_FLUSH); - - goto retry_load_slab; - } - c->slab = slab; + if (allow_spin) + goto new_objects; - goto load_freelist; -} -/* - * We disallow kprobes in ___slab_alloc() to prevent reentrance - * - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() - * manipulating c->freelist without lock. - * - * This does not prevent kprobe in functions called from ___slab_alloc() such as - * local_lock_irqsave() itself, and that is fine, we only need to protect the - * c->freelist manipulation in ___slab_alloc() itself. - */ -NOKPROBE_SYMBOL(___slab_alloc); + /* This could cause an endless loop. Fail instead. */ + return NULL; -/* - * A wrapper for ___slab_alloc() for contexts where preemption is not yet - * disabled. Compensates for possible cpu changes by refetching the per cpu area - * pointer. - */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) -{ - void *p; +success: + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) + set_track(s, object, TRACK_ALLOC, addr, gfpflags); -#ifdef CONFIG_PREEMPT_COUNT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling preemption. Need to reload cpu area - * pointer. - */ - c = slub_get_cpu_ptr(s->cpu_slab); -#endif - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { - if (local_lock_is_locked(&s->cpu_slab->lock)) { - /* - * EBUSY is an internal signal to kmalloc_nolock() to - * retry a different bucket. It's not propagated - * to the caller. - */ - p = ERR_PTR(-EBUSY); - goto out; - } - } - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); -out: -#ifdef CONFIG_PREEMPT_COUNT - slub_put_cpu_ptr(s->cpu_slab); -#endif - return p; + return object; } static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - struct kmem_cache_cpu *c; - struct slab *slab; - unsigned long tid; void *object; -redo: - /* - * Must read kmem_cache cpu data via this cpu ptr. Preemption is - * enabled. We may switch back and forth between cpus while - * reading from one cpu area. That does not matter as long - * as we end up on the original cpu again when doing the cmpxchg. - * - * We must guarantee that tid and kmem_cache_cpu are retrieved on the - * same cpu. We read first the kmem_cache_cpu pointer and use it to read - * the tid. If we are preempted and switched to another cpu between the - * two reads, it's OK as the two are still associated with the same cpu - * and cmpxchg later will validate the cpu. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* - * Irqless object alloc/free algorithm used here depends on sequence - * of fetching cpu_slab's data. tid should be fetched before anything - * on c to guarantee that object and slab associated with previous tid - * won't be used with current tid. If we fetch tid first, object and - * slab could be one associated with next tid and our alloc/free - * request will be failed. In this case, we will retry. So, no problem. - */ - barrier(); - - /* - * The transaction ids are globally unique per cpu and per operation on - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double - * occurs on the right processor and that there was no operation on the - * linked list in between. - */ - - object = c->freelist; - slab = c->slab; - #ifdef CONFIG_NUMA if (static_branch_unlikely(&strict_numa) && node == NUMA_NO_NODE) { @@ -5118,66 +4384,24 @@ redo: if (mpol) { /* - * Special BIND rule support. If existing slab + * Special BIND rule support. If the local node * is in permitted set then do not redirect * to a particular node. * Otherwise we apply the memory policy to get * the node we need to allocate on. */ - if (mpol->mode != MPOL_BIND || !slab || - !node_isset(slab_nid(slab), mpol->nodes)) - + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) node = mempolicy_slab_node(); } } #endif - if (!USE_LOCKLESS_FAST_PATH() || - unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); - } else { - void *next_object = get_freepointer_safe(s, object); - - /* - * The cmpxchg will only match if there was no additional - * operation and if we are on the right processor. - * - * The cmpxchg does the following atomically (without lock - * semantics!) - * 1. Relocate first pointer to the current per cpu area. - * 2. Verify that tid and freelist have not been changed - * 3. If they were not changed replace tid and freelist - * - * Since this is without lock semantics the protection is only - * against code executing on this cpu *not* from access by - * other cpus. - */ - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { - note_cmpxchg_failure("slab_alloc", s, tid); - goto redo; - } - prefetch_freepointer(s, next_object); - stat(s, ALLOC_FASTPATH); - } + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); return object; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - * - * Note that we also wipe custom freelist pointers. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj && - !freeptr_outside_object(s)) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - static __fastpath_inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { @@ -5264,6 +4488,12 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); return pcs; @@ -5275,7 +4505,8 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5292,7 +4523,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, empty = pcs->spare; pcs->spare = NULL; } else { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); } } @@ -5334,7 +4565,10 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, */ if (pcs->main->size == 0) { - barn_put_empty_sheaf(barn, pcs->main); + if (!pcs->spare) + pcs->spare = pcs->main; + else + barn_put_empty_sheaf(barn, pcs->main); pcs->main = full; return pcs; } @@ -5391,8 +4625,10 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) * We assume the percpu sheaves contain only local objects although it's * not completely guaranteed, so we verify later. */ - if (unlikely(node_requested && node != numa_mem_id())) + if (unlikely(node_requested && node != numa_mem_id())) { + stat(s, ALLOC_NODE_MISMATCH); return NULL; + } if (!local_trylock(&s->cpu_sheaves->lock)) return NULL; @@ -5415,6 +4651,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) */ if (page_to_nid(virt_to_page(object)) != node) { local_unlock(&s->cpu_sheaves->lock); + stat(s, ALLOC_NODE_MISMATCH); return NULL; } } @@ -5423,13 +4660,14 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) local_unlock(&s->cpu_sheaves->lock); - stat(s, ALLOC_PCS); + stat(s, ALLOC_FASTPATH); return object; } static __fastpath_inline -unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, + void **p) { struct slub_percpu_sheaves *pcs; struct slab_sheaf *main; @@ -5447,6 +4685,11 @@ next_batch: struct slab_sheaf *full; struct node_barn *barn; + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return allocated; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); goto do_alloc; @@ -5458,7 +4701,8 @@ next_batch: return allocated; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5488,7 +4732,7 @@ do_alloc: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, ALLOC_PCS, batch); + stat_add(s, ALLOC_FASTPATH, batch); allocated += batch; @@ -5526,8 +4770,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - if (s->cpu_sheaves) - object = alloc_from_pcs(s, gfpflags, node); + object = alloc_from_pcs(s, gfpflags, node); if (!object) object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); @@ -5622,6 +4865,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, return ret; } +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -5635,18 +4881,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) struct slab_sheaf *sheaf = NULL; struct node_barn *barn; - if (unlikely(size > s->sheaf_capacity)) { + if (unlikely(!size)) + return NULL; - /* - * slab_debug disables cpu sheaves intentionally so all - * prefilled sheaves become "oversize" and we give up on - * performance for the debugging. Same with SLUB_TINY. - * Creating a cache without sheaves and then requesting a - * prefilled sheaf is however not expected, so warn. - */ - WARN_ON_ONCE(s->sheaf_capacity == 0 && - !IS_ENABLED(CONFIG_SLUB_TINY) && - !(s->flags & SLAB_DEBUG_FLAGS)); + if (unlikely(size > s->sheaf_capacity)) { sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); if (!sheaf) @@ -5968,7 +5206,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; struct kmem_cache *s; bool can_retry = true; - void *ret = ERR_PTR(-EBUSY); + void *ret; VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | __GFP_NO_OBJ_EXT)); @@ -5976,13 +5214,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) if (unlikely(!size)) return ZERO_SIZE_PTR; - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) - /* - * kmalloc_nolock() in PREEMPT_RT is not supported from - * non-preemptible context because local_lock becomes a - * sleeping lock on RT. - */ + /* + * See the comment for the same check in + * alloc_frozen_pages_nolock_noprof() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) return NULL; + retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; @@ -5991,50 +5229,47 @@ retry: if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) /* * kmalloc_nolock() is not supported on architectures that - * don't implement cmpxchg16b, but debug caches don't use - * per-cpu slab and per-cpu partial slabs. They rely on - * kmem_cache_node->list_lock, so kmalloc_nolock() can - * attempt to allocate from debug caches by + * don't implement cmpxchg16b and thus need slab_lock() + * which could be preempted by a nmi. + * But debug caches don't use that and only rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt + * to allocate from debug caches by * spin_trylock_irqsave(&n->list_lock, ...) */ return NULL; + ret = alloc_from_pcs(s, alloc_gfp, node); + if (ret) + goto success; + /* * Do not call slab_alloc_node(), since trylock mode isn't * compatible with slab_pre_alloc_hook/should_failslab and * kfence_alloc. Hence call __slab_alloc_node() (at most twice) * and slab_post_alloc_hook() directly. - * - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair - * in irq saved region. It assumes that the same cpu will not - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. - * Therefore use in_nmi() to check whether particular bucket is in - * irq protected section. - * - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that - * this cpu was interrupted somewhere inside ___slab_alloc() after - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). - * In this case fast path with __update_cpu_freelist_fast() is not safe. */ - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); - if (PTR_ERR(ret) == -EBUSY) { - if (can_retry) { - /* pick the next kmalloc bucket */ - size = s->object_size + 1; - /* - * Another alternative is to - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; - * to retry from bucket of the same size. - */ - can_retry = false; - goto retry; - } - ret = NULL; + /* + * It's possible we failed due to trylock as we preempted someone with + * the sheaves locked, and the list_lock is also held by another cpu. + * But it should be rare that multiple kmalloc buckets would have + * sheaves locked, so try a larger one. + */ + if (!ret && can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; } +success: maybe_wipe_obj_freeptr(s, ret); slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, slab_want_init_on_alloc(alloc_gfp, s), size); @@ -6116,7 +5351,7 @@ static noinline void free_to_partial_list( /* was on full list */ remove_full(s, n, slab); if (!slab_free) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } else if (slab_free) { @@ -6154,26 +5389,17 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - bool was_frozen, was_full; + bool was_full; struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; bool on_node_partial; - stat(s, FREE_SLOWPATH); - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return; } - /* - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) - * is the only other reason it can be false, and it is already handled - * above. - */ - do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); @@ -6184,7 +5410,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, old.counters = slab->counters; was_full = (old.freelist == NULL); - was_frozen = old.frozen; set_freepointer(s, tail, old.freelist); @@ -6197,53 +5422,29 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * to (due to not being full anymore) the partial list. * Unless it's frozen. */ - if ((!new.inuse || was_full) && !was_frozen) { + if (!new.inuse || was_full) { + + n = get_node(s, slab_nid(slab)); /* - * If slab becomes non-full and we have cpu partial - * lists, we put it there unconditionally to avoid - * taking the list_lock. Otherwise we need it. + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. */ - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { - - n = get_node(s, slab_nid(slab)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); - on_node_partial = slab_test_node_partial(slab); - } + on_node_partial = slab_test_node_partial(slab); } } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { - - if (likely(was_frozen)) { - /* - * The list lock was not taken therefore no list - * activity can be necessary. - */ - stat(s, FREE_FROZEN); - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { - /* - * If we started with a full slab then put it onto the - * per cpu partial list. - */ - put_cpu_partial(s, slab, 1); - stat(s, CPU_PARTIAL_FREE); - } - /* - * In other cases we didn't take the list_lock because the slab - * was already on the partial list and will remain there. + * We didn't take the list_lock because the slab was already on + * the partial list and will remain there. */ - return; } @@ -6265,11 +5466,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, /* * Objects left in the slab. If it was not on the partial list before - * then add it. This can only happen when cache has no per cpu partial - * list otherwise we would have put it there. + * then add it. */ - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + if (unlikely(was_full)) { + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -6355,7 +5555,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s, * unlocked. */ static struct slub_percpu_sheaves * -__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, + bool allow_spin) { struct slab_sheaf *empty; struct node_barn *barn; @@ -6364,6 +5565,12 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) restart: lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + barn = get_barn(s); if (!barn) { local_unlock(&s->cpu_sheaves->lock); @@ -6373,7 +5580,7 @@ restart: put_fail = false; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, allow_spin); if (empty) { pcs->spare = pcs->main; pcs->main = empty; @@ -6387,7 +5594,7 @@ restart: return pcs; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); if (!IS_ERR(empty)) { stat(s, BARN_PUT); @@ -6395,7 +5602,8 @@ restart: return pcs; } - if (PTR_ERR(empty) == -E2BIG) { + /* sheaf_flush_unused() doesn't support !allow_spin */ + if (PTR_ERR(empty) == -E2BIG && allow_spin) { /* Since we got here, spare exists and is full */ struct slab_sheaf *to_flush = pcs->spare; @@ -6420,6 +5628,14 @@ restart: alloc_empty: local_unlock(&s->cpu_sheaves->lock); + /* + * alloc_empty_sheaf() doesn't support !allow_spin and it's + * easier to fall back to freeing directly without sheaves + * than add the support (and to sheaf_flush_unused() above) + */ + if (!allow_spin) + return NULL; + empty = alloc_empty_sheaf(s, GFP_NOWAIT); if (empty) goto got_empty; @@ -6462,7 +5678,7 @@ got_empty: * The object is expected to have passed slab_free_hook() already. */ static __fastpath_inline -bool free_to_pcs(struct kmem_cache *s, void *object) +bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) { struct slub_percpu_sheaves *pcs; @@ -6473,7 +5689,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object) if (unlikely(pcs->main->size == s->sheaf_capacity)) { - pcs = __pcs_replace_full_main(s, pcs); + pcs = __pcs_replace_full_main(s, pcs, allow_spin); if (unlikely(!pcs)) return false; } @@ -6482,7 +5698,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object) local_unlock(&s->cpu_sheaves->lock); - stat(s, FREE_PCS); + stat(s, FREE_FASTPATH); return true; } @@ -6580,6 +5796,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) struct slab_sheaf *empty; struct node_barn *barn; + /* Bootstrap or debug cache, fall back */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + goto fail; + } + if (pcs->spare && pcs->spare->size == 0) { pcs->rcu_free = pcs->spare; pcs->spare = NULL; @@ -6592,7 +5814,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) goto fail; } - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (empty) { pcs->rcu_free = empty; @@ -6712,7 +5934,7 @@ next_batch: goto no_empty; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (!empty) goto no_empty; @@ -6726,7 +5948,7 @@ next_batch: goto do_free; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, true); if (IS_ERR(empty)) { stat(s, BARN_PUT_FAIL); goto no_empty; @@ -6744,7 +5966,7 @@ do_free: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, FREE_PCS, batch); + stat_add(s, FREE_FASTPATH, batch); if (batch < size) { p += batch; @@ -6766,10 +5988,12 @@ no_empty: */ fallback: __kmem_cache_free_bulk(s, size, p); + stat_add(s, FREE_SLOWPATH, size); flush_remote: if (remote_nr) { __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + stat_add(s, FREE_SLOWPATH, remote_nr); if (i < size) { remote_nr = 0; goto next_remote_batch; @@ -6779,7 +6003,6 @@ flush_remote: struct defer_free { struct llist_head objects; - struct llist_head slabs; struct irq_work work; }; @@ -6787,23 +6010,21 @@ static void free_deferred_objects(struct irq_work *work); static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { .objects = LLIST_HEAD_INIT(objects), - .slabs = LLIST_HEAD_INIT(slabs), .work = IRQ_WORK_INIT(free_deferred_objects), }; /* * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * to take sleeping spin_locks from __slab_free(). * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). */ static void free_deferred_objects(struct irq_work *work) { struct defer_free *df = container_of(work, struct defer_free, work); struct llist_head *objs = &df->objects; - struct llist_head *slabs = &df->slabs; struct llist_node *llnode, *pos, *t; - if (llist_empty(objs) && llist_empty(slabs)) + if (llist_empty(objs)) return; llnode = llist_del_all(objs); @@ -6826,16 +6047,7 @@ static void free_deferred_objects(struct irq_work *work) set_freepointer(s, x, NULL); __slab_free(s, slab, x, x, 1, _THIS_IP_); - } - - llnode = llist_del_all(slabs); - llist_for_each_safe(pos, t, llnode) { - struct slab *slab = container_of(pos, struct slab, llnode); - - if (slab->frozen) - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); - else - free_slab(slab->slab_cache, slab); + stat(s, FREE_SLOWPATH); } } @@ -6852,19 +6064,6 @@ static void defer_free(struct kmem_cache *s, void *head) irq_work_queue(&df->work); } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) -{ - struct defer_free *df; - - slab->flush_freelist = flush_freelist; - - guard(preempt)(); - - df = this_cpu_ptr(&defer_free_objects); - if (llist_add(&slab->llnode, &df->slabs)) - irq_work_queue(&df->work); -} - void defer_free_barrier(void) { int cpu; @@ -6873,99 +6072,6 @@ void defer_free_barrier(void) irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } -/* - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that - * can perform fastpath freeing without additional function calls. - * - * The fastpath is only possible if we are freeing to the current cpu slab - * of this processor. This typically the case if we have just allocated - * the item before. - * - * If fastpath is not possible then fall back to __slab_free where we deal - * with all sorts of special processing. - * - * Bulk free of a freelist with several objects (all pointing to the - * same slab) possible by specifying head and tail ptr, plus objects - * count (cnt). Bulk free indicated by tail pointer being set. - */ -static __always_inline void do_slab_free(struct kmem_cache *s, - struct slab *slab, void *head, void *tail, - int cnt, unsigned long addr) -{ - /* cnt == 0 signals that it's called from kfree_nolock() */ - bool allow_spin = cnt; - struct kmem_cache_cpu *c; - unsigned long tid; - void **freelist; - -redo: - /* - * Determine the currently cpus per cpu slab. - * The cpu may change afterward. However that does not matter since - * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succeed. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* Same with comment on barrier() in __slab_alloc_node() */ - barrier(); - - if (unlikely(slab != c->slab)) { - if (unlikely(!allow_spin)) { - /* - * __slab_free() can locklessly cmpxchg16 into a slab, - * but then it might need to take spin_lock or local_lock - * in put_cpu_partial() for further processing. - * Avoid the complexity and simply add to a deferred list. - */ - defer_free(s, head); - } else { - __slab_free(s, slab, head, tail, cnt, addr); - } - return; - } - - if (unlikely(!allow_spin)) { - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && - local_lock_is_locked(&s->cpu_slab->lock)) { - defer_free(s, head); - return; - } - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ - } - - if (USE_LOCKLESS_FAST_PATH()) { - freelist = READ_ONCE(c->freelist); - - set_freepointer(s, tail, freelist); - - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { - note_cmpxchg_failure("slab_free", s, tid); - goto redo; - } - } else { - __maybe_unused unsigned long flags = 0; - - /* Update the free list under the local lock */ - local_lock_cpu_slab(s, flags); - c = this_cpu_ptr(s->cpu_slab); - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto redo; - } - tid = c->tid; - freelist = c->freelist; - - set_freepointer(s, tail, freelist); - c->freelist = head; - c->tid = next_tid(tid); - - local_unlock_cpu_slab(s, flags); - } - stat_add(s, FREE_FASTPATH, cnt); -} - static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) @@ -6976,14 +6082,14 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) return; - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id()) - && likely(!slab_test_pfmemalloc(slab))) { - if (likely(free_to_pcs(s, object))) + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { + if (likely(free_to_pcs(s, object, true))) return; } - do_slab_free(s, slab, object, object, 1, addr); + __slab_free(s, slab, object, object, 1, addr); + stat(s, FREE_SLOWPATH); } #ifdef CONFIG_MEMCG @@ -6992,7 +6098,7 @@ static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) - do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); + __slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } #endif @@ -7006,8 +6112,10 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) - do_slab_free(s, slab, head, tail, cnt, addr); + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { + __slab_free(s, slab, head, tail, cnt, addr); + stat_add(s, FREE_SLOWPATH, cnt); + } } #ifdef CONFIG_SLUB_RCU_DEBUG @@ -7032,15 +6140,18 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) return; /* resume freeing */ - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) - do_slab_free(s, slab, object, object, 1, _THIS_IP_); + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { + __slab_free(s, slab, object, object, 1, _THIS_IP_); + stat(s, FREE_SLOWPATH); + } } #endif /* CONFIG_SLUB_RCU_DEBUG */ #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); + stat(cache, FREE_SLOWPATH); } #endif @@ -7340,7 +6451,18 @@ void kfree_nolock(const void *object) * since kasan quarantine takes locks and not supported from NMI. */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); - do_slab_free(s, slab, x, x, 0, _RET_IP_); + + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { + if (likely(free_to_pcs(s, x, false))) + return; + } + + /* + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might + * need to take spin_lock for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, x); } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -7766,7 +6888,7 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (kfence_free(df.freelist)) continue; - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); } @@ -7781,7 +6903,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) * freeing to sheaves is so incompatible with the detached freelist so * once we go that way, we have to do everything differently */ - if (s && s->cpu_sheaves) { + if (s && cache_has_sheaves(s)) { free_to_pcs_bulk(s, size, p); return; } @@ -7799,72 +6921,224 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -static inline -int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +static unsigned int +__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max, struct kmem_cache_node *n, + bool allow_spin) { - struct kmem_cache_cpu *c; - unsigned long irqflags; - int i; + struct partial_bulk_context pc; + struct slab *slab, *slab2; + unsigned int refilled = 0; + unsigned long flags; + void *object; + + pc.flags = gfp; + pc.min_objects = min; + pc.max_objects = max; + + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) + return 0; + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + + object = get_freelist_nofreeze(s, slab); + + while (object && refilled < max) { + p[refilled] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[refilled]); + + refilled++; + } + + /* + * Freelist had more objects than we can accommodate, we need to + * free them back. We can treat it like a detached freelist, just + * need to find the tail object. + */ + if (unlikely(object)) { + void *head = object; + void *tail; + int cnt = 0; + + do { + tail = object; + cnt++; + object = get_freepointer(s, object); + } while (object); + __slab_free(s, slab, head, tail, cnt, _RET_IP_); + } + + if (refilled >= max) + break; + } + + if (unlikely(!list_empty(&pc.slabs))) { + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) + continue; + + list_del(&slab->slab_list); + add_partial(n, slab, ADD_TO_HEAD); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + /* any slabs left are completely free and for discard */ + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + discard_slab(s, slab); + } + } + + return refilled; +} + +#ifdef CONFIG_NUMA +static unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(gfp); + unsigned int cpuset_mems_cookie; + unsigned int refilled = 0; + + /* see get_from_any_partial() for the defrag ratio description */ + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return 0; + + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), gfp); + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { + struct kmem_cache_node *n; + unsigned int r; + + n = get_node(s, zone_to_nid(zone)); + + if (!n || !cpuset_zone_allowed(zone, gfp) || + n->nr_partial <= s->min_partial) + continue; + + r = __refill_objects_node(s, p, gfp, min, max, n, + /* allow_spin = */ false); + refilled += r; + + if (r >= min) { + /* + * Don't check read_mems_allowed_retry() here - + * if mems_allowed was updated in parallel, that + * was a harmless race between allocation and + * the cpuset update + */ + return refilled; + } + p += r; + min -= r; + max -= r; + } + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return refilled; +} +#else +static inline unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + return 0; +} +#endif + +static unsigned int +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + int local_node = numa_mem_id(); + unsigned int refilled; + struct slab *slab; + + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) + return 0; + + refilled = __refill_objects_node(s, p, gfp, min, max, + get_node(s, local_node), + /* allow_spin = */ true); + if (refilled >= min) + return refilled; + + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, + max - refilled); + if (refilled >= min) + return refilled; + +new_slab: + + slab = new_slab(s, gfp, local_node); + if (!slab) + goto out; + + stat(s, ALLOC_SLAB); /* - * Drain objects in the per cpu slab, while disabling local - * IRQs, which protects against PREEMPT and interrupts - * handlers invoking normal fastpath. + * TODO: possible optimization - if we know we will consume the whole + * slab we might skip creating the freelist? */ - c = slub_get_cpu_ptr(s->cpu_slab); - local_lock_irqsave(&s->cpu_slab->lock, irqflags); + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, + /* allow_spin = */ true); - for (i = 0; i < size; i++) { - void *object = c->freelist; + if (refilled < min) + goto new_slab; - if (unlikely(!object)) { - /* - * We may have removed an object from c->freelist using - * the fastpath in the previous iteration; in that case, - * c->tid has not been bumped yet. - * Since ___slab_alloc() may reenable interrupts while - * allocating memory, we should bump c->tid now. - */ - c->tid = next_tid(c->tid); +out: + return refilled; +} + +static inline +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + int i; - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + for (i = 0; i < size; i++) { - /* - * Invoking slow path likely have side-effect - * of re-populating per CPU c->freelist - */ - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c, s->object_size); + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, + s->object_size); if (unlikely(!p[i])) goto error; - c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - - local_lock_irqsave(&s->cpu_slab->lock, irqflags); - - continue; /* goto for-loop */ } - c->freelist = get_freepointer(s, object); - p[i] = object; - maybe_wipe_obj_freeptr(s, p[i]); - stat(s, ALLOC_FASTPATH); + } else { + i = refill_objects(s, p, flags, size, size); + if (i < size) + goto error; + stat_add(s, ALLOC_SLOWPATH, i); } - c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); - slub_put_cpu_ptr(s->cpu_slab); return i; error: - slub_put_cpu_ptr(s->cpu_slab); __kmem_cache_free_bulk(s, i, p); return 0; } -/* Note that interrupts must be enabled when calling this function. */ +/* + * Note that interrupts must be enabled when calling this function and gfp + * flags must allow spinning. + */ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { @@ -7892,8 +7166,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, size--; } - if (s->cpu_sheaves) - i = alloc_from_pcs_bulk(s, size, p); + i = alloc_from_pcs_bulk(s, flags, size, p); if (i < size) { /* @@ -8081,29 +7354,25 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) barn_init(barn); } -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +#ifdef CONFIG_SLUB_STATS +static inline int alloc_kmem_cache_stats(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * - sizeof(struct kmem_cache_cpu)); + sizeof(struct kmem_cache_stats)); - /* - * Must align to double word boundary for the double cmpxchg - * instructions to work; see __pcpu_double_call_return_bool(). - */ - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), - 2 * sizeof(void *)); + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); - if (!s->cpu_slab) + if (!s->cpu_stats) return 0; - init_kmem_cache_cpus(s); - return 1; } +#endif static int init_percpu_sheaves(struct kmem_cache *s) { + static struct slab_sheaf bootstrap_sheaf = {}; int cpu; for_each_possible_cpu(cpu) { @@ -8113,7 +7382,28 @@ static int init_percpu_sheaves(struct kmem_cache *s) local_trylock_init(&pcs->lock); - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + /* + * Bootstrap sheaf has zero size so fast-path allocation fails. + * It has also size == s->sheaf_capacity, so fast-path free + * fails. In the slow paths we recognize the situation by + * checking s->sheaf_capacity. This allows fast paths to assume + * s->cpu_sheaves and pcs->main always exists and are valid. + * It's also safe to share the single static bootstrap_sheaf + * with zero-sized objects array as it's never modified. + * + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we + * recognize it and not attempt to free it when destroying the + * cache. + * + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, + * caches with debug enabled, and all caches with SLUB_TINY. + * For kmalloc caches it's used temporarily during the initial + * bootstrap. + */ + if (!s->sheaf_capacity) + pcs->main = &bootstrap_sheaf; + else + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); if (!pcs->main) return -ENOMEM; @@ -8164,7 +7454,7 @@ static void early_kmem_cache_node_alloc(int node) * No locks need to be taken here as it has just been * initialized and there is no concurrent access. */ - __add_partial(n, slab, DEACTIVATE_TO_HEAD); + __add_partial(n, slab, ADD_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -8188,13 +7478,10 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); - if (s->cpu_sheaves) - pcs_destroy(s); -#ifdef CONFIG_PREEMPT_RT - if (s->cpu_slab) - lockdep_unregister_key(&s->lock_key); + pcs_destroy(s); +#ifdef CONFIG_SLUB_STATS + free_percpu(s->cpu_stats); #endif - free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); } @@ -8211,7 +7498,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) continue; } - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); if (!barn) @@ -8232,37 +7519,51 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 1; } -static void set_cpu_partial(struct kmem_cache *s) +static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, + struct kmem_cache_args *args) + { -#ifdef CONFIG_SLUB_CPU_PARTIAL - unsigned int nr_objects; + unsigned int capacity; + size_t size; + + + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) + return 0; /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * For backwards compatibility reasons, this is determined as number - * of objects, even though we now limit maximum number of pages, see - * slub_set_cpu_partial() + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not + * have sheaves to avoid recursion when sheaf allocation triggers + * kmemleak tracking. + */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return 0; + + /* + * For now we use roughly similar formula (divided by two as there are + * two percpu sheaves) as what was used for percpu partial slabs, which + * should result in similar lock contention (barn or list_lock) */ - if (!kmem_cache_has_cpu_partial(s)) - nr_objects = 0; - else if (s->size >= PAGE_SIZE) - nr_objects = 6; + if (s->size >= PAGE_SIZE) + capacity = 4; else if (s->size >= 1024) - nr_objects = 24; + capacity = 12; else if (s->size >= 256) - nr_objects = 52; + capacity = 26; else - nr_objects = 120; + capacity = 60; - slub_set_cpu_partial(s, nr_objects); -#endif + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ + size = struct_size_t(struct slab_sheaf, objects, capacity); + size = kmalloc_size_roundup(size); + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); + + /* + * Respect an explicit request for capacity that's typically motivated by + * expected maximum size of kmem_cache_prefill_sheaf() to not end up + * using low-performance oversize sheaves + */ + return max(capacity, args->sheaf_capacity); } /* @@ -8410,6 +7711,13 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) s->allocflags |= __GFP_RECLAIMABLE; /* + * For KMALLOC_NORMAL caches we enable sheaves later by + * bootstrap_kmalloc_sheaves() to avoid recursion + */ + if (!is_kmalloc_normal(s)) + s->sheaf_capacity = calculate_sheaf_capacity(s, args); + + /* * Determine the number of objects per slab */ s->oo = oo_make(order, size); @@ -8493,7 +7801,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) flush_all_cpus_locked(s); /* we might have rcu sheaves in flight */ - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) rcu_barrier(); /* Attempt to free all objects */ @@ -8805,7 +8113,7 @@ static int slab_mem_going_online_callback(int nid) if (get_node(s, nid)) continue; - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); if (!barn) { @@ -8880,12 +8188,6 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) memcpy(s, static_cache, kmem_cache->object_size); - /* - * This runs very early, and only the boot processor is supposed to be - * up. Even if it weren't true, IRQs are not up so we couldn't fire - * IPIs around. - */ - __flush_cpu_slab(s, smp_processor_id()); for_each_kmem_cache_node(s, node, n) { struct slab *p; @@ -8901,6 +8203,74 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) return s; } +/* + * Finish the sheaves initialization done normally by init_percpu_sheaves() and + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it + * since sheaves and barns are allocated by kmalloc. + */ +static void __init bootstrap_cache_sheaves(struct kmem_cache *s) +{ + struct kmem_cache_args empty_args = {}; + unsigned int capacity; + bool failed = false; + int node, cpu; + + capacity = calculate_sheaf_capacity(s, &empty_args); + + /* capacity can be 0 due to debugging or SLUB_TINY */ + if (!capacity) + return; + + for_each_node_mask(node, slab_nodes) { + struct node_barn *barn; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) { + failed = true; + goto out; + } + + barn_init(barn); + get_node(s, node)->barn = barn; + } + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); + + if (!pcs->main) { + failed = true; + break; + } + } + +out: + /* + * It's still early in boot so treat this like same as a failure to + * create the kmalloc cache in the first place + */ + if (failed) + panic("Out of memory when creating kmem_cache %s\n", s->name); + + s->sheaf_capacity = capacity; +} + +static void __init bootstrap_kmalloc_sheaves(void) +{ + enum kmalloc_cache_type type; + + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { + if (kmalloc_caches[type][idx]) + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); + } + } +} + void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, @@ -8944,6 +8314,8 @@ void __init kmem_cache_init(void) setup_kmalloc_cache_index_table(); create_kmalloc_caches(); + bootstrap_kmalloc_sheaves(); + /* Setup random freelists for each cache */ init_freelist_randomization(); @@ -9011,17 +8383,10 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - set_cpu_partial(s); - - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) - && !(s->flags & SLAB_DEBUG_FLAGS)) { - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); - if (!s->cpu_sheaves) { - err = -ENOMEM; - goto out; - } - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? - s->sheaf_capacity = args->sheaf_capacity; + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; } #ifdef CONFIG_NUMA @@ -9037,14 +8402,14 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!init_kmem_cache_nodes(s)) goto out; - if (!alloc_kmem_cache_cpus(s)) +#ifdef CONFIG_SLUB_STATS + if (!alloc_kmem_cache_stats(s)) goto out; +#endif - if (s->cpu_sheaves) { - err = init_percpu_sheaves(s); - if (err) - goto out; - } + err = init_percpu_sheaves(s); + if (err) + goto out; err = 0; @@ -9359,47 +8724,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, if (!nodes) return -ENOMEM; - if (flags & SO_CPU) { - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, - cpu); - int node; - struct slab *slab; - - slab = READ_ONCE(c->slab); - if (!slab) - continue; - - node = slab_nid(slab); - if (flags & SO_TOTAL) - x = slab->objects; - else if (flags & SO_OBJECTS) - x = slab->inuse; - else - x = 1; - - total += x; - nodes[node] += x; - -#ifdef CONFIG_SLUB_CPU_PARTIAL - slab = slub_percpu_partial_read_once(c); - if (slab) { - node = slab_nid(slab); - if (flags & SO_TOTAL) - WARN_ON_ONCE(1); - else if (flags & SO_OBJECTS) - WARN_ON_ONCE(1); - else - x = data_race(slab->slabs); - total += x; - nodes[node] += x; - } -#endif - } - } - /* * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" * already held which will conflict with an existing lock order: @@ -9531,12 +8855,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - unsigned int nr_partial = 0; -#ifdef CONFIG_SLUB_CPU_PARTIAL - nr_partial = s->cpu_partial; -#endif - - return sysfs_emit(buf, "%u\n", nr_partial); + return sysfs_emit(buf, "0\n"); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -9548,11 +8867,9 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = kstrtouint(buf, 10, &objects); if (err) return err; - if (objects && !kmem_cache_has_cpu_partial(s)) + if (objects) return -EINVAL; - slub_set_cpu_partial(s, objects); - flush_all(s); return length; } SLAB_ATTR(cpu_partial); @@ -9591,42 +8908,7 @@ SLAB_ATTR_RO(objects_partial); static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) { - int objects = 0; - int slabs = 0; - int cpu __maybe_unused; - int len = 0; - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - - if (slab) - slabs += data_race(slab->slabs); - } -#endif - - /* Approximate half-full slabs, see slub_set_cpu_partial() */ - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (slab) { - slabs = data_race(slab->slabs); - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", - cpu, objects, slabs); - } - } -#endif - len += sysfs_emit_at(buf, len, "\n"); - - return len; + return sysfs_emit(buf, "0(0)\n"); } SLAB_ATTR_RO(slabs_cpu_partial); @@ -9812,7 +9094,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -9838,7 +9120,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) int cpu; for_each_online_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; } #define STAT_ATTR(si, text) \ @@ -9856,36 +9138,19 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ -STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); -STAT_ATTR(FREE_PCS, free_cpu_sheaf); STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); -STAT_ATTR(FREE_FROZEN, free_frozen); STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); -STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); -STAT_ATTR(ALLOC_REFILL, alloc_refill); STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); -STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); -STAT_ATTR(DEACTIVATE_FULL, deactivate_full); -STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); -STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); -STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); -STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); -STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); -STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); -STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); -STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); -STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); -STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); STAT_ATTR(SHEAF_FLUSH, sheaf_flush); STAT_ATTR(SHEAF_REFILL, sheaf_refill); STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); @@ -9961,36 +9226,19 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS - &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, - &free_cpu_sheaf_attr.attr, &free_rcu_sheaf_attr.attr, &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, - &free_frozen_attr.attr, &free_add_partial_attr.attr, &free_remove_partial_attr.attr, - &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, - &alloc_refill_attr.attr, &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, - &cpuslab_flush_attr.attr, - &deactivate_full_attr.attr, - &deactivate_empty_attr.attr, - &deactivate_to_head_attr.attr, - &deactivate_to_tail_attr.attr, - &deactivate_remote_frees_attr.attr, - &deactivate_bypass_attr.attr, &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, - &cmpxchg_double_cpu_fail_attr.attr, - &cpu_partial_alloc_attr.attr, - &cpu_partial_free_attr.attr, - &cpu_partial_node_attr.attr, - &cpu_partial_drain_attr.attr, &sheaf_flush_attr.attr, &sheaf_refill_attr.attr, &sheaf_alloc_attr.attr, |
