diff options
| -rw-r--r-- | fs/ext4/super.c | 19 | ||||
| -rw-r--r-- | include/linux/slab.h | 40 | ||||
| -rw-r--r-- | mm/Kconfig | 11 | ||||
| -rw-r--r-- | mm/internal.h | 1 | ||||
| -rw-r--r-- | mm/memcontrol.c | 31 | ||||
| -rw-r--r-- | mm/page_alloc.c | 5 | ||||
| -rw-r--r-- | mm/slab.h | 213 | ||||
| -rw-r--r-- | mm/slab_common.c | 153 | ||||
| -rw-r--r-- | mm/slub.c | 3203 |
9 files changed, 1685 insertions, 1991 deletions
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a8d2460b527a..3c73b982a4f7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1496,12 +1496,19 @@ static void init_once(void *foo) static int __init init_inodecache(void) { - ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", - sizeof(struct ext4_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, - offsetof(struct ext4_inode_info, i_data), - sizeof_field(struct ext4_inode_info, i_data), - init_once); + struct kmem_cache_args args = { + .useroffset = offsetof(struct ext4_inode_info, i_data), + .usersize = sizeof_field(struct ext4_inode_info, i_data), + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct ext4_inode_info, i_flags), + .ctor = init_once, + }; + + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", + sizeof(struct ext4_inode_info), + &args, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT); + if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/include/linux/slab.h b/include/linux/slab.h index 7701b38cedec..c5fde8740281 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -58,8 +58,9 @@ enum _slab_flag_bits { #endif _SLAB_OBJECT_POISON, _SLAB_CMPXCHG_DOUBLE, -#ifdef CONFIG_SLAB_OBJ_EXT _SLAB_NO_OBJ_EXT, +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) + _SLAB_OBJ_EXT_IN_OBJ, #endif _SLAB_FLAGS_LAST_BIT }; @@ -239,10 +240,12 @@ enum _slab_flag_bits { #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* Slab created using create_boot_cache */ -#ifdef CONFIG_SLAB_OBJ_EXT #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) + +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) +#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ) #else -#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED +#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_UNUSED #endif /* @@ -300,24 +303,26 @@ struct kmem_cache_args { unsigned int usersize; /** * @freeptr_offset: Custom offset for the free pointer - * in &SLAB_TYPESAFE_BY_RCU caches + * in caches with &SLAB_TYPESAFE_BY_RCU or @ctor * - * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer - * outside of the object. This might cause the object to grow in size. - * Cache creators that have a reason to avoid this can specify a custom - * free pointer offset in their struct where the free pointer will be - * placed. + * By default, &SLAB_TYPESAFE_BY_RCU and @ctor caches place the free + * pointer outside of the object. This might cause the object to grow + * in size. Cache creators that have a reason to avoid this can specify + * a custom free pointer offset in their data structure where the free + * pointer will be placed. * - * Note that placing the free pointer inside the object requires the - * caller to ensure that no fields are invalidated that are required to - * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for - * details). + * For caches with &SLAB_TYPESAFE_BY_RCU, the caller must ensure that + * the free pointer does not overlay fields required to guard against + * object recycling (See &SLAB_TYPESAFE_BY_RCU for details). * - * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset - * is specified, %use_freeptr_offset must be set %true. + * For caches with @ctor, the caller must ensure that the free pointer + * does not overlay fields initialized by the constructor. * - * Note that @ctor currently isn't supported with custom free pointers - * as a @ctor requires an external free pointer. + * Currently, only caches with &SLAB_TYPESAFE_BY_RCU or @ctor + * may specify @freeptr_offset. + * + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset + * is specified, @use_freeptr_offset must be set %true. */ unsigned int freeptr_offset; /** @@ -508,7 +513,6 @@ void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size void kfree(const void *objp); void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); -size_t __ksize(const void *objp); DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T)) diff --git a/mm/Kconfig b/mm/Kconfig index a992f2203eb9..fbac1dfc9943 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -247,17 +247,6 @@ config SLUB_STATS out which slabs are relevant to a particular load. Try running: slabinfo -DA -config SLUB_CPU_PARTIAL - default y - depends on SMP && !SLUB_TINY - bool "Enable per cpu partial caches" - help - Per cpu partial caches accelerate objects allocation and freeing - that is local to a processor at the price of more indeterminism - in the latency of the free. On overflow these caches will be cleared - which requires the taking of locks that may cause latency spikes. - Typically one would choose no for a realtime system. - config RANDOM_KMALLOC_CACHES default n depends on !SLUB_TINY diff --git a/mm/internal.h b/mm/internal.h index f35dbcf99a86..aacda4f79534 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -838,6 +838,7 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_frozen_pages_nolock(...) \ alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) +void free_frozen_pages_nolock(struct page *page, unsigned int order); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8a9a1e797c2e..36ab9897b61b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2627,16 +2627,24 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) * Memcg membership data for each individual object is saved in * slab->obj_exts. */ - struct slabobj_ext *obj_exts; + unsigned long obj_exts; + struct slabobj_ext *obj_ext; unsigned int off; obj_exts = slab_obj_exts(slab); if (!obj_exts) return NULL; + get_slab_obj_exts(obj_exts); off = obj_to_index(slab->slab_cache, slab, p); - if (obj_exts[off].objcg) - return obj_cgroup_memcg(obj_exts[off].objcg); + obj_ext = slab_obj_ext(slab, obj_exts, off); + if (obj_ext->objcg) { + struct obj_cgroup *objcg = obj_ext->objcg; + + put_slab_obj_exts(obj_exts); + return obj_cgroup_memcg(objcg); + } + put_slab_obj_exts(obj_exts); return NULL; } @@ -3222,6 +3230,9 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, } for (i = 0; i < size; i++) { + unsigned long obj_exts; + struct slabobj_ext *obj_ext; + slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && @@ -3244,29 +3255,35 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, slab_pgdat(slab), cache_vmstat_idx(s))) return false; + obj_exts = slab_obj_exts(slab); + get_slab_obj_exts(obj_exts); off = obj_to_index(s, slab, p[i]); + obj_ext = slab_obj_ext(slab, obj_exts, off); obj_cgroup_get(objcg); - slab_obj_exts(slab)[off].objcg = objcg; + obj_ext->objcg = objcg; + put_slab_obj_exts(obj_exts); } return true; } void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects, struct slabobj_ext *obj_exts) + void **p, int objects, unsigned long obj_exts) { size_t obj_size = obj_full_size(s); for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; + struct slabobj_ext *obj_ext; unsigned int off; off = obj_to_index(s, slab, p[i]); - objcg = obj_exts[off].objcg; + obj_ext = slab_obj_ext(slab, obj_exts, off); + objcg = obj_ext->objcg; if (!objcg) continue; - obj_exts[off].objcg = NULL; + obj_ext->objcg = NULL; refill_obj_stock(objcg, obj_size, true, -obj_size, slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbf758e27aa2..d312ebaa1e77 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3011,6 +3011,11 @@ void free_frozen_pages(struct page *page, unsigned int order) __free_frozen_pages(page, order, FPI_NONE); } +void free_frozen_pages_nolock(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_TRYLOCK); +} + /* * Free a batch of folios */ diff --git a/mm/slab.h b/mm/slab.h index e767aa7e91b0..71c7261bf822 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -21,14 +21,12 @@ # define system_has_freelist_aba() system_has_cmpxchg128() # define try_cmpxchg_freelist try_cmpxchg128 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 typedef u128 freelist_full_t; #else /* CONFIG_64BIT */ # ifdef system_has_cmpxchg64 # define system_has_freelist_aba() system_has_cmpxchg64() # define try_cmpxchg_freelist try_cmpxchg64 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 typedef u64 freelist_full_t; #endif /* CONFIG_64BIT */ @@ -55,6 +53,14 @@ struct freelist_counters { * that the slab was corrupted */ unsigned frozen:1; +#ifdef CONFIG_64BIT + /* + * Some optimizations use free bits in 'counters' field + * to save memory. In case ->stride field is not available, + * such optimizations are disabled. + */ + unsigned short stride; +#endif }; }; }; @@ -71,19 +77,7 @@ struct slab { struct kmem_cache *slab_cache; union { struct { - union { - struct list_head slab_list; - struct { /* For deferred deactivate_slab() */ - struct llist_node llnode; - void *flush_freelist; - }; -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct { - struct slab *next; - int slabs; /* Nr of slabs left */ - }; -#endif - }; + struct list_head slab_list; /* Double-word boundary */ struct freelist_counters; }; @@ -188,23 +182,6 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the @@ -218,8 +195,6 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { - struct kmem_cache_cpu __percpu *cpu_slab; - struct lock_class_key lock_key; struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; @@ -228,12 +203,6 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; @@ -274,16 +243,35 @@ struct kmem_cache { unsigned int usersize; /* Usercopy region size */ #endif +#ifdef CONFIG_SLUB_STATS + struct kmem_cache_stats __percpu *cpu_stats; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; +/* + * Every cache has !NULL s->cpu_sheaves but they may point to the + * bootstrap_sheaf temporarily during init, or permanently for the boot caches + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This + * helper distinguishes whether cache has real non-bootstrap sheaves. + */ +static inline bool cache_has_sheaves(struct kmem_cache *s) +{ + /* Test CONFIG_SLUB_TINY for code elimination purposes */ + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; +} + #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS 1 void sysfs_slab_unlink(struct kmem_cache *s); void sysfs_slab_release(struct kmem_cache *s); +int sysfs_slab_alias(struct kmem_cache *s, const char *name); #else static inline void sysfs_slab_unlink(struct kmem_cache *s) { } static inline void sysfs_slab_release(struct kmem_cache *s) { } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *name) + { return 0; } #endif void *fixup_red_left(struct kmem_cache *s, void *p); @@ -400,11 +388,7 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); -struct kmem_cache *find_mergeable(unsigned size, unsigned align, - slab_flags_t flags, const char *name, void (*ctor)(void *)); -struct kmem_cache * -__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)); +bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags); slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); @@ -502,6 +486,24 @@ bool slab_in_kunit_test(void); static inline bool slab_in_kunit_test(void) { return false; } #endif +/* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); + kmsan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kmsan_enable_current(); + kasan_enable_current(); +} + #ifdef CONFIG_SLAB_OBJ_EXT /* @@ -509,10 +511,26 @@ static inline bool slab_in_kunit_test(void) { return false; } * associated with a slab. * @slab: a pointer to the slab struct * - * Returns a pointer to the object extension vector associated with the slab, - * or NULL if no such vector has been associated yet. + * Returns the address of the object extension vector associated with the slab, + * or zero if no such vector has been associated yet. + * Do not dereference the return value directly; use get/put_slab_obj_exts() + * pair and slab_obj_ext() to access individual elements. + * + * Example usage: + * + * obj_exts = slab_obj_exts(slab); + * if (obj_exts) { + * get_slab_obj_exts(obj_exts); + * obj_ext = slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, obj)); + * // do something with obj_ext + * put_slab_obj_exts(obj_exts); + * } + * + * Note that the get/put semantics does not involve reference counting. + * Instead, it updates kasan/kmsan depth so that accesses to slabobj_ext + * won't be reported as access violations. */ -static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) +static inline unsigned long slab_obj_exts(struct slab *slab) { unsigned long obj_exts = READ_ONCE(slab->obj_exts); @@ -525,7 +543,62 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); #endif - return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); + + return obj_exts & ~OBJEXTS_FLAGS_MASK; +} + +static inline void get_slab_obj_exts(unsigned long obj_exts) +{ + VM_WARN_ON_ONCE(!obj_exts); + metadata_access_enable(); +} + +static inline void put_slab_obj_exts(unsigned long obj_exts) +{ + metadata_access_disable(); +} + +#ifdef CONFIG_64BIT +static inline void slab_set_stride(struct slab *slab, unsigned short stride) +{ + slab->stride = stride; +} +static inline unsigned short slab_get_stride(struct slab *slab) +{ + return slab->stride; +} +#else +static inline void slab_set_stride(struct slab *slab, unsigned short stride) +{ + VM_WARN_ON_ONCE(stride != sizeof(struct slabobj_ext)); +} +static inline unsigned short slab_get_stride(struct slab *slab) +{ + return sizeof(struct slabobj_ext); +} +#endif + +/* + * slab_obj_ext - get the pointer to the slab object extension metadata + * associated with an object in a slab. + * @slab: a pointer to the slab struct + * @obj_exts: a pointer to the object extension vector + * @index: an index of the object + * + * Returns a pointer to the object extension associated with the object. + * Must be called within a section covered by get/put_slab_obj_exts(). + */ +static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, + unsigned long obj_exts, + unsigned int index) +{ + struct slabobj_ext *obj_ext; + + VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab)); + + obj_ext = (struct slabobj_ext *)(obj_exts + + slab_get_stride(slab) * index); + return kasan_reset_tag(obj_ext); } int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, @@ -533,11 +606,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, #else /* CONFIG_SLAB_OBJ_EXT */ -static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) +static inline unsigned long slab_obj_exts(struct slab *slab) +{ + return 0; +} + +static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, + unsigned long obj_exts, + unsigned int index) { return NULL; } +static inline void slab_set_stride(struct slab *slab, unsigned int stride) { } +static inline unsigned int slab_get_stride(struct slab *slab) { return 0; } + + #endif /* CONFIG_SLAB_OBJ_EXT */ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) @@ -550,38 +634,11 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p); void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects, struct slabobj_ext *obj_exts); + void **p, int objects, unsigned long obj_exts); #endif void kvfree_rcu_cb(struct rcu_head *head); -size_t __ksize(const void *objp); - -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; -#endif - if (s->flags & SLAB_KASAN) - return s->object_size; - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline unsigned int large_kmalloc_order(const struct page *page) { return page[1].flags.f & 0xff; diff --git a/mm/slab_common.c b/mm/slab_common.c index eed7ea556cb1..d5a70a831a2a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -43,11 +43,13 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; /* - * Set of flags that will prevent slab merging + * Set of flags that will prevent slab merging. + * Any flag that adds per-object metadata should be included, + * since slab merging can update s->inuse that affects the metadata layout. */ -#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB | SLAB_NO_MERGE) +#define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \ + SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \ + SLAB_OBJ_EXT_IN_OBJ) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_ACCOUNT) @@ -163,9 +165,6 @@ int slab_unmergeable(struct kmem_cache *s) return 1; #endif - if (s->cpu_sheaves) - return 1; - /* * We may have set a slab to be unmergeable during bootstrap. */ @@ -175,24 +174,35 @@ int slab_unmergeable(struct kmem_cache *s) return 0; } -struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, - slab_flags_t flags, const char *name, void (*ctor)(void *)) +bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags) { - struct kmem_cache *s; - if (slab_nomerge) - return NULL; + return true; - if (ctor) - return NULL; + if (args->ctor) + return true; - flags = kmem_cache_flags(flags, name); + if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) + return true; if (flags & SLAB_NEVER_MERGE) + return true; + + return false; +} + +static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags, + const char *name, struct kmem_cache_args *args) +{ + struct kmem_cache *s; + unsigned int align; + + flags = kmem_cache_flags(flags, name); + if (slab_args_unmergeable(args, flags)) return NULL; size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); + align = calculate_alignment(flags, args->align, size); size = ALIGN(size, align); list_for_each_entry_reverse(s, &slab_caches, list) { @@ -231,7 +241,7 @@ static struct kmem_cache *create_cache(const char *name, err = -EINVAL; if (args->use_freeptr_offset && (args->freeptr_offset >= object_size || - !(flags & SLAB_TYPESAFE_BY_RCU) || + (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) || !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t)))) goto out; @@ -253,6 +263,31 @@ out: return ERR_PTR(err); } +static struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags, + struct kmem_cache_args *args) +{ + struct kmem_cache *s; + + s = find_mergeable(size, flags, name, args); + if (s) { + if (sysfs_slab_alias(s, name)) + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", + name); + + s->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->object_size = max(s->object_size, size); + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); + } + + return s; +} + /** * __kmem_cache_create_args - Create a kmem cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -305,6 +340,13 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, flags &= ~SLAB_DEBUG_FLAGS; #endif + /* + * Caches with specific capacity are special enough. It's simpler to + * make them unmergeable. + */ + if (args->sheaf_capacity) + flags |= SLAB_NO_MERGE; + mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, object_size); @@ -324,9 +366,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; - if (!args->usersize && !args->sheaf_capacity) - s = __kmem_cache_alias(name, object_size, args->align, flags, - args->ctor); + s = __kmem_cache_alias(name, object_size, flags, args); if (s) goto out_unlock; @@ -983,43 +1023,6 @@ void __init create_kmalloc_caches(void) 0, SLAB_NO_MERGE, NULL); } -/** - * __ksize -- Report full size of underlying allocation - * @object: pointer to the object - * - * This should only be used internally to query the true size of allocations. - * It is not meant to be a way to discover the usable size of an allocation - * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond - * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, - * and/or FORTIFY_SOURCE. - * - * Return: size of the actual memory used by @object in bytes - */ -size_t __ksize(const void *object) -{ - const struct page *page; - const struct slab *slab; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - page = virt_to_page(object); - - if (unlikely(PageLargeKmalloc(page))) - return large_kmalloc_size(page); - - slab = page_slab(page); - /* Delete this after we're sure there are no users */ - if (WARN_ON(!slab)) - return page_size(page); - -#ifdef CONFIG_SLUB_DEBUG - skip_orig_size_check(slab->slab_cache, object); -#endif - - return slab_ksize(slab->slab_cache); -} - gfp_t kmalloc_fix_flags(gfp_t flags) { gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; @@ -1235,30 +1238,6 @@ void kfree_sensitive(const void *p) } EXPORT_SYMBOL(kfree_sensitive); -size_t ksize(const void *objp) -{ - /* - * We need to first check that the pointer to the object is valid. - * The KASAN report printed from ksize() is more useful, then when - * it's printed later when the behaviour could be undefined due to - * a potential use-after-free or double-free. - * - * We use kasan_check_byte(), which is supported for the hardware - * tag-based KASAN mode, unlike kasan_check_read/write(). - * - * If the pointed to memory is invalid, we return 0 to avoid users of - * ksize() writing to and potentially corrupting the memory region. - * - * We want to perform the check before __ksize(), to avoid potentially - * crashing in __ksize() due to accessing invalid metadata. - */ - if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) - return 0; - - return kfence_ksize(objp) ?: __ksize(objp); -} -EXPORT_SYMBOL(ksize); - #ifdef CONFIG_BPF_SYSCALL #include <linux/btf.h> @@ -1625,11 +1604,8 @@ static bool kfree_rcu_sheaf(void *obj) return false; s = slab->slab_cache; - if (s->cpu_sheaves) { - if (likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id())) - return __kfree_rcu_sheaf(s, obj); - } + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) + return __kfree_rcu_sheaf(s, obj); return false; } @@ -2133,8 +2109,11 @@ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); */ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) { - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) { flush_rcu_sheaves_on_cache(s); + rcu_barrier(); + } + /* * TODO: Introduce a version of __kvfree_rcu_barrier() that works * on a specific slab cache. diff --git a/mm/slub.c b/mm/slub.c index cdc1e652ec52..18899017512c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1,13 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* - * SLUB: A slab allocator that limits cache line use instead of queuing - * objects in per cpu and per node lists. + * SLUB: A slab allocator with low overhead percpu array caches and mostly + * lockless freeing of objects to slabs in the slowpath. * - * The allocator synchronizes using per slab locks or atomic operations - * and only uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using spin_trylock for percpu arrays in the + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. + * Uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter * (C) 2011 Linux Foundation, Christoph Lameter + * (C) 2025 SUSE, Vlastimil Babka */ #include <linux/mm.h> @@ -53,11 +55,13 @@ /* * Lock order: - * 1. slab_mutex (Global Mutex) - * 2. node->list_lock (Spinlock) - * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches) - * 5. object_map_lock (Only for debugging) + * 0. cpu_hotplug_lock + * 1. slab_mutex (Global Mutex) + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) + * 2b. node->barn->lock (Spinlock) + * 2c. node->list_lock (Spinlock) + * 3. slab_lock(slab) (Only on some arches) + * 4. object_map_lock (Only for debugging) * * slab_mutex * @@ -78,31 +82,38 @@ * C. slab->objects -> Number of objects in slab * D. slab->frozen -> frozen state * - * Frozen slabs + * SL_partial slabs + * + * Slabs on node partial list have at least one free object. A limited number + * of slabs on the list can be fully free (slab->inuse == 0), until we start + * discarding them. These slabs are marked with SL_partial, and the flag is + * cleared while removing them, usually to grab their freelist afterwards. + * This clearing also exempts them from list management. Please see + * __slab_free() for more details. * - * If a slab is frozen then it is exempt from list management. It is - * the cpu slab which is actively allocated from by the processor that - * froze it and it is not on any list. The processor that froze the - * slab is the one who can perform list operations on the slab. Other - * processors may put objects onto the freelist but the processor that - * froze the slab is the only one that can retrieve the objects from the - * slab's freelist. + * Full slabs * - * CPU partial slabs + * For caches without debugging enabled, full slabs (slab->inuse == + * slab->objects and slab->freelist == NULL) are not placed on any list. + * The __slab_free() freeing the first object from such a slab will place + * it on the partial list. Caches with debugging enabled place such slab + * on the full list and use different allocation and freeing paths. * - * The partially empty slabs cached on the CPU partial list are used - * for performance reasons, which speeds up the allocation process. - * These slabs are not frozen, but are also exempt from list management, - * by clearing the SL_partial flag when moving out of the node - * partial list. Please see __slab_free() for more details. + * Frozen slabs + * + * If a slab is frozen then it is exempt from list management. It is used to + * indicate a slab that has failed consistency checks and thus cannot be + * allocated from anymore - it is also marked as full. Any previously + * allocated objects will be simply leaked upon freeing instead of attempting + * to modify the potentially corrupted freelist and metadata. * * To sum up, the current scheme is: - * - node partial slab: SL_partial && !frozen - * - cpu partial slab: !SL_partial && !frozen - * - cpu slab: !SL_partial && frozen - * - full slab: !SL_partial && !frozen + * - node partial slab: SL_partial && !full && !frozen + * - taken off partial list: !SL_partial && !full && !frozen + * - full slab, not on any list: !SL_partial && full && !frozen + * - frozen due to inconsistency: !SL_partial && full && frozen * - * list_lock + * node->list_lock (spinlock) * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -112,47 +123,46 @@ * * The list_lock is a centralized lock and thus we avoid taking it as * much as possible. As long as SLUB does not have to handle partial - * slabs, operations can continue without any centralized lock. F.e. - * allocating a long series of objects that fill up slabs does not require - * the list lock. + * slabs, operations can continue without any centralized lock. * * For debug caches, all allocations are forced to go through a list_lock * protected region to serialize against concurrent validation. * - * cpu_slab->lock local lock + * cpu_sheaves->lock (local_trylock) + * + * This lock protects fastpath operations on the percpu sheaves. On !RT it + * only disables preemption and does no atomic operations. As long as the main + * or spare sheaf can handle the allocation or free, there is no other + * overhead. * - * This locks protect slowpath manipulation of all kmem_cache_cpu fields - * except the stat counters. This is a percpu structure manipulated only by - * the local cpu, so the lock protects against being preempted or interrupted - * by an irq. Fast path operations rely on lockless operations instead. + * node->barn->lock (spinlock) * - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption - * which means the lockless fastpath cannot be used as it might interfere with - * an in-progress slow path operations. In this case the local lock is always - * taken but it still utilizes the freelist for the common operations. + * This lock protects the operations on per-NUMA-node barn. It can quickly + * serve an empty or full sheaf if available, and avoid more expensive refill + * or flush operation. * - * lockless fastpaths + * Lockless freeing * - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) - * are fully lockless when satisfied from the percpu slab (and when - * cmpxchg_double is possible to use, otherwise slab_lock is taken). - * They also don't disable preemption or migration or irqs. They rely on - * the transaction id (tid) field to detect being preempted or moved to - * another cpu. + * Objects may have to be freed to their slabs when they are from a remote + * node (where we want to avoid filling local sheaves with remote objects) + * or when there are too many full sheaves. On architectures supporting + * cmpxchg_double this is done by a lockless update of slab's freelist and + * counters, otherwise slab_lock is taken. This only needs to take the + * list_lock if it's a first free to a full slab, or when a slab becomes empty + * after the free. * * irq, preemption, migration considerations * - * Interrupts are disabled as part of list_lock or local_lock operations, or + * Interrupts are disabled as part of list_lock or barn lock operations, or * around the slab_lock operation, in order to make the slab allocator safe * to use in the context of an irq. + * Preemption is disabled as part of local_trylock operations. + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see + * their limitations. * - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer - * doesn't have to be revalidated in each section protected by the local lock. - * - * SLUB assigns one slab for allocation to each processor. - * Allocations only occur from these slabs called cpu slabs. + * SLUB assigns two object arrays called sheaves for caching allocations and + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. + * Allocations and frees are primarily served from these sheaves. * * Slabs with free elements are kept on a partial list and during regular * operations no list for full slabs is used. If an object in a full slab is @@ -160,25 +170,8 @@ * We track full slabs for debugging purposes though because otherwise we * cannot scan all objects. * - * Slabs are freed when they become empty. Teardown and setup is - * minimal so we rely on the page allocators per cpu caches for - * fast frees and allocs. - * - * slab->frozen The slab is frozen and exempt from list processing. - * This means that the slab is dedicated to a purpose - * such as satisfying allocations for a specific - * processor. Objects may be freed in the slab while - * it is frozen but slab_free will then skip the usual - * list operations. It is up to the processor holding - * the slab to integrate the slab into the slab lists - * when the slab is no longer needed. - * - * One use of this flag is to mark slabs that are - * used for allocations. Then such a slab becomes a cpu - * slab. The cpu slab may be equipped with an additional - * freelist that allows lockless access to - * free objects in addition to the regular freelist - * that requires the slab lock. + * Slabs are freed when they become empty. Teardown and setup is minimal so we + * rely on the page allocators per cpu caches for fast frees and allocs. * * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of @@ -201,28 +194,6 @@ enum slab_flags { SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ }; -/* - * We could simply use migrate_disable()/enable() but as long as it's a - * function call even on !PREEMPT_RT, use inline preempt_disable() there. - */ -#ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) -#define USE_LOCKLESS_FAST_PATH() (true) -#else -#define slub_get_cpu_ptr(var) \ -({ \ - migrate_disable(); \ - this_cpu_ptr(var); \ -}) -#define slub_put_cpu_ptr(var) \ -do { \ - (void)(var); \ - migrate_enable(); \ -} while (0) -#define USE_LOCKLESS_FAST_PATH() (false) -#endif - #ifndef CONFIG_SLUB_TINY #define __fastpath_inline __always_inline #else @@ -241,11 +212,18 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); static DEFINE_STATIC_KEY_FALSE(strict_numa); #endif -/* Structure holding parameters for get_partial() call chain */ +/* Structure holding parameters for get_from_partial() call chain */ struct partial_context { gfp_t flags; unsigned int orig_size; - void *object; +}; + +/* Structure holding parameters for get_partial_node_bulk() */ +struct partial_bulk_context { + gfp_t flags; + unsigned int min_objects; + unsigned int max_objects; + struct list_head slabs; }; static inline bool kmem_cache_debug(struct kmem_cache *s) @@ -261,15 +239,6 @@ void *fixup_red_left(struct kmem_cache *s, void *p) return p; } -static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_CPU_PARTIAL - return !kmem_cache_debug(s); -#else - return false; -#endif -} - /* * Issues still to be resolved: * @@ -350,11 +319,8 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); -static int sysfs_slab_alias(struct kmem_cache *, const char *); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } -static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) - { return 0; } #endif #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) @@ -363,37 +329,25 @@ static void debugfs_slab_add(struct kmem_cache *); static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif +enum add_mode { + ADD_TO_HEAD, + ADD_TO_TAIL, +}; + enum stat_item { - ALLOC_PCS, /* Allocation from percpu sheaf */ - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_PCS, /* Free to percpu sheaf */ + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ + FREE_FASTPATH, /* Free to percpu sheaves */ + FREE_SLOWPATH, /* Free to a slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + ALLOC_SLAB, /* New slab acquired from page allocator */ + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ SHEAF_FLUSH, /* Objects flushed from a sheaf */ SHEAF_REFILL, /* Objects refilled to a sheaf */ SHEAF_ALLOC, /* Allocation of an empty sheaf */ @@ -410,31 +364,11 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; -struct freelist_tid { - union { - struct { - void *freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_full_t freelist_tid; - }; -}; - -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - struct freelist_tid; - struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated slabs */ -#endif - local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS +struct kmem_cache_stats { unsigned int stat[NR_SLUB_STAT_ITEMS]; -#endif }; +#endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { @@ -443,7 +377,7 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * The rmw is racy on a preemptible kernel but this is acceptable, so * avoid this_cpu_add()'s irq-disable overhead. */ - raw_cpu_inc(s->cpu_slab->stat[si]); + raw_cpu_inc(s->cpu_stats->stat[si]); #endif } @@ -451,7 +385,7 @@ static inline void stat_add(const struct kmem_cache *s, enum stat_item si, int v) { #ifdef CONFIG_SLUB_STATS - raw_cpu_add(s->cpu_slab->stat[si], v); + raw_cpu_add(s->cpu_stats->stat[si], v); #endif } @@ -540,7 +474,7 @@ static inline struct node_barn *get_barn(struct kmem_cache *s) static nodemask_t slab_nodes; /* - * Workqueue used for flush_cpu_slab(). + * Workqueue used for flushing cpu and kfree_rcu sheaves. */ static struct workqueue_struct *flushwq; @@ -599,36 +533,6 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_ptr_decode(s, p, ptr_addr); } -static void prefetch_freepointer(const struct kmem_cache *s, void *object) -{ - prefetchw(object + s->offset); -} - -/* - * When running under KMSAN, get_freepointer_safe() may return an uninitialized - * pointer value in the case the current thread loses the race for the next - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in - * slab_alloc_node() will fail, so the uninitialized value won't be used, but - * KMSAN will still check all arguments of cmpxchg because of imperfect - * handling of inline assembly. - * To work around this problem, we apply __no_kmsan_checks to ensure that - * get_freepointer_safe() returns initialized memory. - */ -__no_kmsan_checks -static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) -{ - unsigned long freepointer_addr; - freeptr_t p; - - if (!debug_pagealloc_enabled_static()) - return get_freepointer(s, object); - - object = kasan_reset_tag(object); - freepointer_addr = (unsigned long)object + s->offset; - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); - return freelist_ptr_decode(s, p, freepointer_addr); -} - static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { unsigned long freeptr_addr = (unsigned long)object + s->offset; @@ -692,41 +596,6 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ - unsigned int nr_slabs; - - s->cpu_partial = nr_objects; - - /* - * We take the number of objects but actually limit the number of - * slabs on the per cpu partial list, in order to limit excessive - * growth of the list. For simplicity we assume that the slabs will - * be half-full. - */ - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); - s->cpu_partial_slabs = nr_slabs; -} - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return s->cpu_partial_slabs; -} -#else -#ifdef SLAB_SUPPORTS_SYSFS -static inline void -slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ -} -#endif - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return 0; -} -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - /* * If network-based swap is enabled, slub must keep track of whether memory * were allocated from pfmemalloc reserves. @@ -782,7 +651,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old, if (slab->freelist == old->freelist && slab->counters == old->counters) { slab->freelist = new->freelist; - slab->counters = new->counters; + /* prevent tearing for the read in get_partial_node_bulk() */ + WRITE_ONCE(slab->counters, new->counters); ret = true; } slab_unlock(slab); @@ -802,7 +672,7 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla { bool ret; - if (USE_LOCKLESS_FAST_PATH()) + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) lockdep_assert_irqs_disabled(); if (s->flags & __CMPXCHG_DOUBLE) @@ -857,7 +727,7 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, * request size in the meta data area, for better debug and sanity check. */ static inline void set_orig_size(struct kmem_cache *s, - void *object, unsigned int orig_size) + void *object, unsigned long orig_size) { void *p = kasan_reset_tag(object); @@ -867,10 +737,10 @@ static inline void set_orig_size(struct kmem_cache *s, p += get_info_end(s); p += sizeof(struct track) * 2; - *(unsigned int *)p = orig_size; + *(unsigned long *)p = orig_size; } -static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +static inline unsigned long get_orig_size(struct kmem_cache *s, void *object) { void *p = kasan_reset_tag(object); @@ -883,8 +753,142 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) p += get_info_end(s); p += sizeof(struct track) * 2; - return *(unsigned int *)p; + return *(unsigned long *)p; +} + +#ifdef CONFIG_SLAB_OBJ_EXT + +/* + * Check if memory cgroup or memory allocation profiling is enabled. + * If enabled, SLUB tries to reduce memory overhead of accounting + * slab objects. If neither is enabled when this function is called, + * the optimization is simply skipped to avoid affecting caches that do not + * need slabobj_ext metadata. + * + * However, this may disable optimization when memory cgroup or memory + * allocation profiling is used, but slabs are created too early + * even before those subsystems are initialized. + */ +static inline bool need_slab_obj_exts(struct kmem_cache *s) +{ + if (s->flags & SLAB_NO_OBJ_EXT) + return false; + + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + return true; + + if (mem_alloc_profiling_enabled()) + return true; + + return false; +} + +static inline unsigned int obj_exts_size_in_slab(struct slab *slab) +{ + return sizeof(struct slabobj_ext) * slab->objects; +} + +static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, + struct slab *slab) +{ + unsigned long objext_offset; + + objext_offset = s->size * slab->objects; + objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext)); + return objext_offset; +} + +static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, + struct slab *slab) +{ + unsigned long objext_offset = obj_exts_offset_in_slab(s, slab); + unsigned long objext_size = obj_exts_size_in_slab(slab); + + return objext_offset + objext_size <= slab_size(slab); +} + +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) +{ + unsigned long obj_exts; + unsigned long start; + unsigned long end; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return false; + + start = (unsigned long)slab_address(slab); + end = start + slab_size(slab); + return (obj_exts >= start) && (obj_exts < end); +} +#else +static inline bool need_slab_obj_exts(struct kmem_cache *s) +{ + return false; +} + +static inline unsigned int obj_exts_size_in_slab(struct slab *slab) +{ + return 0; +} + +static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, + struct slab *slab) +{ + return 0; +} + +static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, + struct slab *slab) +{ + return false; +} + +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) +{ + return false; +} + +#endif + +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) +static bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) +{ + /* + * Note we cannot rely on the SLAB_OBJ_EXT_IN_OBJ flag here and need to + * check the stride. A cache can have SLAB_OBJ_EXT_IN_OBJ set, but + * allocations within_slab_leftover are preferred. And those may be + * possible or not depending on the particular slab's size. + */ + return obj_exts_in_slab(s, slab) && + (slab_get_stride(slab) == s->size); +} + +static unsigned int obj_exts_offset_in_object(struct kmem_cache *s) +{ + unsigned int offset = get_info_end(s); + + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) + offset += sizeof(struct track) * 2; + + if (slub_debug_orig_size(s)) + offset += sizeof(unsigned long); + + offset += kasan_metadata_size(s, false); + + return offset; +} +#else +static inline bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) +{ + return false; +} + +static inline unsigned int obj_exts_offset_in_object(struct kmem_cache *s) +{ + return 0; } +#endif #ifdef CONFIG_SLUB_DEBUG @@ -976,24 +980,6 @@ static const char *slub_debug_string __ro_after_init; static int disable_higher_order_debug; /* - * slub is about to manipulate internal object metadata. This memory lies - * outside the range of the allocated object, so accessing it would normally - * be reported by kasan as a bounds error. metadata_access_enable() is used - * to tell kasan that these accesses are OK. - */ -static inline void metadata_access_enable(void) -{ - kasan_disable_current(); - kmsan_disable_current(); -} - -static inline void metadata_access_disable(void) -{ - kmsan_enable_current(); - kasan_enable_current(); -} - -/* * Object debugging */ @@ -1065,7 +1051,7 @@ static void set_track_update(struct kmem_cache *s, void *object, p->handle = handle; #endif p->addr = addr; - p->cpu = smp_processor_id(); + p->cpu = raw_smp_processor_id(); p->pid = current->pid; p->when = jiffies; } @@ -1198,10 +1184,13 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) off += 2 * sizeof(struct track); if (slub_debug_orig_size(s)) - off += sizeof(unsigned int); + off += sizeof(unsigned long); off += kasan_metadata_size(s, false); + if (obj_exts_in_object(s, slab)) + off += sizeof(struct slabobj_ext); + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, @@ -1226,20 +1215,6 @@ static void object_err(struct kmem_cache *s, struct slab *slab, WARN_ON(1); } -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && - !check_valid_pointer(s, slab, nextfree) && freelist) { - object_err(s, slab, *freelist, "Freechain corrupt"); - *freelist = NULL; - slab_fix(s, "Isolate corrupted freechain"); - return true; - } - - return false; -} - static void __slab_err(struct slab *slab) { if (slab_in_kunit_test()) @@ -1347,44 +1322,63 @@ skip_bug_print: } /* - * Object layout: - * - * object address - * Bytes of the object to be managed. - * If the freepointer may overlay the object then the free - * pointer is at the middle of the object. - * - * Poisoning uses 0x6b (POISON_FREE) and the last byte is - * 0xa5 (POISON_END) + * Object field layout: * - * object + s->object_size - * Padding to reach word boundary. This is also used for Redzoning. - * Padding is extended by another word if Redzoning is enabled and - * object_size == inuse. + * [Left redzone padding] (if SLAB_RED_ZONE) + * - Field size: s->red_left_pad + * - Immediately precedes each object when SLAB_RED_ZONE is set. + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. * - * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with - * 0xcc (SLUB_RED_ACTIVE) for objects in use. + * [Object bytes] (object address starts here) + * - Field size: s->object_size + * - Object payload bytes. + * - If the freepointer may overlap the object, it is stored inside + * the object (typically near the middle). + * - Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) when __OBJECT_POISON is enabled. * - * object + s->inuse - * Meta data starts here. + * [Word-align padding] (right redzone when SLAB_RED_ZONE is set) + * - Field size: s->inuse - s->object_size + * - If redzoning is enabled and ALIGN(size, sizeof(void *)) adds no + * padding, explicitly extend by one word so the right redzone is + * non-empty. + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. * - * A. Free pointer (if we cannot overwrite object on free) - * B. Tracking data for SLAB_STORE_USER - * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) - * D. Padding to reach required alignment boundary or at minimum - * one word if debugging is on to be able to detect writes - * before the word boundary. + * [Metadata starts at object + s->inuse] + * - A. freelist pointer (if freeptr_outside_object) + * - B. alloc tracking (SLAB_STORE_USER) + * - C. free tracking (SLAB_STORE_USER) + * - D. original request size (SLAB_KMALLOC && SLAB_STORE_USER) + * - E. KASAN metadata (if enabled) * - * Padding is done using 0x5a (POISON_INUSE) + * [Mandatory padding] (if CONFIG_SLUB_DEBUG && SLAB_RED_ZONE) + * - One mandatory debug word to guarantee a minimum poisoned gap + * between metadata and the next object, independent of alignment. + * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set. + * [Final alignment padding] + * - Bytes added by ALIGN(size, s->align) to reach s->size. + * - When the padding is large enough, it can be used to store + * struct slabobj_ext for accounting metadata (obj_exts_in_object()). + * - The remaining bytes (if any) are filled with 0x5a (POISON_INUSE) + * when SLAB_POISON is set. * - * object + s->size - * Nothing is used beyond s->size. + * Notes: + * - Redzones are filled by init_object() with SLUB_RED_ACTIVE/INACTIVE. + * - Object contents are poisoned with POISON_FREE/END when __OBJECT_POISON. + * - The trailing padding is pre-filled with POISON_INUSE by + * setup_slab_debug() when SLAB_POISON is set, and is validated by + * check_pad_bytes(). + * - The first object pointer is slab_address(slab) + + * (s->red_left_pad if redzoning); subsequent objects are reached by + * adding s->size each time. * - * If slabcaches are merged then the object_size and inuse boundaries are mostly - * ignored. And therefore no slab options that rely on these boundaries - * may be used with merged slabcaches. + * If a slab cache flag relies on specific metadata to exist at a fixed + * offset, the flag must be included in SLAB_NEVER_MERGE to prevent merging. + * Otherwise, the cache would misbehave as s->object_size and s->inuse are + * adjusted during cache merging (see __kmem_cache_alias()). */ - static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ @@ -1394,11 +1388,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) off += 2 * sizeof(struct track); if (s->flags & SLAB_KMALLOC) - off += sizeof(unsigned int); + off += sizeof(unsigned long); } off += kasan_metadata_size(s, false); + if (obj_exts_in_object(s, slab)) + off += sizeof(struct slabobj_ext); + if (size_from_object(s) == off) return 1; @@ -1423,7 +1420,15 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) start = slab_address(slab); length = slab_size(slab); end = start + length; - remainder = length % s->size; + + if (obj_exts_in_slab(s, slab) && !obj_exts_in_object(s, slab)) { + remainder = length; + remainder -= obj_exts_offset_in_slab(s, slab); + remainder -= obj_exts_size_in_slab(slab); + } else { + remainder = length % s->size; + } + if (!remainder) return; @@ -2021,11 +2026,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - return false; -} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -2042,21 +2042,27 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - struct slabobj_ext *slab_exts; struct slab *obj_exts_slab; + unsigned long slab_exts; obj_exts_slab = virt_to_slab(obj_exts); slab_exts = slab_obj_exts(obj_exts_slab); if (slab_exts) { + get_slab_obj_exts(slab_exts); unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, obj_exts_slab, obj_exts); + struct slabobj_ext *ext = slab_obj_ext(obj_exts_slab, + slab_exts, offs); - if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) + if (unlikely(is_codetag_empty(&ext->ref))) { + put_slab_obj_exts(slab_exts); return; + } /* codetag should be NULL here */ - WARN_ON(slab_exts[offs].ref.ct); - set_codetag_empty(&slab_exts[offs].ref); + WARN_ON(ext->ref.ct); + set_codetag_empty(&ext->ref); + put_slab_obj_exts(slab_exts); } } @@ -2095,6 +2101,49 @@ static inline void init_slab_obj_exts(struct slab *slab) slab->obj_exts = 0; } +/* + * Calculate the allocation size for slabobj_ext array. + * + * When memory allocation profiling is enabled, the obj_exts array + * could be allocated from the same slab cache it's being allocated for. + * This would prevent the slab from ever being freed because it would + * always contain at least one allocated object (its own obj_exts array). + * + * To avoid this, increase the allocation size when we detect the array + * may come from the same cache, forcing it to use a different cache. + */ +static inline size_t obj_exts_alloc_size(struct kmem_cache *s, + struct slab *slab, gfp_t gfp) +{ + size_t sz = sizeof(struct slabobj_ext) * slab->objects; + struct kmem_cache *obj_exts_cache; + + /* + * slabobj_ext array for KMALLOC_CGROUP allocations + * are served from KMALLOC_NORMAL caches. + */ + if (!mem_alloc_profiling_enabled()) + return sz; + + if (sz > KMALLOC_MAX_CACHE_SIZE) + return sz; + + if (!is_kmalloc_normal(s)) + return sz; + + obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0); + /* + * We can't simply compare s with obj_exts_cache, because random kmalloc + * caches have multiple caches per size, selected by caller address. + * Since caller address may differ between kmalloc_slab() and actual + * allocation, bump size when sizes are equal. + */ + if (s->object_size == obj_exts_cache->object_size) + return obj_exts_cache->object_size + 1; + + return sz; +} + int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { @@ -2103,26 +2152,26 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, unsigned long new_exts; unsigned long old_exts; struct slabobj_ext *vec; + size_t sz; gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; + sz = obj_exts_alloc_size(s, slab, gfp); + /* * Note that allow_spin may be false during early boot and its * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting * architectures with cmpxchg16b, early obj_exts will be missing for * very early allocations on those. */ - if (unlikely(!allow_spin)) { - size_t sz = objects * sizeof(struct slabobj_ext); - + if (unlikely(!allow_spin)) vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, slab_nid(slab)); - } else { - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); - } + else + vec = kmalloc_node(sz, gfp | __GFP_ZERO, slab_nid(slab)); + if (!vec) { /* * Try to mark vectors which failed to allocate. @@ -2136,6 +2185,9 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, return -ENOMEM; } + VM_WARN_ON_ONCE(virt_to_slab(vec) != NULL && + virt_to_slab(vec)->slab_cache == s); + new_exts = (unsigned long)vec; if (unlikely(!allow_spin)) new_exts |= OBJEXTS_NOSPIN_ALLOC; @@ -2145,6 +2197,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, retry: old_exts = READ_ONCE(slab->obj_exts); handle_failed_objexts_alloc(old_exts, vec, objects); + slab_set_stride(slab, sizeof(struct slabobj_ext)); + if (new_slab) { /* * If the slab is brand new and nobody can yet access its @@ -2178,7 +2232,7 @@ static inline void free_slab_obj_exts(struct slab *slab) { struct slabobj_ext *obj_exts; - obj_exts = slab_obj_exts(slab); + obj_exts = (struct slabobj_ext *)slab_obj_exts(slab); if (!obj_exts) { /* * If obj_exts allocation failed, slab->obj_exts is set to @@ -2189,6 +2243,11 @@ static inline void free_slab_obj_exts(struct slab *slab) return; } + if (obj_exts_in_slab(slab->slab_cache, slab)) { + slab->obj_exts = 0; + return; + } + /* * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its * corresponding extension will be NULL. alloc_tag_sub() will throw a @@ -2204,6 +2263,54 @@ static inline void free_slab_obj_exts(struct slab *slab) slab->obj_exts = 0; } +/* + * Try to allocate slabobj_ext array from unused space. + * This function must be called on a freshly allocated slab to prevent + * concurrency problems. + */ +static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) +{ + void *addr; + unsigned long obj_exts; + + if (!need_slab_obj_exts(s)) + return; + + if (obj_exts_fit_within_slab_leftover(s, slab)) { + addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab); + addr = kasan_reset_tag(addr); + obj_exts = (unsigned long)addr; + + get_slab_obj_exts(obj_exts); + memset(addr, 0, obj_exts_size_in_slab(slab)); + put_slab_obj_exts(obj_exts); + +#ifdef CONFIG_MEMCG + obj_exts |= MEMCG_DATA_OBJEXTS; +#endif + slab->obj_exts = obj_exts; + slab_set_stride(slab, sizeof(struct slabobj_ext)); + } else if (s->flags & SLAB_OBJ_EXT_IN_OBJ) { + unsigned int offset = obj_exts_offset_in_object(s); + + obj_exts = (unsigned long)slab_address(slab); + obj_exts += s->red_left_pad; + obj_exts += offset; + + get_slab_obj_exts(obj_exts); + for_each_object(addr, s, slab_address(slab), slab->objects) + memset(kasan_reset_tag(addr) + offset, 0, + sizeof(struct slabobj_ext)); + put_slab_obj_exts(obj_exts); + +#ifdef CONFIG_MEMCG + obj_exts |= MEMCG_DATA_OBJEXTS; +#endif + slab->obj_exts = obj_exts; + slab_set_stride(slab, s->size); + } +} + #else /* CONFIG_SLAB_OBJ_EXT */ static inline void init_slab_obj_exts(struct slab *slab) @@ -2220,31 +2327,37 @@ static inline void free_slab_obj_exts(struct slab *slab) { } +static inline void alloc_slab_obj_exts_early(struct kmem_cache *s, + struct slab *slab) +{ +} + #endif /* CONFIG_SLAB_OBJ_EXT */ #ifdef CONFIG_MEM_ALLOC_PROFILING -static inline struct slabobj_ext * -prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +static inline unsigned long +prepare_slab_obj_exts_hook(struct kmem_cache *s, struct slab *slab, + gfp_t flags, void *p) { - struct slab *slab; - - slab = virt_to_slab(p); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { pr_warn_once("%s, %s: Failed to create slab extension vector!\n", __func__, s->name); - return NULL; + return 0; } - return slab_obj_exts(slab) + obj_to_index(s, slab, p); + return slab_obj_exts(slab); } + /* Should be called only if mem_alloc_profiling_enabled() */ static noinline void __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - struct slabobj_ext *obj_exts; + unsigned long obj_exts; + struct slabobj_ext *obj_ext; + struct slab *slab; if (!object) return; @@ -2255,16 +2368,23 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) if (flags & __GFP_NO_OBJ_EXT) return; - obj_exts = prepare_slab_obj_exts_hook(s, flags, object); + slab = virt_to_slab(object); + obj_exts = prepare_slab_obj_exts_hook(s, slab, flags, object); /* * Currently obj_exts is used only for allocation profiling. * If other users appear then mem_alloc_profiling_enabled() * check should be added before alloc_tag_add(). */ - if (likely(obj_exts)) - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); - else + if (obj_exts) { + unsigned int obj_idx = obj_to_index(s, slab, object); + + get_slab_obj_exts(obj_exts); + obj_ext = slab_obj_ext(slab, obj_exts, obj_idx); + alloc_tag_add(&obj_ext->ref, current->alloc_tag, s->size); + put_slab_obj_exts(obj_exts); + } else { alloc_tag_set_inaccurate(current->alloc_tag); + } } static inline void @@ -2279,8 +2399,8 @@ static noinline void __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct slabobj_ext *obj_exts; int i; + unsigned long obj_exts; /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) @@ -2290,11 +2410,13 @@ __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p if (!obj_exts) return; + get_slab_obj_exts(obj_exts); for (i = 0; i < objects; i++) { unsigned int off = obj_to_index(s, slab, p[i]); - alloc_tag_sub(&obj_exts[off].ref, s->size); + alloc_tag_sub(&slab_obj_ext(slab, obj_exts, off)->ref, s->size); } + put_slab_obj_exts(obj_exts); } static inline void @@ -2352,7 +2474,7 @@ static __fastpath_inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct slabobj_ext *obj_exts; + unsigned long obj_exts; if (!memcg_kmem_online()) return; @@ -2361,13 +2483,16 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, if (likely(!obj_exts)) return; + get_slab_obj_exts(obj_exts); __memcg_slab_free_hook(s, slab, p, objects, obj_exts); + put_slab_obj_exts(obj_exts); } static __fastpath_inline bool memcg_slab_post_charge(void *p, gfp_t flags) { - struct slabobj_ext *slab_exts; + unsigned long obj_exts; + struct slabobj_ext *obj_ext; struct kmem_cache *s; struct page *page; struct slab *slab; @@ -2408,11 +2533,16 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) return true; /* Ignore already charged objects. */ - slab_exts = slab_obj_exts(slab); - if (slab_exts) { + obj_exts = slab_obj_exts(slab); + if (obj_exts) { + get_slab_obj_exts(obj_exts); off = obj_to_index(s, slab, p); - if (unlikely(slab_exts[off].objcg)) + obj_ext = slab_obj_ext(slab, obj_exts, off); + if (unlikely(obj_ext->objcg)) { + put_slab_obj_exts(obj_exts); return true; + } + put_slab_obj_exts(obj_exts); } return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); @@ -2596,7 +2726,8 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } -static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, + unsigned int capacity) { struct slab_sheaf *sheaf; size_t sheaf_size; @@ -2614,7 +2745,7 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) if (s->flags & SLAB_KMALLOC) gfp |= __GFP_NO_OBJ_EXT; - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf_size = struct_size(sheaf, objects, capacity); sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) @@ -2627,6 +2758,12 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) return sheaf; } +static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, + gfp_t gfp) +{ + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); +} + static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) { kfree(sheaf); @@ -2634,9 +2771,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) stat(s, SHEAF_FREE); } -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p); - +static unsigned int +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max); static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, gfp_t gfp) @@ -2647,8 +2784,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, if (!to_fill) return 0; - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, - &sheaf->objects[sheaf->size]); + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, + to_fill); sheaf->size += filled; @@ -2849,12 +2986,23 @@ static void pcs_destroy(struct kmem_cache *s) { int cpu; + /* + * We may be unwinding cache creation that failed before or during the + * allocation of this. + */ + if (!s->cpu_sheaves) + return; + + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ + if (!cache_has_sheaves(s)) + goto free_pcs; + for_each_possible_cpu(cpu) { struct slub_percpu_sheaves *pcs; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); - /* can happen when unwinding failed create */ + /* This can happen when unwinding failed cache creation. */ if (!pcs->main) continue; @@ -2876,11 +3024,13 @@ static void pcs_destroy(struct kmem_cache *s) } } +free_pcs: free_percpu(s->cpu_sheaves); s->cpu_sheaves = NULL; } -static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, + bool allow_spin) { struct slab_sheaf *empty = NULL; unsigned long flags; @@ -2888,7 +3038,10 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) if (!data_race(barn->nr_empty)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, @@ -2965,7 +3118,8 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) * change. */ static struct slab_sheaf * -barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, + bool allow_spin) { struct slab_sheaf *full = NULL; unsigned long flags; @@ -2973,7 +3127,10 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) if (!data_race(barn->nr_full)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_full)) { full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, @@ -2994,7 +3151,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) * barn. But if there are too many full sheaves, reject this with -E2BIG. */ static struct slab_sheaf * -barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, + bool allow_spin) { struct slab_sheaf *empty; unsigned long flags; @@ -3005,7 +3163,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) if (!data_race(barn->nr_empty)) return ERR_PTR(-ENOMEM); - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return ERR_PTR(-EBUSY); if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, @@ -3198,7 +3359,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + if (memcg_kmem_online() && + (s->flags & SLAB_ACCOUNT) && + !slab_obj_exts(slab)) alloc_slab_obj_exts(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), @@ -3262,9 +3425,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) slab->objects = oo_objects(oo); slab->inuse = 0; slab->frozen = 0; - init_slab_obj_exts(slab); - - account_slab(slab, oo_order(oo), s, flags); slab->slab_cache = s; @@ -3273,6 +3433,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) start = slab_address(slab); setup_slab_debug(s, slab, start); + init_slab_obj_exts(slab); + /* + * Poison the slab before initializing the slabobj_ext array + * to prevent the array from being overwritten. + */ + alloc_slab_obj_exts_early(s, slab); + account_slab(slab, oo_order(oo), s, flags); shuffle = shuffle_freelist(s, slab); @@ -3303,7 +3470,7 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); } -static void __free_slab(struct kmem_cache *s, struct slab *slab) +static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) { struct page *page = slab_page(slab); int order = compound_order(page); @@ -3314,14 +3481,26 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __ClearPageSlab(page); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - free_frozen_pages(page, order); + if (allow_spin) + free_frozen_pages(page, order); + else + free_frozen_pages_nolock(page, order); +} + +static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) +{ + /* + * Since it was just allocated, we can skip the actions in + * discard_slab() and free_slab(). + */ + __free_slab(s, slab, false); } static void rcu_free_slab(struct rcu_head *h) { struct slab *slab = container_of(h, struct slab, rcu_head); - __free_slab(slab->slab_cache, slab); + __free_slab(slab->slab_cache, slab, true); } static void free_slab(struct kmem_cache *s, struct slab *slab) @@ -3337,7 +3516,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab) if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); else - __free_slab(s, slab); + __free_slab(s, slab, true); } static void discard_slab(struct kmem_cache *s, struct slab *slab) @@ -3365,10 +3544,10 @@ static inline void slab_clear_node_partial(struct slab *slab) * Management of partially allocated slabs. */ static inline void -__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) +__add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) { n->nr_partial++; - if (tail == DEACTIVATE_TO_TAIL) + if (mode == ADD_TO_TAIL) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); @@ -3376,10 +3555,10 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) } static inline void add_partial(struct kmem_cache_node *n, - struct slab *slab, int tail) + struct slab *slab, enum add_mode mode) { lockdep_assert_held(&n->list_lock); - __add_partial(n, slab, tail); + __add_partial(n, slab, mode); } static inline void remove_partial(struct kmem_cache_node *n, @@ -3430,8 +3609,6 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); - /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist @@ -3447,8 +3624,8 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, void *object; if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { - /* Unlucky, discard newly allocated slab */ - defer_deactivate_slab(slab, NULL); + /* Unlucky, discard newly allocated slab. */ + free_new_slab_nolock(s, slab); return NULL; } @@ -3474,7 +3651,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, if (slab->inuse == slab->objects) add_full(s, n, slab); else - add_partial(n, slab, DEACTIVATE_TO_HEAD); + add_partial(n, slab, ADD_TO_HEAD); inc_slabs_node(s, nid, slab->objects); spin_unlock_irqrestore(&n->list_lock, flags); @@ -3482,29 +3659,78 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, return object; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); -#else -static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, - int drain) { } -#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); +static bool get_partial_node_bulk(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_bulk_context *pc, + bool allow_spin) +{ + struct slab *slab, *slab2; + unsigned int total_free = 0; + unsigned long flags; + + /* Racy check to avoid taking the lock unnecessarily. */ + if (!n || data_race(!n->nr_partial)) + return false; + + INIT_LIST_HEAD(&pc->slabs); + + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return false; + + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + struct freelist_counters flc; + unsigned int slab_free; + + if (!pfmemalloc_match(slab, pc->flags)) + continue; + + /* + * determine the number of free objects in the slab racily + * + * slab_free is a lower bound due to possible subsequent + * concurrent freeing, so the caller may get more objects than + * requested and must handle that + */ + flc.counters = data_race(READ_ONCE(slab->counters)); + slab_free = flc.objects - flc.inuse; + + /* we have already min and this would get us over the max */ + if (total_free >= pc->min_objects + && total_free + slab_free > pc->max_objects) + break; + + remove_partial(n, slab); + + list_add(&slab->slab_list, &pc->slabs); + + total_free += slab_free; + if (total_free >= pc->max_objects) + break; + } + + spin_unlock_irqrestore(&n->list_lock, flags); + return total_free > 0; +} + /* - * Try to allocate a partial slab from a specific node. + * Try to allocate object from a partial slab on a specific node. */ -static struct slab *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, - struct partial_context *pc) +static void *get_from_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct slab *slab, *slab2, *partial = NULL; + struct slab *slab, *slab2; unsigned long flags; - unsigned int partial_slabs = 0; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partial() + * partial slab and there is none available then get_from_partial() * will return NULL. */ if (!n || !n->nr_partial) @@ -3515,54 +3741,55 @@ static struct slab *get_partial_node(struct kmem_cache *s, else if (!spin_trylock_irqsave(&n->list_lock, flags)) return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + + struct freelist_counters old, new; + if (!pfmemalloc_match(slab, pc->flags)) continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - void *object = alloc_single_from_partial(s, n, slab, + object = alloc_single_from_partial(s, n, slab, pc->orig_size); - if (object) { - partial = slab; - pc->object = object; + if (object) break; - } continue; } - remove_partial(n, slab); + /* + * get a single object from the slab. This might race against + * __slab_free(), which however has to take the list_lock if + * it's about to make the slab fully free. + */ + do { + old.freelist = slab->freelist; + old.counters = slab->counters; - if (!partial) { - partial = slab; - stat(s, ALLOC_FROM_PARTIAL); + new.freelist = get_freepointer(s, old.freelist); + new.counters = old.counters; + new.inuse++; - if ((slub_get_cpu_partial(s) == 0)) { - break; - } - } else { - put_cpu_partial(s, slab, 0); - stat(s, CPU_PARTIAL_NODE); + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { - break; - } - } + object = old.freelist; + if (!new.freelist) + remove_partial(n, slab); + + break; } spin_unlock_irqrestore(&n->list_lock, flags); - return partial; + return object; } /* - * Get a slab from somewhere. Search in increasing NUMA distances. + * Get an object from somewhere. Search in increasing NUMA distances. */ -static struct slab *get_any_partial(struct kmem_cache *s, - struct partial_context *pc) +static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(pc->flags); - struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -3597,8 +3824,10 @@ static struct slab *get_any_partial(struct kmem_cache *s, if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - slab = get_partial_node(s, n, pc); - if (slab) { + + void *object = get_from_partial_node(s, n, pc); + + if (object) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -3606,7 +3835,7 @@ static struct slab *get_any_partial(struct kmem_cache *s, * between allocation and the cpuset * update */ - return slab; + return object; } } } @@ -3616,424 +3845,29 @@ static struct slab *get_any_partial(struct kmem_cache *s, } /* - * Get a partial slab, lock it and return it. + * Get an object from a partial slab */ -static struct slab *get_partial(struct kmem_cache *s, int node, - struct partial_context *pc) +static void *get_from_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - struct slab *slab; int searchnode = node; + void *object; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) - return slab; - - return get_any_partial(s, pc); -} - -#ifdef CONFIG_PREEMPTION -/* - * Calculate the next globally unique transaction for disambiguation - * during cmpxchg. The transactions start with the cpu number and are then - * incremented by CONFIG_NR_CPUS. - */ -#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) -#else -/* - * No preemption supported therefore also no need to check for - * different cpus. - */ -#define TID_STEP 1 -#endif /* CONFIG_PREEMPTION */ - -static inline unsigned long next_tid(unsigned long tid) -{ - return tid + TID_STEP; -} - -#ifdef SLUB_DEBUG_CMPXCHG -static inline unsigned int tid_to_cpu(unsigned long tid) -{ - return tid % TID_STEP; -} - -static inline unsigned long tid_to_event(unsigned long tid) -{ - return tid / TID_STEP; -} -#endif - -static inline unsigned int init_tid(int cpu) -{ - return cpu; -} - -static inline void note_cmpxchg_failure(const char *n, - const struct kmem_cache *s, unsigned long tid) -{ -#ifdef SLUB_DEBUG_CMPXCHG - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - - pr_info("%s %s: cmpxchg redo ", n, s->name); - - if (IS_ENABLED(CONFIG_PREEMPTION) && - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { - pr_warn("due to cpu change %d -> %d\n", - tid_to_cpu(tid), tid_to_cpu(actual_tid)); - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { - pr_warn("due to cpu running other code. Event %ld->%ld\n", - tid_to_event(tid), tid_to_event(actual_tid)); - } else { - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", - actual_tid, tid, next_tid(tid)); - } -#endif - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); -} - -static void init_kmem_cache_cpus(struct kmem_cache *s) -{ -#ifdef CONFIG_PREEMPT_RT - /* - * Register lockdep key for non-boot kmem caches to avoid - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() - */ - bool finegrain_lockdep = !init_section_contains(s, 1); -#else - /* - * Don't bother with different lockdep classes for each - * kmem_cache, since we only use local_trylock_irqsave(). - */ - bool finegrain_lockdep = false; -#endif - int cpu; - struct kmem_cache_cpu *c; - - if (finegrain_lockdep) - lockdep_register_key(&s->lock_key); - for_each_possible_cpu(cpu) { - c = per_cpu_ptr(s->cpu_slab, cpu); - local_trylock_init(&c->lock); - if (finegrain_lockdep) - lockdep_set_class(&c->lock, &s->lock_key); - c->tid = init_tid(cpu); - } -} - -/* - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, - * unfreezes the slabs and puts it on the proper list. - * Assumes the slab has been already safely taken away from kmem_cache_cpu - * by the caller. - */ -static void deactivate_slab(struct kmem_cache *s, struct slab *slab, - void *freelist) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - int free_delta = 0; - void *nextfree, *freelist_iter, *freelist_tail; - int tail = DEACTIVATE_TO_HEAD; - unsigned long flags = 0; - struct freelist_counters old, new; - - if (READ_ONCE(slab->freelist)) { - stat(s, DEACTIVATE_REMOTE_FREES); - tail = DEACTIVATE_TO_TAIL; - } - - /* - * Stage one: Count the objects on cpu's freelist as free_delta and - * remember the last object in freelist_tail for later splicing. - */ - freelist_tail = NULL; - freelist_iter = freelist; - while (freelist_iter) { - nextfree = get_freepointer(s, freelist_iter); - - /* - * If 'nextfree' is invalid, it is possible that the object at - * 'freelist_iter' is already corrupted. So isolate all objects - * starting at 'freelist_iter' by skipping them. - */ - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) - break; - - freelist_tail = freelist_iter; - free_delta++; - - freelist_iter = nextfree; - } - - /* - * Stage two: Unfreeze the slab while splicing the per-cpu - * freelist to the head of slab's freelist. - */ - do { - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - new.frozen = 0; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else { - new.freelist = old.freelist; - } - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); - - /* - * Stage three: Manipulate the slab list based on the updated state. - */ - if (!new.inuse && n->nr_partial >= s->min_partial) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } else if (new.freelist) { - spin_lock_irqsave(&n->list_lock, flags); - add_partial(n, slab, tail); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, tail); - } else { - stat(s, DEACTIVATE_FULL); - } -} - -/* - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock - * can be acquired without a deadlock before invoking the function. - * - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), - * and kmalloc() is not used in an unsupported context. - * - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but - * lockdep_assert() will catch a bug in case: - * #1 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() - * or - * #2 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() - * - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt - * disabled context. The lock will always be acquired and if needed it - * block and sleep until the lock is available. - * #1 is possible in !PREEMPT_RT only. - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) - * - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B - */ -#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) -#define local_lock_cpu_slab(s, flags) \ - local_lock_irqsave(&(s)->cpu_slab->lock, flags) -#else -#define local_lock_cpu_slab(s, flags) \ - do { \ - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ - lockdep_assert(__l); \ - } while (0) -#endif - -#define local_unlock_cpu_slab(s, flags) \ - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) - -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) -{ - struct kmem_cache_node *n = NULL, *n2 = NULL; - struct slab *slab, *slab_to_discard = NULL; - unsigned long flags = 0; - - while (partial_slab) { - slab = partial_slab; - partial_slab = slab->next; - - n2 = get_node(s, slab_nid(slab)); - if (n != n2) { - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); - - n = n2; - spin_lock_irqsave(&n->list_lock, flags); - } - - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { - slab->next = slab_to_discard; - slab_to_discard = slab; - } else { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - } - - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); + object = get_from_partial_node(s, get_node(s, searchnode), pc); + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) + return object; - while (slab_to_discard) { - slab = slab_to_discard; - slab_to_discard = slab_to_discard->next; - - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } -} - -/* - * Put all the cpu partial slabs to the node partial list. - */ -static void put_partials(struct kmem_cache *s) -{ - struct slab *partial_slab; - unsigned long flags; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - partial_slab = this_cpu_read(s->cpu_slab->partial); - this_cpu_write(s->cpu_slab->partial, NULL); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (partial_slab) - __put_partials(s, partial_slab); -} - -static void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - struct slab *partial_slab; - - partial_slab = slub_percpu_partial(c); - c->partial = NULL; - - if (partial_slab) - __put_partials(s, partial_slab); -} - -/* - * Put a slab into a partial slab slot if available. - * - * If we did not find a slot then simply move all the partials to the - * per node partial list. - */ -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) -{ - struct slab *oldslab; - struct slab *slab_to_put = NULL; - unsigned long flags; - int slabs = 0; - - local_lock_cpu_slab(s, flags); - - oldslab = this_cpu_read(s->cpu_slab->partial); - - if (oldslab) { - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { - /* - * Partial array is full. Move the existing set to the - * per node partial list. Postpone the actual unfreezing - * outside of the critical section. - */ - slab_to_put = oldslab; - oldslab = NULL; - } else { - slabs = oldslab->slabs; - } - } - - slabs++; - - slab->slabs = slabs; - slab->next = oldslab; - - this_cpu_write(s->cpu_slab->partial, slab); - - local_unlock_cpu_slab(s, flags); - - if (slab_to_put) { - __put_partials(s, slab_to_put); - stat(s, CPU_PARTIAL_DRAIN); - } -} - -#else /* CONFIG_SLUB_CPU_PARTIAL */ - -static inline void put_partials(struct kmem_cache *s) { } -static inline void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } - -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - -static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) -{ - unsigned long flags; - struct slab *slab; - void *freelist; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - - slab = c->slab; - freelist = c->freelist; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } -} - -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - void *freelist = c->freelist; - struct slab *slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } - - put_partials_cpu(s, c); -} - -static inline void flush_this_cpu_slab(struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - - if (c->slab) - flush_slab(s, c); - - put_partials(s); -} - -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - return c->slab || slub_percpu_partial(c); + return get_from_any_partial(s, pc); } static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) return false; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); @@ -4042,11 +3876,11 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s) } /* - * Flush cpu slab. + * Flush percpu sheaves * * Called from CPU work handler with migration disabled. */ -static void flush_cpu_slab(struct work_struct *w) +static void flush_cpu_sheaves(struct work_struct *w) { struct kmem_cache *s; struct slub_flush_work *sfw; @@ -4055,10 +3889,8 @@ static void flush_cpu_slab(struct work_struct *w) s = sfw->s; - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) pcs_flush_all(s); - - flush_this_cpu_slab(s); } static void flush_all_cpus_locked(struct kmem_cache *s) @@ -4071,11 +3903,11 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { + if (!has_pcs_used(cpu, s)) { sfw->skip = true; continue; } - INIT_WORK(&sfw->work, flush_cpu_slab); + INIT_WORK(&sfw->work, flush_cpu_sheaves); sfw->skip = false; sfw->s = s; queue_work_on(cpu, flushwq, &sfw->work); @@ -4160,7 +3992,7 @@ void flush_all_rcu_sheaves(void) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) continue; flush_rcu_sheaves_on_cache(s); } @@ -4181,27 +4013,13 @@ static int slub_cpu_dead(unsigned int cpu) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - __flush_cpu_slab(s, cpu); - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) __pcs_flush_all_cpu(s, cpu); } mutex_unlock(&slab_mutex); return 0; } -/* - * Check if the objects in a per cpu structure fit numa - * locality expectations. - */ -static inline int node_match(struct slab *slab, int node) -{ -#ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && slab_nid(slab) != node) - return 0; -#endif - return 1; -} - #ifdef CONFIG_SLUB_DEBUG static int count_free(struct slab *slab) { @@ -4374,235 +4192,116 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } -static inline bool -__update_cpu_freelist_fast(struct kmem_cache *s, - void *freelist_old, void *freelist_new, - unsigned long tid) -{ - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, - &old.freelist_tid, new.freelist_tid); -} - /* - * Check the slab->freelist and either transfer the freelist to the - * per cpu freelist or deactivate the slab. + * Get the slab's freelist and do not freeze it. * - * The slab is still frozen if the return value is not NULL. + * Assumes the slab is isolated from node partial list and not frozen. * - * If this function returns NULL then the slab has been unfrozen. + * Assumes this is performed only for caches without debugging so we + * don't need to worry about adding the slab to the full list. */ -static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) +static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) { struct freelist_counters old, new; - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - do { old.freelist = slab->freelist; old.counters = slab->counters; new.freelist = NULL; new.counters = old.counters; + VM_WARN_ON_ONCE(new.frozen); new.inuse = old.objects; - new.frozen = old.freelist != NULL; - - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); return old.freelist; } /* - * Freeze the partial slab and return the pointer to the freelist. + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. */ -static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) { - struct freelist_counters old, new; - - do { - old.freelist = slab->freelist; - old.counters = slab->counters; - - new.freelist = NULL; - new.counters = old.counters; - VM_BUG_ON(new.frozen); - - new.inuse = old.objects; - new.frozen = 1; - - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); - - return old.freelist; + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); } -/* - * Slow path. The lockless freelist is empty or we need to perform - * debugging duties. - * - * Processing is still very fast if new objects have been freed to the - * regular freelist. In that case we simply take over the regular freelist - * as the lockless freelist and zap the regular freelist. - * - * If that is not working then we fall back to the partial lists. We take the - * first element of the freelist as the object to allocate now and move the - * rest of the freelist to the lockless freelist. - * - * And if we were unable to get a new slab from the partial slab lists then - * we need to allocate a new slab. This is the slowest path since it involves - * a call to the page allocator and the setup of a new slab. - * - * Version of __slab_alloc to use when we know that preemption is - * already disabled (which is the case for bulk allocation). - */ -static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, + void **p, unsigned int count, bool allow_spin) { - bool allow_spin = gfpflags_allow_spinning(gfpflags); - void *freelist; - struct slab *slab; + unsigned int allocated = 0; + struct kmem_cache_node *n; + bool needs_add_partial; unsigned long flags; - struct partial_context pc; - bool try_thisnode = true; - - stat(s, ALLOC_SLOWPATH); - -reread_slab: - - slab = READ_ONCE(c->slab); - if (!slab) { - /* - * if the node is not online or has no normal memory, just - * ignore the node constraint - */ - if (unlikely(node != NUMA_NO_NODE && - !node_isset(node, slab_nodes))) - node = NUMA_NO_NODE; - goto new_slab; - } - - if (unlikely(!node_match(slab, node))) { - /* - * same as above but node_match() being false already - * implies node != NUMA_NO_NODE. - * - * We don't strictly honor pfmemalloc and NUMA preferences - * when !allow_spin because: - * - * 1. Most kmalloc() users allocate objects on the local node, - * so kmalloc_nolock() tries not to interfere with them by - * deactivating the cpu slab. - * - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause - * unnecessary slab allocations even when n->partial list - * is not empty. - */ - if (!node_isset(node, slab_nodes) || - !allow_spin) { - node = NUMA_NO_NODE; - } else { - stat(s, ALLOC_NODE_MISMATCH); - goto deactivate_slab; - } - } + void *object; /* - * By rights, we should be searching for a slab page that was - * PFMEMALLOC but right now, we are losing the pfmemalloc - * information when the page leaves the per-cpu allocator + * Are we going to put the slab on the partial list? + * Note slab->inuse is 0 on a new slab. */ - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) - goto deactivate_slab; - - /* must check again c->slab in case we got preempted and it changed */ - local_lock_cpu_slab(s, flags); + needs_add_partial = (slab->objects > count); - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - if (freelist) - goto load_freelist; + if (!allow_spin && needs_add_partial) { - freelist = get_freelist(s, slab); + n = get_node(s, slab_nid(slab)); - if (!freelist) { - c->slab = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - stat(s, DEACTIVATE_BYPASS); - goto new_slab; + if (!spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + free_new_slab_nolock(s, slab); + return 0; + } } - stat(s, ALLOC_REFILL); - -load_freelist: - - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + object = slab->freelist; + while (object && allocated < count) { + p[allocated] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[allocated]); - /* - * freelist is pointing to the list of objects to be used. - * slab is pointing to the slab from which the objects are obtained. - * That slab must be frozen for per cpu allocations to work. - */ - VM_BUG_ON(!c->slab->frozen); - c->freelist = get_freepointer(s, freelist); - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - return freelist; - -deactivate_slab: - - local_lock_cpu_slab(s, flags); - if (slab != c->slab) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - deactivate_slab(s, slab, freelist); + slab->inuse++; + allocated++; + } + slab->freelist = object; -new_slab: + if (needs_add_partial) { -#ifdef CONFIG_SLUB_CPU_PARTIAL - while (slub_percpu_partial(c)) { - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - if (unlikely(!slub_percpu_partial(c))) { - local_unlock_cpu_slab(s, flags); - /* we were preempted and partial list got empty */ - goto new_objects; + if (allow_spin) { + n = get_node(s, slab_nid(slab)); + spin_lock_irqsave(&n->list_lock, flags); } + add_partial(n, slab, ADD_TO_HEAD); + spin_unlock_irqrestore(&n->list_lock, flags); + } - slab = slub_percpu_partial(c); - slub_set_percpu_partial(c, slab); - - if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags)) || - !allow_spin) { - c->slab = slab; - freelist = get_freelist(s, slab); - VM_BUG_ON(!freelist); - stat(s, CPU_PARTIAL_ALLOC); - goto load_freelist; - } + inc_slabs_node(s, slab_nid(slab), slab->objects); + return allocated; +} - local_unlock_cpu_slab(s, flags); +/* + * Slow path. We failed to allocate via percpu sheaves or they are not available + * due to bootstrap or debugging enabled or SLUB_TINY. + * + * We try to allocate from partial slab lists and fall back to allocating a new + * slab. + */ +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, unsigned int orig_size) +{ + bool allow_spin = gfpflags_allow_spinning(gfpflags); + void *object; + struct slab *slab; + struct partial_context pc; + bool try_thisnode = true; - slab->next = NULL; - __put_partials(s, slab); - } -#endif + stat(s, ALLOC_SLOWPATH); new_objects: @@ -4611,12 +4310,12 @@ new_objects: * When a preferred node is indicated but no __GFP_THISNODE * * 1) try to get a partial slab from target node only by having - * __GFP_THISNODE in pc.flags for get_partial() + * __GFP_THISNODE in pc.flags for get_from_partial() * 2) if 1) failed, try to allocate a new slab from target node with * GPF_NOWAIT | __GFP_THISNODE opportunistically * 3) if 2) failed, retry with original gfpflags which will allow - * get_partial() try partial lists of other nodes before potentially - * allocating new page from other nodes + * get_from_partial() try partial lists of other nodes before + * potentially allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)) { @@ -4628,33 +4327,11 @@ new_objects: } pc.orig_size = orig_size; - slab = get_partial(s, node, &pc); - if (slab) { - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = pc.object; - /* - * For debug caches here we had to go through - * alloc_single_from_partial() so just store the - * tracking info and return the object. - * - * Due to disabled preemption we need to disallow - * blocking. The flags are further adjusted by - * gfp_nested_mask() in stack_depot itself. - */ - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); - - return freelist; - } - - freelist = freeze_slab(s, slab); - goto retry_load_slab; - } + object = get_from_partial(s, node, &pc); + if (object) + goto success; - slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, pc.flags, node); - c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) @@ -4669,165 +4346,36 @@ new_objects: stat(s, ALLOC_SLAB); if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - - if (unlikely(!freelist)) { - /* This could cause an endless loop. Fail instead. */ - if (!allow_spin) - return NULL; - goto new_objects; - } - - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); - - return freelist; - } - - /* - * No other reference to the slab yet so we can - * muck around with it freely without cmpxchg - */ - freelist = slab->freelist; - slab->freelist = NULL; - slab->inuse = slab->objects; - slab->frozen = 1; + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - inc_slabs_node(s, slab_nid(slab), slab->objects); + if (likely(object)) + goto success; + } else { + alloc_from_new_slab(s, slab, &object, 1, allow_spin); - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { - /* - * For !pfmemalloc_match() case we don't load freelist so that - * we don't make further mismatched allocations easier. - */ - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; + /* we don't need to check SLAB_STORE_USER here */ + if (likely(object)) + return object; } -retry_load_slab: - - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - void *flush_freelist = c->freelist; - struct slab *flush_slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_cpu_slab(s, flags); - - if (unlikely(!allow_spin)) { - /* Reentrant slub cannot take locks, defer */ - defer_deactivate_slab(flush_slab, flush_freelist); - } else { - deactivate_slab(s, flush_slab, flush_freelist); - } - - stat(s, CPUSLAB_FLUSH); - - goto retry_load_slab; - } - c->slab = slab; + if (allow_spin) + goto new_objects; - goto load_freelist; -} -/* - * We disallow kprobes in ___slab_alloc() to prevent reentrance - * - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() - * manipulating c->freelist without lock. - * - * This does not prevent kprobe in functions called from ___slab_alloc() such as - * local_lock_irqsave() itself, and that is fine, we only need to protect the - * c->freelist manipulation in ___slab_alloc() itself. - */ -NOKPROBE_SYMBOL(___slab_alloc); + /* This could cause an endless loop. Fail instead. */ + return NULL; -/* - * A wrapper for ___slab_alloc() for contexts where preemption is not yet - * disabled. Compensates for possible cpu changes by refetching the per cpu area - * pointer. - */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) -{ - void *p; +success: + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) + set_track(s, object, TRACK_ALLOC, addr, gfpflags); -#ifdef CONFIG_PREEMPT_COUNT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling preemption. Need to reload cpu area - * pointer. - */ - c = slub_get_cpu_ptr(s->cpu_slab); -#endif - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { - if (local_lock_is_locked(&s->cpu_slab->lock)) { - /* - * EBUSY is an internal signal to kmalloc_nolock() to - * retry a different bucket. It's not propagated - * to the caller. - */ - p = ERR_PTR(-EBUSY); - goto out; - } - } - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); -out: -#ifdef CONFIG_PREEMPT_COUNT - slub_put_cpu_ptr(s->cpu_slab); -#endif - return p; + return object; } static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - struct kmem_cache_cpu *c; - struct slab *slab; - unsigned long tid; void *object; -redo: - /* - * Must read kmem_cache cpu data via this cpu ptr. Preemption is - * enabled. We may switch back and forth between cpus while - * reading from one cpu area. That does not matter as long - * as we end up on the original cpu again when doing the cmpxchg. - * - * We must guarantee that tid and kmem_cache_cpu are retrieved on the - * same cpu. We read first the kmem_cache_cpu pointer and use it to read - * the tid. If we are preempted and switched to another cpu between the - * two reads, it's OK as the two are still associated with the same cpu - * and cmpxchg later will validate the cpu. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* - * Irqless object alloc/free algorithm used here depends on sequence - * of fetching cpu_slab's data. tid should be fetched before anything - * on c to guarantee that object and slab associated with previous tid - * won't be used with current tid. If we fetch tid first, object and - * slab could be one associated with next tid and our alloc/free - * request will be failed. In this case, we will retry. So, no problem. - */ - barrier(); - - /* - * The transaction ids are globally unique per cpu and per operation on - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double - * occurs on the right processor and that there was no operation on the - * linked list in between. - */ - - object = c->freelist; - slab = c->slab; - #ifdef CONFIG_NUMA if (static_branch_unlikely(&strict_numa) && node == NUMA_NO_NODE) { @@ -4836,66 +4384,24 @@ redo: if (mpol) { /* - * Special BIND rule support. If existing slab + * Special BIND rule support. If the local node * is in permitted set then do not redirect * to a particular node. * Otherwise we apply the memory policy to get * the node we need to allocate on. */ - if (mpol->mode != MPOL_BIND || !slab || - !node_isset(slab_nid(slab), mpol->nodes)) - + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) node = mempolicy_slab_node(); } } #endif - if (!USE_LOCKLESS_FAST_PATH() || - unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); - } else { - void *next_object = get_freepointer_safe(s, object); - - /* - * The cmpxchg will only match if there was no additional - * operation and if we are on the right processor. - * - * The cmpxchg does the following atomically (without lock - * semantics!) - * 1. Relocate first pointer to the current per cpu area. - * 2. Verify that tid and freelist have not been changed - * 3. If they were not changed replace tid and freelist - * - * Since this is without lock semantics the protection is only - * against code executing on this cpu *not* from access by - * other cpus. - */ - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { - note_cmpxchg_failure("slab_alloc", s, tid); - goto redo; - } - prefetch_freepointer(s, next_object); - stat(s, ALLOC_FASTPATH); - } + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); return object; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - * - * Note that we also wipe custom freelist pointers. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj && - !freeptr_outside_object(s)) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - static __fastpath_inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { @@ -4982,6 +4488,12 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); return pcs; @@ -4993,7 +4505,8 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5010,7 +4523,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, empty = pcs->spare; pcs->spare = NULL; } else { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); } } @@ -5052,7 +4565,10 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, */ if (pcs->main->size == 0) { - barn_put_empty_sheaf(barn, pcs->main); + if (!pcs->spare) + pcs->spare = pcs->main; + else + barn_put_empty_sheaf(barn, pcs->main); pcs->main = full; return pcs; } @@ -5109,8 +4625,10 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) * We assume the percpu sheaves contain only local objects although it's * not completely guaranteed, so we verify later. */ - if (unlikely(node_requested && node != numa_mem_id())) + if (unlikely(node_requested && node != numa_mem_id())) { + stat(s, ALLOC_NODE_MISMATCH); return NULL; + } if (!local_trylock(&s->cpu_sheaves->lock)) return NULL; @@ -5133,6 +4651,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) */ if (page_to_nid(virt_to_page(object)) != node) { local_unlock(&s->cpu_sheaves->lock); + stat(s, ALLOC_NODE_MISMATCH); return NULL; } } @@ -5141,13 +4660,14 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) local_unlock(&s->cpu_sheaves->lock); - stat(s, ALLOC_PCS); + stat(s, ALLOC_FASTPATH); return object; } static __fastpath_inline -unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, + void **p) { struct slub_percpu_sheaves *pcs; struct slab_sheaf *main; @@ -5165,6 +4685,11 @@ next_batch: struct slab_sheaf *full; struct node_barn *barn; + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return allocated; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); goto do_alloc; @@ -5176,7 +4701,8 @@ next_batch: return allocated; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5206,7 +4732,7 @@ do_alloc: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, ALLOC_PCS, batch); + stat_add(s, ALLOC_FASTPATH, batch); allocated += batch; @@ -5244,8 +4770,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - if (s->cpu_sheaves) - object = alloc_from_pcs(s, gfpflags, node); + object = alloc_from_pcs(s, gfpflags, node); if (!object) object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); @@ -5340,6 +4865,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, return ret; } +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -5353,18 +4881,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) struct slab_sheaf *sheaf = NULL; struct node_barn *barn; - if (unlikely(size > s->sheaf_capacity)) { + if (unlikely(!size)) + return NULL; - /* - * slab_debug disables cpu sheaves intentionally so all - * prefilled sheaves become "oversize" and we give up on - * performance for the debugging. Same with SLUB_TINY. - * Creating a cache without sheaves and then requesting a - * prefilled sheaf is however not expected, so warn. - */ - WARN_ON_ONCE(s->sheaf_capacity == 0 && - !IS_ENABLED(CONFIG_SLUB_TINY) && - !(s->flags & SLAB_DEBUG_FLAGS)); + if (unlikely(size > s->sheaf_capacity)) { sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); if (!sheaf) @@ -5686,7 +5206,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; struct kmem_cache *s; bool can_retry = true; - void *ret = ERR_PTR(-EBUSY); + void *ret; VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | __GFP_NO_OBJ_EXT)); @@ -5694,13 +5214,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) if (unlikely(!size)) return ZERO_SIZE_PTR; - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) - /* - * kmalloc_nolock() in PREEMPT_RT is not supported from - * non-preemptible context because local_lock becomes a - * sleeping lock on RT. - */ + /* + * See the comment for the same check in + * alloc_frozen_pages_nolock_noprof() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) return NULL; + retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; @@ -5709,50 +5229,47 @@ retry: if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) /* * kmalloc_nolock() is not supported on architectures that - * don't implement cmpxchg16b, but debug caches don't use - * per-cpu slab and per-cpu partial slabs. They rely on - * kmem_cache_node->list_lock, so kmalloc_nolock() can - * attempt to allocate from debug caches by + * don't implement cmpxchg16b and thus need slab_lock() + * which could be preempted by a nmi. + * But debug caches don't use that and only rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt + * to allocate from debug caches by * spin_trylock_irqsave(&n->list_lock, ...) */ return NULL; + ret = alloc_from_pcs(s, alloc_gfp, node); + if (ret) + goto success; + /* * Do not call slab_alloc_node(), since trylock mode isn't * compatible with slab_pre_alloc_hook/should_failslab and * kfence_alloc. Hence call __slab_alloc_node() (at most twice) * and slab_post_alloc_hook() directly. - * - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair - * in irq saved region. It assumes that the same cpu will not - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. - * Therefore use in_nmi() to check whether particular bucket is in - * irq protected section. - * - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that - * this cpu was interrupted somewhere inside ___slab_alloc() after - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). - * In this case fast path with __update_cpu_freelist_fast() is not safe. */ - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); - if (PTR_ERR(ret) == -EBUSY) { - if (can_retry) { - /* pick the next kmalloc bucket */ - size = s->object_size + 1; - /* - * Another alternative is to - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; - * to retry from bucket of the same size. - */ - can_retry = false; - goto retry; - } - ret = NULL; + /* + * It's possible we failed due to trylock as we preempted someone with + * the sheaves locked, and the list_lock is also held by another cpu. + * But it should be rare that multiple kmalloc buckets would have + * sheaves locked, so try a larger one. + */ + if (!ret && can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; } +success: maybe_wipe_obj_freeptr(s, ret); slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, slab_want_init_on_alloc(alloc_gfp, s), size); @@ -5834,7 +5351,7 @@ static noinline void free_to_partial_list( /* was on full list */ remove_full(s, n, slab); if (!slab_free) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } else if (slab_free) { @@ -5872,26 +5389,17 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - bool was_frozen, was_full; + bool was_full; struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; bool on_node_partial; - stat(s, FREE_SLOWPATH); - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return; } - /* - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) - * is the only other reason it can be false, and it is already handled - * above. - */ - do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); @@ -5902,7 +5410,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, old.counters = slab->counters; was_full = (old.freelist == NULL); - was_frozen = old.frozen; set_freepointer(s, tail, old.freelist); @@ -5915,53 +5422,29 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * to (due to not being full anymore) the partial list. * Unless it's frozen. */ - if ((!new.inuse || was_full) && !was_frozen) { + if (!new.inuse || was_full) { + + n = get_node(s, slab_nid(slab)); /* - * If slab becomes non-full and we have cpu partial - * lists, we put it there unconditionally to avoid - * taking the list_lock. Otherwise we need it. + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. */ - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { - - n = get_node(s, slab_nid(slab)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); - on_node_partial = slab_test_node_partial(slab); - } + on_node_partial = slab_test_node_partial(slab); } } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { - - if (likely(was_frozen)) { - /* - * The list lock was not taken therefore no list - * activity can be necessary. - */ - stat(s, FREE_FROZEN); - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { - /* - * If we started with a full slab then put it onto the - * per cpu partial list. - */ - put_cpu_partial(s, slab, 1); - stat(s, CPU_PARTIAL_FREE); - } - /* - * In other cases we didn't take the list_lock because the slab - * was already on the partial list and will remain there. + * We didn't take the list_lock because the slab was already on + * the partial list and will remain there. */ - return; } @@ -5983,11 +5466,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, /* * Objects left in the slab. If it was not on the partial list before - * then add it. This can only happen when cache has no per cpu partial - * list otherwise we would have put it there. + * then add it. */ - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + if (unlikely(was_full)) { + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -6073,7 +5555,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s, * unlocked. */ static struct slub_percpu_sheaves * -__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, + bool allow_spin) { struct slab_sheaf *empty; struct node_barn *barn; @@ -6082,6 +5565,12 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) restart: lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + barn = get_barn(s); if (!barn) { local_unlock(&s->cpu_sheaves->lock); @@ -6091,7 +5580,7 @@ restart: put_fail = false; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, allow_spin); if (empty) { pcs->spare = pcs->main; pcs->main = empty; @@ -6105,7 +5594,7 @@ restart: return pcs; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); if (!IS_ERR(empty)) { stat(s, BARN_PUT); @@ -6113,7 +5602,8 @@ restart: return pcs; } - if (PTR_ERR(empty) == -E2BIG) { + /* sheaf_flush_unused() doesn't support !allow_spin */ + if (PTR_ERR(empty) == -E2BIG && allow_spin) { /* Since we got here, spare exists and is full */ struct slab_sheaf *to_flush = pcs->spare; @@ -6138,6 +5628,14 @@ restart: alloc_empty: local_unlock(&s->cpu_sheaves->lock); + /* + * alloc_empty_sheaf() doesn't support !allow_spin and it's + * easier to fall back to freeing directly without sheaves + * than add the support (and to sheaf_flush_unused() above) + */ + if (!allow_spin) + return NULL; + empty = alloc_empty_sheaf(s, GFP_NOWAIT); if (empty) goto got_empty; @@ -6180,7 +5678,7 @@ got_empty: * The object is expected to have passed slab_free_hook() already. */ static __fastpath_inline -bool free_to_pcs(struct kmem_cache *s, void *object) +bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) { struct slub_percpu_sheaves *pcs; @@ -6191,7 +5689,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object) if (unlikely(pcs->main->size == s->sheaf_capacity)) { - pcs = __pcs_replace_full_main(s, pcs); + pcs = __pcs_replace_full_main(s, pcs, allow_spin); if (unlikely(!pcs)) return false; } @@ -6200,7 +5698,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object) local_unlock(&s->cpu_sheaves->lock); - stat(s, FREE_PCS); + stat(s, FREE_FASTPATH); return true; } @@ -6265,11 +5763,29 @@ empty: free_empty_sheaf(s, sheaf); } +/* + * kvfree_call_rcu() can be called while holding a raw_spinlock_t. Since + * __kfree_rcu_sheaf() may acquire a spinlock_t (sleeping lock on PREEMPT_RT), + * this would violate lock nesting rules. Therefore, kvfree_call_rcu() avoids + * this problem by bypassing the sheaves layer entirely on PREEMPT_RT. + * + * However, lockdep still complains that it is invalid to acquire spinlock_t + * while holding raw_spinlock_t, even on !PREEMPT_RT where spinlock_t is a + * spinning lock. Tell lockdep that acquiring spinlock_t is valid here + * by temporarily raising the wait-type to LD_WAIT_CONFIG. + */ +static DEFINE_WAIT_OVERRIDE_MAP(kfree_rcu_sheaf_map, LD_WAIT_CONFIG); + bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) { struct slub_percpu_sheaves *pcs; struct slab_sheaf *rcu_sheaf; + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return false; + + lock_map_acquire_try(&kfree_rcu_sheaf_map); + if (!local_trylock(&s->cpu_sheaves->lock)) goto fail; @@ -6280,6 +5796,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) struct slab_sheaf *empty; struct node_barn *barn; + /* Bootstrap or debug cache, fall back */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + goto fail; + } + if (pcs->spare && pcs->spare->size == 0) { pcs->rcu_free = pcs->spare; pcs->spare = NULL; @@ -6292,7 +5814,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) goto fail; } - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (empty) { pcs->rcu_free = empty; @@ -6346,10 +5868,12 @@ do_free: local_unlock(&s->cpu_sheaves->lock); stat(s, FREE_RCU_SHEAF); + lock_map_release(&kfree_rcu_sheaf_map); return true; fail: stat(s, FREE_RCU_SHEAF_FAIL); + lock_map_release(&kfree_rcu_sheaf_map); return false; } @@ -6410,7 +5934,7 @@ next_batch: goto no_empty; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (!empty) goto no_empty; @@ -6424,7 +5948,7 @@ next_batch: goto do_free; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, true); if (IS_ERR(empty)) { stat(s, BARN_PUT_FAIL); goto no_empty; @@ -6442,7 +5966,7 @@ do_free: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, FREE_PCS, batch); + stat_add(s, FREE_FASTPATH, batch); if (batch < size) { p += batch; @@ -6464,10 +5988,12 @@ no_empty: */ fallback: __kmem_cache_free_bulk(s, size, p); + stat_add(s, FREE_SLOWPATH, size); flush_remote: if (remote_nr) { __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + stat_add(s, FREE_SLOWPATH, remote_nr); if (i < size) { remote_nr = 0; goto next_remote_batch; @@ -6477,7 +6003,6 @@ flush_remote: struct defer_free { struct llist_head objects; - struct llist_head slabs; struct irq_work work; }; @@ -6485,23 +6010,21 @@ static void free_deferred_objects(struct irq_work *work); static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { .objects = LLIST_HEAD_INIT(objects), - .slabs = LLIST_HEAD_INIT(slabs), .work = IRQ_WORK_INIT(free_deferred_objects), }; /* * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * to take sleeping spin_locks from __slab_free(). * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). */ static void free_deferred_objects(struct irq_work *work) { struct defer_free *df = container_of(work, struct defer_free, work); struct llist_head *objs = &df->objects; - struct llist_head *slabs = &df->slabs; struct llist_node *llnode, *pos, *t; - if (llist_empty(objs) && llist_empty(slabs)) + if (llist_empty(objs)) return; llnode = llist_del_all(objs); @@ -6524,16 +6047,7 @@ static void free_deferred_objects(struct irq_work *work) set_freepointer(s, x, NULL); __slab_free(s, slab, x, x, 1, _THIS_IP_); - } - - llnode = llist_del_all(slabs); - llist_for_each_safe(pos, t, llnode) { - struct slab *slab = container_of(pos, struct slab, llnode); - - if (slab->frozen) - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); - else - free_slab(slab->slab_cache, slab); + stat(s, FREE_SLOWPATH); } } @@ -6550,19 +6064,6 @@ static void defer_free(struct kmem_cache *s, void *head) irq_work_queue(&df->work); } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) -{ - struct defer_free *df; - - slab->flush_freelist = flush_freelist; - - guard(preempt)(); - - df = this_cpu_ptr(&defer_free_objects); - if (llist_add(&slab->llnode, &df->slabs)) - irq_work_queue(&df->work); -} - void defer_free_barrier(void) { int cpu; @@ -6571,99 +6072,6 @@ void defer_free_barrier(void) irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } -/* - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that - * can perform fastpath freeing without additional function calls. - * - * The fastpath is only possible if we are freeing to the current cpu slab - * of this processor. This typically the case if we have just allocated - * the item before. - * - * If fastpath is not possible then fall back to __slab_free where we deal - * with all sorts of special processing. - * - * Bulk free of a freelist with several objects (all pointing to the - * same slab) possible by specifying head and tail ptr, plus objects - * count (cnt). Bulk free indicated by tail pointer being set. - */ -static __always_inline void do_slab_free(struct kmem_cache *s, - struct slab *slab, void *head, void *tail, - int cnt, unsigned long addr) -{ - /* cnt == 0 signals that it's called from kfree_nolock() */ - bool allow_spin = cnt; - struct kmem_cache_cpu *c; - unsigned long tid; - void **freelist; - -redo: - /* - * Determine the currently cpus per cpu slab. - * The cpu may change afterward. However that does not matter since - * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succeed. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* Same with comment on barrier() in __slab_alloc_node() */ - barrier(); - - if (unlikely(slab != c->slab)) { - if (unlikely(!allow_spin)) { - /* - * __slab_free() can locklessly cmpxchg16 into a slab, - * but then it might need to take spin_lock or local_lock - * in put_cpu_partial() for further processing. - * Avoid the complexity and simply add to a deferred list. - */ - defer_free(s, head); - } else { - __slab_free(s, slab, head, tail, cnt, addr); - } - return; - } - - if (unlikely(!allow_spin)) { - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && - local_lock_is_locked(&s->cpu_slab->lock)) { - defer_free(s, head); - return; - } - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ - } - - if (USE_LOCKLESS_FAST_PATH()) { - freelist = READ_ONCE(c->freelist); - - set_freepointer(s, tail, freelist); - - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { - note_cmpxchg_failure("slab_free", s, tid); - goto redo; - } - } else { - __maybe_unused unsigned long flags = 0; - - /* Update the free list under the local lock */ - local_lock_cpu_slab(s, flags); - c = this_cpu_ptr(s->cpu_slab); - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto redo; - } - tid = c->tid; - freelist = c->freelist; - - set_freepointer(s, tail, freelist); - c->freelist = head; - c->tid = next_tid(tid); - - local_unlock_cpu_slab(s, flags); - } - stat_add(s, FREE_FASTPATH, cnt); -} - static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) @@ -6674,14 +6082,14 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) return; - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id()) - && likely(!slab_test_pfmemalloc(slab))) { - if (likely(free_to_pcs(s, object))) + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { + if (likely(free_to_pcs(s, object, true))) return; } - do_slab_free(s, slab, object, object, 1, addr); + __slab_free(s, slab, object, object, 1, addr); + stat(s, FREE_SLOWPATH); } #ifdef CONFIG_MEMCG @@ -6694,7 +6102,7 @@ void memcg_alloc_abort_single(struct kmem_cache *s, void *object) alloc_tagging_slab_free_hook(s, slab, &object, 1); if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) - do_slab_free(s, slab, object, object, 1, _RET_IP_); + __slab_free(s, slab, object, object, 1, _RET_IP_); } #endif @@ -6708,8 +6116,10 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) - do_slab_free(s, slab, head, tail, cnt, addr); + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { + __slab_free(s, slab, head, tail, cnt, addr); + stat_add(s, FREE_SLOWPATH, cnt); + } } #ifdef CONFIG_SLUB_RCU_DEBUG @@ -6734,42 +6144,41 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) return; /* resume freeing */ - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) - do_slab_free(s, slab, object, object, 1, _THIS_IP_); + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { + __slab_free(s, slab, object, object, 1, _THIS_IP_); + stat(s, FREE_SLOWPATH); + } } #endif /* CONFIG_SLUB_RCU_DEBUG */ #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); + stat(cache, FREE_SLOWPATH); } #endif -static inline struct kmem_cache *virt_to_cache(const void *obj) +static noinline void warn_free_bad_obj(struct kmem_cache *s, void *obj) { + struct kmem_cache *cachep; struct slab *slab; slab = virt_to_slab(obj); - if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) - return NULL; - return slab->slab_cache; -} - -static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) -{ - struct kmem_cache *cachep; + if (WARN_ONCE(!slab, + "kmem_cache_free(%s, %p): object is not in a slab page\n", + s->name, obj)) + return; - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) - return s; + cachep = slab->slab_cache; - cachep = virt_to_cache(x); - if (WARN(cachep && cachep != s, - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name)) - print_tracking(cachep, x); - return cachep; + if (WARN_ONCE(cachep != s, + "kmem_cache_free(%s, %p): object belongs to different cache %s\n", + s->name, obj, cachep ? cachep->name : "(NULL)")) { + if (cachep) + print_tracking(cachep, obj); + return; + } } /** @@ -6782,14 +6191,118 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) */ void kmem_cache_free(struct kmem_cache *s, void *x) { - s = cache_from_obj(s, x); - if (!s) - return; + struct slab *slab; + + slab = virt_to_slab(x); + + if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) || + kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + + /* + * Intentionally leak the object in these cases, because it + * would be too dangerous to continue. + */ + if (unlikely(!slab || (slab->slab_cache != s))) { + warn_free_bad_obj(s, x); + return; + } + } + trace_kmem_cache_free(_RET_IP_, x, s); - slab_free(s, virt_to_slab(x), x, _RET_IP_); + slab_free(s, slab, x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); +static inline size_t slab_ksize(struct slab *slab) +{ + struct kmem_cache *s = slab->slab_cache; + +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +#endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * or any other metadata back there then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + else if (obj_exts_in_object(s, slab)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static size_t __ksize(const void *object) +{ + struct page *page; + struct slab *slab; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + page = virt_to_page(object); + + if (unlikely(PageLargeKmalloc(page))) + return large_kmalloc_size(page); + + slab = page_slab(page); + /* Delete this after we're sure there are no users */ + if (WARN_ON(!slab)) + return page_size(page); + +#ifdef CONFIG_SLUB_DEBUG + skip_orig_size_check(slab->slab_cache, object); +#endif + + return slab_ksize(slab); +} + +/** + * ksize -- Report full size of underlying allocation + * @objp: pointer to the object + * + * This should only be used internally to query the true size of allocations. + * It is not meant to be a way to discover the usable size of an allocation + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, + * and/or FORTIFY_SOURCE. + * + * Return: size of the actual memory used by @objp in bytes + */ +size_t ksize(const void *objp) +{ + /* + * We need to first check that the pointer to the object is valid. + * The KASAN report printed from ksize() is more useful, then when + * it's printed later when the behaviour could be undefined due to + * a potential use-after-free or double-free. + * + * We use kasan_check_byte(), which is supported for the hardware + * tag-based KASAN mode, unlike kasan_check_read/write(). + * + * If the pointed to memory is invalid, we return 0 to avoid users of + * ksize() writing to and potentially corrupting the memory region. + * + * We want to perform the check before __ksize(), to avoid potentially + * crashing in __ksize() due to accessing invalid metadata. + */ + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) + return 0; + + return kfence_ksize(objp) ?: __ksize(objp); +} +EXPORT_SYMBOL(ksize); + static void free_large_kmalloc(struct page *page, void *object) { unsigned int order = compound_order(page); @@ -6942,7 +6455,18 @@ void kfree_nolock(const void *object) * since kasan quarantine takes locks and not supported from NMI. */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); - do_slab_free(s, slab, x, x, 0, _RET_IP_); + + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { + if (likely(free_to_pcs(s, x, false))) + return; + } + + /* + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might + * need to take spin_lock for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, x); } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -7313,7 +6837,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, df->s = slab->slab_cache; } else { df->slab = slab; - df->s = cache_from_obj(s, object); /* Support for memcg */ + df->s = s; } /* Start new detached freelist */ @@ -7368,7 +6892,7 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (kfence_free(df.freelist)) continue; - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); } @@ -7383,7 +6907,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) * freeing to sheaves is so incompatible with the detached freelist so * once we go that way, we have to do everything differently */ - if (s && s->cpu_sheaves) { + if (s && cache_has_sheaves(s)) { free_to_pcs_bulk(s, size, p); return; } @@ -7401,72 +6925,224 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -static inline -int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +static unsigned int +__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max, struct kmem_cache_node *n, + bool allow_spin) { - struct kmem_cache_cpu *c; - unsigned long irqflags; - int i; + struct partial_bulk_context pc; + struct slab *slab, *slab2; + unsigned int refilled = 0; + unsigned long flags; + void *object; + + pc.flags = gfp; + pc.min_objects = min; + pc.max_objects = max; + + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) + return 0; + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + + object = get_freelist_nofreeze(s, slab); + + while (object && refilled < max) { + p[refilled] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[refilled]); + + refilled++; + } + + /* + * Freelist had more objects than we can accommodate, we need to + * free them back. We can treat it like a detached freelist, just + * need to find the tail object. + */ + if (unlikely(object)) { + void *head = object; + void *tail; + int cnt = 0; + + do { + tail = object; + cnt++; + object = get_freepointer(s, object); + } while (object); + __slab_free(s, slab, head, tail, cnt, _RET_IP_); + } + + if (refilled >= max) + break; + } + + if (unlikely(!list_empty(&pc.slabs))) { + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) + continue; + + list_del(&slab->slab_list); + add_partial(n, slab, ADD_TO_HEAD); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + /* any slabs left are completely free and for discard */ + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + discard_slab(s, slab); + } + } + + return refilled; +} + +#ifdef CONFIG_NUMA +static unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(gfp); + unsigned int cpuset_mems_cookie; + unsigned int refilled = 0; + + /* see get_from_any_partial() for the defrag ratio description */ + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return 0; + + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), gfp); + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { + struct kmem_cache_node *n; + unsigned int r; + + n = get_node(s, zone_to_nid(zone)); + + if (!n || !cpuset_zone_allowed(zone, gfp) || + n->nr_partial <= s->min_partial) + continue; + + r = __refill_objects_node(s, p, gfp, min, max, n, + /* allow_spin = */ false); + refilled += r; + + if (r >= min) { + /* + * Don't check read_mems_allowed_retry() here - + * if mems_allowed was updated in parallel, that + * was a harmless race between allocation and + * the cpuset update + */ + return refilled; + } + p += r; + min -= r; + max -= r; + } + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return refilled; +} +#else +static inline unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + return 0; +} +#endif + +static unsigned int +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + int local_node = numa_mem_id(); + unsigned int refilled; + struct slab *slab; + + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) + return 0; + + refilled = __refill_objects_node(s, p, gfp, min, max, + get_node(s, local_node), + /* allow_spin = */ true); + if (refilled >= min) + return refilled; + + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, + max - refilled); + if (refilled >= min) + return refilled; + +new_slab: + + slab = new_slab(s, gfp, local_node); + if (!slab) + goto out; + + stat(s, ALLOC_SLAB); /* - * Drain objects in the per cpu slab, while disabling local - * IRQs, which protects against PREEMPT and interrupts - * handlers invoking normal fastpath. + * TODO: possible optimization - if we know we will consume the whole + * slab we might skip creating the freelist? */ - c = slub_get_cpu_ptr(s->cpu_slab); - local_lock_irqsave(&s->cpu_slab->lock, irqflags); + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, + /* allow_spin = */ true); - for (i = 0; i < size; i++) { - void *object = c->freelist; + if (refilled < min) + goto new_slab; - if (unlikely(!object)) { - /* - * We may have removed an object from c->freelist using - * the fastpath in the previous iteration; in that case, - * c->tid has not been bumped yet. - * Since ___slab_alloc() may reenable interrupts while - * allocating memory, we should bump c->tid now. - */ - c->tid = next_tid(c->tid); +out: + return refilled; +} - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); +static inline +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + int i; - /* - * Invoking slow path likely have side-effect - * of re-populating per CPU c->freelist - */ - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c, s->object_size); + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + for (i = 0; i < size; i++) { + + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, + s->object_size); if (unlikely(!p[i])) goto error; - c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - - local_lock_irqsave(&s->cpu_slab->lock, irqflags); - - continue; /* goto for-loop */ } - c->freelist = get_freepointer(s, object); - p[i] = object; - maybe_wipe_obj_freeptr(s, p[i]); - stat(s, ALLOC_FASTPATH); + } else { + i = refill_objects(s, p, flags, size, size); + if (i < size) + goto error; + stat_add(s, ALLOC_SLOWPATH, i); } - c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); - slub_put_cpu_ptr(s->cpu_slab); return i; error: - slub_put_cpu_ptr(s->cpu_slab); __kmem_cache_free_bulk(s, i, p); return 0; } -/* Note that interrupts must be enabled when calling this function. */ +/* + * Note that interrupts must be enabled when calling this function and gfp + * flags must allow spinning. + */ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { @@ -7494,8 +7170,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, size--; } - if (s->cpu_sheaves) - i = alloc_from_pcs_bulk(s, size, p); + i = alloc_from_pcs_bulk(s, flags, size, p); if (i < size) { /* @@ -7683,29 +7358,25 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) barn_init(barn); } -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +#ifdef CONFIG_SLUB_STATS +static inline int alloc_kmem_cache_stats(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * - sizeof(struct kmem_cache_cpu)); + sizeof(struct kmem_cache_stats)); - /* - * Must align to double word boundary for the double cmpxchg - * instructions to work; see __pcpu_double_call_return_bool(). - */ - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), - 2 * sizeof(void *)); + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); - if (!s->cpu_slab) + if (!s->cpu_stats) return 0; - init_kmem_cache_cpus(s); - return 1; } +#endif static int init_percpu_sheaves(struct kmem_cache *s) { + static struct slab_sheaf bootstrap_sheaf = {}; int cpu; for_each_possible_cpu(cpu) { @@ -7715,7 +7386,28 @@ static int init_percpu_sheaves(struct kmem_cache *s) local_trylock_init(&pcs->lock); - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + /* + * Bootstrap sheaf has zero size so fast-path allocation fails. + * It has also size == s->sheaf_capacity, so fast-path free + * fails. In the slow paths we recognize the situation by + * checking s->sheaf_capacity. This allows fast paths to assume + * s->cpu_sheaves and pcs->main always exists and are valid. + * It's also safe to share the single static bootstrap_sheaf + * with zero-sized objects array as it's never modified. + * + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we + * recognize it and not attempt to free it when destroying the + * cache. + * + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, + * caches with debug enabled, and all caches with SLUB_TINY. + * For kmalloc caches it's used temporarily during the initial + * bootstrap. + */ + if (!s->sheaf_capacity) + pcs->main = &bootstrap_sheaf; + else + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); if (!pcs->main) return -ENOMEM; @@ -7766,7 +7458,7 @@ static void early_kmem_cache_node_alloc(int node) * No locks need to be taken here as it has just been * initialized and there is no concurrent access. */ - __add_partial(n, slab, DEACTIVATE_TO_HEAD); + __add_partial(n, slab, ADD_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -7790,13 +7482,10 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); - if (s->cpu_sheaves) - pcs_destroy(s); -#ifdef CONFIG_PREEMPT_RT - if (s->cpu_slab) - lockdep_unregister_key(&s->lock_key); + pcs_destroy(s); +#ifdef CONFIG_SLUB_STATS + free_percpu(s->cpu_stats); #endif - free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); } @@ -7813,7 +7502,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) continue; } - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); if (!barn) @@ -7834,37 +7523,51 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 1; } -static void set_cpu_partial(struct kmem_cache *s) +static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, + struct kmem_cache_args *args) + { -#ifdef CONFIG_SLUB_CPU_PARTIAL - unsigned int nr_objects; + unsigned int capacity; + size_t size; + + + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) + return 0; /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * For backwards compatibility reasons, this is determined as number - * of objects, even though we now limit maximum number of pages, see - * slub_set_cpu_partial() + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not + * have sheaves to avoid recursion when sheaf allocation triggers + * kmemleak tracking. + */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return 0; + + /* + * For now we use roughly similar formula (divided by two as there are + * two percpu sheaves) as what was used for percpu partial slabs, which + * should result in similar lock contention (barn or list_lock) */ - if (!kmem_cache_has_cpu_partial(s)) - nr_objects = 0; - else if (s->size >= PAGE_SIZE) - nr_objects = 6; + if (s->size >= PAGE_SIZE) + capacity = 4; else if (s->size >= 1024) - nr_objects = 24; + capacity = 12; else if (s->size >= 256) - nr_objects = 52; + capacity = 26; else - nr_objects = 120; + capacity = 60; - slub_set_cpu_partial(s, nr_objects); -#endif + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ + size = struct_size_t(struct slab_sheaf, objects, capacity); + size = kmalloc_size_roundup(size); + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); + + /* + * Respect an explicit request for capacity that's typically motivated by + * expected maximum size of kmem_cache_prefill_sheaf() to not end up + * using low-performance oversize sheaves + */ + return max(capacity, args->sheaf_capacity); } /* @@ -7875,6 +7578,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; + unsigned int aligned_size; unsigned int order; /* @@ -7898,9 +7602,9 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) /* - * If we are Redzoning then check if there is some space between the - * end of the object and the free pointer. If not then add an - * additional word to have some bytes to store Redzone information. + * If we are Redzoning and there is no space between the end of the + * object and the following fields, add one word so the right Redzone + * is non-empty. */ if ((flags & SLAB_RED_ZONE) && size == s->object_size) size += sizeof(void *); @@ -7913,7 +7617,8 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) s->inuse = size; if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || - (flags & SLAB_POISON) || s->ctor || + (flags & SLAB_POISON) || + (s->ctor && !args->use_freeptr_offset) || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* @@ -7934,7 +7639,8 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + } else if (((flags & SLAB_TYPESAFE_BY_RCU) || s->ctor) && + args->use_freeptr_offset) { s->offset = args->freeptr_offset; } else { /* @@ -7955,7 +7661,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) /* Save the original kmalloc request size */ if (flags & SLAB_KMALLOC) - size += sizeof(unsigned int); + size += sizeof(unsigned long); } #endif @@ -7982,7 +7688,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, s->align); + aligned_size = ALIGN(size, s->align); +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) + if (slab_args_unmergeable(args, s->flags) && + (aligned_size - size >= sizeof(struct slabobj_ext))) + s->flags |= SLAB_OBJ_EXT_IN_OBJ; +#endif + size = aligned_size; + s->size = size; s->reciprocal_size = reciprocal_value(size); order = calculate_order(size); @@ -8002,6 +7715,13 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) s->allocflags |= __GFP_RECLAIMABLE; /* + * For KMALLOC_NORMAL caches we enable sheaves later by + * bootstrap_kmalloc_sheaves() to avoid recursion + */ + if (!is_kmalloc_normal(s)) + s->sheaf_capacity = calculate_sheaf_capacity(s, args); + + /* * Determine the number of objects per slab */ s->oo = oo_make(order, size); @@ -8085,7 +7805,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) flush_all_cpus_locked(s); /* we might have rcu sheaves in flight */ - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) rcu_barrier(); /* Attempt to free all objects */ @@ -8397,7 +8117,7 @@ static int slab_mem_going_online_callback(int nid) if (get_node(s, nid)) continue; - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); if (!barn) { @@ -8472,12 +8192,6 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) memcpy(s, static_cache, kmem_cache->object_size); - /* - * This runs very early, and only the boot processor is supposed to be - * up. Even if it weren't true, IRQs are not up so we couldn't fire - * IPIs around. - */ - __flush_cpu_slab(s, smp_processor_id()); for_each_kmem_cache_node(s, node, n) { struct slab *p; @@ -8493,6 +8207,74 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) return s; } +/* + * Finish the sheaves initialization done normally by init_percpu_sheaves() and + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it + * since sheaves and barns are allocated by kmalloc. + */ +static void __init bootstrap_cache_sheaves(struct kmem_cache *s) +{ + struct kmem_cache_args empty_args = {}; + unsigned int capacity; + bool failed = false; + int node, cpu; + + capacity = calculate_sheaf_capacity(s, &empty_args); + + /* capacity can be 0 due to debugging or SLUB_TINY */ + if (!capacity) + return; + + for_each_node_mask(node, slab_nodes) { + struct node_barn *barn; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) { + failed = true; + goto out; + } + + barn_init(barn); + get_node(s, node)->barn = barn; + } + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); + + if (!pcs->main) { + failed = true; + break; + } + } + +out: + /* + * It's still early in boot so treat this like same as a failure to + * create the kmalloc cache in the first place + */ + if (failed) + panic("Out of memory when creating kmem_cache %s\n", s->name); + + s->sheaf_capacity = capacity; +} + +static void __init bootstrap_kmalloc_sheaves(void) +{ + enum kmalloc_cache_type type; + + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { + if (kmalloc_caches[type][idx]) + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); + } + } +} + void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, @@ -8536,6 +8318,8 @@ void __init kmem_cache_init(void) setup_kmalloc_cache_index_table(); create_kmalloc_caches(); + bootstrap_kmalloc_sheaves(); + /* Setup random freelists for each cache */ init_freelist_randomization(); @@ -8554,31 +8338,6 @@ void __init kmem_cache_init_late(void) WARN_ON(!flushwq); } -struct kmem_cache * -__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) -{ - struct kmem_cache *s; - - s = find_mergeable(size, align, flags, name, ctor); - if (s) { - if (sysfs_slab_alias(s, name)) - pr_err("SLUB: Unable to add cache alias %s to sysfs\n", - name); - - s->refcount++; - - /* - * Adjust the object sizes so that we clear - * the complete object on kzalloc. - */ - s->object_size = max(s->object_size, size); - s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); - } - - return s; -} - int do_kmem_cache_create(struct kmem_cache *s, const char *name, unsigned int size, struct kmem_cache_args *args, slab_flags_t flags) @@ -8628,17 +8387,10 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - set_cpu_partial(s); - - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) - && !(s->flags & SLAB_DEBUG_FLAGS)) { - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); - if (!s->cpu_sheaves) { - err = -ENOMEM; - goto out; - } - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? - s->sheaf_capacity = args->sheaf_capacity; + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; } #ifdef CONFIG_NUMA @@ -8654,14 +8406,14 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!init_kmem_cache_nodes(s)) goto out; - if (!alloc_kmem_cache_cpus(s)) +#ifdef CONFIG_SLUB_STATS + if (!alloc_kmem_cache_stats(s)) goto out; +#endif - if (s->cpu_sheaves) { - err = init_percpu_sheaves(s); - if (err) - goto out; - } + err = init_percpu_sheaves(s); + if (err) + goto out; err = 0; @@ -8976,47 +8728,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, if (!nodes) return -ENOMEM; - if (flags & SO_CPU) { - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, - cpu); - int node; - struct slab *slab; - - slab = READ_ONCE(c->slab); - if (!slab) - continue; - - node = slab_nid(slab); - if (flags & SO_TOTAL) - x = slab->objects; - else if (flags & SO_OBJECTS) - x = slab->inuse; - else - x = 1; - - total += x; - nodes[node] += x; - -#ifdef CONFIG_SLUB_CPU_PARTIAL - slab = slub_percpu_partial_read_once(c); - if (slab) { - node = slab_nid(slab); - if (flags & SO_TOTAL) - WARN_ON_ONCE(1); - else if (flags & SO_OBJECTS) - WARN_ON_ONCE(1); - else - x = data_race(slab->slabs); - total += x; - nodes[node] += x; - } -#endif - } - } - /* * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" * already held which will conflict with an existing lock order: @@ -9148,12 +8859,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - unsigned int nr_partial = 0; -#ifdef CONFIG_SLUB_CPU_PARTIAL - nr_partial = s->cpu_partial; -#endif - - return sysfs_emit(buf, "%u\n", nr_partial); + return sysfs_emit(buf, "0\n"); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -9165,11 +8871,9 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = kstrtouint(buf, 10, &objects); if (err) return err; - if (objects && !kmem_cache_has_cpu_partial(s)) + if (objects) return -EINVAL; - slub_set_cpu_partial(s, objects); - flush_all(s); return length; } SLAB_ATTR(cpu_partial); @@ -9208,42 +8912,7 @@ SLAB_ATTR_RO(objects_partial); static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) { - int objects = 0; - int slabs = 0; - int cpu __maybe_unused; - int len = 0; - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - - if (slab) - slabs += data_race(slab->slabs); - } -#endif - - /* Approximate half-full slabs, see slub_set_cpu_partial() */ - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (slab) { - slabs = data_race(slab->slabs); - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", - cpu, objects, slabs); - } - } -#endif - len += sysfs_emit_at(buf, len, "\n"); - - return len; + return sysfs_emit(buf, "0(0)\n"); } SLAB_ATTR_RO(slabs_cpu_partial); @@ -9429,7 +9098,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -9455,7 +9124,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) int cpu; for_each_online_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; } #define STAT_ATTR(si, text) \ @@ -9473,36 +9142,19 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ -STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); -STAT_ATTR(FREE_PCS, free_cpu_sheaf); STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); -STAT_ATTR(FREE_FROZEN, free_frozen); STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); -STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); -STAT_ATTR(ALLOC_REFILL, alloc_refill); STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); -STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); -STAT_ATTR(DEACTIVATE_FULL, deactivate_full); -STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); -STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); -STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); -STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); -STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); -STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); -STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); -STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); -STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); -STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); STAT_ATTR(SHEAF_FLUSH, sheaf_flush); STAT_ATTR(SHEAF_REFILL, sheaf_refill); STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); @@ -9578,36 +9230,19 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS - &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, - &free_cpu_sheaf_attr.attr, &free_rcu_sheaf_attr.attr, &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, - &free_frozen_attr.attr, &free_add_partial_attr.attr, &free_remove_partial_attr.attr, - &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, - &alloc_refill_attr.attr, &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, - &cpuslab_flush_attr.attr, - &deactivate_full_attr.attr, - &deactivate_empty_attr.attr, - &deactivate_to_head_attr.attr, - &deactivate_to_tail_attr.attr, - &deactivate_remote_frees_attr.attr, - &deactivate_bypass_attr.attr, &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, - &cmpxchg_double_cpu_fail_attr.attr, - &cpu_partial_alloc_attr.attr, - &cpu_partial_free_attr.attr, - &cpu_partial_node_attr.attr, - &cpu_partial_drain_attr.attr, &sheaf_flush_attr.attr, &sheaf_refill_attr.attr, &sheaf_alloc_attr.attr, @@ -9811,7 +9446,7 @@ struct saved_alias { static struct saved_alias *alias_list; -static int sysfs_slab_alias(struct kmem_cache *s, const char *name) +int sysfs_slab_alias(struct kmem_cache *s, const char *name) { struct saved_alias *al; |
