summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/ext4/super.c19
-rw-r--r--include/linux/slab.h40
-rw-r--r--mm/Kconfig11
-rw-r--r--mm/internal.h1
-rw-r--r--mm/memcontrol.c31
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/slab.h213
-rw-r--r--mm/slab_common.c153
-rw-r--r--mm/slub.c3203
9 files changed, 1685 insertions, 1991 deletions
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a8d2460b527a..3c73b982a4f7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1496,12 +1496,19 @@ static void init_once(void *foo)
static int __init init_inodecache(void)
{
- ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
- sizeof(struct ext4_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
- offsetof(struct ext4_inode_info, i_data),
- sizeof_field(struct ext4_inode_info, i_data),
- init_once);
+ struct kmem_cache_args args = {
+ .useroffset = offsetof(struct ext4_inode_info, i_data),
+ .usersize = sizeof_field(struct ext4_inode_info, i_data),
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct ext4_inode_info, i_flags),
+ .ctor = init_once,
+ };
+
+ ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
+ sizeof(struct ext4_inode_info),
+ &args,
+ SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT);
+
if (ext4_inode_cachep == NULL)
return -ENOMEM;
return 0;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 7701b38cedec..c5fde8740281 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -58,8 +58,9 @@ enum _slab_flag_bits {
#endif
_SLAB_OBJECT_POISON,
_SLAB_CMPXCHG_DOUBLE,
-#ifdef CONFIG_SLAB_OBJ_EXT
_SLAB_NO_OBJ_EXT,
+#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
+ _SLAB_OBJ_EXT_IN_OBJ,
#endif
_SLAB_FLAGS_LAST_BIT
};
@@ -239,10 +240,12 @@ enum _slab_flag_bits {
#define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
/* Slab created using create_boot_cache */
-#ifdef CONFIG_SLAB_OBJ_EXT
#define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
+
+#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
+#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ)
#else
-#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED
+#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_UNUSED
#endif
/*
@@ -300,24 +303,26 @@ struct kmem_cache_args {
unsigned int usersize;
/**
* @freeptr_offset: Custom offset for the free pointer
- * in &SLAB_TYPESAFE_BY_RCU caches
+ * in caches with &SLAB_TYPESAFE_BY_RCU or @ctor
*
- * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
- * outside of the object. This might cause the object to grow in size.
- * Cache creators that have a reason to avoid this can specify a custom
- * free pointer offset in their struct where the free pointer will be
- * placed.
+ * By default, &SLAB_TYPESAFE_BY_RCU and @ctor caches place the free
+ * pointer outside of the object. This might cause the object to grow
+ * in size. Cache creators that have a reason to avoid this can specify
+ * a custom free pointer offset in their data structure where the free
+ * pointer will be placed.
*
- * Note that placing the free pointer inside the object requires the
- * caller to ensure that no fields are invalidated that are required to
- * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
- * details).
+ * For caches with &SLAB_TYPESAFE_BY_RCU, the caller must ensure that
+ * the free pointer does not overlay fields required to guard against
+ * object recycling (See &SLAB_TYPESAFE_BY_RCU for details).
*
- * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
- * is specified, %use_freeptr_offset must be set %true.
+ * For caches with @ctor, the caller must ensure that the free pointer
+ * does not overlay fields initialized by the constructor.
*
- * Note that @ctor currently isn't supported with custom free pointers
- * as a @ctor requires an external free pointer.
+ * Currently, only caches with &SLAB_TYPESAFE_BY_RCU or @ctor
+ * may specify @freeptr_offset.
+ *
+ * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
+ * is specified, @use_freeptr_offset must be set %true.
*/
unsigned int freeptr_offset;
/**
@@ -508,7 +513,6 @@ void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size
void kfree(const void *objp);
void kfree_nolock(const void *objp);
void kfree_sensitive(const void *objp);
-size_t __ksize(const void *objp);
DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))
diff --git a/mm/Kconfig b/mm/Kconfig
index a992f2203eb9..fbac1dfc9943 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -247,17 +247,6 @@ config SLUB_STATS
out which slabs are relevant to a particular load.
Try running: slabinfo -DA
-config SLUB_CPU_PARTIAL
- default y
- depends on SMP && !SLUB_TINY
- bool "Enable per cpu partial caches"
- help
- Per cpu partial caches accelerate objects allocation and freeing
- that is local to a processor at the price of more indeterminism
- in the latency of the free. On overflow these caches will be cleared
- which requires the taking of locks that may cause latency spikes.
- Typically one would choose no for a realtime system.
-
config RANDOM_KMALLOC_CACHES
default n
depends on !SLUB_TINY
diff --git a/mm/internal.h b/mm/internal.h
index f35dbcf99a86..aacda4f79534 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -838,6 +838,7 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord
struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order);
#define alloc_frozen_pages_nolock(...) \
alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__))
+void free_frozen_pages_nolock(struct page *page, unsigned int order);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8a9a1e797c2e..36ab9897b61b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2627,16 +2627,24 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p)
* Memcg membership data for each individual object is saved in
* slab->obj_exts.
*/
- struct slabobj_ext *obj_exts;
+ unsigned long obj_exts;
+ struct slabobj_ext *obj_ext;
unsigned int off;
obj_exts = slab_obj_exts(slab);
if (!obj_exts)
return NULL;
+ get_slab_obj_exts(obj_exts);
off = obj_to_index(slab->slab_cache, slab, p);
- if (obj_exts[off].objcg)
- return obj_cgroup_memcg(obj_exts[off].objcg);
+ obj_ext = slab_obj_ext(slab, obj_exts, off);
+ if (obj_ext->objcg) {
+ struct obj_cgroup *objcg = obj_ext->objcg;
+
+ put_slab_obj_exts(obj_exts);
+ return obj_cgroup_memcg(objcg);
+ }
+ put_slab_obj_exts(obj_exts);
return NULL;
}
@@ -3222,6 +3230,9 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
}
for (i = 0; i < size; i++) {
+ unsigned long obj_exts;
+ struct slabobj_ext *obj_ext;
+
slab = virt_to_slab(p[i]);
if (!slab_obj_exts(slab) &&
@@ -3244,29 +3255,35 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
slab_pgdat(slab), cache_vmstat_idx(s)))
return false;
+ obj_exts = slab_obj_exts(slab);
+ get_slab_obj_exts(obj_exts);
off = obj_to_index(s, slab, p[i]);
+ obj_ext = slab_obj_ext(slab, obj_exts, off);
obj_cgroup_get(objcg);
- slab_obj_exts(slab)[off].objcg = objcg;
+ obj_ext->objcg = objcg;
+ put_slab_obj_exts(obj_exts);
}
return true;
}
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- void **p, int objects, struct slabobj_ext *obj_exts)
+ void **p, int objects, unsigned long obj_exts)
{
size_t obj_size = obj_full_size(s);
for (int i = 0; i < objects; i++) {
struct obj_cgroup *objcg;
+ struct slabobj_ext *obj_ext;
unsigned int off;
off = obj_to_index(s, slab, p[i]);
- objcg = obj_exts[off].objcg;
+ obj_ext = slab_obj_ext(slab, obj_exts, off);
+ objcg = obj_ext->objcg;
if (!objcg)
continue;
- obj_exts[off].objcg = NULL;
+ obj_ext->objcg = NULL;
refill_obj_stock(objcg, obj_size, true, -obj_size,
slab_pgdat(slab), cache_vmstat_idx(s));
obj_cgroup_put(objcg);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cbf758e27aa2..d312ebaa1e77 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3011,6 +3011,11 @@ void free_frozen_pages(struct page *page, unsigned int order)
__free_frozen_pages(page, order, FPI_NONE);
}
+void free_frozen_pages_nolock(struct page *page, unsigned int order)
+{
+ __free_frozen_pages(page, order, FPI_TRYLOCK);
+}
+
/*
* Free a batch of folios
*/
diff --git a/mm/slab.h b/mm/slab.h
index e767aa7e91b0..71c7261bf822 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -21,14 +21,12 @@
# define system_has_freelist_aba() system_has_cmpxchg128()
# define try_cmpxchg_freelist try_cmpxchg128
# endif
-#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128
typedef u128 freelist_full_t;
#else /* CONFIG_64BIT */
# ifdef system_has_cmpxchg64
# define system_has_freelist_aba() system_has_cmpxchg64()
# define try_cmpxchg_freelist try_cmpxchg64
# endif
-#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64
typedef u64 freelist_full_t;
#endif /* CONFIG_64BIT */
@@ -55,6 +53,14 @@ struct freelist_counters {
* that the slab was corrupted
*/
unsigned frozen:1;
+#ifdef CONFIG_64BIT
+ /*
+ * Some optimizations use free bits in 'counters' field
+ * to save memory. In case ->stride field is not available,
+ * such optimizations are disabled.
+ */
+ unsigned short stride;
+#endif
};
};
};
@@ -71,19 +77,7 @@ struct slab {
struct kmem_cache *slab_cache;
union {
struct {
- union {
- struct list_head slab_list;
- struct { /* For deferred deactivate_slab() */
- struct llist_node llnode;
- void *flush_freelist;
- };
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- struct {
- struct slab *next;
- int slabs; /* Nr of slabs left */
- };
-#endif
- };
+ struct list_head slab_list;
/* Double-word boundary */
struct freelist_counters;
};
@@ -188,23 +182,6 @@ static inline size_t slab_size(const struct slab *slab)
return PAGE_SIZE << slab_order(slab);
}
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-#define slub_percpu_partial(c) ((c)->partial)
-
-#define slub_set_percpu_partial(c, p) \
-({ \
- slub_percpu_partial(c) = (p)->next; \
-})
-
-#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c))
-#else
-#define slub_percpu_partial(c) NULL
-
-#define slub_set_percpu_partial(c, p)
-
-#define slub_percpu_partial_read_once(c) NULL
-#endif // CONFIG_SLUB_CPU_PARTIAL
-
/*
* Word size structure that can be atomically updated or read and that
* contains both the order and the number of objects that a slab of the
@@ -218,8 +195,6 @@ struct kmem_cache_order_objects {
* Slab cache management.
*/
struct kmem_cache {
- struct kmem_cache_cpu __percpu *cpu_slab;
- struct lock_class_key lock_key;
struct slub_percpu_sheaves __percpu *cpu_sheaves;
/* Used for retrieving partial slabs, etc. */
slab_flags_t flags;
@@ -228,12 +203,6 @@ struct kmem_cache {
unsigned int object_size; /* Object size without metadata */
struct reciprocal_value reciprocal_size;
unsigned int offset; /* Free pointer offset */
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- /* Number of per cpu partial objects to keep around */
- unsigned int cpu_partial;
- /* Number of per cpu partial slabs to keep around */
- unsigned int cpu_partial_slabs;
-#endif
unsigned int sheaf_capacity;
struct kmem_cache_order_objects oo;
@@ -274,16 +243,35 @@ struct kmem_cache {
unsigned int usersize; /* Usercopy region size */
#endif
+#ifdef CONFIG_SLUB_STATS
+ struct kmem_cache_stats __percpu *cpu_stats;
+#endif
+
struct kmem_cache_node *node[MAX_NUMNODES];
};
+/*
+ * Every cache has !NULL s->cpu_sheaves but they may point to the
+ * bootstrap_sheaf temporarily during init, or permanently for the boot caches
+ * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This
+ * helper distinguishes whether cache has real non-bootstrap sheaves.
+ */
+static inline bool cache_has_sheaves(struct kmem_cache *s)
+{
+ /* Test CONFIG_SLUB_TINY for code elimination purposes */
+ return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity;
+}
+
#if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY)
#define SLAB_SUPPORTS_SYSFS 1
void sysfs_slab_unlink(struct kmem_cache *s);
void sysfs_slab_release(struct kmem_cache *s);
+int sysfs_slab_alias(struct kmem_cache *s, const char *name);
#else
static inline void sysfs_slab_unlink(struct kmem_cache *s) { }
static inline void sysfs_slab_release(struct kmem_cache *s) { }
+static inline int sysfs_slab_alias(struct kmem_cache *s, const char *name)
+ { return 0; }
#endif
void *fixup_red_left(struct kmem_cache *s, void *p);
@@ -400,11 +388,7 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
unsigned int useroffset, unsigned int usersize);
int slab_unmergeable(struct kmem_cache *s);
-struct kmem_cache *find_mergeable(unsigned size, unsigned align,
- slab_flags_t flags, const char *name, void (*ctor)(void *));
-struct kmem_cache *
-__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
- slab_flags_t flags, void (*ctor)(void *));
+bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags);
slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name);
@@ -502,6 +486,24 @@ bool slab_in_kunit_test(void);
static inline bool slab_in_kunit_test(void) { return false; }
#endif
+/*
+ * slub is about to manipulate internal object metadata. This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error. metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+ kasan_disable_current();
+ kmsan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+ kmsan_enable_current();
+ kasan_enable_current();
+}
+
#ifdef CONFIG_SLAB_OBJ_EXT
/*
@@ -509,10 +511,26 @@ static inline bool slab_in_kunit_test(void) { return false; }
* associated with a slab.
* @slab: a pointer to the slab struct
*
- * Returns a pointer to the object extension vector associated with the slab,
- * or NULL if no such vector has been associated yet.
+ * Returns the address of the object extension vector associated with the slab,
+ * or zero if no such vector has been associated yet.
+ * Do not dereference the return value directly; use get/put_slab_obj_exts()
+ * pair and slab_obj_ext() to access individual elements.
+ *
+ * Example usage:
+ *
+ * obj_exts = slab_obj_exts(slab);
+ * if (obj_exts) {
+ * get_slab_obj_exts(obj_exts);
+ * obj_ext = slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, obj));
+ * // do something with obj_ext
+ * put_slab_obj_exts(obj_exts);
+ * }
+ *
+ * Note that the get/put semantics does not involve reference counting.
+ * Instead, it updates kasan/kmsan depth so that accesses to slabobj_ext
+ * won't be reported as access violations.
*/
-static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
+static inline unsigned long slab_obj_exts(struct slab *slab)
{
unsigned long obj_exts = READ_ONCE(slab->obj_exts);
@@ -525,7 +543,62 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab));
VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
#endif
- return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
+
+ return obj_exts & ~OBJEXTS_FLAGS_MASK;
+}
+
+static inline void get_slab_obj_exts(unsigned long obj_exts)
+{
+ VM_WARN_ON_ONCE(!obj_exts);
+ metadata_access_enable();
+}
+
+static inline void put_slab_obj_exts(unsigned long obj_exts)
+{
+ metadata_access_disable();
+}
+
+#ifdef CONFIG_64BIT
+static inline void slab_set_stride(struct slab *slab, unsigned short stride)
+{
+ slab->stride = stride;
+}
+static inline unsigned short slab_get_stride(struct slab *slab)
+{
+ return slab->stride;
+}
+#else
+static inline void slab_set_stride(struct slab *slab, unsigned short stride)
+{
+ VM_WARN_ON_ONCE(stride != sizeof(struct slabobj_ext));
+}
+static inline unsigned short slab_get_stride(struct slab *slab)
+{
+ return sizeof(struct slabobj_ext);
+}
+#endif
+
+/*
+ * slab_obj_ext - get the pointer to the slab object extension metadata
+ * associated with an object in a slab.
+ * @slab: a pointer to the slab struct
+ * @obj_exts: a pointer to the object extension vector
+ * @index: an index of the object
+ *
+ * Returns a pointer to the object extension associated with the object.
+ * Must be called within a section covered by get/put_slab_obj_exts().
+ */
+static inline struct slabobj_ext *slab_obj_ext(struct slab *slab,
+ unsigned long obj_exts,
+ unsigned int index)
+{
+ struct slabobj_ext *obj_ext;
+
+ VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab));
+
+ obj_ext = (struct slabobj_ext *)(obj_exts +
+ slab_get_stride(slab) * index);
+ return kasan_reset_tag(obj_ext);
}
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
@@ -533,11 +606,22 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
#else /* CONFIG_SLAB_OBJ_EXT */
-static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
+static inline unsigned long slab_obj_exts(struct slab *slab)
+{
+ return 0;
+}
+
+static inline struct slabobj_ext *slab_obj_ext(struct slab *slab,
+ unsigned long obj_exts,
+ unsigned int index)
{
return NULL;
}
+static inline void slab_set_stride(struct slab *slab, unsigned int stride) { }
+static inline unsigned int slab_get_stride(struct slab *slab) { return 0; }
+
+
#endif /* CONFIG_SLAB_OBJ_EXT */
static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
@@ -550,38 +634,11 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
gfp_t flags, size_t size, void **p);
void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- void **p, int objects, struct slabobj_ext *obj_exts);
+ void **p, int objects, unsigned long obj_exts);
#endif
void kvfree_rcu_cb(struct rcu_head *head);
-size_t __ksize(const void *objp);
-
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->object_size;
-#endif
- if (s->flags & SLAB_KASAN)
- return s->object_size;
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
-}
-
static inline unsigned int large_kmalloc_order(const struct page *page)
{
return page[1].flags.f & 0xff;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index eed7ea556cb1..d5a70a831a2a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -43,11 +43,13 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
/*
- * Set of flags that will prevent slab merging
+ * Set of flags that will prevent slab merging.
+ * Any flag that adds per-object metadata should be included,
+ * since slab merging can update s->inuse that affects the metadata layout.
*/
-#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB | SLAB_NO_MERGE)
+#define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \
+ SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \
+ SLAB_OBJ_EXT_IN_OBJ)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -163,9 +165,6 @@ int slab_unmergeable(struct kmem_cache *s)
return 1;
#endif
- if (s->cpu_sheaves)
- return 1;
-
/*
* We may have set a slab to be unmergeable during bootstrap.
*/
@@ -175,24 +174,35 @@ int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
- slab_flags_t flags, const char *name, void (*ctor)(void *))
+bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags)
{
- struct kmem_cache *s;
-
if (slab_nomerge)
- return NULL;
+ return true;
- if (ctor)
- return NULL;
+ if (args->ctor)
+ return true;
- flags = kmem_cache_flags(flags, name);
+ if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize)
+ return true;
if (flags & SLAB_NEVER_MERGE)
+ return true;
+
+ return false;
+}
+
+static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags,
+ const char *name, struct kmem_cache_args *args)
+{
+ struct kmem_cache *s;
+ unsigned int align;
+
+ flags = kmem_cache_flags(flags, name);
+ if (slab_args_unmergeable(args, flags))
return NULL;
size = ALIGN(size, sizeof(void *));
- align = calculate_alignment(flags, align, size);
+ align = calculate_alignment(flags, args->align, size);
size = ALIGN(size, align);
list_for_each_entry_reverse(s, &slab_caches, list) {
@@ -231,7 +241,7 @@ static struct kmem_cache *create_cache(const char *name,
err = -EINVAL;
if (args->use_freeptr_offset &&
(args->freeptr_offset >= object_size ||
- !(flags & SLAB_TYPESAFE_BY_RCU) ||
+ (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) ||
!IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t))))
goto out;
@@ -253,6 +263,31 @@ out:
return ERR_PTR(err);
}
+static struct kmem_cache *
+__kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags,
+ struct kmem_cache_args *args)
+{
+ struct kmem_cache *s;
+
+ s = find_mergeable(size, flags, name, args);
+ if (s) {
+ if (sysfs_slab_alias(s, name))
+ pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
+ name);
+
+ s->refcount++;
+
+ /*
+ * Adjust the object sizes so that we clear
+ * the complete object on kzalloc.
+ */
+ s->object_size = max(s->object_size, size);
+ s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
+ }
+
+ return s;
+}
+
/**
* __kmem_cache_create_args - Create a kmem cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -305,6 +340,13 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
flags &= ~SLAB_DEBUG_FLAGS;
#endif
+ /*
+ * Caches with specific capacity are special enough. It's simpler to
+ * make them unmergeable.
+ */
+ if (args->sheaf_capacity)
+ flags |= SLAB_NO_MERGE;
+
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, object_size);
@@ -324,9 +366,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name,
object_size - args->usersize < args->useroffset))
args->usersize = args->useroffset = 0;
- if (!args->usersize && !args->sheaf_capacity)
- s = __kmem_cache_alias(name, object_size, args->align, flags,
- args->ctor);
+ s = __kmem_cache_alias(name, object_size, flags, args);
if (s)
goto out_unlock;
@@ -983,43 +1023,6 @@ void __init create_kmalloc_caches(void)
0, SLAB_NO_MERGE, NULL);
}
-/**
- * __ksize -- Report full size of underlying allocation
- * @object: pointer to the object
- *
- * This should only be used internally to query the true size of allocations.
- * It is not meant to be a way to discover the usable size of an allocation
- * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
- * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
- * and/or FORTIFY_SOURCE.
- *
- * Return: size of the actual memory used by @object in bytes
- */
-size_t __ksize(const void *object)
-{
- const struct page *page;
- const struct slab *slab;
-
- if (unlikely(object == ZERO_SIZE_PTR))
- return 0;
-
- page = virt_to_page(object);
-
- if (unlikely(PageLargeKmalloc(page)))
- return large_kmalloc_size(page);
-
- slab = page_slab(page);
- /* Delete this after we're sure there are no users */
- if (WARN_ON(!slab))
- return page_size(page);
-
-#ifdef CONFIG_SLUB_DEBUG
- skip_orig_size_check(slab->slab_cache, object);
-#endif
-
- return slab_ksize(slab->slab_cache);
-}
-
gfp_t kmalloc_fix_flags(gfp_t flags)
{
gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
@@ -1235,30 +1238,6 @@ void kfree_sensitive(const void *p)
}
EXPORT_SYMBOL(kfree_sensitive);
-size_t ksize(const void *objp)
-{
- /*
- * We need to first check that the pointer to the object is valid.
- * The KASAN report printed from ksize() is more useful, then when
- * it's printed later when the behaviour could be undefined due to
- * a potential use-after-free or double-free.
- *
- * We use kasan_check_byte(), which is supported for the hardware
- * tag-based KASAN mode, unlike kasan_check_read/write().
- *
- * If the pointed to memory is invalid, we return 0 to avoid users of
- * ksize() writing to and potentially corrupting the memory region.
- *
- * We want to perform the check before __ksize(), to avoid potentially
- * crashing in __ksize() due to accessing invalid metadata.
- */
- if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
- return 0;
-
- return kfence_ksize(objp) ?: __ksize(objp);
-}
-EXPORT_SYMBOL(ksize);
-
#ifdef CONFIG_BPF_SYSCALL
#include <linux/btf.h>
@@ -1625,11 +1604,8 @@ static bool kfree_rcu_sheaf(void *obj)
return false;
s = slab->slab_cache;
- if (s->cpu_sheaves) {
- if (likely(!IS_ENABLED(CONFIG_NUMA) ||
- slab_nid(slab) == numa_mem_id()))
- return __kfree_rcu_sheaf(s, obj);
- }
+ if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()))
+ return __kfree_rcu_sheaf(s, obj);
return false;
}
@@ -2133,8 +2109,11 @@ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
*/
void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
{
- if (s->cpu_sheaves)
+ if (cache_has_sheaves(s)) {
flush_rcu_sheaves_on_cache(s);
+ rcu_barrier();
+ }
+
/*
* TODO: Introduce a version of __kvfree_rcu_barrier() that works
* on a specific slab cache.
diff --git a/mm/slub.c b/mm/slub.c
index cdc1e652ec52..18899017512c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1,13 +1,15 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * SLUB: A slab allocator that limits cache line use instead of queuing
- * objects in per cpu and per node lists.
+ * SLUB: A slab allocator with low overhead percpu array caches and mostly
+ * lockless freeing of objects to slabs in the slowpath.
*
- * The allocator synchronizes using per slab locks or atomic operations
- * and only uses a centralized lock to manage a pool of partial slabs.
+ * The allocator synchronizes using spin_trylock for percpu arrays in the
+ * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing.
+ * Uses a centralized lock to manage a pool of partial slabs.
*
* (C) 2007 SGI, Christoph Lameter
* (C) 2011 Linux Foundation, Christoph Lameter
+ * (C) 2025 SUSE, Vlastimil Babka
*/
#include <linux/mm.h>
@@ -53,11 +55,13 @@
/*
* Lock order:
- * 1. slab_mutex (Global Mutex)
- * 2. node->list_lock (Spinlock)
- * 3. kmem_cache->cpu_slab->lock (Local lock)
- * 4. slab_lock(slab) (Only on some arches)
- * 5. object_map_lock (Only for debugging)
+ * 0. cpu_hotplug_lock
+ * 1. slab_mutex (Global Mutex)
+ * 2a. kmem_cache->cpu_sheaves->lock (Local trylock)
+ * 2b. node->barn->lock (Spinlock)
+ * 2c. node->list_lock (Spinlock)
+ * 3. slab_lock(slab) (Only on some arches)
+ * 4. object_map_lock (Only for debugging)
*
* slab_mutex
*
@@ -78,31 +82,38 @@
* C. slab->objects -> Number of objects in slab
* D. slab->frozen -> frozen state
*
- * Frozen slabs
+ * SL_partial slabs
+ *
+ * Slabs on node partial list have at least one free object. A limited number
+ * of slabs on the list can be fully free (slab->inuse == 0), until we start
+ * discarding them. These slabs are marked with SL_partial, and the flag is
+ * cleared while removing them, usually to grab their freelist afterwards.
+ * This clearing also exempts them from list management. Please see
+ * __slab_free() for more details.
*
- * If a slab is frozen then it is exempt from list management. It is
- * the cpu slab which is actively allocated from by the processor that
- * froze it and it is not on any list. The processor that froze the
- * slab is the one who can perform list operations on the slab. Other
- * processors may put objects onto the freelist but the processor that
- * froze the slab is the only one that can retrieve the objects from the
- * slab's freelist.
+ * Full slabs
*
- * CPU partial slabs
+ * For caches without debugging enabled, full slabs (slab->inuse ==
+ * slab->objects and slab->freelist == NULL) are not placed on any list.
+ * The __slab_free() freeing the first object from such a slab will place
+ * it on the partial list. Caches with debugging enabled place such slab
+ * on the full list and use different allocation and freeing paths.
*
- * The partially empty slabs cached on the CPU partial list are used
- * for performance reasons, which speeds up the allocation process.
- * These slabs are not frozen, but are also exempt from list management,
- * by clearing the SL_partial flag when moving out of the node
- * partial list. Please see __slab_free() for more details.
+ * Frozen slabs
+ *
+ * If a slab is frozen then it is exempt from list management. It is used to
+ * indicate a slab that has failed consistency checks and thus cannot be
+ * allocated from anymore - it is also marked as full. Any previously
+ * allocated objects will be simply leaked upon freeing instead of attempting
+ * to modify the potentially corrupted freelist and metadata.
*
* To sum up, the current scheme is:
- * - node partial slab: SL_partial && !frozen
- * - cpu partial slab: !SL_partial && !frozen
- * - cpu slab: !SL_partial && frozen
- * - full slab: !SL_partial && !frozen
+ * - node partial slab: SL_partial && !full && !frozen
+ * - taken off partial list: !SL_partial && !full && !frozen
+ * - full slab, not on any list: !SL_partial && full && !frozen
+ * - frozen due to inconsistency: !SL_partial && full && frozen
*
- * list_lock
+ * node->list_lock (spinlock)
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -112,47 +123,46 @@
*
* The list_lock is a centralized lock and thus we avoid taking it as
* much as possible. As long as SLUB does not have to handle partial
- * slabs, operations can continue without any centralized lock. F.e.
- * allocating a long series of objects that fill up slabs does not require
- * the list lock.
+ * slabs, operations can continue without any centralized lock.
*
* For debug caches, all allocations are forced to go through a list_lock
* protected region to serialize against concurrent validation.
*
- * cpu_slab->lock local lock
+ * cpu_sheaves->lock (local_trylock)
+ *
+ * This lock protects fastpath operations on the percpu sheaves. On !RT it
+ * only disables preemption and does no atomic operations. As long as the main
+ * or spare sheaf can handle the allocation or free, there is no other
+ * overhead.
*
- * This locks protect slowpath manipulation of all kmem_cache_cpu fields
- * except the stat counters. This is a percpu structure manipulated only by
- * the local cpu, so the lock protects against being preempted or interrupted
- * by an irq. Fast path operations rely on lockless operations instead.
+ * node->barn->lock (spinlock)
*
- * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
- * which means the lockless fastpath cannot be used as it might interfere with
- * an in-progress slow path operations. In this case the local lock is always
- * taken but it still utilizes the freelist for the common operations.
+ * This lock protects the operations on per-NUMA-node barn. It can quickly
+ * serve an empty or full sheaf if available, and avoid more expensive refill
+ * or flush operation.
*
- * lockless fastpaths
+ * Lockless freeing
*
- * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
- * are fully lockless when satisfied from the percpu slab (and when
- * cmpxchg_double is possible to use, otherwise slab_lock is taken).
- * They also don't disable preemption or migration or irqs. They rely on
- * the transaction id (tid) field to detect being preempted or moved to
- * another cpu.
+ * Objects may have to be freed to their slabs when they are from a remote
+ * node (where we want to avoid filling local sheaves with remote objects)
+ * or when there are too many full sheaves. On architectures supporting
+ * cmpxchg_double this is done by a lockless update of slab's freelist and
+ * counters, otherwise slab_lock is taken. This only needs to take the
+ * list_lock if it's a first free to a full slab, or when a slab becomes empty
+ * after the free.
*
* irq, preemption, migration considerations
*
- * Interrupts are disabled as part of list_lock or local_lock operations, or
+ * Interrupts are disabled as part of list_lock or barn lock operations, or
* around the slab_lock operation, in order to make the slab allocator safe
* to use in the context of an irq.
+ * Preemption is disabled as part of local_trylock operations.
+ * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see
+ * their limitations.
*
- * In addition, preemption (or migration on PREEMPT_RT) is disabled in the
- * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
- * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
- * doesn't have to be revalidated in each section protected by the local lock.
- *
- * SLUB assigns one slab for allocation to each processor.
- * Allocations only occur from these slabs called cpu slabs.
+ * SLUB assigns two object arrays called sheaves for caching allocations and
+ * frees on each cpu, with a NUMA node shared barn for balancing between cpus.
+ * Allocations and frees are primarily served from these sheaves.
*
* Slabs with free elements are kept on a partial list and during regular
* operations no list for full slabs is used. If an object in a full slab is
@@ -160,25 +170,8 @@
* We track full slabs for debugging purposes though because otherwise we
* cannot scan all objects.
*
- * Slabs are freed when they become empty. Teardown and setup is
- * minimal so we rely on the page allocators per cpu caches for
- * fast frees and allocs.
- *
- * slab->frozen The slab is frozen and exempt from list processing.
- * This means that the slab is dedicated to a purpose
- * such as satisfying allocations for a specific
- * processor. Objects may be freed in the slab while
- * it is frozen but slab_free will then skip the usual
- * list operations. It is up to the processor holding
- * the slab to integrate the slab into the slab lists
- * when the slab is no longer needed.
- *
- * One use of this flag is to mark slabs that are
- * used for allocations. Then such a slab becomes a cpu
- * slab. The cpu slab may be equipped with an additional
- * freelist that allows lockless access to
- * free objects in addition to the regular freelist
- * that requires the slab lock.
+ * Slabs are freed when they become empty. Teardown and setup is minimal so we
+ * rely on the page allocators per cpu caches for fast frees and allocs.
*
* SLAB_DEBUG_FLAGS Slab requires special handling due to debug
* options set. This moves slab handling out of
@@ -201,28 +194,6 @@ enum slab_flags {
SL_pfmemalloc = PG_active, /* Historical reasons for this bit */
};
-/*
- * We could simply use migrate_disable()/enable() but as long as it's a
- * function call even on !PREEMPT_RT, use inline preempt_disable() there.
- */
-#ifndef CONFIG_PREEMPT_RT
-#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
-#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
-#define USE_LOCKLESS_FAST_PATH() (true)
-#else
-#define slub_get_cpu_ptr(var) \
-({ \
- migrate_disable(); \
- this_cpu_ptr(var); \
-})
-#define slub_put_cpu_ptr(var) \
-do { \
- (void)(var); \
- migrate_enable(); \
-} while (0)
-#define USE_LOCKLESS_FAST_PATH() (false)
-#endif
-
#ifndef CONFIG_SLUB_TINY
#define __fastpath_inline __always_inline
#else
@@ -241,11 +212,18 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
static DEFINE_STATIC_KEY_FALSE(strict_numa);
#endif
-/* Structure holding parameters for get_partial() call chain */
+/* Structure holding parameters for get_from_partial() call chain */
struct partial_context {
gfp_t flags;
unsigned int orig_size;
- void *object;
+};
+
+/* Structure holding parameters for get_partial_node_bulk() */
+struct partial_bulk_context {
+ gfp_t flags;
+ unsigned int min_objects;
+ unsigned int max_objects;
+ struct list_head slabs;
};
static inline bool kmem_cache_debug(struct kmem_cache *s)
@@ -261,15 +239,6 @@ void *fixup_red_left(struct kmem_cache *s, void *p)
return p;
}
-static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- return !kmem_cache_debug(s);
-#else
- return false;
-#endif
-}
-
/*
* Issues still to be resolved:
*
@@ -350,11 +319,8 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
#ifdef SLAB_SUPPORTS_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
-static int sysfs_slab_alias(struct kmem_cache *, const char *);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
-static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
- { return 0; }
#endif
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG)
@@ -363,37 +329,25 @@ static void debugfs_slab_add(struct kmem_cache *);
static inline void debugfs_slab_add(struct kmem_cache *s) { }
#endif
+enum add_mode {
+ ADD_TO_HEAD,
+ ADD_TO_TAIL,
+};
+
enum stat_item {
- ALLOC_PCS, /* Allocation from percpu sheaf */
- ALLOC_FASTPATH, /* Allocation from cpu slab */
- ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */
- FREE_PCS, /* Free to percpu sheaf */
+ ALLOC_FASTPATH, /* Allocation from percpu sheaves */
+ ALLOC_SLOWPATH, /* Allocation from partial or new slab */
FREE_RCU_SHEAF, /* Free to rcu_free sheaf */
FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */
- FREE_FASTPATH, /* Free to cpu slab */
- FREE_SLOWPATH, /* Freeing not to cpu slab */
- FREE_FROZEN, /* Freeing to frozen slab */
+ FREE_FASTPATH, /* Free to percpu sheaves */
+ FREE_SLOWPATH, /* Free to a slab */
FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */
FREE_REMOVE_PARTIAL, /* Freeing removes last object */
- ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */
- ALLOC_SLAB, /* Cpu slab acquired from page allocator */
- ALLOC_REFILL, /* Refill cpu slab from slab freelist */
- ALLOC_NODE_MISMATCH, /* Switching cpu slab */
+ ALLOC_SLAB, /* New slab acquired from page allocator */
+ ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */
FREE_SLAB, /* Slab freed to the page allocator */
- CPUSLAB_FLUSH, /* Abandoning of the cpu slab */
- DEACTIVATE_FULL, /* Cpu slab was full when deactivated */
- DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */
- DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */
- DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */
- DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */
- DEACTIVATE_BYPASS, /* Implicit deactivation */
ORDER_FALLBACK, /* Number of times fallback was necessary */
- CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */
CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */
- CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */
- CPU_PARTIAL_FREE, /* Refill cpu partial on free */
- CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */
- CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */
SHEAF_FLUSH, /* Objects flushed from a sheaf */
SHEAF_REFILL, /* Objects refilled to a sheaf */
SHEAF_ALLOC, /* Allocation of an empty sheaf */
@@ -410,31 +364,11 @@ enum stat_item {
NR_SLUB_STAT_ITEMS
};
-struct freelist_tid {
- union {
- struct {
- void *freelist; /* Pointer to next available object */
- unsigned long tid; /* Globally unique transaction id */
- };
- freelist_full_t freelist_tid;
- };
-};
-
-/*
- * When changing the layout, make sure freelist and tid are still compatible
- * with this_cpu_cmpxchg_double() alignment requirements.
- */
-struct kmem_cache_cpu {
- struct freelist_tid;
- struct slab *slab; /* The slab from which we are allocating */
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- struct slab *partial; /* Partially allocated slabs */
-#endif
- local_trylock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
+struct kmem_cache_stats {
unsigned int stat[NR_SLUB_STAT_ITEMS];
-#endif
};
+#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
@@ -443,7 +377,7 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* The rmw is racy on a preemptible kernel but this is acceptable, so
* avoid this_cpu_add()'s irq-disable overhead.
*/
- raw_cpu_inc(s->cpu_slab->stat[si]);
+ raw_cpu_inc(s->cpu_stats->stat[si]);
#endif
}
@@ -451,7 +385,7 @@ static inline
void stat_add(const struct kmem_cache *s, enum stat_item si, int v)
{
#ifdef CONFIG_SLUB_STATS
- raw_cpu_add(s->cpu_slab->stat[si], v);
+ raw_cpu_add(s->cpu_stats->stat[si], v);
#endif
}
@@ -540,7 +474,7 @@ static inline struct node_barn *get_barn(struct kmem_cache *s)
static nodemask_t slab_nodes;
/*
- * Workqueue used for flush_cpu_slab().
+ * Workqueue used for flushing cpu and kfree_rcu sheaves.
*/
static struct workqueue_struct *flushwq;
@@ -599,36 +533,6 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return freelist_ptr_decode(s, p, ptr_addr);
}
-static void prefetch_freepointer(const struct kmem_cache *s, void *object)
-{
- prefetchw(object + s->offset);
-}
-
-/*
- * When running under KMSAN, get_freepointer_safe() may return an uninitialized
- * pointer value in the case the current thread loses the race for the next
- * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
- * slab_alloc_node() will fail, so the uninitialized value won't be used, but
- * KMSAN will still check all arguments of cmpxchg because of imperfect
- * handling of inline assembly.
- * To work around this problem, we apply __no_kmsan_checks to ensure that
- * get_freepointer_safe() returns initialized memory.
- */
-__no_kmsan_checks
-static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
-{
- unsigned long freepointer_addr;
- freeptr_t p;
-
- if (!debug_pagealloc_enabled_static())
- return get_freepointer(s, object);
-
- object = kasan_reset_tag(object);
- freepointer_addr = (unsigned long)object + s->offset;
- copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
- return freelist_ptr_decode(s, p, freepointer_addr);
-}
-
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
unsigned long freeptr_addr = (unsigned long)object + s->offset;
@@ -692,41 +596,6 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
return x.x & OO_MASK;
}
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
-{
- unsigned int nr_slabs;
-
- s->cpu_partial = nr_objects;
-
- /*
- * We take the number of objects but actually limit the number of
- * slabs on the per cpu partial list, in order to limit excessive
- * growth of the list. For simplicity we assume that the slabs will
- * be half-full.
- */
- nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
- s->cpu_partial_slabs = nr_slabs;
-}
-
-static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
-{
- return s->cpu_partial_slabs;
-}
-#else
-#ifdef SLAB_SUPPORTS_SYSFS
-static inline void
-slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
-{
-}
-#endif
-
-static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
-{
- return 0;
-}
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
-
/*
* If network-based swap is enabled, slub must keep track of whether memory
* were allocated from pfmemalloc reserves.
@@ -782,7 +651,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old,
if (slab->freelist == old->freelist &&
slab->counters == old->counters) {
slab->freelist = new->freelist;
- slab->counters = new->counters;
+ /* prevent tearing for the read in get_partial_node_bulk() */
+ WRITE_ONCE(slab->counters, new->counters);
ret = true;
}
slab_unlock(slab);
@@ -802,7 +672,7 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla
{
bool ret;
- if (USE_LOCKLESS_FAST_PATH())
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
lockdep_assert_irqs_disabled();
if (s->flags & __CMPXCHG_DOUBLE)
@@ -857,7 +727,7 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab,
* request size in the meta data area, for better debug and sanity check.
*/
static inline void set_orig_size(struct kmem_cache *s,
- void *object, unsigned int orig_size)
+ void *object, unsigned long orig_size)
{
void *p = kasan_reset_tag(object);
@@ -867,10 +737,10 @@ static inline void set_orig_size(struct kmem_cache *s,
p += get_info_end(s);
p += sizeof(struct track) * 2;
- *(unsigned int *)p = orig_size;
+ *(unsigned long *)p = orig_size;
}
-static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
+static inline unsigned long get_orig_size(struct kmem_cache *s, void *object)
{
void *p = kasan_reset_tag(object);
@@ -883,8 +753,142 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
p += get_info_end(s);
p += sizeof(struct track) * 2;
- return *(unsigned int *)p;
+ return *(unsigned long *)p;
+}
+
+#ifdef CONFIG_SLAB_OBJ_EXT
+
+/*
+ * Check if memory cgroup or memory allocation profiling is enabled.
+ * If enabled, SLUB tries to reduce memory overhead of accounting
+ * slab objects. If neither is enabled when this function is called,
+ * the optimization is simply skipped to avoid affecting caches that do not
+ * need slabobj_ext metadata.
+ *
+ * However, this may disable optimization when memory cgroup or memory
+ * allocation profiling is used, but slabs are created too early
+ * even before those subsystems are initialized.
+ */
+static inline bool need_slab_obj_exts(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_NO_OBJ_EXT)
+ return false;
+
+ if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
+ return true;
+
+ if (mem_alloc_profiling_enabled())
+ return true;
+
+ return false;
+}
+
+static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
+{
+ return sizeof(struct slabobj_ext) * slab->objects;
+}
+
+static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
+ struct slab *slab)
+{
+ unsigned long objext_offset;
+
+ objext_offset = s->size * slab->objects;
+ objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext));
+ return objext_offset;
+}
+
+static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
+ struct slab *slab)
+{
+ unsigned long objext_offset = obj_exts_offset_in_slab(s, slab);
+ unsigned long objext_size = obj_exts_size_in_slab(slab);
+
+ return objext_offset + objext_size <= slab_size(slab);
+}
+
+static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
+{
+ unsigned long obj_exts;
+ unsigned long start;
+ unsigned long end;
+
+ obj_exts = slab_obj_exts(slab);
+ if (!obj_exts)
+ return false;
+
+ start = (unsigned long)slab_address(slab);
+ end = start + slab_size(slab);
+ return (obj_exts >= start) && (obj_exts < end);
+}
+#else
+static inline bool need_slab_obj_exts(struct kmem_cache *s)
+{
+ return false;
+}
+
+static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
+{
+ return 0;
+}
+
+static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
+ struct slab *slab)
+{
+ return 0;
+}
+
+static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
+ struct slab *slab)
+{
+ return false;
+}
+
+static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
+{
+ return false;
+}
+
+#endif
+
+#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
+static bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab)
+{
+ /*
+ * Note we cannot rely on the SLAB_OBJ_EXT_IN_OBJ flag here and need to
+ * check the stride. A cache can have SLAB_OBJ_EXT_IN_OBJ set, but
+ * allocations within_slab_leftover are preferred. And those may be
+ * possible or not depending on the particular slab's size.
+ */
+ return obj_exts_in_slab(s, slab) &&
+ (slab_get_stride(slab) == s->size);
+}
+
+static unsigned int obj_exts_offset_in_object(struct kmem_cache *s)
+{
+ unsigned int offset = get_info_end(s);
+
+ if (kmem_cache_debug_flags(s, SLAB_STORE_USER))
+ offset += sizeof(struct track) * 2;
+
+ if (slub_debug_orig_size(s))
+ offset += sizeof(unsigned long);
+
+ offset += kasan_metadata_size(s, false);
+
+ return offset;
+}
+#else
+static inline bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab)
+{
+ return false;
+}
+
+static inline unsigned int obj_exts_offset_in_object(struct kmem_cache *s)
+{
+ return 0;
}
+#endif
#ifdef CONFIG_SLUB_DEBUG
@@ -976,24 +980,6 @@ static const char *slub_debug_string __ro_after_init;
static int disable_higher_order_debug;
/*
- * slub is about to manipulate internal object metadata. This memory lies
- * outside the range of the allocated object, so accessing it would normally
- * be reported by kasan as a bounds error. metadata_access_enable() is used
- * to tell kasan that these accesses are OK.
- */
-static inline void metadata_access_enable(void)
-{
- kasan_disable_current();
- kmsan_disable_current();
-}
-
-static inline void metadata_access_disable(void)
-{
- kmsan_enable_current();
- kasan_enable_current();
-}
-
-/*
* Object debugging
*/
@@ -1065,7 +1051,7 @@ static void set_track_update(struct kmem_cache *s, void *object,
p->handle = handle;
#endif
p->addr = addr;
- p->cpu = smp_processor_id();
+ p->cpu = raw_smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
}
@@ -1198,10 +1184,13 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
off += 2 * sizeof(struct track);
if (slub_debug_orig_size(s))
- off += sizeof(unsigned int);
+ off += sizeof(unsigned long);
off += kasan_metadata_size(s, false);
+ if (obj_exts_in_object(s, slab))
+ off += sizeof(struct slabobj_ext);
+
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
print_section(KERN_ERR, "Padding ", p + off,
@@ -1226,20 +1215,6 @@ static void object_err(struct kmem_cache *s, struct slab *slab,
WARN_ON(1);
}
-static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
- void **freelist, void *nextfree)
-{
- if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
- !check_valid_pointer(s, slab, nextfree) && freelist) {
- object_err(s, slab, *freelist, "Freechain corrupt");
- *freelist = NULL;
- slab_fix(s, "Isolate corrupted freechain");
- return true;
- }
-
- return false;
-}
-
static void __slab_err(struct slab *slab)
{
if (slab_in_kunit_test())
@@ -1347,44 +1322,63 @@ skip_bug_print:
}
/*
- * Object layout:
- *
- * object address
- * Bytes of the object to be managed.
- * If the freepointer may overlay the object then the free
- * pointer is at the middle of the object.
- *
- * Poisoning uses 0x6b (POISON_FREE) and the last byte is
- * 0xa5 (POISON_END)
+ * Object field layout:
*
- * object + s->object_size
- * Padding to reach word boundary. This is also used for Redzoning.
- * Padding is extended by another word if Redzoning is enabled and
- * object_size == inuse.
+ * [Left redzone padding] (if SLAB_RED_ZONE)
+ * - Field size: s->red_left_pad
+ * - Immediately precedes each object when SLAB_RED_ZONE is set.
+ * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and
+ * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE.
*
- * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
- * 0xcc (SLUB_RED_ACTIVE) for objects in use.
+ * [Object bytes] (object address starts here)
+ * - Field size: s->object_size
+ * - Object payload bytes.
+ * - If the freepointer may overlap the object, it is stored inside
+ * the object (typically near the middle).
+ * - Poisoning uses 0x6b (POISON_FREE) and the last byte is
+ * 0xa5 (POISON_END) when __OBJECT_POISON is enabled.
*
- * object + s->inuse
- * Meta data starts here.
+ * [Word-align padding] (right redzone when SLAB_RED_ZONE is set)
+ * - Field size: s->inuse - s->object_size
+ * - If redzoning is enabled and ALIGN(size, sizeof(void *)) adds no
+ * padding, explicitly extend by one word so the right redzone is
+ * non-empty.
+ * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and
+ * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE.
*
- * A. Free pointer (if we cannot overwrite object on free)
- * B. Tracking data for SLAB_STORE_USER
- * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
- * D. Padding to reach required alignment boundary or at minimum
- * one word if debugging is on to be able to detect writes
- * before the word boundary.
+ * [Metadata starts at object + s->inuse]
+ * - A. freelist pointer (if freeptr_outside_object)
+ * - B. alloc tracking (SLAB_STORE_USER)
+ * - C. free tracking (SLAB_STORE_USER)
+ * - D. original request size (SLAB_KMALLOC && SLAB_STORE_USER)
+ * - E. KASAN metadata (if enabled)
*
- * Padding is done using 0x5a (POISON_INUSE)
+ * [Mandatory padding] (if CONFIG_SLUB_DEBUG && SLAB_RED_ZONE)
+ * - One mandatory debug word to guarantee a minimum poisoned gap
+ * between metadata and the next object, independent of alignment.
+ * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set.
+ * [Final alignment padding]
+ * - Bytes added by ALIGN(size, s->align) to reach s->size.
+ * - When the padding is large enough, it can be used to store
+ * struct slabobj_ext for accounting metadata (obj_exts_in_object()).
+ * - The remaining bytes (if any) are filled with 0x5a (POISON_INUSE)
+ * when SLAB_POISON is set.
*
- * object + s->size
- * Nothing is used beyond s->size.
+ * Notes:
+ * - Redzones are filled by init_object() with SLUB_RED_ACTIVE/INACTIVE.
+ * - Object contents are poisoned with POISON_FREE/END when __OBJECT_POISON.
+ * - The trailing padding is pre-filled with POISON_INUSE by
+ * setup_slab_debug() when SLAB_POISON is set, and is validated by
+ * check_pad_bytes().
+ * - The first object pointer is slab_address(slab) +
+ * (s->red_left_pad if redzoning); subsequent objects are reached by
+ * adding s->size each time.
*
- * If slabcaches are merged then the object_size and inuse boundaries are mostly
- * ignored. And therefore no slab options that rely on these boundaries
- * may be used with merged slabcaches.
+ * If a slab cache flag relies on specific metadata to exist at a fixed
+ * offset, the flag must be included in SLAB_NEVER_MERGE to prevent merging.
+ * Otherwise, the cache would misbehave as s->object_size and s->inuse are
+ * adjusted during cache merging (see __kmem_cache_alias()).
*/
-
static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned long off = get_info_end(s); /* The end of info */
@@ -1394,11 +1388,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
off += 2 * sizeof(struct track);
if (s->flags & SLAB_KMALLOC)
- off += sizeof(unsigned int);
+ off += sizeof(unsigned long);
}
off += kasan_metadata_size(s, false);
+ if (obj_exts_in_object(s, slab))
+ off += sizeof(struct slabobj_ext);
+
if (size_from_object(s) == off)
return 1;
@@ -1423,7 +1420,15 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
start = slab_address(slab);
length = slab_size(slab);
end = start + length;
- remainder = length % s->size;
+
+ if (obj_exts_in_slab(s, slab) && !obj_exts_in_object(s, slab)) {
+ remainder = length;
+ remainder -= obj_exts_offset_in_slab(s, slab);
+ remainder -= obj_exts_size_in_slab(slab);
+ } else {
+ remainder = length % s->size;
+ }
+
if (!remainder)
return;
@@ -2021,11 +2026,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
- void **freelist, void *nextfree)
-{
- return false;
-}
#endif /* CONFIG_SLUB_DEBUG */
/*
@@ -2042,21 +2042,27 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
{
- struct slabobj_ext *slab_exts;
struct slab *obj_exts_slab;
+ unsigned long slab_exts;
obj_exts_slab = virt_to_slab(obj_exts);
slab_exts = slab_obj_exts(obj_exts_slab);
if (slab_exts) {
+ get_slab_obj_exts(slab_exts);
unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
obj_exts_slab, obj_exts);
+ struct slabobj_ext *ext = slab_obj_ext(obj_exts_slab,
+ slab_exts, offs);
- if (unlikely(is_codetag_empty(&slab_exts[offs].ref)))
+ if (unlikely(is_codetag_empty(&ext->ref))) {
+ put_slab_obj_exts(slab_exts);
return;
+ }
/* codetag should be NULL here */
- WARN_ON(slab_exts[offs].ref.ct);
- set_codetag_empty(&slab_exts[offs].ref);
+ WARN_ON(ext->ref.ct);
+ set_codetag_empty(&ext->ref);
+ put_slab_obj_exts(slab_exts);
}
}
@@ -2095,6 +2101,49 @@ static inline void init_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
+/*
+ * Calculate the allocation size for slabobj_ext array.
+ *
+ * When memory allocation profiling is enabled, the obj_exts array
+ * could be allocated from the same slab cache it's being allocated for.
+ * This would prevent the slab from ever being freed because it would
+ * always contain at least one allocated object (its own obj_exts array).
+ *
+ * To avoid this, increase the allocation size when we detect the array
+ * may come from the same cache, forcing it to use a different cache.
+ */
+static inline size_t obj_exts_alloc_size(struct kmem_cache *s,
+ struct slab *slab, gfp_t gfp)
+{
+ size_t sz = sizeof(struct slabobj_ext) * slab->objects;
+ struct kmem_cache *obj_exts_cache;
+
+ /*
+ * slabobj_ext array for KMALLOC_CGROUP allocations
+ * are served from KMALLOC_NORMAL caches.
+ */
+ if (!mem_alloc_profiling_enabled())
+ return sz;
+
+ if (sz > KMALLOC_MAX_CACHE_SIZE)
+ return sz;
+
+ if (!is_kmalloc_normal(s))
+ return sz;
+
+ obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0);
+ /*
+ * We can't simply compare s with obj_exts_cache, because random kmalloc
+ * caches have multiple caches per size, selected by caller address.
+ * Since caller address may differ between kmalloc_slab() and actual
+ * allocation, bump size when sizes are equal.
+ */
+ if (s->object_size == obj_exts_cache->object_size)
+ return obj_exts_cache->object_size + 1;
+
+ return sz;
+}
+
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2103,26 +2152,26 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
unsigned long new_exts;
unsigned long old_exts;
struct slabobj_ext *vec;
+ size_t sz;
gfp &= ~OBJCGS_CLEAR_MASK;
/* Prevent recursive extension vector allocation */
gfp |= __GFP_NO_OBJ_EXT;
+ sz = obj_exts_alloc_size(s, slab, gfp);
+
/*
* Note that allow_spin may be false during early boot and its
* restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting
* architectures with cmpxchg16b, early obj_exts will be missing for
* very early allocations on those.
*/
- if (unlikely(!allow_spin)) {
- size_t sz = objects * sizeof(struct slabobj_ext);
-
+ if (unlikely(!allow_spin))
vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT,
slab_nid(slab));
- } else {
- vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
- slab_nid(slab));
- }
+ else
+ vec = kmalloc_node(sz, gfp | __GFP_ZERO, slab_nid(slab));
+
if (!vec) {
/*
* Try to mark vectors which failed to allocate.
@@ -2136,6 +2185,9 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
return -ENOMEM;
}
+ VM_WARN_ON_ONCE(virt_to_slab(vec) != NULL &&
+ virt_to_slab(vec)->slab_cache == s);
+
new_exts = (unsigned long)vec;
if (unlikely(!allow_spin))
new_exts |= OBJEXTS_NOSPIN_ALLOC;
@@ -2145,6 +2197,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
retry:
old_exts = READ_ONCE(slab->obj_exts);
handle_failed_objexts_alloc(old_exts, vec, objects);
+ slab_set_stride(slab, sizeof(struct slabobj_ext));
+
if (new_slab) {
/*
* If the slab is brand new and nobody can yet access its
@@ -2178,7 +2232,7 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
struct slabobj_ext *obj_exts;
- obj_exts = slab_obj_exts(slab);
+ obj_exts = (struct slabobj_ext *)slab_obj_exts(slab);
if (!obj_exts) {
/*
* If obj_exts allocation failed, slab->obj_exts is set to
@@ -2189,6 +2243,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
return;
}
+ if (obj_exts_in_slab(slab->slab_cache, slab)) {
+ slab->obj_exts = 0;
+ return;
+ }
+
/*
* obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
* corresponding extension will be NULL. alloc_tag_sub() will throw a
@@ -2204,6 +2263,54 @@ static inline void free_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
+/*
+ * Try to allocate slabobj_ext array from unused space.
+ * This function must be called on a freshly allocated slab to prevent
+ * concurrency problems.
+ */
+static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab)
+{
+ void *addr;
+ unsigned long obj_exts;
+
+ if (!need_slab_obj_exts(s))
+ return;
+
+ if (obj_exts_fit_within_slab_leftover(s, slab)) {
+ addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab);
+ addr = kasan_reset_tag(addr);
+ obj_exts = (unsigned long)addr;
+
+ get_slab_obj_exts(obj_exts);
+ memset(addr, 0, obj_exts_size_in_slab(slab));
+ put_slab_obj_exts(obj_exts);
+
+#ifdef CONFIG_MEMCG
+ obj_exts |= MEMCG_DATA_OBJEXTS;
+#endif
+ slab->obj_exts = obj_exts;
+ slab_set_stride(slab, sizeof(struct slabobj_ext));
+ } else if (s->flags & SLAB_OBJ_EXT_IN_OBJ) {
+ unsigned int offset = obj_exts_offset_in_object(s);
+
+ obj_exts = (unsigned long)slab_address(slab);
+ obj_exts += s->red_left_pad;
+ obj_exts += offset;
+
+ get_slab_obj_exts(obj_exts);
+ for_each_object(addr, s, slab_address(slab), slab->objects)
+ memset(kasan_reset_tag(addr) + offset, 0,
+ sizeof(struct slabobj_ext));
+ put_slab_obj_exts(obj_exts);
+
+#ifdef CONFIG_MEMCG
+ obj_exts |= MEMCG_DATA_OBJEXTS;
+#endif
+ slab->obj_exts = obj_exts;
+ slab_set_stride(slab, s->size);
+ }
+}
+
#else /* CONFIG_SLAB_OBJ_EXT */
static inline void init_slab_obj_exts(struct slab *slab)
@@ -2220,31 +2327,37 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
}
+static inline void alloc_slab_obj_exts_early(struct kmem_cache *s,
+ struct slab *slab)
+{
+}
+
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEM_ALLOC_PROFILING
-static inline struct slabobj_ext *
-prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
+static inline unsigned long
+prepare_slab_obj_exts_hook(struct kmem_cache *s, struct slab *slab,
+ gfp_t flags, void *p)
{
- struct slab *slab;
-
- slab = virt_to_slab(p);
if (!slab_obj_exts(slab) &&
alloc_slab_obj_exts(slab, s, flags, false)) {
pr_warn_once("%s, %s: Failed to create slab extension vector!\n",
__func__, s->name);
- return NULL;
+ return 0;
}
- return slab_obj_exts(slab) + obj_to_index(s, slab, p);
+ return slab_obj_exts(slab);
}
+
/* Should be called only if mem_alloc_profiling_enabled() */
static noinline void
__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
{
- struct slabobj_ext *obj_exts;
+ unsigned long obj_exts;
+ struct slabobj_ext *obj_ext;
+ struct slab *slab;
if (!object)
return;
@@ -2255,16 +2368,23 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags)
if (flags & __GFP_NO_OBJ_EXT)
return;
- obj_exts = prepare_slab_obj_exts_hook(s, flags, object);
+ slab = virt_to_slab(object);
+ obj_exts = prepare_slab_obj_exts_hook(s, slab, flags, object);
/*
* Currently obj_exts is used only for allocation profiling.
* If other users appear then mem_alloc_profiling_enabled()
* check should be added before alloc_tag_add().
*/
- if (likely(obj_exts))
- alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
- else
+ if (obj_exts) {
+ unsigned int obj_idx = obj_to_index(s, slab, object);
+
+ get_slab_obj_exts(obj_exts);
+ obj_ext = slab_obj_ext(slab, obj_exts, obj_idx);
+ alloc_tag_add(&obj_ext->ref, current->alloc_tag, s->size);
+ put_slab_obj_exts(obj_exts);
+ } else {
alloc_tag_set_inaccurate(current->alloc_tag);
+ }
}
static inline void
@@ -2279,8 +2399,8 @@ static noinline void
__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
int objects)
{
- struct slabobj_ext *obj_exts;
int i;
+ unsigned long obj_exts;
/* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */
if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
@@ -2290,11 +2410,13 @@ __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p
if (!obj_exts)
return;
+ get_slab_obj_exts(obj_exts);
for (i = 0; i < objects; i++) {
unsigned int off = obj_to_index(s, slab, p[i]);
- alloc_tag_sub(&obj_exts[off].ref, s->size);
+ alloc_tag_sub(&slab_obj_ext(slab, obj_exts, off)->ref, s->size);
}
+ put_slab_obj_exts(obj_exts);
}
static inline void
@@ -2352,7 +2474,7 @@ static __fastpath_inline
void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
int objects)
{
- struct slabobj_ext *obj_exts;
+ unsigned long obj_exts;
if (!memcg_kmem_online())
return;
@@ -2361,13 +2483,16 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
if (likely(!obj_exts))
return;
+ get_slab_obj_exts(obj_exts);
__memcg_slab_free_hook(s, slab, p, objects, obj_exts);
+ put_slab_obj_exts(obj_exts);
}
static __fastpath_inline
bool memcg_slab_post_charge(void *p, gfp_t flags)
{
- struct slabobj_ext *slab_exts;
+ unsigned long obj_exts;
+ struct slabobj_ext *obj_ext;
struct kmem_cache *s;
struct page *page;
struct slab *slab;
@@ -2408,11 +2533,16 @@ bool memcg_slab_post_charge(void *p, gfp_t flags)
return true;
/* Ignore already charged objects. */
- slab_exts = slab_obj_exts(slab);
- if (slab_exts) {
+ obj_exts = slab_obj_exts(slab);
+ if (obj_exts) {
+ get_slab_obj_exts(obj_exts);
off = obj_to_index(s, slab, p);
- if (unlikely(slab_exts[off].objcg))
+ obj_ext = slab_obj_ext(slab, obj_exts, off);
+ if (unlikely(obj_ext->objcg)) {
+ put_slab_obj_exts(obj_exts);
return true;
+ }
+ put_slab_obj_exts(obj_exts);
}
return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p);
@@ -2596,7 +2726,8 @@ static void *setup_object(struct kmem_cache *s, void *object)
return object;
}
-static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
+static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp,
+ unsigned int capacity)
{
struct slab_sheaf *sheaf;
size_t sheaf_size;
@@ -2614,7 +2745,7 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
if (s->flags & SLAB_KMALLOC)
gfp |= __GFP_NO_OBJ_EXT;
- sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity);
+ sheaf_size = struct_size(sheaf, objects, capacity);
sheaf = kzalloc(sheaf_size, gfp);
if (unlikely(!sheaf))
@@ -2627,6 +2758,12 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp)
return sheaf;
}
+static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s,
+ gfp_t gfp)
+{
+ return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity);
+}
+
static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
{
kfree(sheaf);
@@ -2634,9 +2771,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf)
stat(s, SHEAF_FREE);
}
-static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p);
-
+static unsigned int
+refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+ unsigned int max);
static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
gfp_t gfp)
@@ -2647,8 +2784,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf,
if (!to_fill)
return 0;
- filled = __kmem_cache_alloc_bulk(s, gfp, to_fill,
- &sheaf->objects[sheaf->size]);
+ filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill,
+ to_fill);
sheaf->size += filled;
@@ -2849,12 +2986,23 @@ static void pcs_destroy(struct kmem_cache *s)
{
int cpu;
+ /*
+ * We may be unwinding cache creation that failed before or during the
+ * allocation of this.
+ */
+ if (!s->cpu_sheaves)
+ return;
+
+ /* pcs->main can only point to the bootstrap sheaf, nothing to free */
+ if (!cache_has_sheaves(s))
+ goto free_pcs;
+
for_each_possible_cpu(cpu) {
struct slub_percpu_sheaves *pcs;
pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
- /* can happen when unwinding failed create */
+ /* This can happen when unwinding failed cache creation. */
if (!pcs->main)
continue;
@@ -2876,11 +3024,13 @@ static void pcs_destroy(struct kmem_cache *s)
}
}
+free_pcs:
free_percpu(s->cpu_sheaves);
s->cpu_sheaves = NULL;
}
-static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
+static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn,
+ bool allow_spin)
{
struct slab_sheaf *empty = NULL;
unsigned long flags;
@@ -2888,7 +3038,10 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn)
if (!data_race(barn->nr_empty))
return NULL;
- spin_lock_irqsave(&barn->lock, flags);
+ if (likely(allow_spin))
+ spin_lock_irqsave(&barn->lock, flags);
+ else if (!spin_trylock_irqsave(&barn->lock, flags))
+ return NULL;
if (likely(barn->nr_empty)) {
empty = list_first_entry(&barn->sheaves_empty,
@@ -2965,7 +3118,8 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn)
* change.
*/
static struct slab_sheaf *
-barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
+barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty,
+ bool allow_spin)
{
struct slab_sheaf *full = NULL;
unsigned long flags;
@@ -2973,7 +3127,10 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
if (!data_race(barn->nr_full))
return NULL;
- spin_lock_irqsave(&barn->lock, flags);
+ if (likely(allow_spin))
+ spin_lock_irqsave(&barn->lock, flags);
+ else if (!spin_trylock_irqsave(&barn->lock, flags))
+ return NULL;
if (likely(barn->nr_full)) {
full = list_first_entry(&barn->sheaves_full, struct slab_sheaf,
@@ -2994,7 +3151,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty)
* barn. But if there are too many full sheaves, reject this with -E2BIG.
*/
static struct slab_sheaf *
-barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
+barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full,
+ bool allow_spin)
{
struct slab_sheaf *empty;
unsigned long flags;
@@ -3005,7 +3163,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full)
if (!data_race(barn->nr_empty))
return ERR_PTR(-ENOMEM);
- spin_lock_irqsave(&barn->lock, flags);
+ if (likely(allow_spin))
+ spin_lock_irqsave(&barn->lock, flags);
+ else if (!spin_trylock_irqsave(&barn->lock, flags))
+ return ERR_PTR(-EBUSY);
if (likely(barn->nr_empty)) {
empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf,
@@ -3198,7 +3359,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
static __always_inline void account_slab(struct slab *slab, int order,
struct kmem_cache *s, gfp_t gfp)
{
- if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
+ if (memcg_kmem_online() &&
+ (s->flags & SLAB_ACCOUNT) &&
+ !slab_obj_exts(slab))
alloc_slab_obj_exts(slab, s, gfp, true);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
@@ -3262,9 +3425,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
- init_slab_obj_exts(slab);
-
- account_slab(slab, oo_order(oo), s, flags);
slab->slab_cache = s;
@@ -3273,6 +3433,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
start = slab_address(slab);
setup_slab_debug(s, slab, start);
+ init_slab_obj_exts(slab);
+ /*
+ * Poison the slab before initializing the slabobj_ext array
+ * to prevent the array from being overwritten.
+ */
+ alloc_slab_obj_exts_early(s, slab);
+ account_slab(slab, oo_order(oo), s, flags);
shuffle = shuffle_freelist(s, slab);
@@ -3303,7 +3470,7 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}
-static void __free_slab(struct kmem_cache *s, struct slab *slab)
+static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin)
{
struct page *page = slab_page(slab);
int order = compound_order(page);
@@ -3314,14 +3481,26 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab)
__ClearPageSlab(page);
mm_account_reclaimed_pages(pages);
unaccount_slab(slab, order, s);
- free_frozen_pages(page, order);
+ if (allow_spin)
+ free_frozen_pages(page, order);
+ else
+ free_frozen_pages_nolock(page, order);
+}
+
+static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab)
+{
+ /*
+ * Since it was just allocated, we can skip the actions in
+ * discard_slab() and free_slab().
+ */
+ __free_slab(s, slab, false);
}
static void rcu_free_slab(struct rcu_head *h)
{
struct slab *slab = container_of(h, struct slab, rcu_head);
- __free_slab(slab->slab_cache, slab);
+ __free_slab(slab->slab_cache, slab, true);
}
static void free_slab(struct kmem_cache *s, struct slab *slab)
@@ -3337,7 +3516,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab)
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU))
call_rcu(&slab->rcu_head, rcu_free_slab);
else
- __free_slab(s, slab);
+ __free_slab(s, slab, true);
}
static void discard_slab(struct kmem_cache *s, struct slab *slab)
@@ -3365,10 +3544,10 @@ static inline void slab_clear_node_partial(struct slab *slab)
* Management of partially allocated slabs.
*/
static inline void
-__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
+__add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode)
{
n->nr_partial++;
- if (tail == DEACTIVATE_TO_TAIL)
+ if (mode == ADD_TO_TAIL)
list_add_tail(&slab->slab_list, &n->partial);
else
list_add(&slab->slab_list, &n->partial);
@@ -3376,10 +3555,10 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
}
static inline void add_partial(struct kmem_cache_node *n,
- struct slab *slab, int tail)
+ struct slab *slab, enum add_mode mode)
{
lockdep_assert_held(&n->list_lock);
- __add_partial(n, slab, tail);
+ __add_partial(n, slab, mode);
}
static inline void remove_partial(struct kmem_cache_node *n,
@@ -3430,8 +3609,6 @@ static void *alloc_single_from_partial(struct kmem_cache *s,
return object;
}
-static void defer_deactivate_slab(struct slab *slab, void *flush_freelist);
-
/*
* Called only for kmem_cache_debug() caches to allocate from a freshly
* allocated slab. Allocate a single object instead of whole freelist
@@ -3447,8 +3624,8 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
void *object;
if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) {
- /* Unlucky, discard newly allocated slab */
- defer_deactivate_slab(slab, NULL);
+ /* Unlucky, discard newly allocated slab. */
+ free_new_slab_nolock(s, slab);
return NULL;
}
@@ -3474,7 +3651,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
if (slab->inuse == slab->objects)
add_full(s, n, slab);
else
- add_partial(n, slab, DEACTIVATE_TO_HEAD);
+ add_partial(n, slab, ADD_TO_HEAD);
inc_slabs_node(s, nid, slab->objects);
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -3482,29 +3659,78 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab,
return object;
}
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
-#else
-static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
- int drain) { }
-#endif
static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
+static bool get_partial_node_bulk(struct kmem_cache *s,
+ struct kmem_cache_node *n,
+ struct partial_bulk_context *pc,
+ bool allow_spin)
+{
+ struct slab *slab, *slab2;
+ unsigned int total_free = 0;
+ unsigned long flags;
+
+ /* Racy check to avoid taking the lock unnecessarily. */
+ if (!n || data_race(!n->nr_partial))
+ return false;
+
+ INIT_LIST_HEAD(&pc->slabs);
+
+ if (allow_spin)
+ spin_lock_irqsave(&n->list_lock, flags);
+ else if (!spin_trylock_irqsave(&n->list_lock, flags))
+ return false;
+
+ list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
+ struct freelist_counters flc;
+ unsigned int slab_free;
+
+ if (!pfmemalloc_match(slab, pc->flags))
+ continue;
+
+ /*
+ * determine the number of free objects in the slab racily
+ *
+ * slab_free is a lower bound due to possible subsequent
+ * concurrent freeing, so the caller may get more objects than
+ * requested and must handle that
+ */
+ flc.counters = data_race(READ_ONCE(slab->counters));
+ slab_free = flc.objects - flc.inuse;
+
+ /* we have already min and this would get us over the max */
+ if (total_free >= pc->min_objects
+ && total_free + slab_free > pc->max_objects)
+ break;
+
+ remove_partial(n, slab);
+
+ list_add(&slab->slab_list, &pc->slabs);
+
+ total_free += slab_free;
+ if (total_free >= pc->max_objects)
+ break;
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ return total_free > 0;
+}
+
/*
- * Try to allocate a partial slab from a specific node.
+ * Try to allocate object from a partial slab on a specific node.
*/
-static struct slab *get_partial_node(struct kmem_cache *s,
- struct kmem_cache_node *n,
- struct partial_context *pc)
+static void *get_from_partial_node(struct kmem_cache *s,
+ struct kmem_cache_node *n,
+ struct partial_context *pc)
{
- struct slab *slab, *slab2, *partial = NULL;
+ struct slab *slab, *slab2;
unsigned long flags;
- unsigned int partial_slabs = 0;
+ void *object = NULL;
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab and there is none available then get_partial()
+ * partial slab and there is none available then get_from_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
@@ -3515,54 +3741,55 @@ static struct slab *get_partial_node(struct kmem_cache *s,
else if (!spin_trylock_irqsave(&n->list_lock, flags))
return NULL;
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
+
+ struct freelist_counters old, new;
+
if (!pfmemalloc_match(slab, pc->flags))
continue;
if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
- void *object = alloc_single_from_partial(s, n, slab,
+ object = alloc_single_from_partial(s, n, slab,
pc->orig_size);
- if (object) {
- partial = slab;
- pc->object = object;
+ if (object)
break;
- }
continue;
}
- remove_partial(n, slab);
+ /*
+ * get a single object from the slab. This might race against
+ * __slab_free(), which however has to take the list_lock if
+ * it's about to make the slab fully free.
+ */
+ do {
+ old.freelist = slab->freelist;
+ old.counters = slab->counters;
- if (!partial) {
- partial = slab;
- stat(s, ALLOC_FROM_PARTIAL);
+ new.freelist = get_freepointer(s, old.freelist);
+ new.counters = old.counters;
+ new.inuse++;
- if ((slub_get_cpu_partial(s) == 0)) {
- break;
- }
- } else {
- put_cpu_partial(s, slab, 0);
- stat(s, CPU_PARTIAL_NODE);
+ } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node"));
- if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
- break;
- }
- }
+ object = old.freelist;
+ if (!new.freelist)
+ remove_partial(n, slab);
+
+ break;
}
spin_unlock_irqrestore(&n->list_lock, flags);
- return partial;
+ return object;
}
/*
- * Get a slab from somewhere. Search in increasing NUMA distances.
+ * Get an object from somewhere. Search in increasing NUMA distances.
*/
-static struct slab *get_any_partial(struct kmem_cache *s,
- struct partial_context *pc)
+static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(pc->flags);
- struct slab *slab;
unsigned int cpuset_mems_cookie;
/*
@@ -3597,8 +3824,10 @@ static struct slab *get_any_partial(struct kmem_cache *s,
if (n && cpuset_zone_allowed(zone, pc->flags) &&
n->nr_partial > s->min_partial) {
- slab = get_partial_node(s, n, pc);
- if (slab) {
+
+ void *object = get_from_partial_node(s, n, pc);
+
+ if (object) {
/*
* Don't check read_mems_allowed_retry()
* here - if mems_allowed was updated in
@@ -3606,7 +3835,7 @@ static struct slab *get_any_partial(struct kmem_cache *s,
* between allocation and the cpuset
* update
*/
- return slab;
+ return object;
}
}
}
@@ -3616,424 +3845,29 @@ static struct slab *get_any_partial(struct kmem_cache *s,
}
/*
- * Get a partial slab, lock it and return it.
+ * Get an object from a partial slab
*/
-static struct slab *get_partial(struct kmem_cache *s, int node,
- struct partial_context *pc)
+static void *get_from_partial(struct kmem_cache *s, int node,
+ struct partial_context *pc)
{
- struct slab *slab;
int searchnode = node;
+ void *object;
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- slab = get_partial_node(s, get_node(s, searchnode), pc);
- if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
- return slab;
-
- return get_any_partial(s, pc);
-}
-
-#ifdef CONFIG_PREEMPTION
-/*
- * Calculate the next globally unique transaction for disambiguation
- * during cmpxchg. The transactions start with the cpu number and are then
- * incremented by CONFIG_NR_CPUS.
- */
-#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
-#else
-/*
- * No preemption supported therefore also no need to check for
- * different cpus.
- */
-#define TID_STEP 1
-#endif /* CONFIG_PREEMPTION */
-
-static inline unsigned long next_tid(unsigned long tid)
-{
- return tid + TID_STEP;
-}
-
-#ifdef SLUB_DEBUG_CMPXCHG
-static inline unsigned int tid_to_cpu(unsigned long tid)
-{
- return tid % TID_STEP;
-}
-
-static inline unsigned long tid_to_event(unsigned long tid)
-{
- return tid / TID_STEP;
-}
-#endif
-
-static inline unsigned int init_tid(int cpu)
-{
- return cpu;
-}
-
-static inline void note_cmpxchg_failure(const char *n,
- const struct kmem_cache *s, unsigned long tid)
-{
-#ifdef SLUB_DEBUG_CMPXCHG
- unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
-
- pr_info("%s %s: cmpxchg redo ", n, s->name);
-
- if (IS_ENABLED(CONFIG_PREEMPTION) &&
- tid_to_cpu(tid) != tid_to_cpu(actual_tid)) {
- pr_warn("due to cpu change %d -> %d\n",
- tid_to_cpu(tid), tid_to_cpu(actual_tid));
- } else if (tid_to_event(tid) != tid_to_event(actual_tid)) {
- pr_warn("due to cpu running other code. Event %ld->%ld\n",
- tid_to_event(tid), tid_to_event(actual_tid));
- } else {
- pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n",
- actual_tid, tid, next_tid(tid));
- }
-#endif
- stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
-}
-
-static void init_kmem_cache_cpus(struct kmem_cache *s)
-{
-#ifdef CONFIG_PREEMPT_RT
- /*
- * Register lockdep key for non-boot kmem caches to avoid
- * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key()
- */
- bool finegrain_lockdep = !init_section_contains(s, 1);
-#else
- /*
- * Don't bother with different lockdep classes for each
- * kmem_cache, since we only use local_trylock_irqsave().
- */
- bool finegrain_lockdep = false;
-#endif
- int cpu;
- struct kmem_cache_cpu *c;
-
- if (finegrain_lockdep)
- lockdep_register_key(&s->lock_key);
- for_each_possible_cpu(cpu) {
- c = per_cpu_ptr(s->cpu_slab, cpu);
- local_trylock_init(&c->lock);
- if (finegrain_lockdep)
- lockdep_set_class(&c->lock, &s->lock_key);
- c->tid = init_tid(cpu);
- }
-}
-
-/*
- * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
- * unfreezes the slabs and puts it on the proper list.
- * Assumes the slab has been already safely taken away from kmem_cache_cpu
- * by the caller.
- */
-static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
- void *freelist)
-{
- struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- int free_delta = 0;
- void *nextfree, *freelist_iter, *freelist_tail;
- int tail = DEACTIVATE_TO_HEAD;
- unsigned long flags = 0;
- struct freelist_counters old, new;
-
- if (READ_ONCE(slab->freelist)) {
- stat(s, DEACTIVATE_REMOTE_FREES);
- tail = DEACTIVATE_TO_TAIL;
- }
-
- /*
- * Stage one: Count the objects on cpu's freelist as free_delta and
- * remember the last object in freelist_tail for later splicing.
- */
- freelist_tail = NULL;
- freelist_iter = freelist;
- while (freelist_iter) {
- nextfree = get_freepointer(s, freelist_iter);
-
- /*
- * If 'nextfree' is invalid, it is possible that the object at
- * 'freelist_iter' is already corrupted. So isolate all objects
- * starting at 'freelist_iter' by skipping them.
- */
- if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
- break;
-
- freelist_tail = freelist_iter;
- free_delta++;
-
- freelist_iter = nextfree;
- }
-
- /*
- * Stage two: Unfreeze the slab while splicing the per-cpu
- * freelist to the head of slab's freelist.
- */
- do {
- old.freelist = READ_ONCE(slab->freelist);
- old.counters = READ_ONCE(slab->counters);
- VM_BUG_ON(!old.frozen);
-
- /* Determine target state of the slab */
- new.counters = old.counters;
- new.frozen = 0;
- if (freelist_tail) {
- new.inuse -= free_delta;
- set_freepointer(s, freelist_tail, old.freelist);
- new.freelist = freelist;
- } else {
- new.freelist = old.freelist;
- }
- } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab"));
-
- /*
- * Stage three: Manipulate the slab list based on the updated state.
- */
- if (!new.inuse && n->nr_partial >= s->min_partial) {
- stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, slab);
- stat(s, FREE_SLAB);
- } else if (new.freelist) {
- spin_lock_irqsave(&n->list_lock, flags);
- add_partial(n, slab, tail);
- spin_unlock_irqrestore(&n->list_lock, flags);
- stat(s, tail);
- } else {
- stat(s, DEACTIVATE_FULL);
- }
-}
-
-/*
- * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock
- * can be acquired without a deadlock before invoking the function.
- *
- * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is
- * using local_lock_is_locked() properly before calling local_lock_cpu_slab(),
- * and kmalloc() is not used in an unsupported context.
- *
- * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave().
- * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but
- * lockdep_assert() will catch a bug in case:
- * #1
- * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock()
- * or
- * #2
- * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock()
- *
- * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt
- * disabled context. The lock will always be acquired and if needed it
- * block and sleep until the lock is available.
- * #1 is possible in !PREEMPT_RT only.
- * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock:
- * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) ->
- * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B)
- *
- * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B
- */
-#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP)
-#define local_lock_cpu_slab(s, flags) \
- local_lock_irqsave(&(s)->cpu_slab->lock, flags)
-#else
-#define local_lock_cpu_slab(s, flags) \
- do { \
- bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \
- lockdep_assert(__l); \
- } while (0)
-#endif
-
-#define local_unlock_cpu_slab(s, flags) \
- local_unlock_irqrestore(&(s)->cpu_slab->lock, flags)
-
-#ifdef CONFIG_SLUB_CPU_PARTIAL
-static void __put_partials(struct kmem_cache *s, struct slab *partial_slab)
-{
- struct kmem_cache_node *n = NULL, *n2 = NULL;
- struct slab *slab, *slab_to_discard = NULL;
- unsigned long flags = 0;
-
- while (partial_slab) {
- slab = partial_slab;
- partial_slab = slab->next;
-
- n2 = get_node(s, slab_nid(slab));
- if (n != n2) {
- if (n)
- spin_unlock_irqrestore(&n->list_lock, flags);
-
- n = n2;
- spin_lock_irqsave(&n->list_lock, flags);
- }
-
- if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) {
- slab->next = slab_to_discard;
- slab_to_discard = slab;
- } else {
- add_partial(n, slab, DEACTIVATE_TO_TAIL);
- stat(s, FREE_ADD_PARTIAL);
- }
- }
-
- if (n)
- spin_unlock_irqrestore(&n->list_lock, flags);
+ object = get_from_partial_node(s, get_node(s, searchnode), pc);
+ if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
+ return object;
- while (slab_to_discard) {
- slab = slab_to_discard;
- slab_to_discard = slab_to_discard->next;
-
- stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, slab);
- stat(s, FREE_SLAB);
- }
-}
-
-/*
- * Put all the cpu partial slabs to the node partial list.
- */
-static void put_partials(struct kmem_cache *s)
-{
- struct slab *partial_slab;
- unsigned long flags;
-
- local_lock_irqsave(&s->cpu_slab->lock, flags);
- partial_slab = this_cpu_read(s->cpu_slab->partial);
- this_cpu_write(s->cpu_slab->partial, NULL);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
-
- if (partial_slab)
- __put_partials(s, partial_slab);
-}
-
-static void put_partials_cpu(struct kmem_cache *s,
- struct kmem_cache_cpu *c)
-{
- struct slab *partial_slab;
-
- partial_slab = slub_percpu_partial(c);
- c->partial = NULL;
-
- if (partial_slab)
- __put_partials(s, partial_slab);
-}
-
-/*
- * Put a slab into a partial slab slot if available.
- *
- * If we did not find a slot then simply move all the partials to the
- * per node partial list.
- */
-static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
-{
- struct slab *oldslab;
- struct slab *slab_to_put = NULL;
- unsigned long flags;
- int slabs = 0;
-
- local_lock_cpu_slab(s, flags);
-
- oldslab = this_cpu_read(s->cpu_slab->partial);
-
- if (oldslab) {
- if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
- /*
- * Partial array is full. Move the existing set to the
- * per node partial list. Postpone the actual unfreezing
- * outside of the critical section.
- */
- slab_to_put = oldslab;
- oldslab = NULL;
- } else {
- slabs = oldslab->slabs;
- }
- }
-
- slabs++;
-
- slab->slabs = slabs;
- slab->next = oldslab;
-
- this_cpu_write(s->cpu_slab->partial, slab);
-
- local_unlock_cpu_slab(s, flags);
-
- if (slab_to_put) {
- __put_partials(s, slab_to_put);
- stat(s, CPU_PARTIAL_DRAIN);
- }
-}
-
-#else /* CONFIG_SLUB_CPU_PARTIAL */
-
-static inline void put_partials(struct kmem_cache *s) { }
-static inline void put_partials_cpu(struct kmem_cache *s,
- struct kmem_cache_cpu *c) { }
-
-#endif /* CONFIG_SLUB_CPU_PARTIAL */
-
-static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
-{
- unsigned long flags;
- struct slab *slab;
- void *freelist;
-
- local_lock_irqsave(&s->cpu_slab->lock, flags);
-
- slab = c->slab;
- freelist = c->freelist;
-
- c->slab = NULL;
- c->freelist = NULL;
- c->tid = next_tid(c->tid);
-
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
-
- if (slab) {
- deactivate_slab(s, slab, freelist);
- stat(s, CPUSLAB_FLUSH);
- }
-}
-
-static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
-{
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- void *freelist = c->freelist;
- struct slab *slab = c->slab;
-
- c->slab = NULL;
- c->freelist = NULL;
- c->tid = next_tid(c->tid);
-
- if (slab) {
- deactivate_slab(s, slab, freelist);
- stat(s, CPUSLAB_FLUSH);
- }
-
- put_partials_cpu(s, c);
-}
-
-static inline void flush_this_cpu_slab(struct kmem_cache *s)
-{
- struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
-
- if (c->slab)
- flush_slab(s, c);
-
- put_partials(s);
-}
-
-static bool has_cpu_slab(int cpu, struct kmem_cache *s)
-{
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-
- return c->slab || slub_percpu_partial(c);
+ return get_from_any_partial(s, pc);
}
static bool has_pcs_used(int cpu, struct kmem_cache *s)
{
struct slub_percpu_sheaves *pcs;
- if (!s->cpu_sheaves)
+ if (!cache_has_sheaves(s))
return false;
pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
@@ -4042,11 +3876,11 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s)
}
/*
- * Flush cpu slab.
+ * Flush percpu sheaves
*
* Called from CPU work handler with migration disabled.
*/
-static void flush_cpu_slab(struct work_struct *w)
+static void flush_cpu_sheaves(struct work_struct *w)
{
struct kmem_cache *s;
struct slub_flush_work *sfw;
@@ -4055,10 +3889,8 @@ static void flush_cpu_slab(struct work_struct *w)
s = sfw->s;
- if (s->cpu_sheaves)
+ if (cache_has_sheaves(s))
pcs_flush_all(s);
-
- flush_this_cpu_slab(s);
}
static void flush_all_cpus_locked(struct kmem_cache *s)
@@ -4071,11 +3903,11 @@ static void flush_all_cpus_locked(struct kmem_cache *s)
for_each_online_cpu(cpu) {
sfw = &per_cpu(slub_flush, cpu);
- if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) {
+ if (!has_pcs_used(cpu, s)) {
sfw->skip = true;
continue;
}
- INIT_WORK(&sfw->work, flush_cpu_slab);
+ INIT_WORK(&sfw->work, flush_cpu_sheaves);
sfw->skip = false;
sfw->s = s;
queue_work_on(cpu, flushwq, &sfw->work);
@@ -4160,7 +3992,7 @@ void flush_all_rcu_sheaves(void)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
- if (!s->cpu_sheaves)
+ if (!cache_has_sheaves(s))
continue;
flush_rcu_sheaves_on_cache(s);
}
@@ -4181,27 +4013,13 @@ static int slub_cpu_dead(unsigned int cpu)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
- __flush_cpu_slab(s, cpu);
- if (s->cpu_sheaves)
+ if (cache_has_sheaves(s))
__pcs_flush_all_cpu(s, cpu);
}
mutex_unlock(&slab_mutex);
return 0;
}
-/*
- * Check if the objects in a per cpu structure fit numa
- * locality expectations.
- */
-static inline int node_match(struct slab *slab, int node)
-{
-#ifdef CONFIG_NUMA
- if (node != NUMA_NO_NODE && slab_nid(slab) != node)
- return 0;
-#endif
- return 1;
-}
-
#ifdef CONFIG_SLUB_DEBUG
static int count_free(struct slab *slab)
{
@@ -4374,235 +4192,116 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
return true;
}
-static inline bool
-__update_cpu_freelist_fast(struct kmem_cache *s,
- void *freelist_old, void *freelist_new,
- unsigned long tid)
-{
- struct freelist_tid old = { .freelist = freelist_old, .tid = tid };
- struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) };
-
- return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid,
- &old.freelist_tid, new.freelist_tid);
-}
-
/*
- * Check the slab->freelist and either transfer the freelist to the
- * per cpu freelist or deactivate the slab.
+ * Get the slab's freelist and do not freeze it.
*
- * The slab is still frozen if the return value is not NULL.
+ * Assumes the slab is isolated from node partial list and not frozen.
*
- * If this function returns NULL then the slab has been unfrozen.
+ * Assumes this is performed only for caches without debugging so we
+ * don't need to worry about adding the slab to the full list.
*/
-static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
+static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab)
{
struct freelist_counters old, new;
- lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
-
do {
old.freelist = slab->freelist;
old.counters = slab->counters;
new.freelist = NULL;
new.counters = old.counters;
+ VM_WARN_ON_ONCE(new.frozen);
new.inuse = old.objects;
- new.frozen = old.freelist != NULL;
-
- } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist"));
+ } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze"));
return old.freelist;
}
/*
- * Freeze the partial slab and return the pointer to the freelist.
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ *
+ * Note that we also wipe custom freelist pointers.
*/
-static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
{
- struct freelist_counters old, new;
-
- do {
- old.freelist = slab->freelist;
- old.counters = slab->counters;
-
- new.freelist = NULL;
- new.counters = old.counters;
- VM_BUG_ON(new.frozen);
-
- new.inuse = old.objects;
- new.frozen = 1;
-
- } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab"));
-
- return old.freelist;
+ if (unlikely(slab_want_init_on_free(s)) && obj &&
+ !freeptr_outside_object(s))
+ memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
+ 0, sizeof(void *));
}
-/*
- * Slow path. The lockless freelist is empty or we need to perform
- * debugging duties.
- *
- * Processing is still very fast if new objects have been freed to the
- * regular freelist. In that case we simply take over the regular freelist
- * as the lockless freelist and zap the regular freelist.
- *
- * If that is not working then we fall back to the partial lists. We take the
- * first element of the freelist as the object to allocate now and move the
- * rest of the freelist to the lockless freelist.
- *
- * And if we were unable to get a new slab from the partial slab lists then
- * we need to allocate a new slab. This is the slowest path since it involves
- * a call to the page allocator and the setup of a new slab.
- *
- * Version of __slab_alloc to use when we know that preemption is
- * already disabled (which is the case for bulk allocation).
- */
-static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
+static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab,
+ void **p, unsigned int count, bool allow_spin)
{
- bool allow_spin = gfpflags_allow_spinning(gfpflags);
- void *freelist;
- struct slab *slab;
+ unsigned int allocated = 0;
+ struct kmem_cache_node *n;
+ bool needs_add_partial;
unsigned long flags;
- struct partial_context pc;
- bool try_thisnode = true;
-
- stat(s, ALLOC_SLOWPATH);
-
-reread_slab:
-
- slab = READ_ONCE(c->slab);
- if (!slab) {
- /*
- * if the node is not online or has no normal memory, just
- * ignore the node constraint
- */
- if (unlikely(node != NUMA_NO_NODE &&
- !node_isset(node, slab_nodes)))
- node = NUMA_NO_NODE;
- goto new_slab;
- }
-
- if (unlikely(!node_match(slab, node))) {
- /*
- * same as above but node_match() being false already
- * implies node != NUMA_NO_NODE.
- *
- * We don't strictly honor pfmemalloc and NUMA preferences
- * when !allow_spin because:
- *
- * 1. Most kmalloc() users allocate objects on the local node,
- * so kmalloc_nolock() tries not to interfere with them by
- * deactivating the cpu slab.
- *
- * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
- * unnecessary slab allocations even when n->partial list
- * is not empty.
- */
- if (!node_isset(node, slab_nodes) ||
- !allow_spin) {
- node = NUMA_NO_NODE;
- } else {
- stat(s, ALLOC_NODE_MISMATCH);
- goto deactivate_slab;
- }
- }
+ void *object;
/*
- * By rights, we should be searching for a slab page that was
- * PFMEMALLOC but right now, we are losing the pfmemalloc
- * information when the page leaves the per-cpu allocator
+ * Are we going to put the slab on the partial list?
+ * Note slab->inuse is 0 on a new slab.
*/
- if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
- goto deactivate_slab;
-
- /* must check again c->slab in case we got preempted and it changed */
- local_lock_cpu_slab(s, flags);
+ needs_add_partial = (slab->objects > count);
- if (unlikely(slab != c->slab)) {
- local_unlock_cpu_slab(s, flags);
- goto reread_slab;
- }
- freelist = c->freelist;
- if (freelist)
- goto load_freelist;
+ if (!allow_spin && needs_add_partial) {
- freelist = get_freelist(s, slab);
+ n = get_node(s, slab_nid(slab));
- if (!freelist) {
- c->slab = NULL;
- c->tid = next_tid(c->tid);
- local_unlock_cpu_slab(s, flags);
- stat(s, DEACTIVATE_BYPASS);
- goto new_slab;
+ if (!spin_trylock_irqsave(&n->list_lock, flags)) {
+ /* Unlucky, discard newly allocated slab */
+ free_new_slab_nolock(s, slab);
+ return 0;
+ }
}
- stat(s, ALLOC_REFILL);
-
-load_freelist:
-
- lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+ object = slab->freelist;
+ while (object && allocated < count) {
+ p[allocated] = object;
+ object = get_freepointer(s, object);
+ maybe_wipe_obj_freeptr(s, p[allocated]);
- /*
- * freelist is pointing to the list of objects to be used.
- * slab is pointing to the slab from which the objects are obtained.
- * That slab must be frozen for per cpu allocations to work.
- */
- VM_BUG_ON(!c->slab->frozen);
- c->freelist = get_freepointer(s, freelist);
- c->tid = next_tid(c->tid);
- local_unlock_cpu_slab(s, flags);
- return freelist;
-
-deactivate_slab:
-
- local_lock_cpu_slab(s, flags);
- if (slab != c->slab) {
- local_unlock_cpu_slab(s, flags);
- goto reread_slab;
- }
- freelist = c->freelist;
- c->slab = NULL;
- c->freelist = NULL;
- c->tid = next_tid(c->tid);
- local_unlock_cpu_slab(s, flags);
- deactivate_slab(s, slab, freelist);
+ slab->inuse++;
+ allocated++;
+ }
+ slab->freelist = object;
-new_slab:
+ if (needs_add_partial) {
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- while (slub_percpu_partial(c)) {
- local_lock_cpu_slab(s, flags);
- if (unlikely(c->slab)) {
- local_unlock_cpu_slab(s, flags);
- goto reread_slab;
- }
- if (unlikely(!slub_percpu_partial(c))) {
- local_unlock_cpu_slab(s, flags);
- /* we were preempted and partial list got empty */
- goto new_objects;
+ if (allow_spin) {
+ n = get_node(s, slab_nid(slab));
+ spin_lock_irqsave(&n->list_lock, flags);
}
+ add_partial(n, slab, ADD_TO_HEAD);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ }
- slab = slub_percpu_partial(c);
- slub_set_percpu_partial(c, slab);
-
- if (likely(node_match(slab, node) &&
- pfmemalloc_match(slab, gfpflags)) ||
- !allow_spin) {
- c->slab = slab;
- freelist = get_freelist(s, slab);
- VM_BUG_ON(!freelist);
- stat(s, CPU_PARTIAL_ALLOC);
- goto load_freelist;
- }
+ inc_slabs_node(s, slab_nid(slab), slab->objects);
+ return allocated;
+}
- local_unlock_cpu_slab(s, flags);
+/*
+ * Slow path. We failed to allocate via percpu sheaves or they are not available
+ * due to bootstrap or debugging enabled or SLUB_TINY.
+ *
+ * We try to allocate from partial slab lists and fall back to allocating a new
+ * slab.
+ */
+static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, unsigned int orig_size)
+{
+ bool allow_spin = gfpflags_allow_spinning(gfpflags);
+ void *object;
+ struct slab *slab;
+ struct partial_context pc;
+ bool try_thisnode = true;
- slab->next = NULL;
- __put_partials(s, slab);
- }
-#endif
+ stat(s, ALLOC_SLOWPATH);
new_objects:
@@ -4611,12 +4310,12 @@ new_objects:
* When a preferred node is indicated but no __GFP_THISNODE
*
* 1) try to get a partial slab from target node only by having
- * __GFP_THISNODE in pc.flags for get_partial()
+ * __GFP_THISNODE in pc.flags for get_from_partial()
* 2) if 1) failed, try to allocate a new slab from target node with
* GPF_NOWAIT | __GFP_THISNODE opportunistically
* 3) if 2) failed, retry with original gfpflags which will allow
- * get_partial() try partial lists of other nodes before potentially
- * allocating new page from other nodes
+ * get_from_partial() try partial lists of other nodes before
+ * potentially allocating new page from other nodes
*/
if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
&& try_thisnode)) {
@@ -4628,33 +4327,11 @@ new_objects:
}
pc.orig_size = orig_size;
- slab = get_partial(s, node, &pc);
- if (slab) {
- if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
- freelist = pc.object;
- /*
- * For debug caches here we had to go through
- * alloc_single_from_partial() so just store the
- * tracking info and return the object.
- *
- * Due to disabled preemption we need to disallow
- * blocking. The flags are further adjusted by
- * gfp_nested_mask() in stack_depot itself.
- */
- if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr,
- gfpflags & ~(__GFP_DIRECT_RECLAIM));
-
- return freelist;
- }
-
- freelist = freeze_slab(s, slab);
- goto retry_load_slab;
- }
+ object = get_from_partial(s, node, &pc);
+ if (object)
+ goto success;
- slub_put_cpu_ptr(s->cpu_slab);
slab = new_slab(s, pc.flags, node);
- c = slub_get_cpu_ptr(s->cpu_slab);
if (unlikely(!slab)) {
if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
@@ -4669,165 +4346,36 @@ new_objects:
stat(s, ALLOC_SLAB);
if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
- freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
-
- if (unlikely(!freelist)) {
- /* This could cause an endless loop. Fail instead. */
- if (!allow_spin)
- return NULL;
- goto new_objects;
- }
-
- if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr,
- gfpflags & ~(__GFP_DIRECT_RECLAIM));
-
- return freelist;
- }
-
- /*
- * No other reference to the slab yet so we can
- * muck around with it freely without cmpxchg
- */
- freelist = slab->freelist;
- slab->freelist = NULL;
- slab->inuse = slab->objects;
- slab->frozen = 1;
+ object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
- inc_slabs_node(s, slab_nid(slab), slab->objects);
+ if (likely(object))
+ goto success;
+ } else {
+ alloc_from_new_slab(s, slab, &object, 1, allow_spin);
- if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
- /*
- * For !pfmemalloc_match() case we don't load freelist so that
- * we don't make further mismatched allocations easier.
- */
- deactivate_slab(s, slab, get_freepointer(s, freelist));
- return freelist;
+ /* we don't need to check SLAB_STORE_USER here */
+ if (likely(object))
+ return object;
}
-retry_load_slab:
-
- local_lock_cpu_slab(s, flags);
- if (unlikely(c->slab)) {
- void *flush_freelist = c->freelist;
- struct slab *flush_slab = c->slab;
-
- c->slab = NULL;
- c->freelist = NULL;
- c->tid = next_tid(c->tid);
-
- local_unlock_cpu_slab(s, flags);
-
- if (unlikely(!allow_spin)) {
- /* Reentrant slub cannot take locks, defer */
- defer_deactivate_slab(flush_slab, flush_freelist);
- } else {
- deactivate_slab(s, flush_slab, flush_freelist);
- }
-
- stat(s, CPUSLAB_FLUSH);
-
- goto retry_load_slab;
- }
- c->slab = slab;
+ if (allow_spin)
+ goto new_objects;
- goto load_freelist;
-}
-/*
- * We disallow kprobes in ___slab_alloc() to prevent reentrance
- *
- * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of
- * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf ->
- * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast()
- * manipulating c->freelist without lock.
- *
- * This does not prevent kprobe in functions called from ___slab_alloc() such as
- * local_lock_irqsave() itself, and that is fine, we only need to protect the
- * c->freelist manipulation in ___slab_alloc() itself.
- */
-NOKPROBE_SYMBOL(___slab_alloc);
+ /* This could cause an endless loop. Fail instead. */
+ return NULL;
-/*
- * A wrapper for ___slab_alloc() for contexts where preemption is not yet
- * disabled. Compensates for possible cpu changes by refetching the per cpu area
- * pointer.
- */
-static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
-{
- void *p;
+success:
+ if (kmem_cache_debug_flags(s, SLAB_STORE_USER))
+ set_track(s, object, TRACK_ALLOC, addr, gfpflags);
-#ifdef CONFIG_PREEMPT_COUNT
- /*
- * We may have been preempted and rescheduled on a different
- * cpu before disabling preemption. Need to reload cpu area
- * pointer.
- */
- c = slub_get_cpu_ptr(s->cpu_slab);
-#endif
- if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
- if (local_lock_is_locked(&s->cpu_slab->lock)) {
- /*
- * EBUSY is an internal signal to kmalloc_nolock() to
- * retry a different bucket. It's not propagated
- * to the caller.
- */
- p = ERR_PTR(-EBUSY);
- goto out;
- }
- }
- p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
-out:
-#ifdef CONFIG_PREEMPT_COUNT
- slub_put_cpu_ptr(s->cpu_slab);
-#endif
- return p;
+ return object;
}
static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
- struct kmem_cache_cpu *c;
- struct slab *slab;
- unsigned long tid;
void *object;
-redo:
- /*
- * Must read kmem_cache cpu data via this cpu ptr. Preemption is
- * enabled. We may switch back and forth between cpus while
- * reading from one cpu area. That does not matter as long
- * as we end up on the original cpu again when doing the cmpxchg.
- *
- * We must guarantee that tid and kmem_cache_cpu are retrieved on the
- * same cpu. We read first the kmem_cache_cpu pointer and use it to read
- * the tid. If we are preempted and switched to another cpu between the
- * two reads, it's OK as the two are still associated with the same cpu
- * and cmpxchg later will validate the cpu.
- */
- c = raw_cpu_ptr(s->cpu_slab);
- tid = READ_ONCE(c->tid);
-
- /*
- * Irqless object alloc/free algorithm used here depends on sequence
- * of fetching cpu_slab's data. tid should be fetched before anything
- * on c to guarantee that object and slab associated with previous tid
- * won't be used with current tid. If we fetch tid first, object and
- * slab could be one associated with next tid and our alloc/free
- * request will be failed. In this case, we will retry. So, no problem.
- */
- barrier();
-
- /*
- * The transaction ids are globally unique per cpu and per operation on
- * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
- * occurs on the right processor and that there was no operation on the
- * linked list in between.
- */
-
- object = c->freelist;
- slab = c->slab;
-
#ifdef CONFIG_NUMA
if (static_branch_unlikely(&strict_numa) &&
node == NUMA_NO_NODE) {
@@ -4836,66 +4384,24 @@ redo:
if (mpol) {
/*
- * Special BIND rule support. If existing slab
+ * Special BIND rule support. If the local node
* is in permitted set then do not redirect
* to a particular node.
* Otherwise we apply the memory policy to get
* the node we need to allocate on.
*/
- if (mpol->mode != MPOL_BIND || !slab ||
- !node_isset(slab_nid(slab), mpol->nodes))
-
+ if (mpol->mode != MPOL_BIND ||
+ !node_isset(numa_mem_id(), mpol->nodes))
node = mempolicy_slab_node();
}
}
#endif
- if (!USE_LOCKLESS_FAST_PATH() ||
- unlikely(!object || !slab || !node_match(slab, node))) {
- object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
- } else {
- void *next_object = get_freepointer_safe(s, object);
-
- /*
- * The cmpxchg will only match if there was no additional
- * operation and if we are on the right processor.
- *
- * The cmpxchg does the following atomically (without lock
- * semantics!)
- * 1. Relocate first pointer to the current per cpu area.
- * 2. Verify that tid and freelist have not been changed
- * 3. If they were not changed replace tid and freelist
- *
- * Since this is without lock semantics the protection is only
- * against code executing on this cpu *not* from access by
- * other cpus.
- */
- if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
- note_cmpxchg_failure("slab_alloc", s, tid);
- goto redo;
- }
- prefetch_freepointer(s, next_object);
- stat(s, ALLOC_FASTPATH);
- }
+ object = ___slab_alloc(s, gfpflags, node, addr, orig_size);
return object;
}
-/*
- * If the object has been wiped upon free, make sure it's fully initialized by
- * zeroing out freelist pointer.
- *
- * Note that we also wipe custom freelist pointers.
- */
-static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
- void *obj)
-{
- if (unlikely(slab_want_init_on_free(s)) && obj &&
- !freeptr_outside_object(s))
- memset((void *)((char *)kasan_reset_tag(obj) + s->offset),
- 0, sizeof(void *));
-}
-
static __fastpath_inline
struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
{
@@ -4982,6 +4488,12 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+ /* Bootstrap or debug cache, back off */
+ if (unlikely(!cache_has_sheaves(s))) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return NULL;
+ }
+
if (pcs->spare && pcs->spare->size > 0) {
swap(pcs->main, pcs->spare);
return pcs;
@@ -4993,7 +4505,8 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
return NULL;
}
- full = barn_replace_empty_sheaf(barn, pcs->main);
+ full = barn_replace_empty_sheaf(barn, pcs->main,
+ gfpflags_allow_spinning(gfp));
if (full) {
stat(s, BARN_GET);
@@ -5010,7 +4523,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
empty = pcs->spare;
pcs->spare = NULL;
} else {
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, true);
}
}
@@ -5052,7 +4565,10 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
*/
if (pcs->main->size == 0) {
- barn_put_empty_sheaf(barn, pcs->main);
+ if (!pcs->spare)
+ pcs->spare = pcs->main;
+ else
+ barn_put_empty_sheaf(barn, pcs->main);
pcs->main = full;
return pcs;
}
@@ -5109,8 +4625,10 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
* We assume the percpu sheaves contain only local objects although it's
* not completely guaranteed, so we verify later.
*/
- if (unlikely(node_requested && node != numa_mem_id()))
+ if (unlikely(node_requested && node != numa_mem_id())) {
+ stat(s, ALLOC_NODE_MISMATCH);
return NULL;
+ }
if (!local_trylock(&s->cpu_sheaves->lock))
return NULL;
@@ -5133,6 +4651,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
*/
if (page_to_nid(virt_to_page(object)) != node) {
local_unlock(&s->cpu_sheaves->lock);
+ stat(s, ALLOC_NODE_MISMATCH);
return NULL;
}
}
@@ -5141,13 +4660,14 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node)
local_unlock(&s->cpu_sheaves->lock);
- stat(s, ALLOC_PCS);
+ stat(s, ALLOC_FASTPATH);
return object;
}
static __fastpath_inline
-unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p)
+unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size,
+ void **p)
{
struct slub_percpu_sheaves *pcs;
struct slab_sheaf *main;
@@ -5165,6 +4685,11 @@ next_batch:
struct slab_sheaf *full;
struct node_barn *barn;
+ if (unlikely(!cache_has_sheaves(s))) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return allocated;
+ }
+
if (pcs->spare && pcs->spare->size > 0) {
swap(pcs->main, pcs->spare);
goto do_alloc;
@@ -5176,7 +4701,8 @@ next_batch:
return allocated;
}
- full = barn_replace_empty_sheaf(barn, pcs->main);
+ full = barn_replace_empty_sheaf(barn, pcs->main,
+ gfpflags_allow_spinning(gfp));
if (full) {
stat(s, BARN_GET);
@@ -5206,7 +4732,7 @@ do_alloc:
local_unlock(&s->cpu_sheaves->lock);
- stat_add(s, ALLOC_PCS, batch);
+ stat_add(s, ALLOC_FASTPATH, batch);
allocated += batch;
@@ -5244,8 +4770,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list
if (unlikely(object))
goto out;
- if (s->cpu_sheaves)
- object = alloc_from_pcs(s, gfpflags, node);
+ object = alloc_from_pcs(s, gfpflags, node);
if (!object)
object = __slab_alloc_node(s, gfpflags, node, addr, orig_size);
@@ -5340,6 +4865,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s,
return ret;
}
+static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p);
+
/*
* returns a sheaf that has at least the requested size
* when prefilling is needed, do so with given gfp flags
@@ -5353,18 +4881,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size)
struct slab_sheaf *sheaf = NULL;
struct node_barn *barn;
- if (unlikely(size > s->sheaf_capacity)) {
+ if (unlikely(!size))
+ return NULL;
- /*
- * slab_debug disables cpu sheaves intentionally so all
- * prefilled sheaves become "oversize" and we give up on
- * performance for the debugging. Same with SLUB_TINY.
- * Creating a cache without sheaves and then requesting a
- * prefilled sheaf is however not expected, so warn.
- */
- WARN_ON_ONCE(s->sheaf_capacity == 0 &&
- !IS_ENABLED(CONFIG_SLUB_TINY) &&
- !(s->flags & SLAB_DEBUG_FLAGS));
+ if (unlikely(size > s->sheaf_capacity)) {
sheaf = kzalloc(struct_size(sheaf, objects, size), gfp);
if (!sheaf)
@@ -5686,7 +5206,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags;
struct kmem_cache *s;
bool can_retry = true;
- void *ret = ERR_PTR(-EBUSY);
+ void *ret;
VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO |
__GFP_NO_OBJ_EXT));
@@ -5694,13 +5214,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (unlikely(!size))
return ZERO_SIZE_PTR;
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
- /*
- * kmalloc_nolock() in PREEMPT_RT is not supported from
- * non-preemptible context because local_lock becomes a
- * sleeping lock on RT.
- */
+ /*
+ * See the comment for the same check in
+ * alloc_frozen_pages_nolock_noprof()
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
return NULL;
+
retry:
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
@@ -5709,50 +5229,47 @@ retry:
if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s))
/*
* kmalloc_nolock() is not supported on architectures that
- * don't implement cmpxchg16b, but debug caches don't use
- * per-cpu slab and per-cpu partial slabs. They rely on
- * kmem_cache_node->list_lock, so kmalloc_nolock() can
- * attempt to allocate from debug caches by
+ * don't implement cmpxchg16b and thus need slab_lock()
+ * which could be preempted by a nmi.
+ * But debug caches don't use that and only rely on
+ * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt
+ * to allocate from debug caches by
* spin_trylock_irqsave(&n->list_lock, ...)
*/
return NULL;
+ ret = alloc_from_pcs(s, alloc_gfp, node);
+ if (ret)
+ goto success;
+
/*
* Do not call slab_alloc_node(), since trylock mode isn't
* compatible with slab_pre_alloc_hook/should_failslab and
* kfence_alloc. Hence call __slab_alloc_node() (at most twice)
* and slab_post_alloc_hook() directly.
- *
- * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair
- * in irq saved region. It assumes that the same cpu will not
- * __update_cpu_freelist_fast() into the same (freelist,tid) pair.
- * Therefore use in_nmi() to check whether particular bucket is in
- * irq protected section.
- *
- * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that
- * this cpu was interrupted somewhere inside ___slab_alloc() after
- * it did local_lock_irqsave(&s->cpu_slab->lock, flags).
- * In this case fast path with __update_cpu_freelist_fast() is not safe.
*/
- if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock))
- ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
+ ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size);
- if (PTR_ERR(ret) == -EBUSY) {
- if (can_retry) {
- /* pick the next kmalloc bucket */
- size = s->object_size + 1;
- /*
- * Another alternative is to
- * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
- * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
- * to retry from bucket of the same size.
- */
- can_retry = false;
- goto retry;
- }
- ret = NULL;
+ /*
+ * It's possible we failed due to trylock as we preempted someone with
+ * the sheaves locked, and the list_lock is also held by another cpu.
+ * But it should be rare that multiple kmalloc buckets would have
+ * sheaves locked, so try a larger one.
+ */
+ if (!ret && can_retry) {
+ /* pick the next kmalloc bucket */
+ size = s->object_size + 1;
+ /*
+ * Another alternative is to
+ * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT;
+ * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT;
+ * to retry from bucket of the same size.
+ */
+ can_retry = false;
+ goto retry;
}
+success:
maybe_wipe_obj_freeptr(s, ret);
slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret,
slab_want_init_on_alloc(alloc_gfp, s), size);
@@ -5834,7 +5351,7 @@ static noinline void free_to_partial_list(
/* was on full list */
remove_full(s, n, slab);
if (!slab_free) {
- add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ add_partial(n, slab, ADD_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
} else if (slab_free) {
@@ -5872,26 +5389,17 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
unsigned long addr)
{
- bool was_frozen, was_full;
+ bool was_full;
struct freelist_counters old, new;
struct kmem_cache_node *n = NULL;
unsigned long flags;
bool on_node_partial;
- stat(s, FREE_SLOWPATH);
-
if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
free_to_partial_list(s, slab, head, tail, cnt, addr);
return;
}
- /*
- * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below
- * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s)
- * is the only other reason it can be false, and it is already handled
- * above.
- */
-
do {
if (unlikely(n)) {
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -5902,7 +5410,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
old.counters = slab->counters;
was_full = (old.freelist == NULL);
- was_frozen = old.frozen;
set_freepointer(s, tail, old.freelist);
@@ -5915,53 +5422,29 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
* to (due to not being full anymore) the partial list.
* Unless it's frozen.
*/
- if ((!new.inuse || was_full) && !was_frozen) {
+ if (!new.inuse || was_full) {
+
+ n = get_node(s, slab_nid(slab));
/*
- * If slab becomes non-full and we have cpu partial
- * lists, we put it there unconditionally to avoid
- * taking the list_lock. Otherwise we need it.
+ * Speculatively acquire the list_lock.
+ * If the cmpxchg does not succeed then we may
+ * drop the list_lock without any processing.
+ *
+ * Otherwise the list_lock will synchronize with
+ * other processors updating the list of slabs.
*/
- if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) {
-
- n = get_node(s, slab_nid(slab));
- /*
- * Speculatively acquire the list_lock.
- * If the cmpxchg does not succeed then we may
- * drop the list_lock without any processing.
- *
- * Otherwise the list_lock will synchronize with
- * other processors updating the list of slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
+ spin_lock_irqsave(&n->list_lock, flags);
- on_node_partial = slab_test_node_partial(slab);
- }
+ on_node_partial = slab_test_node_partial(slab);
}
} while (!slab_update_freelist(s, slab, &old, &new, "__slab_free"));
if (likely(!n)) {
-
- if (likely(was_frozen)) {
- /*
- * The list lock was not taken therefore no list
- * activity can be necessary.
- */
- stat(s, FREE_FROZEN);
- } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) {
- /*
- * If we started with a full slab then put it onto the
- * per cpu partial list.
- */
- put_cpu_partial(s, slab, 1);
- stat(s, CPU_PARTIAL_FREE);
- }
-
/*
- * In other cases we didn't take the list_lock because the slab
- * was already on the partial list and will remain there.
+ * We didn't take the list_lock because the slab was already on
+ * the partial list and will remain there.
*/
-
return;
}
@@ -5983,11 +5466,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
/*
* Objects left in the slab. If it was not on the partial list before
- * then add it. This can only happen when cache has no per cpu partial
- * list otherwise we would have put it there.
+ * then add it.
*/
- if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) {
- add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ if (unlikely(was_full)) {
+ add_partial(n, slab, ADD_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -6073,7 +5555,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s,
* unlocked.
*/
static struct slub_percpu_sheaves *
-__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
+__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs,
+ bool allow_spin)
{
struct slab_sheaf *empty;
struct node_barn *barn;
@@ -6082,6 +5565,12 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs)
restart:
lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+ /* Bootstrap or debug cache, back off */
+ if (unlikely(!cache_has_sheaves(s))) {
+ local_unlock(&s->cpu_sheaves->lock);
+ return NULL;
+ }
+
barn = get_barn(s);
if (!barn) {
local_unlock(&s->cpu_sheaves->lock);
@@ -6091,7 +5580,7 @@ restart:
put_fail = false;
if (!pcs->spare) {
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, allow_spin);
if (empty) {
pcs->spare = pcs->main;
pcs->main = empty;
@@ -6105,7 +5594,7 @@ restart:
return pcs;
}
- empty = barn_replace_full_sheaf(barn, pcs->main);
+ empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin);
if (!IS_ERR(empty)) {
stat(s, BARN_PUT);
@@ -6113,7 +5602,8 @@ restart:
return pcs;
}
- if (PTR_ERR(empty) == -E2BIG) {
+ /* sheaf_flush_unused() doesn't support !allow_spin */
+ if (PTR_ERR(empty) == -E2BIG && allow_spin) {
/* Since we got here, spare exists and is full */
struct slab_sheaf *to_flush = pcs->spare;
@@ -6138,6 +5628,14 @@ restart:
alloc_empty:
local_unlock(&s->cpu_sheaves->lock);
+ /*
+ * alloc_empty_sheaf() doesn't support !allow_spin and it's
+ * easier to fall back to freeing directly without sheaves
+ * than add the support (and to sheaf_flush_unused() above)
+ */
+ if (!allow_spin)
+ return NULL;
+
empty = alloc_empty_sheaf(s, GFP_NOWAIT);
if (empty)
goto got_empty;
@@ -6180,7 +5678,7 @@ got_empty:
* The object is expected to have passed slab_free_hook() already.
*/
static __fastpath_inline
-bool free_to_pcs(struct kmem_cache *s, void *object)
+bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin)
{
struct slub_percpu_sheaves *pcs;
@@ -6191,7 +5689,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
if (unlikely(pcs->main->size == s->sheaf_capacity)) {
- pcs = __pcs_replace_full_main(s, pcs);
+ pcs = __pcs_replace_full_main(s, pcs, allow_spin);
if (unlikely(!pcs))
return false;
}
@@ -6200,7 +5698,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object)
local_unlock(&s->cpu_sheaves->lock);
- stat(s, FREE_PCS);
+ stat(s, FREE_FASTPATH);
return true;
}
@@ -6265,11 +5763,29 @@ empty:
free_empty_sheaf(s, sheaf);
}
+/*
+ * kvfree_call_rcu() can be called while holding a raw_spinlock_t. Since
+ * __kfree_rcu_sheaf() may acquire a spinlock_t (sleeping lock on PREEMPT_RT),
+ * this would violate lock nesting rules. Therefore, kvfree_call_rcu() avoids
+ * this problem by bypassing the sheaves layer entirely on PREEMPT_RT.
+ *
+ * However, lockdep still complains that it is invalid to acquire spinlock_t
+ * while holding raw_spinlock_t, even on !PREEMPT_RT where spinlock_t is a
+ * spinning lock. Tell lockdep that acquiring spinlock_t is valid here
+ * by temporarily raising the wait-type to LD_WAIT_CONFIG.
+ */
+static DEFINE_WAIT_OVERRIDE_MAP(kfree_rcu_sheaf_map, LD_WAIT_CONFIG);
+
bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
{
struct slub_percpu_sheaves *pcs;
struct slab_sheaf *rcu_sheaf;
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+ return false;
+
+ lock_map_acquire_try(&kfree_rcu_sheaf_map);
+
if (!local_trylock(&s->cpu_sheaves->lock))
goto fail;
@@ -6280,6 +5796,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
struct slab_sheaf *empty;
struct node_barn *barn;
+ /* Bootstrap or debug cache, fall back */
+ if (unlikely(!cache_has_sheaves(s))) {
+ local_unlock(&s->cpu_sheaves->lock);
+ goto fail;
+ }
+
if (pcs->spare && pcs->spare->size == 0) {
pcs->rcu_free = pcs->spare;
pcs->spare = NULL;
@@ -6292,7 +5814,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj)
goto fail;
}
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, true);
if (empty) {
pcs->rcu_free = empty;
@@ -6346,10 +5868,12 @@ do_free:
local_unlock(&s->cpu_sheaves->lock);
stat(s, FREE_RCU_SHEAF);
+ lock_map_release(&kfree_rcu_sheaf_map);
return true;
fail:
stat(s, FREE_RCU_SHEAF_FAIL);
+ lock_map_release(&kfree_rcu_sheaf_map);
return false;
}
@@ -6410,7 +5934,7 @@ next_batch:
goto no_empty;
if (!pcs->spare) {
- empty = barn_get_empty_sheaf(barn);
+ empty = barn_get_empty_sheaf(barn, true);
if (!empty)
goto no_empty;
@@ -6424,7 +5948,7 @@ next_batch:
goto do_free;
}
- empty = barn_replace_full_sheaf(barn, pcs->main);
+ empty = barn_replace_full_sheaf(barn, pcs->main, true);
if (IS_ERR(empty)) {
stat(s, BARN_PUT_FAIL);
goto no_empty;
@@ -6442,7 +5966,7 @@ do_free:
local_unlock(&s->cpu_sheaves->lock);
- stat_add(s, FREE_PCS, batch);
+ stat_add(s, FREE_FASTPATH, batch);
if (batch < size) {
p += batch;
@@ -6464,10 +5988,12 @@ no_empty:
*/
fallback:
__kmem_cache_free_bulk(s, size, p);
+ stat_add(s, FREE_SLOWPATH, size);
flush_remote:
if (remote_nr) {
__kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]);
+ stat_add(s, FREE_SLOWPATH, remote_nr);
if (i < size) {
remote_nr = 0;
goto next_remote_batch;
@@ -6477,7 +6003,6 @@ flush_remote:
struct defer_free {
struct llist_head objects;
- struct llist_head slabs;
struct irq_work work;
};
@@ -6485,23 +6010,21 @@ static void free_deferred_objects(struct irq_work *work);
static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = {
.objects = LLIST_HEAD_INIT(objects),
- .slabs = LLIST_HEAD_INIT(slabs),
.work = IRQ_WORK_INIT(free_deferred_objects),
};
/*
* In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe
- * to take sleeping spin_locks from __slab_free() and deactivate_slab().
+ * to take sleeping spin_locks from __slab_free().
* In !PREEMPT_RT irq_work will run after local_unlock_irqrestore().
*/
static void free_deferred_objects(struct irq_work *work)
{
struct defer_free *df = container_of(work, struct defer_free, work);
struct llist_head *objs = &df->objects;
- struct llist_head *slabs = &df->slabs;
struct llist_node *llnode, *pos, *t;
- if (llist_empty(objs) && llist_empty(slabs))
+ if (llist_empty(objs))
return;
llnode = llist_del_all(objs);
@@ -6524,16 +6047,7 @@ static void free_deferred_objects(struct irq_work *work)
set_freepointer(s, x, NULL);
__slab_free(s, slab, x, x, 1, _THIS_IP_);
- }
-
- llnode = llist_del_all(slabs);
- llist_for_each_safe(pos, t, llnode) {
- struct slab *slab = container_of(pos, struct slab, llnode);
-
- if (slab->frozen)
- deactivate_slab(slab->slab_cache, slab, slab->flush_freelist);
- else
- free_slab(slab->slab_cache, slab);
+ stat(s, FREE_SLOWPATH);
}
}
@@ -6550,19 +6064,6 @@ static void defer_free(struct kmem_cache *s, void *head)
irq_work_queue(&df->work);
}
-static void defer_deactivate_slab(struct slab *slab, void *flush_freelist)
-{
- struct defer_free *df;
-
- slab->flush_freelist = flush_freelist;
-
- guard(preempt)();
-
- df = this_cpu_ptr(&defer_free_objects);
- if (llist_add(&slab->llnode, &df->slabs))
- irq_work_queue(&df->work);
-}
-
void defer_free_barrier(void)
{
int cpu;
@@ -6571,99 +6072,6 @@ void defer_free_barrier(void)
irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work);
}
-/*
- * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
- * can perform fastpath freeing without additional function calls.
- *
- * The fastpath is only possible if we are freeing to the current cpu slab
- * of this processor. This typically the case if we have just allocated
- * the item before.
- *
- * If fastpath is not possible then fall back to __slab_free where we deal
- * with all sorts of special processing.
- *
- * Bulk free of a freelist with several objects (all pointing to the
- * same slab) possible by specifying head and tail ptr, plus objects
- * count (cnt). Bulk free indicated by tail pointer being set.
- */
-static __always_inline void do_slab_free(struct kmem_cache *s,
- struct slab *slab, void *head, void *tail,
- int cnt, unsigned long addr)
-{
- /* cnt == 0 signals that it's called from kfree_nolock() */
- bool allow_spin = cnt;
- struct kmem_cache_cpu *c;
- unsigned long tid;
- void **freelist;
-
-redo:
- /*
- * Determine the currently cpus per cpu slab.
- * The cpu may change afterward. However that does not matter since
- * data is retrieved via this pointer. If we are on the same cpu
- * during the cmpxchg then the free will succeed.
- */
- c = raw_cpu_ptr(s->cpu_slab);
- tid = READ_ONCE(c->tid);
-
- /* Same with comment on barrier() in __slab_alloc_node() */
- barrier();
-
- if (unlikely(slab != c->slab)) {
- if (unlikely(!allow_spin)) {
- /*
- * __slab_free() can locklessly cmpxchg16 into a slab,
- * but then it might need to take spin_lock or local_lock
- * in put_cpu_partial() for further processing.
- * Avoid the complexity and simply add to a deferred list.
- */
- defer_free(s, head);
- } else {
- __slab_free(s, slab, head, tail, cnt, addr);
- }
- return;
- }
-
- if (unlikely(!allow_spin)) {
- if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
- local_lock_is_locked(&s->cpu_slab->lock)) {
- defer_free(s, head);
- return;
- }
- cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */
- }
-
- if (USE_LOCKLESS_FAST_PATH()) {
- freelist = READ_ONCE(c->freelist);
-
- set_freepointer(s, tail, freelist);
-
- if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) {
- note_cmpxchg_failure("slab_free", s, tid);
- goto redo;
- }
- } else {
- __maybe_unused unsigned long flags = 0;
-
- /* Update the free list under the local lock */
- local_lock_cpu_slab(s, flags);
- c = this_cpu_ptr(s->cpu_slab);
- if (unlikely(slab != c->slab)) {
- local_unlock_cpu_slab(s, flags);
- goto redo;
- }
- tid = c->tid;
- freelist = c->freelist;
-
- set_freepointer(s, tail, freelist);
- c->freelist = head;
- c->tid = next_tid(tid);
-
- local_unlock_cpu_slab(s, flags);
- }
- stat_add(s, FREE_FASTPATH, cnt);
-}
-
static __fastpath_inline
void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
unsigned long addr)
@@ -6674,14 +6082,14 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false)))
return;
- if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) ||
- slab_nid(slab) == numa_mem_id())
- && likely(!slab_test_pfmemalloc(slab))) {
- if (likely(free_to_pcs(s, object)))
+ if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())
+ && likely(!slab_test_pfmemalloc(slab))) {
+ if (likely(free_to_pcs(s, object, true)))
return;
}
- do_slab_free(s, slab, object, object, 1, addr);
+ __slab_free(s, slab, object, object, 1, addr);
+ stat(s, FREE_SLOWPATH);
}
#ifdef CONFIG_MEMCG
@@ -6694,7 +6102,7 @@ void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
alloc_tagging_slab_free_hook(s, slab, &object, 1);
if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
- do_slab_free(s, slab, object, object, 1, _RET_IP_);
+ __slab_free(s, slab, object, object, 1, _RET_IP_);
}
#endif
@@ -6708,8 +6116,10 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head,
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
*/
- if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt)))
- do_slab_free(s, slab, head, tail, cnt, addr);
+ if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) {
+ __slab_free(s, slab, head, tail, cnt, addr);
+ stat_add(s, FREE_SLOWPATH, cnt);
+ }
}
#ifdef CONFIG_SLUB_RCU_DEBUG
@@ -6734,42 +6144,41 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head)
return;
/* resume freeing */
- if (slab_free_hook(s, object, slab_want_init_on_free(s), true))
- do_slab_free(s, slab, object, object, 1, _THIS_IP_);
+ if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) {
+ __slab_free(s, slab, object, object, 1, _THIS_IP_);
+ stat(s, FREE_SLOWPATH);
+ }
}
#endif /* CONFIG_SLUB_RCU_DEBUG */
#ifdef CONFIG_KASAN_GENERIC
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
- do_slab_free(cache, virt_to_slab(x), x, x, 1, addr);
+ __slab_free(cache, virt_to_slab(x), x, x, 1, addr);
+ stat(cache, FREE_SLOWPATH);
}
#endif
-static inline struct kmem_cache *virt_to_cache(const void *obj)
+static noinline void warn_free_bad_obj(struct kmem_cache *s, void *obj)
{
+ struct kmem_cache *cachep;
struct slab *slab;
slab = virt_to_slab(obj);
- if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__))
- return NULL;
- return slab->slab_cache;
-}
-
-static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
-{
- struct kmem_cache *cachep;
+ if (WARN_ONCE(!slab,
+ "kmem_cache_free(%s, %p): object is not in a slab page\n",
+ s->name, obj))
+ return;
- if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
- !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
- return s;
+ cachep = slab->slab_cache;
- cachep = virt_to_cache(x);
- if (WARN(cachep && cachep != s,
- "%s: Wrong slab cache. %s but object is from %s\n",
- __func__, s->name, cachep->name))
- print_tracking(cachep, x);
- return cachep;
+ if (WARN_ONCE(cachep != s,
+ "kmem_cache_free(%s, %p): object belongs to different cache %s\n",
+ s->name, obj, cachep ? cachep->name : "(NULL)")) {
+ if (cachep)
+ print_tracking(cachep, obj);
+ return;
+ }
}
/**
@@ -6782,14 +6191,118 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
*/
void kmem_cache_free(struct kmem_cache *s, void *x)
{
- s = cache_from_obj(s, x);
- if (!s)
- return;
+ struct slab *slab;
+
+ slab = virt_to_slab(x);
+
+ if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) ||
+ kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
+
+ /*
+ * Intentionally leak the object in these cases, because it
+ * would be too dangerous to continue.
+ */
+ if (unlikely(!slab || (slab->slab_cache != s))) {
+ warn_free_bad_obj(s, x);
+ return;
+ }
+ }
+
trace_kmem_cache_free(_RET_IP_, x, s);
- slab_free(s, virt_to_slab(x), x, _RET_IP_);
+ slab_free(s, slab, x, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
+static inline size_t slab_ksize(struct slab *slab)
+{
+ struct kmem_cache *s = slab->slab_cache;
+
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+#endif
+ if (s->flags & SLAB_KASAN)
+ return s->object_size;
+ /*
+ * If we have the need to store the freelist pointer
+ * or any other metadata back there then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ else if (obj_exts_in_object(s, slab))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+}
+
+static size_t __ksize(const void *object)
+{
+ struct page *page;
+ struct slab *slab;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ page = virt_to_page(object);
+
+ if (unlikely(PageLargeKmalloc(page)))
+ return large_kmalloc_size(page);
+
+ slab = page_slab(page);
+ /* Delete this after we're sure there are no users */
+ if (WARN_ON(!slab))
+ return page_size(page);
+
+#ifdef CONFIG_SLUB_DEBUG
+ skip_orig_size_check(slab->slab_cache, object);
+#endif
+
+ return slab_ksize(slab);
+}
+
+/**
+ * ksize -- Report full size of underlying allocation
+ * @objp: pointer to the object
+ *
+ * This should only be used internally to query the true size of allocations.
+ * It is not meant to be a way to discover the usable size of an allocation
+ * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
+ * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
+ * and/or FORTIFY_SOURCE.
+ *
+ * Return: size of the actual memory used by @objp in bytes
+ */
+size_t ksize(const void *objp)
+{
+ /*
+ * We need to first check that the pointer to the object is valid.
+ * The KASAN report printed from ksize() is more useful, then when
+ * it's printed later when the behaviour could be undefined due to
+ * a potential use-after-free or double-free.
+ *
+ * We use kasan_check_byte(), which is supported for the hardware
+ * tag-based KASAN mode, unlike kasan_check_read/write().
+ *
+ * If the pointed to memory is invalid, we return 0 to avoid users of
+ * ksize() writing to and potentially corrupting the memory region.
+ *
+ * We want to perform the check before __ksize(), to avoid potentially
+ * crashing in __ksize() due to accessing invalid metadata.
+ */
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
+ return 0;
+
+ return kfence_ksize(objp) ?: __ksize(objp);
+}
+EXPORT_SYMBOL(ksize);
+
static void free_large_kmalloc(struct page *page, void *object)
{
unsigned int order = compound_order(page);
@@ -6942,7 +6455,18 @@ void kfree_nolock(const void *object)
* since kasan quarantine takes locks and not supported from NMI.
*/
kasan_slab_free(s, x, false, false, /* skip quarantine */true);
- do_slab_free(s, slab, x, x, 0, _RET_IP_);
+
+ if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) {
+ if (likely(free_to_pcs(s, x, false)))
+ return;
+ }
+
+ /*
+ * __slab_free() can locklessly cmpxchg16 into a slab, but then it might
+ * need to take spin_lock for further processing.
+ * Avoid the complexity and simply add to a deferred list.
+ */
+ defer_free(s, x);
}
EXPORT_SYMBOL_GPL(kfree_nolock);
@@ -7313,7 +6837,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
df->s = slab->slab_cache;
} else {
df->slab = slab;
- df->s = cache_from_obj(s, object); /* Support for memcg */
+ df->s = s;
}
/* Start new detached freelist */
@@ -7368,7 +6892,7 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (kfence_free(df.freelist))
continue;
- do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
+ __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt,
_RET_IP_);
} while (likely(size));
}
@@ -7383,7 +6907,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
* freeing to sheaves is so incompatible with the detached freelist so
* once we go that way, we have to do everything differently
*/
- if (s && s->cpu_sheaves) {
+ if (s && cache_has_sheaves(s)) {
free_to_pcs_bulk(s, size, p);
return;
}
@@ -7401,72 +6925,224 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-static inline
-int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+static unsigned int
+__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+ unsigned int max, struct kmem_cache_node *n,
+ bool allow_spin)
{
- struct kmem_cache_cpu *c;
- unsigned long irqflags;
- int i;
+ struct partial_bulk_context pc;
+ struct slab *slab, *slab2;
+ unsigned int refilled = 0;
+ unsigned long flags;
+ void *object;
+
+ pc.flags = gfp;
+ pc.min_objects = min;
+ pc.max_objects = max;
+
+ if (!get_partial_node_bulk(s, n, &pc, allow_spin))
+ return 0;
+
+ list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+ list_del(&slab->slab_list);
+
+ object = get_freelist_nofreeze(s, slab);
+
+ while (object && refilled < max) {
+ p[refilled] = object;
+ object = get_freepointer(s, object);
+ maybe_wipe_obj_freeptr(s, p[refilled]);
+
+ refilled++;
+ }
+
+ /*
+ * Freelist had more objects than we can accommodate, we need to
+ * free them back. We can treat it like a detached freelist, just
+ * need to find the tail object.
+ */
+ if (unlikely(object)) {
+ void *head = object;
+ void *tail;
+ int cnt = 0;
+
+ do {
+ tail = object;
+ cnt++;
+ object = get_freepointer(s, object);
+ } while (object);
+ __slab_free(s, slab, head, tail, cnt, _RET_IP_);
+ }
+
+ if (refilled >= max)
+ break;
+ }
+
+ if (unlikely(!list_empty(&pc.slabs))) {
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+ if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial))
+ continue;
+
+ list_del(&slab->slab_list);
+ add_partial(n, slab, ADD_TO_HEAD);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ /* any slabs left are completely free and for discard */
+ list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) {
+
+ list_del(&slab->slab_list);
+ discard_slab(s, slab);
+ }
+ }
+
+ return refilled;
+}
+
+#ifdef CONFIG_NUMA
+static unsigned int
+__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+ unsigned int max)
+{
+ struct zonelist *zonelist;
+ struct zoneref *z;
+ struct zone *zone;
+ enum zone_type highest_zoneidx = gfp_zone(gfp);
+ unsigned int cpuset_mems_cookie;
+ unsigned int refilled = 0;
+
+ /* see get_from_any_partial() for the defrag ratio description */
+ if (!s->remote_node_defrag_ratio ||
+ get_cycles() % 1024 > s->remote_node_defrag_ratio)
+ return 0;
+
+ do {
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist = node_zonelist(mempolicy_slab_node(), gfp);
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
+ struct kmem_cache_node *n;
+ unsigned int r;
+
+ n = get_node(s, zone_to_nid(zone));
+
+ if (!n || !cpuset_zone_allowed(zone, gfp) ||
+ n->nr_partial <= s->min_partial)
+ continue;
+
+ r = __refill_objects_node(s, p, gfp, min, max, n,
+ /* allow_spin = */ false);
+ refilled += r;
+
+ if (r >= min) {
+ /*
+ * Don't check read_mems_allowed_retry() here -
+ * if mems_allowed was updated in parallel, that
+ * was a harmless race between allocation and
+ * the cpuset update
+ */
+ return refilled;
+ }
+ p += r;
+ min -= r;
+ max -= r;
+ }
+ } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+ return refilled;
+}
+#else
+static inline unsigned int
+__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+ unsigned int max)
+{
+ return 0;
+}
+#endif
+
+static unsigned int
+refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min,
+ unsigned int max)
+{
+ int local_node = numa_mem_id();
+ unsigned int refilled;
+ struct slab *slab;
+
+ if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp)))
+ return 0;
+
+ refilled = __refill_objects_node(s, p, gfp, min, max,
+ get_node(s, local_node),
+ /* allow_spin = */ true);
+ if (refilled >= min)
+ return refilled;
+
+ refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled,
+ max - refilled);
+ if (refilled >= min)
+ return refilled;
+
+new_slab:
+
+ slab = new_slab(s, gfp, local_node);
+ if (!slab)
+ goto out;
+
+ stat(s, ALLOC_SLAB);
/*
- * Drain objects in the per cpu slab, while disabling local
- * IRQs, which protects against PREEMPT and interrupts
- * handlers invoking normal fastpath.
+ * TODO: possible optimization - if we know we will consume the whole
+ * slab we might skip creating the freelist?
*/
- c = slub_get_cpu_ptr(s->cpu_slab);
- local_lock_irqsave(&s->cpu_slab->lock, irqflags);
+ refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled,
+ /* allow_spin = */ true);
- for (i = 0; i < size; i++) {
- void *object = c->freelist;
+ if (refilled < min)
+ goto new_slab;
- if (unlikely(!object)) {
- /*
- * We may have removed an object from c->freelist using
- * the fastpath in the previous iteration; in that case,
- * c->tid has not been bumped yet.
- * Since ___slab_alloc() may reenable interrupts while
- * allocating memory, we should bump c->tid now.
- */
- c->tid = next_tid(c->tid);
+out:
+ return refilled;
+}
- local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
+static inline
+int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
+{
+ int i;
- /*
- * Invoking slow path likely have side-effect
- * of re-populating per CPU c->freelist
- */
- p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c, s->object_size);
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
+ for (i = 0; i < size; i++) {
+
+ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
+ s->object_size);
if (unlikely(!p[i]))
goto error;
- c = this_cpu_ptr(s->cpu_slab);
maybe_wipe_obj_freeptr(s, p[i]);
-
- local_lock_irqsave(&s->cpu_slab->lock, irqflags);
-
- continue; /* goto for-loop */
}
- c->freelist = get_freepointer(s, object);
- p[i] = object;
- maybe_wipe_obj_freeptr(s, p[i]);
- stat(s, ALLOC_FASTPATH);
+ } else {
+ i = refill_objects(s, p, flags, size, size);
+ if (i < size)
+ goto error;
+ stat_add(s, ALLOC_SLOWPATH, i);
}
- c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
- slub_put_cpu_ptr(s->cpu_slab);
return i;
error:
- slub_put_cpu_ptr(s->cpu_slab);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
-/* Note that interrupts must be enabled when calling this function. */
+/*
+ * Note that interrupts must be enabled when calling this function and gfp
+ * flags must allow spinning.
+ */
int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
@@ -7494,8 +7170,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
size--;
}
- if (s->cpu_sheaves)
- i = alloc_from_pcs_bulk(s, size, p);
+ i = alloc_from_pcs_bulk(s, flags, size, p);
if (i < size) {
/*
@@ -7683,29 +7358,25 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn)
barn_init(barn);
}
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
+#ifdef CONFIG_SLUB_STATS
+static inline int alloc_kmem_cache_stats(struct kmem_cache *s)
{
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH *
- sizeof(struct kmem_cache_cpu));
+ sizeof(struct kmem_cache_stats));
- /*
- * Must align to double word boundary for the double cmpxchg
- * instructions to work; see __pcpu_double_call_return_bool().
- */
- s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
- 2 * sizeof(void *));
+ s->cpu_stats = alloc_percpu(struct kmem_cache_stats);
- if (!s->cpu_slab)
+ if (!s->cpu_stats)
return 0;
- init_kmem_cache_cpus(s);
-
return 1;
}
+#endif
static int init_percpu_sheaves(struct kmem_cache *s)
{
+ static struct slab_sheaf bootstrap_sheaf = {};
int cpu;
for_each_possible_cpu(cpu) {
@@ -7715,7 +7386,28 @@ static int init_percpu_sheaves(struct kmem_cache *s)
local_trylock_init(&pcs->lock);
- pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
+ /*
+ * Bootstrap sheaf has zero size so fast-path allocation fails.
+ * It has also size == s->sheaf_capacity, so fast-path free
+ * fails. In the slow paths we recognize the situation by
+ * checking s->sheaf_capacity. This allows fast paths to assume
+ * s->cpu_sheaves and pcs->main always exists and are valid.
+ * It's also safe to share the single static bootstrap_sheaf
+ * with zero-sized objects array as it's never modified.
+ *
+ * Bootstrap_sheaf also has NULL pointer to kmem_cache so we
+ * recognize it and not attempt to free it when destroying the
+ * cache.
+ *
+ * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node,
+ * caches with debug enabled, and all caches with SLUB_TINY.
+ * For kmalloc caches it's used temporarily during the initial
+ * bootstrap.
+ */
+ if (!s->sheaf_capacity)
+ pcs->main = &bootstrap_sheaf;
+ else
+ pcs->main = alloc_empty_sheaf(s, GFP_KERNEL);
if (!pcs->main)
return -ENOMEM;
@@ -7766,7 +7458,7 @@ static void early_kmem_cache_node_alloc(int node)
* No locks need to be taken here as it has just been
* initialized and there is no concurrent access.
*/
- __add_partial(n, slab, DEACTIVATE_TO_HEAD);
+ __add_partial(n, slab, ADD_TO_HEAD);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -7790,13 +7482,10 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
void __kmem_cache_release(struct kmem_cache *s)
{
cache_random_seq_destroy(s);
- if (s->cpu_sheaves)
- pcs_destroy(s);
-#ifdef CONFIG_PREEMPT_RT
- if (s->cpu_slab)
- lockdep_unregister_key(&s->lock_key);
+ pcs_destroy(s);
+#ifdef CONFIG_SLUB_STATS
+ free_percpu(s->cpu_stats);
#endif
- free_percpu(s->cpu_slab);
free_kmem_cache_nodes(s);
}
@@ -7813,7 +7502,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
continue;
}
- if (s->cpu_sheaves) {
+ if (cache_has_sheaves(s)) {
barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
if (!barn)
@@ -7834,37 +7523,51 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
return 1;
}
-static void set_cpu_partial(struct kmem_cache *s)
+static unsigned int calculate_sheaf_capacity(struct kmem_cache *s,
+ struct kmem_cache_args *args)
+
{
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- unsigned int nr_objects;
+ unsigned int capacity;
+ size_t size;
+
+
+ if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS)
+ return 0;
/*
- * cpu_partial determined the maximum number of objects kept in the
- * per cpu partial lists of a processor.
- *
- * Per cpu partial lists mainly contain slabs that just have one
- * object freed. If they are used for allocation then they can be
- * filled up again with minimal effort. The slab will never hit the
- * per node partial lists and therefore no locking will be required.
- *
- * For backwards compatibility reasons, this is determined as number
- * of objects, even though we now limit maximum number of pages, see
- * slub_set_cpu_partial()
+ * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT).
+ * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not
+ * have sheaves to avoid recursion when sheaf allocation triggers
+ * kmemleak tracking.
+ */
+ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE))
+ return 0;
+
+ /*
+ * For now we use roughly similar formula (divided by two as there are
+ * two percpu sheaves) as what was used for percpu partial slabs, which
+ * should result in similar lock contention (barn or list_lock)
*/
- if (!kmem_cache_has_cpu_partial(s))
- nr_objects = 0;
- else if (s->size >= PAGE_SIZE)
- nr_objects = 6;
+ if (s->size >= PAGE_SIZE)
+ capacity = 4;
else if (s->size >= 1024)
- nr_objects = 24;
+ capacity = 12;
else if (s->size >= 256)
- nr_objects = 52;
+ capacity = 26;
else
- nr_objects = 120;
+ capacity = 60;
- slub_set_cpu_partial(s, nr_objects);
-#endif
+ /* Increment capacity to make sheaf exactly a kmalloc size bucket */
+ size = struct_size_t(struct slab_sheaf, objects, capacity);
+ size = kmalloc_size_roundup(size);
+ capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *);
+
+ /*
+ * Respect an explicit request for capacity that's typically motivated by
+ * expected maximum size of kmem_cache_prefill_sheaf() to not end up
+ * using low-performance oversize sheaves
+ */
+ return max(capacity, args->sheaf_capacity);
}
/*
@@ -7875,6 +7578,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
+ unsigned int aligned_size;
unsigned int order;
/*
@@ -7898,9 +7602,9 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
/*
- * If we are Redzoning then check if there is some space between the
- * end of the object and the free pointer. If not then add an
- * additional word to have some bytes to store Redzone information.
+ * If we are Redzoning and there is no space between the end of the
+ * object and the following fields, add one word so the right Redzone
+ * is non-empty.
*/
if ((flags & SLAB_RED_ZONE) && size == s->object_size)
size += sizeof(void *);
@@ -7913,7 +7617,8 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
s->inuse = size;
if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||
- (flags & SLAB_POISON) || s->ctor ||
+ (flags & SLAB_POISON) ||
+ (s->ctor && !args->use_freeptr_offset) ||
((flags & SLAB_RED_ZONE) &&
(s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
/*
@@ -7934,7 +7639,8 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
*/
s->offset = size;
size += sizeof(void *);
- } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) {
+ } else if (((flags & SLAB_TYPESAFE_BY_RCU) || s->ctor) &&
+ args->use_freeptr_offset) {
s->offset = args->freeptr_offset;
} else {
/*
@@ -7955,7 +7661,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
/* Save the original kmalloc request size */
if (flags & SLAB_KMALLOC)
- size += sizeof(unsigned int);
+ size += sizeof(unsigned long);
}
#endif
@@ -7982,7 +7688,14 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
* offset 0. In order to align the objects we have to simply size
* each object to conform to the alignment.
*/
- size = ALIGN(size, s->align);
+ aligned_size = ALIGN(size, s->align);
+#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT)
+ if (slab_args_unmergeable(args, s->flags) &&
+ (aligned_size - size >= sizeof(struct slabobj_ext)))
+ s->flags |= SLAB_OBJ_EXT_IN_OBJ;
+#endif
+ size = aligned_size;
+
s->size = size;
s->reciprocal_size = reciprocal_value(size);
order = calculate_order(size);
@@ -8002,6 +7715,13 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s)
s->allocflags |= __GFP_RECLAIMABLE;
/*
+ * For KMALLOC_NORMAL caches we enable sheaves later by
+ * bootstrap_kmalloc_sheaves() to avoid recursion
+ */
+ if (!is_kmalloc_normal(s))
+ s->sheaf_capacity = calculate_sheaf_capacity(s, args);
+
+ /*
* Determine the number of objects per slab
*/
s->oo = oo_make(order, size);
@@ -8085,7 +7805,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
flush_all_cpus_locked(s);
/* we might have rcu sheaves in flight */
- if (s->cpu_sheaves)
+ if (cache_has_sheaves(s))
rcu_barrier();
/* Attempt to free all objects */
@@ -8397,7 +8117,7 @@ static int slab_mem_going_online_callback(int nid)
if (get_node(s, nid))
continue;
- if (s->cpu_sheaves) {
+ if (cache_has_sheaves(s)) {
barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid);
if (!barn) {
@@ -8472,12 +8192,6 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
memcpy(s, static_cache, kmem_cache->object_size);
- /*
- * This runs very early, and only the boot processor is supposed to be
- * up. Even if it weren't true, IRQs are not up so we couldn't fire
- * IPIs around.
- */
- __flush_cpu_slab(s, smp_processor_id());
for_each_kmem_cache_node(s, node, n) {
struct slab *p;
@@ -8493,6 +8207,74 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
return s;
}
+/*
+ * Finish the sheaves initialization done normally by init_percpu_sheaves() and
+ * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it
+ * since sheaves and barns are allocated by kmalloc.
+ */
+static void __init bootstrap_cache_sheaves(struct kmem_cache *s)
+{
+ struct kmem_cache_args empty_args = {};
+ unsigned int capacity;
+ bool failed = false;
+ int node, cpu;
+
+ capacity = calculate_sheaf_capacity(s, &empty_args);
+
+ /* capacity can be 0 due to debugging or SLUB_TINY */
+ if (!capacity)
+ return;
+
+ for_each_node_mask(node, slab_nodes) {
+ struct node_barn *barn;
+
+ barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node);
+
+ if (!barn) {
+ failed = true;
+ goto out;
+ }
+
+ barn_init(barn);
+ get_node(s, node)->barn = barn;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct slub_percpu_sheaves *pcs;
+
+ pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
+
+ pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity);
+
+ if (!pcs->main) {
+ failed = true;
+ break;
+ }
+ }
+
+out:
+ /*
+ * It's still early in boot so treat this like same as a failure to
+ * create the kmalloc cache in the first place
+ */
+ if (failed)
+ panic("Out of memory when creating kmem_cache %s\n", s->name);
+
+ s->sheaf_capacity = capacity;
+}
+
+static void __init bootstrap_kmalloc_sheaves(void)
+{
+ enum kmalloc_cache_type type;
+
+ for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) {
+ for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) {
+ if (kmalloc_caches[type][idx])
+ bootstrap_cache_sheaves(kmalloc_caches[type][idx]);
+ }
+ }
+}
+
void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
@@ -8536,6 +8318,8 @@ void __init kmem_cache_init(void)
setup_kmalloc_cache_index_table();
create_kmalloc_caches();
+ bootstrap_kmalloc_sheaves();
+
/* Setup random freelists for each cache */
init_freelist_randomization();
@@ -8554,31 +8338,6 @@ void __init kmem_cache_init_late(void)
WARN_ON(!flushwq);
}
-struct kmem_cache *
-__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
- slab_flags_t flags, void (*ctor)(void *))
-{
- struct kmem_cache *s;
-
- s = find_mergeable(size, align, flags, name, ctor);
- if (s) {
- if (sysfs_slab_alias(s, name))
- pr_err("SLUB: Unable to add cache alias %s to sysfs\n",
- name);
-
- s->refcount++;
-
- /*
- * Adjust the object sizes so that we clear
- * the complete object on kzalloc.
- */
- s->object_size = max(s->object_size, size);
- s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
- }
-
- return s;
-}
-
int do_kmem_cache_create(struct kmem_cache *s, const char *name,
unsigned int size, struct kmem_cache_args *args,
slab_flags_t flags)
@@ -8628,17 +8387,10 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
- set_cpu_partial(s);
-
- if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY)
- && !(s->flags & SLAB_DEBUG_FLAGS)) {
- s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
- if (!s->cpu_sheaves) {
- err = -ENOMEM;
- goto out;
- }
- // TODO: increase capacity to grow slab_sheaf up to next kmalloc size?
- s->sheaf_capacity = args->sheaf_capacity;
+ s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves);
+ if (!s->cpu_sheaves) {
+ err = -ENOMEM;
+ goto out;
}
#ifdef CONFIG_NUMA
@@ -8654,14 +8406,14 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name,
if (!init_kmem_cache_nodes(s))
goto out;
- if (!alloc_kmem_cache_cpus(s))
+#ifdef CONFIG_SLUB_STATS
+ if (!alloc_kmem_cache_stats(s))
goto out;
+#endif
- if (s->cpu_sheaves) {
- err = init_percpu_sheaves(s);
- if (err)
- goto out;
- }
+ err = init_percpu_sheaves(s);
+ if (err)
+ goto out;
err = 0;
@@ -8976,47 +8728,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
if (!nodes)
return -ENOMEM;
- if (flags & SO_CPU) {
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
- cpu);
- int node;
- struct slab *slab;
-
- slab = READ_ONCE(c->slab);
- if (!slab)
- continue;
-
- node = slab_nid(slab);
- if (flags & SO_TOTAL)
- x = slab->objects;
- else if (flags & SO_OBJECTS)
- x = slab->inuse;
- else
- x = 1;
-
- total += x;
- nodes[node] += x;
-
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- slab = slub_percpu_partial_read_once(c);
- if (slab) {
- node = slab_nid(slab);
- if (flags & SO_TOTAL)
- WARN_ON_ONCE(1);
- else if (flags & SO_OBJECTS)
- WARN_ON_ONCE(1);
- else
- x = data_race(slab->slabs);
- total += x;
- nodes[node] += x;
- }
-#endif
- }
- }
-
/*
* It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
* already held which will conflict with an existing lock order:
@@ -9148,12 +8859,7 @@ SLAB_ATTR(min_partial);
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
- unsigned int nr_partial = 0;
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- nr_partial = s->cpu_partial;
-#endif
-
- return sysfs_emit(buf, "%u\n", nr_partial);
+ return sysfs_emit(buf, "0\n");
}
static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -9165,11 +8871,9 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
err = kstrtouint(buf, 10, &objects);
if (err)
return err;
- if (objects && !kmem_cache_has_cpu_partial(s))
+ if (objects)
return -EINVAL;
- slub_set_cpu_partial(s, objects);
- flush_all(s);
return length;
}
SLAB_ATTR(cpu_partial);
@@ -9208,42 +8912,7 @@ SLAB_ATTR_RO(objects_partial);
static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
{
- int objects = 0;
- int slabs = 0;
- int cpu __maybe_unused;
- int len = 0;
-
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- for_each_online_cpu(cpu) {
- struct slab *slab;
-
- slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
-
- if (slab)
- slabs += data_race(slab->slabs);
- }
-#endif
-
- /* Approximate half-full slabs, see slub_set_cpu_partial() */
- objects = (slabs * oo_objects(s->oo)) / 2;
- len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
-
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- for_each_online_cpu(cpu) {
- struct slab *slab;
-
- slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
- if (slab) {
- slabs = data_race(slab->slabs);
- objects = (slabs * oo_objects(s->oo)) / 2;
- len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
- cpu, objects, slabs);
- }
- }
-#endif
- len += sysfs_emit_at(buf, len, "\n");
-
- return len;
+ return sysfs_emit(buf, "0(0)\n");
}
SLAB_ATTR_RO(slabs_cpu_partial);
@@ -9429,7 +9098,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
return -ENOMEM;
for_each_online_cpu(cpu) {
- unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
+ unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si];
data[cpu] = x;
sum += x;
@@ -9455,7 +9124,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
int cpu;
for_each_online_cpu(cpu)
- per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
+ per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0;
}
#define STAT_ATTR(si, text) \
@@ -9473,36 +9142,19 @@ static ssize_t text##_store(struct kmem_cache *s, \
} \
SLAB_ATTR(text); \
-STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf);
STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
-STAT_ATTR(FREE_PCS, free_cpu_sheaf);
STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf);
STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail);
STAT_ATTR(FREE_FASTPATH, free_fastpath);
STAT_ATTR(FREE_SLOWPATH, free_slowpath);
-STAT_ATTR(FREE_FROZEN, free_frozen);
STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
-STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
STAT_ATTR(ALLOC_SLAB, alloc_slab);
-STAT_ATTR(ALLOC_REFILL, alloc_refill);
STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
STAT_ATTR(FREE_SLAB, free_slab);
-STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
-STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
-STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
-STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
-STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
-STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
-STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
STAT_ATTR(ORDER_FALLBACK, order_fallback);
-STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
-STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
-STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
-STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
-STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
STAT_ATTR(SHEAF_FLUSH, sheaf_flush);
STAT_ATTR(SHEAF_REFILL, sheaf_refill);
STAT_ATTR(SHEAF_ALLOC, sheaf_alloc);
@@ -9578,36 +9230,19 @@ static struct attribute *slab_attrs[] = {
&remote_node_defrag_ratio_attr.attr,
#endif
#ifdef CONFIG_SLUB_STATS
- &alloc_cpu_sheaf_attr.attr,
&alloc_fastpath_attr.attr,
&alloc_slowpath_attr.attr,
- &free_cpu_sheaf_attr.attr,
&free_rcu_sheaf_attr.attr,
&free_rcu_sheaf_fail_attr.attr,
&free_fastpath_attr.attr,
&free_slowpath_attr.attr,
- &free_frozen_attr.attr,
&free_add_partial_attr.attr,
&free_remove_partial_attr.attr,
- &alloc_from_partial_attr.attr,
&alloc_slab_attr.attr,
- &alloc_refill_attr.attr,
&alloc_node_mismatch_attr.attr,
&free_slab_attr.attr,
- &cpuslab_flush_attr.attr,
- &deactivate_full_attr.attr,
- &deactivate_empty_attr.attr,
- &deactivate_to_head_attr.attr,
- &deactivate_to_tail_attr.attr,
- &deactivate_remote_frees_attr.attr,
- &deactivate_bypass_attr.attr,
&order_fallback_attr.attr,
&cmpxchg_double_fail_attr.attr,
- &cmpxchg_double_cpu_fail_attr.attr,
- &cpu_partial_alloc_attr.attr,
- &cpu_partial_free_attr.attr,
- &cpu_partial_node_attr.attr,
- &cpu_partial_drain_attr.attr,
&sheaf_flush_attr.attr,
&sheaf_refill_attr.attr,
&sheaf_alloc_attr.attr,
@@ -9811,7 +9446,7 @@ struct saved_alias {
static struct saved_alias *alias_list;
-static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
+int sysfs_slab_alias(struct kmem_cache *s, const char *name)
{
struct saved_alias *al;