From 68889dfd547bd8eabc5a98b58475d7b901cf5129 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:09 +0000 Subject: mptcp: Fix up subflow's memcg when CONFIG_SOCK_CGROUP_DATA=n. When sk_alloc() allocates a socket, mem_cgroup_sk_alloc() sets sk->sk_memcg based on the current task. MPTCP subflow socket creation is triggered from userspace or an in-kernel worker. In the latter case, sk->sk_memcg is not what we want. So, we fix it up from the parent socket's sk->sk_memcg in mptcp_attach_cgroup(). Although the code is placed under #ifdef CONFIG_MEMCG, it is buried under #ifdef CONFIG_SOCK_CGROUP_DATA. The two configs are orthogonal. If CONFIG_MEMCG is enabled without CONFIG_SOCK_CGROUP_DATA, the subflow's memory usage is not charged correctly. Let's move the code out of the wrong ifdef guard. Note that sk->sk_memcg is freed in sk_prot_free() and the parent sk holds the refcnt of memcg->css here, so we don't need to use css_tryget(). Fixes: 3764b0c5651e3 ("mptcp: attach subflow socket to parent cgroup") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Matthieu Baerts (NGI0) Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..25921fbec685 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1604,6 +1604,7 @@ extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); +void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1661,6 +1662,11 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg); #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; -- cgit v1.2.3 From bb178c6bc08525d758a57775458d644304011bf8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:16 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_(un)?charge(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_charge_skmem() and mem_cgroup_uncharge_skmem(). Let's pass struct sock to the functions. While at it, they are renamed to match other functions starting with mem_cgroup_sk_. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 25921fbec685..0837d3de3a68 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1596,15 +1596,16 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; -bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, - gfp_t gfp_mask); -void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) + void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk); +bool mem_cgroup_sk_charge(const struct sock *sk, unsigned int nr_pages, + gfp_t gfp_mask); +void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages); #if BITS_PER_LONG < 64 static inline void mem_cgroup_set_socket_pressure(struct mem_cgroup *memcg) @@ -1660,13 +1661,31 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); #else #define mem_cgroup_sockets_enabled 0 -static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; -static inline void mem_cgroup_sk_free(struct sock *sk) { }; + +static inline void mem_cgroup_sk_alloc(struct sock *sk) +{ +} + +static inline void mem_cgroup_sk_free(struct sock *sk) +{ +} static inline void mem_cgroup_sk_inherit(const struct sock *sk, struct sock *newsk) { } +static inline bool mem_cgroup_sk_charge(const struct sock *sk, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + return false; +} + +static inline void mem_cgroup_sk_uncharge(const struct sock *sk, + unsigned int nr_pages) +{ +} + static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; -- cgit v1.2.3 From b2ffd10cddde47cc6830e4981e91e3215def62b1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 15 Aug 2025 20:16:17 +0000 Subject: net-memcg: Pass struct sock to mem_cgroup_sk_under_memory_pressure(). We will store a flag in the lowest bit of sk->sk_memcg. Then, we cannot pass the raw pointer to mem_cgroup_under_socket_pressure(). Let's pass struct sock to it and rename the function to match other functions starting with mem_cgroup_sk_. Note that the helper is moved to sock.h to use mem_cgroup_from_sk(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Acked-by: Shakeel Butt Link: https://patch.msgid.link/20250815201712.1745332-10-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/memcontrol.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0837d3de3a68..fb27e3d2fdac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1642,19 +1642,6 @@ static inline u64 mem_cgroup_get_socket_pressure(struct mem_cgroup *memcg) } #endif -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ -#ifdef CONFIG_MEMCG_V1 - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return !!memcg->tcpmem_pressure; -#endif /* CONFIG_MEMCG_V1 */ - do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) - return true; - } while ((memcg = parent_mem_cgroup(memcg))); - return false; -} - int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); @@ -1686,11 +1673,6 @@ static inline void mem_cgroup_sk_uncharge(const struct sock *sk, { } -static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) -{ - return false; -} - static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { -- cgit v1.2.3 From ec45783fce52f358c9e8680d2837bc0d477f16ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Aug 2025 16:57:55 +0200 Subject: memcg: optimize exit to user space memcg uses TIF_NOTIFY_RESUME to handle reclaiming on exit to user space. TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is utilized by other entities as well. This results in a unconditional mem_cgroup_handle_over_high() call for every invocation of resume_user_mode_work(), which is a pointless exercise as most of the time there is no reclaim work to do. Especially since RSEQ is used by glibc, TIF_NOTIFY_RESUME is raised quite frequently and the empty calls show up in exit path profiling. Optimize this by doing a quick check of the reclaim condition before invoking it. [akpm@linux-foundation.org: remove now-unneeded test of memcg_nr_pages_over_high==0, per Shakeel] Link: https://lkml.kernel.org/r/87tt2b6zgs.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..9fa3afc90dd5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,7 +900,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask); + +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) +{ + if (unlikely(current->memcg_nr_pages_over_high)) + __mem_cgroup_handle_over_high(gfp_mask); +} unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); -- cgit v1.2.3 From cf1dec76ba8a00b20e51d205f3c9f5c45bc96df2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:35 -0700 Subject: mm/filemap: add AS_KERNEL_FILE Patch series "introduce kernel file mapped folios", v4. Btrfs currently tracks its metadata pages in the page cache, using a fake inode (fs_info->btree_inode) with offsets corresponding to where the metadata is stored in the filesystem's full logical address space. A consequence of this is that when btrfs uses filemap_add_folio(), this usage is charged to the cgroup of whichever task happens to be running at the time. These folios don't belong to any particular user cgroup, so I don't think it makes much sense for them to be charged in that way. Some negative consequences as a result: - A task can be holding some important btrfs locks, then need to lookup some metadata and go into reclaim, extending the duration it holds that lock for, and unfairly pushing its own reclaim pain onto other cgroups. - If that cgroup goes into reclaim, it might reclaim these folios a different non-reclaiming cgroup might need soon. This is naturally offset by LRU reclaim, but still. We have two options for how to manage such file pages: 1. charge them to the root cgroup. 2. don't charge them to any cgroup at all. 2. breaks the invariant that every mapped page has a cgroup. This is workable, but unnecessarily risky. Therefore, go with 1. A very similar proposal to use the root cgroup was previously made by Qu, where he eventually proposed the idea of setting it per address_space. This makes good sense for the btrfs use case, as the behavior should apply to all use of the address_space, not select allocations. I.e., if someone adds another filemap_add_folio() call using btrfs's btree_inode, we would almost certainly want to account that to the root cgroup as well. This patch (of 3): Add the flag AS_KERNEL_FILE to the address_space to indicate that this mapping's memory is exempt from the usual memcg accounting. [boris@bur.io: fix CONFIG_MEMCG build for AS_KERNEL_FILE] Link: https://lkml.kernel.org/r/6de59ddeec81b5c294d337c001ba0061631d4ec6.1755816635.git.boris@bur.io Link: https://lore.kernel.org/linux-mm/b5fef5372ae454a7b6da4f2f75c427aeab6a07d6.1727498749.git.wqu@suse.com/ Link: https://lkml.kernel.org/r/f09c4e2c90351d4cb30a1969f7a863b9238bd291.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Suggested-by: Qu Wenruo Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9fa3afc90dd5..e693978b2022 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1059,6 +1059,8 @@ extern int mem_cgroup_init(void); #define MEM_CGROUP_ID_SHIFT 0 +#define root_mem_cgroup (NULL) + static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; -- cgit v1.2.3 From 7612833192d56af86061de8ab51989b75daf5b0d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:06 -0700 Subject: slab: Reuse first bit for OBJEXTS_ALLOC_FAIL Since the combination of valid upper bits in slab->obj_exts with OBJEXTS_ALLOC_FAIL bit can never happen, use OBJEXTS_ALLOC_FAIL == (1ull << 0) as a magic sentinel instead of (1ull << 2) to free up bit 2. Signed-off-by: Alexei Starovoitov Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/memcontrol.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..d254c0b96d0d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -341,17 +341,23 @@ enum page_memcg_data_flags { __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; +#define __OBJEXTS_ALLOC_FAIL MEMCG_DATA_OBJEXTS #define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS #else /* CONFIG_MEMCG */ +#define __OBJEXTS_ALLOC_FAIL (1UL << 0) #define __FIRST_OBJEXT_FLAG (1UL << 0) #endif /* CONFIG_MEMCG */ enum objext_flags { - /* slabobj_ext vector failed to allocate */ - OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* + * Use bit 0 with zero other bits to signal that slabobj_ext vector + * failed to allocate. The same bit 0 with valid upper bits means + * MEMCG_DATA_OBJEXTS. + */ + OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; -- cgit v1.2.3 From af92793e52c3a99b828ed4bdd277fd3e11c18d08 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Sep 2025 18:00:07 -0700 Subject: slab: Introduce kmalloc_nolock() and kfree_nolock(). kmalloc_nolock() relies on ability of local_trylock_t to detect the situation when per-cpu kmem_cache is locked. In !PREEMPT_RT local_(try)lock_irqsave(&s->cpu_slab->lock, flags) disables IRQs and marks s->cpu_slab->lock as acquired. local_lock_is_locked(&s->cpu_slab->lock) returns true when slab is in the middle of manipulating per-cpu cache of that specific kmem_cache. kmalloc_nolock() can be called from any context and can re-enter into ___slab_alloc(): kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> NMI -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) or kmalloc() -> ___slab_alloc(cache_A) -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() -> ___slab_alloc(cache_B) Hence the caller of ___slab_alloc() checks if &s->cpu_slab->lock can be acquired without a deadlock before invoking the function. If that specific per-cpu kmem_cache is busy the kmalloc_nolock() retries in a different kmalloc bucket. The second attempt will likely succeed, since this cpu locked different kmem_cache. Similarly, in PREEMPT_RT local_lock_is_locked() returns true when per-cpu rt_spin_lock is locked by current _task_. In this case re-entrance into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries a different bucket that is most likely is not locked by the current task. Though it may be locked by a different task it's safe to rt_spin_lock() and sleep on it. Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL immediately if called from hard irq or NMI in PREEMPT_RT. kfree_nolock() defers freeing to irq_work when local_lock_is_locked() and (in_nmi() or in PREEMPT_RT). SLUB_TINY config doesn't use local_lock_is_locked() and relies on spin_trylock_irqsave(&n->list_lock) to allocate, while kfree_nolock() always defers to irq_work. Note, kfree_nolock() must be called _only_ for objects allocated with kmalloc_nolock(). Debug checks (like kmemleak and kfence) were skipped on allocation, hence obj = kmalloc(); kfree_nolock(obj); will miss kmemleak/kfence book keeping and will cause false positives. large_kmalloc is not supported by either kmalloc_nolock() or kfree_nolock(). Signed-off-by: Alexei Starovoitov Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/memcontrol.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d254c0b96d0d..82563236f35c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -358,6 +358,8 @@ enum objext_flags { * MEMCG_DATA_OBJEXTS. */ OBJEXTS_ALLOC_FAIL = __OBJEXTS_ALLOC_FAIL, + /* slabobj_ext vector allocated with kmalloc_nolock() */ + OBJEXTS_NOSPIN_ALLOC = __FIRST_OBJEXT_FLAG, /* the next bit after the last actual flag */ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), }; -- cgit v1.2.3 From fcc0669c5aa681994c507b50f1c706c969d99730 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 22 Sep 2025 15:02:03 -0700 Subject: memcg: skip cgroup_file_notify if spinning is not allowed Generally memcg charging is allowed from all the contexts including NMI where even spinning on spinlock can cause locking issues. However one call chain was missed during the addition of memcg charging from any context support. That is try_charge_memcg() -> memcg_memory_event() -> cgroup_file_notify(). The possible function call tree under cgroup_file_notify() can acquire many different spin locks in spinning mode. Some of them are cgroup_file_kn_lock, kernfs_notify_lock, pool_workqeue's lock. So, let's just skip cgroup_file_notify() from memcg charging if the context does not allow spinning. Alternative approach was also explored where instead of skipping cgroup_file_notify(), we defer the memcg event processing to irq_work [1]. However it adds complexity and it was decided to keep things simple until we need more memcg events with !allow_spinning requirement. Link: https://lore.kernel.org/all/5qi2llyzf7gklncflo6gxoozljbm4h3tpnuv4u4ej4ztysvi6f@x44v7nz2wdzd/ [1] Link: https://lkml.kernel.org/r/20250922220203.261714-1-shakeel.butt@linux.dev Fixes: 3ac4638a734a ("memcg: make memcg_rstat_updated nmi safe") Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Closes: https://lore.kernel.org/all/20250905061919.439648-1-yepeilin@google.com/ Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Kumar Kartikeya Dwivedi Cc: Muchun Song Cc: Peilin Ye Cc: Roman Gushchin Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 16fe0306e50e..873e510d6f8d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1001,22 +1001,28 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event) +static inline void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, + bool allow_spinning) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; + /* For now only MEMCG_MAX can happen with !allow_spinning context. */ + VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); + atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event) + if (!swap_event && allow_spinning) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); + if (allow_spinning) { + if (swap_event) + cgroup_file_notify(&memcg->swap_events_file); + else + cgroup_file_notify(&memcg->events_file); + } if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; @@ -1026,6 +1032,12 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, !mem_cgroup_is_root(memcg)); } +static inline void memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event) +{ + __memcg_memory_event(memcg, event, true); +} + static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { -- cgit v1.2.3