From 79f919a89c9d06816dbdbbd168fa41d27411a7f9 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 19 Aug 2025 01:07:24 +0000 Subject: cgroup: split cgroup_destroy_wq into 3 workqueues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A hung task can occur during [1] LTP cgroup testing when repeatedly mounting/unmounting perf_event and net_prio controllers with systemd.unified_cgroup_hierarchy=1. The hang manifests in cgroup_lock_and_drain_offline() during root destruction. Related case: cgroup_fj_function_perf_event cgroup_fj_function.sh perf_event cgroup_fj_function_net_prio cgroup_fj_function.sh net_prio Call Trace: cgroup_lock_and_drain_offline+0x14c/0x1e8 cgroup_destroy_root+0x3c/0x2c0 css_free_rwork_fn+0x248/0x338 process_one_work+0x16c/0x3b8 worker_thread+0x22c/0x3b0 kthread+0xec/0x100 ret_from_fork+0x10/0x20 Root Cause: CPU0 CPU1 mount perf_event umount net_prio cgroup1_get_tree cgroup_kill_sb rebind_subsystems // root destruction enqueues // cgroup_destroy_wq // kill all perf_event css // one perf_event css A is dying // css A offline enqueues cgroup_destroy_wq // root destruction will be executed first css_free_rwork_fn cgroup_destroy_root cgroup_lock_and_drain_offline // some perf descendants are dying // cgroup_destroy_wq max_active = 1 // waiting for css A to die Problem scenario: 1. CPU0 mounts perf_event (rebind_subsystems) 2. CPU1 unmounts net_prio (cgroup_kill_sb), queuing root destruction work 3. A dying perf_event CSS gets queued for offline after root destruction 4. Root destruction waits for offline completion, but offline work is blocked behind root destruction in cgroup_destroy_wq (max_active=1) Solution: Split cgroup_destroy_wq into three dedicated workqueues: cgroup_offline_wq – Handles CSS offline operations cgroup_release_wq – Manages resource release cgroup_free_wq – Performs final memory deallocation This separation eliminates blocking in the CSS free path while waiting for offline operations to complete. [1] https://github.com/linux-test-project/ltp/blob/master/runtest/controllers Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Reported-by: Gao Yingjie Signed-off-by: Chen Ridong Suggested-by: Teju Heo Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb..79b1d79f86a3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -126,8 +126,31 @@ DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); * of concurrent destructions. Use a separate workqueue so that cgroup * destruction work items don't end up filling up max_active of system_wq * which may lead to deadlock. + * + * A cgroup destruction should enqueue work sequentially to: + * cgroup_offline_wq: use for css offline work + * cgroup_release_wq: use for css release work + * cgroup_free_wq: use for free work + * + * Rationale for using separate workqueues: + * The cgroup root free work may depend on completion of other css offline + * operations. If all tasks were enqueued to a single workqueue, this could + * create a deadlock scenario where: + * - Free work waits for other css offline work to complete. + * - But other css offline work is queued after free work in the same queue. + * + * Example deadlock scenario with single workqueue (cgroup_destroy_wq): + * 1. umount net_prio + * 2. net_prio root destruction enqueues work to cgroup_destroy_wq (CPUx) + * 3. perf_event CSS A offline enqueues work to same cgroup_destroy_wq (CPUx) + * 4. net_prio cgroup_destroy_root->cgroup_lock_and_drain_offline. + * 5. net_prio root destruction blocks waiting for perf_event CSS A offline, + * which can never complete as it's behind in the same queue and + * workqueue's max_active is 1. */ -static struct workqueue_struct *cgroup_destroy_wq; +static struct workqueue_struct *cgroup_offline_wq; +static struct workqueue_struct *cgroup_release_wq; +static struct workqueue_struct *cgroup_free_wq; /* generate an array of cgroup subsystem pointers */ #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, @@ -5558,7 +5581,7 @@ static void css_release_work_fn(struct work_struct *work) cgroup_unlock(); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); } static void css_release(struct percpu_ref *ref) @@ -5567,7 +5590,7 @@ static void css_release(struct percpu_ref *ref) container_of(ref, struct cgroup_subsys_state, refcnt); INIT_WORK(&css->destroy_work, css_release_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_release_wq, &css->destroy_work); } static void init_and_link_css(struct cgroup_subsys_state *css, @@ -5701,7 +5724,7 @@ err_list_del: list_del_rcu(&css->sibling); err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); - queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); + queue_rcu_work(cgroup_free_wq, &css->destroy_rwork); return ERR_PTR(err); } @@ -5939,7 +5962,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref) if (atomic_dec_and_test(&css->online_cnt)) { INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + queue_work(cgroup_offline_wq, &css->destroy_work); } } @@ -6325,8 +6348,14 @@ static int __init cgroup_wq_init(void) * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); - BUG_ON(!cgroup_destroy_wq); + cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); + BUG_ON(!cgroup_offline_wq); + + cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); + BUG_ON(!cgroup_release_wq); + + cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); + BUG_ON(!cgroup_free_wq); return 0; } core_initcall(cgroup_wq_init); -- cgit v1.2.3 From 94a4acfec14615e971eb2c9e1fa6c992c85ff6c6 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 22 Aug 2025 07:07:15 +0000 Subject: cgroup/psi: Set of->priv to NULL upon file release Setting of->priv to NULL when the file is released enables earlier bug detection. This allows potential bugs to manifest as NULL pointer dereferences rather than use-after-free errors[1], which are generally more difficult to diagnose. [1] https://lore.kernel.org/cgroups/38ef3ff9-b380-44f0-9315-8b3714b0948d@huaweicloud.com/T/#m8a3b3f88f0ff3da5925d342e90043394f8b2091b Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 79b1d79f86a3..77d02f87f3f1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4182,6 +4182,7 @@ static void cgroup_file_release(struct kernfs_open_file *of) cft->release(of); put_cgroup_ns(ctx->ns); kfree(ctx); + of->priv = NULL; } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, -- cgit v1.2.3 From e895f8e29119c8c966ea794af9e9100b10becb88 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 5 Aug 2025 16:10:25 +0800 Subject: hrtimers: Unconditionally update target CPU base after offline timer migration When testing softirq based hrtimers on an ARM32 board, with high resolution mode and NOHZ inactive, softirq based hrtimers fail to expire after being moved away from an offline CPU: CPU0 CPU1 hrtimer_start(..., HRTIMER_MODE_SOFT); cpu_down(CPU1) ... hrtimers_cpu_dying() // Migrate timers to CPU0 smp_call_function_single(CPU0, returgger_next_event); retrigger_next_event() if (!highres && !nohz) return; As retrigger_next_event() is a NOOP when both high resolution timers and NOHZ are inactive CPU0's hrtimer_cpu_base::softirq_expires_next is not updated and the migrated softirq timers never expire unless there is a softirq based hrtimer queued on CPU0 later. Fix this by removing the hrtimer_hres_active() and tick_nohz_active() check in retrigger_next_event(), which enforces a full update of the CPU base. As this is not a fast path the extra cost does not matter. [ tglx: Massaged change log ] Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Xiongfeng Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250805081025.54235-1-wangxiongfeng2@huawei.com --- kernel/time/hrtimer.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..e8c479329282 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -787,10 +787,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -2295,11 +2295,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); -- cgit v1.2.3 From 0b47b6c3543efd65f2e620e359b05f4938314fbd Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Fri, 12 Sep 2025 18:14:38 +0200 Subject: Revert "sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scx_bpf_reenqueue_local() can be called from ops.cpu_release() when a CPU is taken by a higher scheduling class to give tasks queued to the CPU's local DSQ a chance to be migrated somewhere else, instead of waiting indefinitely for that CPU to become available again. In doing so, we decided to skip migration-disabled tasks, under the assumption that they cannot be migrated anyway. However, when a higher scheduling class preempts a CPU, the running task is always inserted at the head of the local DSQ as a migration-disabled task. This means it is always skipped by scx_bpf_reenqueue_local(), and ends up being confined to the same CPU even if that CPU is heavily contended by other higher scheduling class tasks. As an example, let's consider the following scenario: $ schedtool -a 0,1, -e yes > /dev/null $ sudo schedtool -F -p 99 -a 0, -e \ stress-ng -c 1 --cpu-load 99 --cpu-load-slice 1000 The first task (SCHED_EXT) can run on CPU0 or CPU1. The second task (SCHED_FIFO) is pinned to CPU0 and consumes ~99% of it. If the SCHED_EXT task initially runs on CPU0, it will remain there because it always sees CPU0 as "idle" in the short gaps left by the RT task, resulting in ~1% utilization while CPU1 stays idle: 0[||||||||||||||||||||||100.0%] 8[ 0.0%] 1[ 0.0%] 9[ 0.0%] 2[ 0.0%] 10[ 0.0%] 3[ 0.0%] 11[ 0.0%] 4[ 0.0%] 12[ 0.0%] 5[ 0.0%] 13[ 0.0%] 6[ 0.0%] 14[ 0.0%] 7[ 0.0%] 15[ 0.0%] PID USER PRI NI S CPU CPU%▽MEM% TIME+ Command 1067 root RT 0 R 0 99.0 0.2 0:31.16 stress-ng-cpu [run] 975 arighi 20 0 R 0 1.0 0.0 0:26.32 yes By allowing scx_bpf_reenqueue_local() to re-enqueue migration-disabled tasks, the scheduler can choose to migrate them to other CPUs (CPU1 in this case) via ops.enqueue(), leading to better CPU utilization: 0[||||||||||||||||||||||100.0%] 8[ 0.0%] 1[||||||||||||||||||||||100.0%] 9[ 0.0%] 2[ 0.0%] 10[ 0.0%] 3[ 0.0%] 11[ 0.0%] 4[ 0.0%] 12[ 0.0%] 5[ 0.0%] 13[ 0.0%] 6[ 0.0%] 14[ 0.0%] 7[ 0.0%] 15[ 0.0%] PID USER PRI NI S CPU CPU%▽MEM% TIME+ Command 577 root RT 0 R 0 100.0 0.2 0:23.17 stress-ng-cpu [run] 555 arighi 20 0 R 1 100.0 0.0 0:28.67 yes It's debatable whether per-CPU tasks should be re-enqueued as well, but doing so is probably safer: the scheduler can recognize re-enqueued tasks through the %SCX_ENQ_REENQ flag, reassess their placement, and either put them back at the head of the local DSQ or let another task attempt to take the CPU. This also prevents giving per-CPU tasks an implicit priority boost, which would otherwise make them more likely to reclaim CPUs preempted by higher scheduling classes. Fixes: 97e13ecb02668 ("sched_ext: Skip per-CPU tasks in scx_bpf_reenqueue_local()") Cc: stable@vger.kernel.org # v6.15+ Signed-off-by: Andrea Righi Acked-by: Changwoo Min Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4ae32ef179dd..088ceff38c8a 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6788,12 +6788,8 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to * the current local DSQ for running tasks and thus are not * visible to the BPF scheduler. - * - * Also skip re-enqueueing tasks that can only run on this - * CPU, as they would just be re-added to the same local - * DSQ without any benefit. */ - if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1) + if (p->migration_pending) continue; dispatch_dequeue(rq, p); -- cgit v1.2.3 From a1eab4d813f7b6e606ed21381b8cfda5c59a87e5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Sep 2025 11:06:42 -1000 Subject: sched_ext, sched/core: Fix build failure when !FAIR_GROUP_SCHED && EXT_GROUP_SCHED While collecting SCX related fields in struct task_group into struct scx_task_group, 6e6558a6bc41 ("sched_ext, sched/core: Factor out struct scx_task_group") forgot update tg->scx_weight usage in tg_weight(), which leads to build failure when CONFIG_FAIR_GROUP_SCHED is disabled but CONFIG_EXT_GROUP_SCHED is enabled. Fix it. Fixes: 6e6558a6bc41 ("sched_ext, sched/core: Factor out struct scx_task_group") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509170230.MwZsJSWa-lkp@intel.com/ Tested-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index be00629f0ba4..ccba6fc3c3fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9551,7 +9551,7 @@ static unsigned long tg_weight(struct task_group *tg) #ifdef CONFIG_FAIR_GROUP_SCHED return scale_load_down(tg->shares); #else - return sched_weight_from_cgroup(tg->scx_weight); + return sched_weight_from_cgroup(tg->scx.weight); #endif } -- cgit v1.2.3 From dc3382fffdec2c1d6df5836c88fa37b39cd8651e Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 16 Sep 2025 15:58:16 +0800 Subject: tracing: kprobe-event: Fix null-ptr-deref in trace_kprobe_create_internal() A crash was observed with the following output: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 UID: 0 PID: 2899 Comm: syz.2.399 Not tainted 6.17.0-rc5+ #5 PREEMPT(none) RIP: 0010:trace_kprobe_create_internal+0x3fc/0x1440 kernel/trace/trace_kprobe.c:911 Call Trace: trace_kprobe_create_cb+0xa2/0xf0 kernel/trace/trace_kprobe.c:1089 trace_probe_create+0xf1/0x110 kernel/trace/trace_probe.c:2246 dyn_event_create+0x45/0x70 kernel/trace/trace_dynevent.c:128 create_or_delete_trace_kprobe+0x5e/0xc0 kernel/trace/trace_kprobe.c:1107 trace_parse_run_command+0x1a5/0x330 kernel/trace/trace.c:10785 vfs_write+0x2b6/0xd00 fs/read_write.c:684 ksys_write+0x129/0x240 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0x5d/0x2d0 arch/x86/entry/syscall_64.c:94 Function kmemdup() may return NULL in trace_kprobe_create_internal(), add check for it's return value. Link: https://lore.kernel.org/all/20250916075816.3181175-1-wangliang74@huawei.com/ Fixes: 33b4e38baa03 ("tracing: kprobe-event: Allocate string buffers from heap") Signed-off-by: Wang Liang Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_kprobe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccae62d4fb91..fa60362a3f31 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -908,6 +908,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], return -EINVAL; } buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { -- cgit v1.2.3