From 4f6fc782128355931527cefe3eb45338abd8ab39 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2025 12:31:45 +0200 Subject: perf: Fix sample vs do_exit() Baisheng Gao reported an ARM64 crash, which Mark decoded as being a synchronous external abort -- most likely due to trying to access MMIO in bad ways. The crash further shows perf trying to do a user stack sample while in exit_mmap()'s tlb_finish_mmu() -- i.e. while tearing down the address space it is trying to access. It turns out that we stop perf after we tear down the userspace mm; a receipie for disaster, since perf likes to access userspace for various reasons. Flip this order by moving up where we stop perf in do_exit(). Additionally, harden PERF_SAMPLE_CALLCHAIN and PERF_SAMPLE_STACK_USER to abort when the current task does not have an mm (exit_mm() makes sure to set current->mm = NULL; before commencing with the actual teardown). Such that CPU wide events don't trip on this same problem. Fixes: c5ebcedb566e ("perf: Add ability to attach user stack dump to sample") Reported-by: Baisheng Gao Suggested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250605110815.GQ39944@noisy.programming.kicks-ass.net --- kernel/events/core.c | 7 +++++++ kernel/exit.c | 17 +++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f34c99f8ce8f..26dec1d8a5ab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7439,6 +7439,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -8150,6 +8154,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + if (!current->mm) + user = false; + if (!kernel && !user) return &__empty_callchain; diff --git a/kernel/exit.c b/kernel/exit.c index 38645039dd8f..1883150c9422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -944,6 +944,15 @@ void __noreturn do_exit(long code) taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + exit_mm(); if (group_dead) @@ -959,14 +968,6 @@ void __noreturn do_exit(long code) exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); cgroup_exit(tsk); -- cgit v1.2.3 From 61988e36dc5457cdff7ae7927e8d9ad1419ee998 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2025 12:37:11 +0200 Subject: perf: Fix cgroup state vs ERROR While chasing down a missing perf_cgroup_event_disable() elsewhere, Leo Yan found that both perf_put_aux_event() and perf_remove_sibling_event() were also missing one. Specifically, the rule is that events that switch to OFF,ERROR need to call perf_cgroup_event_disable(). Unify the disable paths to ensure this. Fixes: ab43762ef010 ("perf: Allow normal events to output AUX data") Fixes: 9f0c4fa111dc ("perf/core: Add a new PERF_EV_CAP_SIBLING event capability") Reported-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250605123343.GD35970@noisy.programming.kicks-ass.net --- kernel/events/core.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 26dec1d8a5ab..1cc98b9b3c0b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2149,8 +2149,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) } static void put_event(struct perf_event *event); -static void event_sched_out(struct perf_event *event, - struct perf_event_context *ctx); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { @@ -2183,8 +2184,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } @@ -2242,18 +2242,6 @@ static inline struct list_head *get_event_list(struct perf_event *event) &event->pmu_ctx->flexible_active; } -/* - * Events that have PERF_EV_CAP_SIBLING require being part of a group and - * cannot exist on their own, schedule them out and move them into the ERROR - * state. Also see _perf_event_enable(), it will not be able to recover - * this ERROR state. - */ -static inline void perf_remove_sibling_event(struct perf_event *event) -{ - event_sched_out(event, event->ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); -} - static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; @@ -2289,8 +2277,15 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) - perf_remove_sibling_event(sibling); + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2562,6 +2557,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); } +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); +} + /* * Cross CPU call to disable a performance event */ @@ -2576,13 +2580,18 @@ static void __perf_event_disable(struct perf_event *event, perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) group_sched_out(event, ctx); - else - event_sched_out(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - perf_cgroup_event_disable(event, ctx); + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } -- cgit v1.2.3 From 3b7a34aebbdf2a4b7295205bf0c654294283ec82 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Mon, 2 Jun 2025 19:40:49 +0100 Subject: perf: Fix dangling cgroup pointer in cpuctx Commit a3c3c6667("perf/core: Fix child_total_time_enabled accounting bug at task exit") moves the event->state update to before list_del_event(). This makes the event->state test in list_del_event() always false; never calling perf_cgroup_event_disable(). As a result, cpuctx->cgrp won't be cleared properly; causing havoc. Fixes: a3c3c6667("perf/core: Fix child_total_time_enabled accounting bug at task exit") Signed-off-by: Yeoreum Yun Signed-off-by: Peter Zijlstra (Intel) Tested-by: David Wang <00107082@163.com> Link: https://lore.kernel.org/all/aD2TspKH%2F7yvfYoO@e129823.arm.com/ --- kernel/events/core.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cc98b9b3c0b..d78608323916 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2120,18 +2120,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) del_event_from_groups(event, ctx); - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) { - perf_cgroup_event_disable(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - } - ctx->generation++; event->pmu_ctx->nr_events--; } @@ -2488,11 +2476,14 @@ __perf_remove_from_context(struct perf_event *event, state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_REVOKE) state = PERF_EVENT_STATE_REVOKED; - if (flags & DETACH_DEAD) { - event->pending_disable = 1; + if (flags & DETACH_DEAD) state = PERF_EVENT_STATE_DEAD; - } + event_sched_out(event, ctx); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) -- cgit v1.2.3 From 3172fb986666dfb71bf483b6d3539e1e587fa197 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Wed, 4 Jun 2025 03:39:24 +0000 Subject: perf/core: Fix WARN in perf_cgroup_switch() There may be concurrency between perf_cgroup_switch and perf_cgroup_event_disable. Consider the following scenario: after a new perf cgroup event is created on CPU0, the new event may not trigger a reprogramming, causing ctx->is_active to be 0. In this case, when CPU1 disables this perf event, it executes __perf_remove_from_context-> list _del_event->perf_cgroup_event_disable on CPU1, which causes a race with perf_cgroup_switch running on CPU0. The following describes the details of this concurrency scenario: CPU0 CPU1 perf_cgroup_switch: ... # cpuctx->cgrp is not NULL here if (READ_ONCE(cpuctx->cgrp) == NULL) return; perf_remove_from_context: ... raw_spin_lock_irq(&ctx->lock); ... # ctx->is_active == 0 because reprogramm is not # tigger, so CPU1 can do __perf_remove_from_context # for CPU0 __perf_remove_from_context: perf_cgroup_event_disable: ... if (--ctx->nr_cgroups) ... # this warning will happened because CPU1 changed # ctx.nr_cgroups to 0. WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); [peterz: use guard instead of goto unlock] Fixes: db4a835601b7 ("perf/core: Set cgroup in CPU contexts for new cgroup events") Signed-off-by: Luo Gengkun Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250604033924.3914647-3-luogengkun@huaweicloud.com --- kernel/events/core.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d78608323916..d7cf008f3d75 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -944,7 +957,13 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == cgrp) return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); @@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task) ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, -- cgit v1.2.3 From 8337204c58899fe422db765b481711eb2d95eb0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 May 2025 10:55:21 +0200 Subject: futex: Handle invalid node numbers supplied by user syzbot used a negative node number which was not rejected early and led to invalid memory access in node_possible(). Reject negative node numbers except for FUTEX_NO_NODE. [bigeasy: Keep the FUTEX_NO_NODE check] Closes: https://lore.kernel.org/all/6835bfe3.a70a0220.253bc2.00b5.GAE@google.com/ Fixes: cec199c5e39bd ("futex: Implement FUTEX2_NUMA") Signed-off-by: Peter Zijlstra (Intel) Reported-by: syzbot+9afaf6749e3a7aa1bdf3@syzkaller.appspotmail.com Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250528085521.1938355-4-bigeasy@linutronix.de --- kernel/futex/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 565f9717c6ca..b652d2f60c40 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -583,8 +583,8 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, if (futex_get_value(&node, naddr)) return -EFAULT; - if (node != FUTEX_NO_NODE && - (node >= MAX_NUMNODES || !node_possible(node))) + if ((node != FUTEX_NO_NODE) && + ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) return -EINVAL; } -- cgit v1.2.3 From 2fe1c59347369fa856ae259e2fac3c8c8dd9d335 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 3 Jun 2025 23:43:07 +0800 Subject: bpf: Add cookie to raw_tp bpf_link_info After commit 68ca5d4eebb8 ("bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs"), we can show the cookie in bpf_link_info like kprobe etc. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250603154309.3063644-1-chen.dylane@linux.dev --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd5304c6ac3c..89d027cd7ca0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3688,6 +3688,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link, return -EINVAL; info->raw_tracepoint.tp_name_len = tp_len + 1; + info->raw_tracepoint.cookie = raw_tp_link->cookie; if (!ubuf) return 0; -- cgit v1.2.3 From 97744b4971d81bf7336dfd782a08188c9e09f4ee Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 22:45:57 +0200 Subject: bpf: Clarify sanitize_check_bounds() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As is, it appears as if pointer arithmetic is allowed for everything except PTR_TO_{STACK,MAP_VALUE} if one only looks at sanitize_check_bounds(). However, this is misleading as the function only works together with retrieve_ptr_limit() and the two must be kept in sync. This patch documents the interdependency and adds a check to ensure they stay in sync. adjust_ptr_min_max_vals(): Because the preceding switch returns -EACCES for every opcode except for ADD/SUB, the sanitize_needed() following the sanitize_check_bounds() call is always true if reached. This means, unless sanitize_check_bounds() detected that the pointer goes OOB because of the ADD/SUB and returns -EACCES, sanitize_ptr_alu() always executes after sanitize_check_bounds(). The following shows that this also implies that retrieve_ptr_limit() runs in all relevant cases. Note that there are two calls to sanitize_ptr_alu(), these are simply needed to easily calculate the correct alu_limit as explained in commit 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask"). The truncation-simulation is already performed on the first call. In the second sanitize_ptr_alu(commit_window = true), we always run retrieve_ptr_limit(), unless: * can_skip_alu_sanititation() is true, notably `BPF_SRC(insn->code) == BPF_K`. BPF_K is fine because it means that there is no scalar register (which could be subject to speculative scalar confusion due to Spectre v4) that goes into the ALU operation. The pointer register can not be subject to v4-based value confusion due to the nospec added. Thus, in this case it would have been fine to also skip sanitize_check_bounds(). * If we are on a speculative path (`vstate->speculative`) and in the second "commit" phase, sanitize_ptr_alu() always just returns 0. This makes sense because there are no ALU sanitization limits to be learned from speculative paths. Furthermore, because the sanitization will ensure that pointer arithmetic stays in (architectural) bounds, the sanitize_check_bounds() on the speculative path could also be skipped. The second case needs more attention: Assume we have some ALU operation that is used with scalars architecturally, but with a non-PTR_TO_{STACK,MAP_VALUE} pointer (e.g., PTR_TO_PACKET) speculatively. It might appear as if this would allow an unsanitized pointer ALU operations, but this can not happen because one of the following two always holds: * The type mismatch stems from Spectre v4, then it is prevented by a nospec after the possibly-bypassed store involving the pointer. There is no speculative path simulated for this case thus it never happens. * The type mismatch stems from a Spectre v1 gadget like the following: r1 = slow(0) r4 = fast(0) r3 = SCALAR // Spectre v4 scalar confusion if (r1) { r2 = PTR_TO_PACKET } else { r2 = 42 } if (r4) { r2 += r3 *r2 } If `r2 = PTR_TO_PACKET` is indeed dead code, it will be sanitized to `goto -1` (as is the case for the r4-if block). If it is not (e.g., if `r1 = r4 = 1` is possible), it will also be explored on an architectural path and retrieve_ptr_limit() will reject it. To summarize, the exception for `vstate->speculative` is safe. Back to retrieve_ptr_limit(): It only allows the ALU operation if the involved pointer register (can be either source or destination for ADD) is PTR_TO_STACK or PTR_TO_MAP_VALUE. Otherwise, it returns -EOPNOTSUPP. Therefore, sanitize_check_bounds() returning 0 for non-PTR_TO_{STACK,MAP_VALUE} is fine because retrieve_ptr_limit() also runs for all relevant cases and prevents unsafe operations. To summarize, we allow unsanitized pointer arithmetic with 64-bit ADD/SUB for the following instructions if the requirements from retrieve_ptr_limit() AND sanitize_check_bounds() hold: * ptr -=/+= imm32 (i.e. `BPF_SRC(insn->code) == BPF_K`) * PTR_TO_{STACK,MAP_VALUE} -= scalar * PTR_TO_{STACK,MAP_VALUE} += scalar * scalar += PTR_TO_{STACK,MAP_VALUE} To document the interdependency between sanitize_check_bounds() and retrieve_ptr_limit(), add a verifier_bug_if() to make sure they stay in sync. Signed-off-by: Luis Gerhorst Reported-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/CAP01T76HZ+s5h+_REqRFkRjjoKwnZZn9YswpSVinGicah1pGJw@mail.gmail.com/ Link: https://lore.kernel.org/bpf/CAP01T75oU0zfZCiymEcH3r-GQ5A6GOc6GmYzJEnMa3=53XuUQQ@mail.gmail.com/ Link: https://lore.kernel.org/r/20250603204557.332447-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..e31f6b0ccb30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14284,7 +14284,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, } break; default: - break; + return -EOPNOTSUPP; } return 0; @@ -14311,7 +14311,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - int ret; + int ret, bounds_ret; dst_reg = ®s[dst]; @@ -14511,11 +14511,19 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) return -EINVAL; reg_bounds_sync(dst_reg); - if (sanitize_check_bounds(env, insn, dst_reg) < 0) - return -EACCES; + bounds_ret = sanitize_check_bounds(env, insn, dst_reg); + if (bounds_ret == -EACCES) + return bounds_ret; if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, &info, true); + if (verifier_bug_if(!can_skip_alu_sanitation(env, insn) + && !env->cur_state->speculative + && bounds_ret + && !ret, + env, "Pointer type unsupported by sanitize_check_bounds() not rejected by retrieve_ptr_limit() as required")) { + return -EFAULT; + } if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } -- cgit v1.2.3 From cf5543870186d6f99b631faaeca27beaa996d52f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2025 16:20:45 +0200 Subject: printk: Allow to use the printk kthread immediately even for 1st nbcon The kthreads for nbcon consoles are created by nbcon_alloc() at the beginning of the console registration. But it currently works only for the 2nd or later nbcon console because the code checks @printk_kthreads_running. The kthread for the 1st registered nbcon console is created at the very end of register_console() by printk_kthreads_check_locked(). As a result, the entire log is replayed synchronously when the "enabled" message gets printed. It might block the boot for a long time with a slow serial console. Prevent the synchronous flush by creating the kthread even for the 1st nbcon console when it is safe (kthreads ready and no boot consoles). Also inform printk() to use the kthread by setting @printk_kthreads_running. Note that the kthreads already must be running when it is safe and this is not the 1st nbcon console. Symmetrically, clear @printk_kthreads_running when the last nbcon console was unregistered by nbcon_free(). This requires updating @have_nbcon_console before nbcon_free() gets called. Note that there is _no_ problem when the 1st nbcon console replaces boot consoles. In this case, the kthread will be started at the end of registration after the boot consoles are removed. But the console does not reply the entire log buffer in this case. Note that the flag CON_PRINTBUFFER is always cleared when the boot consoles are removed and vice versa. Closes: https://lore.kernel.org/r/20250514173514.2117832-1-mcobb@thegoodpenguin.co.uk Tested-by: Michael Cobb Reviewed-by: John Ogness Link: https://patch.msgid.link/20250604142045.253301-1-pmladek@suse.com Signed-off-by: Petr Mladek --- kernel/printk/internal.h | 2 ++ kernel/printk/nbcon.c | 26 ++++++++++++++++++++++++-- kernel/printk/printk.c | 20 +++++++++++--------- 3 files changed, 37 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 48a24e7b309d..567c9e100d47 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -64,6 +64,7 @@ struct dev_printk_info; extern struct printk_ringbuffer *prb; extern bool printk_kthreads_running; +extern bool printk_kthreads_ready; extern bool debug_non_panic_cpus; __printf(4, 0) @@ -180,6 +181,7 @@ static inline void nbcon_kthread_wake(struct console *con) #define PRINTKRB_RECORD_MAX 0 #define printk_kthreads_running (false) +#define printk_kthreads_ready (false) /* * In !PRINTK builds we still export console_sem diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..d60596777d27 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1671,6 +1671,9 @@ bool nbcon_alloc(struct console *con) { struct nbcon_state state = { }; + /* Synchronize the kthread start. */ + lockdep_assert_console_list_lock_held(); + /* The write_thread() callback is mandatory. */ if (WARN_ON(!con->write_thread)) return false; @@ -1701,12 +1704,15 @@ bool nbcon_alloc(struct console *con) return false; } - if (printk_kthreads_running) { + if (printk_kthreads_ready && !have_boot_console) { if (!nbcon_kthread_create(con)) { kfree(con->pbufs); con->pbufs = NULL; return false; } + + /* Might be the first kthread. */ + printk_kthreads_running = true; } } @@ -1716,14 +1722,30 @@ bool nbcon_alloc(struct console *con) /** * nbcon_free - Free and cleanup the nbcon console specific data * @con: Console to free/cleanup nbcon data + * + * Important: @have_nbcon_console must be updated before calling + * this function. In particular, it can be set only when there + * is still another nbcon console registered. */ void nbcon_free(struct console *con) { struct nbcon_state state = { }; - if (printk_kthreads_running) + /* Synchronize the kthread stop. */ + lockdep_assert_console_list_lock_held(); + + if (printk_kthreads_running) { nbcon_kthread_stop(con); + /* Might be the last nbcon console. + * + * Do not rely on printk_kthreads_check_locked(). It is not + * called in some code paths, see nbcon_free() callers. + */ + if (!have_nbcon_console) + printk_kthreads_running = false; + } + nbcon_state_set(con, &state); /* Boot consoles share global printk buffers. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1eea80d0648e..0efbcdda9aab 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3574,7 +3574,7 @@ EXPORT_SYMBOL(console_resume); static int unregister_console_locked(struct console *console); /* True when system boot is far enough to create printer threads. */ -static bool printk_kthreads_ready __ro_after_init; +bool printk_kthreads_ready __ro_after_init; static struct task_struct *printk_legacy_kthread; @@ -3713,6 +3713,7 @@ static void printk_kthreads_check_locked(void) if (!printk_kthreads_ready) return; + /* Start or stop the legacy kthread when needed. */ if (have_legacy_console || have_boot_console) { if (!printk_legacy_kthread && force_legacy_kthread() && @@ -4204,14 +4205,6 @@ static int unregister_console_locked(struct console *console) */ synchronize_srcu(&console_srcu); - if (console->flags & CON_NBCON) - nbcon_free(console); - - console_sysfs_notify(); - - if (console->exit) - res = console->exit(console); - /* * With this console gone, the global flags tracking registered * console types may have changed. Update them. @@ -4232,6 +4225,15 @@ static int unregister_console_locked(struct console *console) if (!found_nbcon_con) have_nbcon_console = found_nbcon_con; + /* @have_nbcon_console must be updated before calling nbcon_free(). */ + if (console->flags & CON_NBCON) + nbcon_free(console); + + console_sysfs_notify(); + + if (console->exit) + res = console->exit(console); + /* Changed console list, may require printer threads to start/stop. */ printk_kthreads_check_locked(); -- cgit v1.2.3 From c68ea8243c5cc901cea62f695504bec73195d906 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 4 Jun 2025 16:33:11 +0200 Subject: sched_ext: idle: Remove unnecessary ifdef in scx_bpf_cpu_node() There's no need to make scx_bpf_cpu_node() dependent on CONFIG_NUMA, since cpu_to_node() can be used also in systems with CONFIG_NUMA disabled. This also allows to always validate the @cpu argument regardless of the CONFIG_NUMA settings. Fixes: 01059219b0cfd ("sched_ext: idle: Introduce node-aware idle cpu kfunc helpers") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 6d29d3cbc670..1598681b681e 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -929,14 +929,10 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, */ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { -#ifdef CONFIG_NUMA if (!kf_cpu_valid(cpu, NULL)) return NUMA_NO_NODE; return cpu_to_node(cpu); -#else - return 0; -#endif } /** -- cgit v1.2.3 From 353656eb84fef8ffece3b1be4345cbacbbb5267f Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 4 Jun 2025 16:33:12 +0200 Subject: sched_ext: idle: Make local functions static in ext_idle.c Functions that are only used within ext_idle.c can be marked as static to limit their scope. No functional changes. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 24 +++++++++++++++++------- kernel/sched/ext_idle.h | 7 ------- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 1598681b681e..17802693e304 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -75,7 +75,7 @@ static int scx_cpu_node_if_enabled(int cpu) return cpu_to_node(cpu); } -bool scx_idle_test_and_clear_cpu(int cpu) +static bool scx_idle_test_and_clear_cpu(int cpu) { int node = scx_cpu_node_if_enabled(cpu); struct cpumask *idle_cpus = idle_cpumask(node)->cpu; @@ -198,7 +198,7 @@ pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u6 /* * Find an idle CPU in the system, starting from @node. */ -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) { s32 cpu; @@ -794,6 +794,16 @@ static void reset_idle_masks(struct sched_ext_ops *ops) cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); } } +#else /* !CONFIG_SMP */ +static bool scx_idle_test_and_clear_cpu(int cpu) +{ + return -EBUSY; +} + +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) +{ + return -EBUSY; +} #endif /* CONFIG_SMP */ void scx_idle_enable(struct sched_ext_ops *ops) @@ -860,8 +870,8 @@ static bool check_builtin_idle_enabled(void) return false; } -s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, - const struct cpumask *allowed, u64 flags) +static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *allowed, u64 flags) { struct rq *rq; struct rq_flags rf; @@ -1121,10 +1131,10 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) if (!check_builtin_idle_enabled()) return false; - if (kf_cpu_valid(cpu, NULL)) - return scx_idle_test_and_clear_cpu(cpu); - else + if (!kf_cpu_valid(cpu, NULL)) return false; + + return scx_idle_test_and_clear_cpu(cpu); } /** diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index 37be78a7502b..05e389ed72e4 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -15,16 +15,9 @@ struct sched_ext_ops; #ifdef CONFIG_SMP void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); -bool scx_idle_test_and_clear_cpu(int cpu); -s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags); #else /* !CONFIG_SMP */ static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} static inline void scx_idle_init_masks(void) {} -static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; } -static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) -{ - return -EBUSY; -} #endif /* CONFIG_SMP */ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, -- cgit v1.2.3 From e212743bd727c3fcffcd73b6c1d906546ee83805 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 4 Jun 2025 16:33:13 +0200 Subject: sched_ext: Make scx_rq_bypassing() inline scx_rq_bypassing() is used both from ext.c and ext_idle.c, move it to ext.h as a static inline function. No functional changes. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 5 ----- kernel/sched/ext.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2c41c78be61e..3e483138dff6 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1701,11 +1701,6 @@ static bool scx_tryset_enable_state(enum scx_enable_state to, return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to); } -static bool scx_rq_bypassing(struct rq *rq) -{ - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); -} - /** * wait_ops_state - Busy-wait the specified ops state to end * @p: target task diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 6e5072f57771..d30f2d1bc00d 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -13,6 +13,11 @@ static inline bool scx_kf_allowed_if_unlocked(void) return !current->scx.kf_mask; } +static inline bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} + DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); void scx_tick(struct rq *rq); -- cgit v1.2.3 From 086ed90a6453873d4c5d51a18c26b3548af4fa24 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 5 Jun 2025 11:30:26 +0200 Subject: sched_ext: Make scx_locked_rq() inline scx_locked_rq() is used both from ext.c and ext_idle.c, move it to ext.h as a static inline function. No functional changes. v2: Rename locked_rq to scx_locked_rq_state, expose it and make scx_locked_rq() inline, as suggested by Tejun. Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 13 ++----------- kernel/sched/ext.h | 11 +++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3e483138dff6..3623ba98d7d8 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1247,7 +1247,7 @@ static void scx_kf_disallow(u32 mask) * This allows kfuncs to safely operate on rq from any scx ops callback, * knowing which rq is already locked. */ -static DEFINE_PER_CPU(struct rq *, locked_rq); +DEFINE_PER_CPU(struct rq *, scx_locked_rq_state); static inline void update_locked_rq(struct rq *rq) { @@ -1258,16 +1258,7 @@ static inline void update_locked_rq(struct rq *rq) */ if (rq) lockdep_assert_rq_held(rq); - __this_cpu_write(locked_rq, rq); -} - -/* - * Return the rq currently locked from an scx callback, or NULL if no rq is - * locked. - */ -static inline struct rq *scx_locked_rq(void) -{ - return __this_cpu_read(locked_rq); + __this_cpu_write(scx_locked_rq_state, rq); } #define SCX_CALL_OP(sch, mask, op, rq, args...) \ diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index d30f2d1bc00d..6d6d00e9de20 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -20,6 +20,17 @@ static inline bool scx_rq_bypassing(struct rq *rq) DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); +DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(scx_locked_rq_state); +} + void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); -- cgit v1.2.3 From 0998f0ac308c14f7a0750e9c24fe2083a817d8fb Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Wed, 4 Jun 2025 20:21:37 -0400 Subject: workqueue: fix opencoded cpumask_next_and_wrap() in wq_select_unbound_cpu() The dedicated helper is more verbose and effective comparing to cpumask_first() followed by cpumask_next(). Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 97f37b5bae66..d9de0f2a2e00 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2221,12 +2221,9 @@ static int wq_select_unbound_cpu(int cpu) } new_cpu = __this_cpu_read(wq_rr_cpu_last); - new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); - if (unlikely(new_cpu >= nr_cpu_ids)) { - new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); - if (unlikely(new_cpu >= nr_cpu_ids)) - return cpu; - } + new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; __this_cpu_write(wq_rr_cpu_last, new_cpu); return new_cpu; -- cgit v1.2.3 From 9b8367b604c739947ec308874f087fe0eb80f412 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 6 Jun 2025 09:31:36 -0700 Subject: cgroup: Add bpf prog revisions to struct cgroup_bpf One of key items in mprog API is revision for prog list. The revision number will be increased if the prog list changed, e.g., attach, detach or replace. Add 'revisions' field to struct cgroup_bpf, representing revisions for all cgroup related attachment types. The initial revision value is set to 1, the same as kernel mprog implementations. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606163136.2428732-1-yonghong.song@linux.dev --- kernel/cgroup/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a723b7dc6e4e..312c6a8b55bb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2074,6 +2074,11 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); +#ifdef CONFIG_CGROUP_BPF + for (int i = 0; i < ARRAY_SIZE(cgrp->bpf.revisions); i++) + cgrp->bpf.revisions[i] = 1; +#endif + init_waitqueue_head(&cgrp->offline_waitq); INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); } -- cgit v1.2.3 From 1209339844601ec1766f4ff430673fbcfe42bb51 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 6 Jun 2025 09:31:41 -0700 Subject: bpf: Implement mprog API on top of existing cgroup progs Current cgroup prog ordering is appending at attachment time. This is not ideal. In some cases, users want specific ordering at a particular cgroup level. To address this, the existing mprog API seems an ideal solution with supporting BPF_F_BEFORE and BPF_F_AFTER flags. But there are a few obstacles to directly use kernel mprog interface. Currently cgroup bpf progs already support prog attach/detach/replace and link-based attach/detach/replace. For example, in struct bpf_prog_array_item, the cgroup_storage field needs to be together with bpf prog. But the mprog API struct bpf_mprog_fp only has bpf_prog as the member, which makes it difficult to use kernel mprog interface. In another case, the current cgroup prog detach tries to use the same flag as in attach. This is different from mprog kernel interface which uses flags passed from user space. So to avoid modifying existing behavior, I made the following changes to support mprog API for cgroup progs: - The support is for prog list at cgroup level. Cross-level prog list (a.k.a. effective prog list) is not supported. - Previously, BPF_F_PREORDER is supported only for prog attach, now BPF_F_PREORDER is also supported by link-based attach. - For attach, BPF_F_BEFORE/BPF_F_AFTER/BPF_F_ID/BPF_F_LINK is supported similar to kernel mprog but with different implementation. - For detach and replace, use the existing implementation. - For attach, detach and replace, the revision for a particular prog list, associated with a particular attach type, will be updated by increasing count by 1. Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606163141.2428937-1-yonghong.song@linux.dev --- kernel/bpf/cgroup.c | 182 ++++++++++++++++++++++++++++++++++++++++++++------- kernel/bpf/syscall.c | 46 ++++++++----- 2 files changed, 191 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9122c39870bf..ffbafbef5010 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -658,6 +658,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, return NULL; } +static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) +{ + struct bpf_link *link = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + link = bpf_link_by_id(id_or_fd); + else if (id_or_fd) + link = bpf_link_get_from_fd(id_or_fd); + return link; +} + +static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd) +{ + struct bpf_prog *prog = ERR_PTR(-EINVAL); + + if (flags & BPF_F_ID) + prog = bpf_prog_by_id(id_or_fd); + else if (id_or_fd) + prog = bpf_prog_get(id_or_fd); + return prog; +} + +static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog, + struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd) +{ + bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID; + struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL); + bool preorder = flags & BPF_F_PREORDER; + struct bpf_link *anchor_link = NULL; + struct bpf_prog *anchor_prog = NULL; + bool is_before, is_after; + + is_before = flags & BPF_F_BEFORE; + is_after = flags & BPF_F_AFTER; + if (is_link || is_id || id_or_fd) { + /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */ + if (is_before == is_after) + return ERR_PTR(-EINVAL); + if ((is_link && !link) || (!is_link && !prog)) + return ERR_PTR(-EINVAL); + } else if (!hlist_empty(progs)) { + /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */ + if (is_before && is_after) + return ERR_PTR(-EINVAL); + } + + if (is_link) { + anchor_link = bpf_get_anchor_link(flags, id_or_fd); + if (IS_ERR(anchor_link)) + return ERR_PTR(PTR_ERR(anchor_link)); + } else if (is_id || id_or_fd) { + anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); + if (IS_ERR(anchor_prog)) + return ERR_PTR(PTR_ERR(anchor_prog)); + } + + if (!anchor_prog && !anchor_link) { + /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER + * doesn't matter since either prepend or append to a combined + * list of progs will end up with correct result. + */ + hlist_for_each_entry(pltmp, progs, node) { + if (is_before) + return pltmp; + if (pltmp->node.next) + continue; + return pltmp; + } + return NULL; + } + + hlist_for_each_entry(pltmp, progs, node) { + if ((anchor_prog && anchor_prog == pltmp->prog) || + (anchor_link && anchor_link == &pltmp->link->link)) { + if (!!(pltmp->flags & BPF_F_PREORDER) != preorder) + goto out; + pl = pltmp; + goto out; + } + } + + pl = ERR_PTR(-ENOENT); +out: + if (anchor_link) + bpf_link_put(anchor_link); + else + bpf_prog_put(anchor_prog); + return pl; +} + +static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs, + struct bpf_prog *prog, struct bpf_cgroup_link *link, + u32 flags, u32 id_or_fd) +{ + struct bpf_prog_list *pltmp; + + pltmp = get_prog_list(progs, prog, link, flags, id_or_fd); + if (IS_ERR(pltmp)) + return PTR_ERR(pltmp); + + if (!pltmp) + hlist_add_head(&pl->node, progs); + else if (flags & BPF_F_BEFORE) + hlist_add_before(&pl->node, &pltmp->node); + else + hlist_add_behind(&pl->node, &pltmp->node); + + return 0; +} + /** * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and * propagate the change to descendants @@ -667,6 +777,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set * @type: Type of attach operation * @flags: Option flags + * @id_or_fd: Relative prog id or fd + * @revision: bpf_prog_list revision * * Exactly one of @prog or @link can be non-null. * Must be called with cgroup_mutex held. @@ -674,7 +786,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, static int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, - enum bpf_attach_type type, u32 flags) + enum bpf_attach_type type, u32 flags, u32 id_or_fd, + u64 revision) { u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); struct bpf_prog *old_prog = NULL; @@ -690,6 +803,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) /* invalid combination */ return -EINVAL; + if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER))) + /* only either replace or insertion with before/after */ + return -EINVAL; if (link && (prog || replace_prog)) /* only either link or prog/replace_prog can be specified */ return -EINVAL; @@ -700,6 +816,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; progs = &cgrp->bpf.progs[atype]; @@ -728,22 +846,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (pl) { old_prog = pl->prog; } else { - struct hlist_node *last = NULL; - pl = kmalloc(sizeof(*pl), GFP_KERNEL); if (!pl) { bpf_cgroup_storages_free(new_storage); return -ENOMEM; } - if (hlist_empty(progs)) - hlist_add_head(&pl->node, progs); - else - hlist_for_each(last, progs) { - if (last->next) - continue; - hlist_add_behind(&pl->node, last); - break; - } + + err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd); + if (err) { + kfree(pl); + bpf_cgroup_storages_free(new_storage); + return err; + } } pl->prog = prog; @@ -762,6 +876,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, if (err) goto cleanup_trampoline; + cgrp->bpf.revisions[atype] += 1; if (old_prog) { if (type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(old_prog); @@ -793,12 +908,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog *replace_prog, struct bpf_cgroup_link *link, enum bpf_attach_type type, - u32 flags) + u32 flags, u32 id_or_fd, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags); + ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags, + id_or_fd, revision); cgroup_unlock(); return ret; } @@ -886,6 +1002,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, if (!found) return -ENOENT; + cgrp->bpf.revisions[atype] += 1; old_prog = xchg(&link->link.prog, new_prog); replace_effective_prog(cgrp, atype, link); bpf_prog_put(old_prog); @@ -1011,12 +1128,14 @@ found: * @prog: A program to detach or NULL * @link: A link to detach or NULL * @type: Type of detach operation + * @revision: bpf_prog_list revision * * At most one of @prog or @link can be non-NULL. * Must be called with cgroup_mutex held. */ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - struct bpf_cgroup_link *link, enum bpf_attach_type type) + struct bpf_cgroup_link *link, enum bpf_attach_type type, + u64 revision) { enum cgroup_bpf_attach_type atype; struct bpf_prog *old_prog; @@ -1034,6 +1153,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, if (atype < 0) return -EINVAL; + if (revision && revision != cgrp->bpf.revisions[atype]) + return -ESTALE; + progs = &cgrp->bpf.progs[atype]; flags = cgrp->bpf.flags[atype]; @@ -1059,6 +1181,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, /* now can actually delete it from this cgroup list */ hlist_del(&pl->node); + cgrp->bpf.revisions[atype] += 1; kfree(pl); if (hlist_empty(progs)) @@ -1074,12 +1197,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, } static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, - enum bpf_attach_type type) + enum bpf_attach_type type, u64 revision) { int ret; cgroup_lock(); - ret = __cgroup_bpf_detach(cgrp, prog, NULL, type); + ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision); cgroup_unlock(); return ret; } @@ -1097,6 +1220,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, struct bpf_prog_array *effective; int cnt, ret = 0, i; int total_cnt = 0; + u64 revision = 0; u32 flags; if (effective_query && prog_attach_flags) @@ -1134,6 +1258,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, return -EFAULT; if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) return -EFAULT; + if (!effective_query && from_atype == to_atype) + revision = cgrp->bpf.revisions[from_atype]; + if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) + return -EFAULT; if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) /* return early if user requested only program count + flags */ return 0; @@ -1216,7 +1344,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr, } ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, - attr->attach_type, attr->attach_flags); + attr->attach_type, attr->attach_flags, + attr->relative_fd, attr->expected_revision); if (replace_prog) bpf_prog_put(replace_prog); @@ -1238,7 +1367,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) if (IS_ERR(prog)) prog = NULL; - ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type); + ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision); if (prog) bpf_prog_put(prog); @@ -1267,7 +1396,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link) } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, - cg_link->type)); + cg_link->type, 0)); if (cg_link->type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); @@ -1339,6 +1468,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = { .fill_link_info = bpf_cgroup_link_fill_link_info, }; +#define BPF_F_LINK_ATTACH_MASK \ + (BPF_F_ID | \ + BPF_F_BEFORE | \ + BPF_F_AFTER | \ + BPF_F_PREORDER | \ + BPF_F_LINK) + int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; @@ -1346,7 +1482,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) struct cgroup *cgrp; int err; - if (attr->link_create.flags) + if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK)) return -EINVAL; cgrp = cgroup_get_from_fd(attr->link_create.target_fd); @@ -1370,7 +1506,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, - link->type, BPF_F_ALLOW_MULTI); + link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags, + attr->link_create.cgroup.relative_fd, + attr->link_create.cgroup.expected_revision); if (err) { bpf_link_cleanup(&link_primer); goto out_put_cgroup; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 89d027cd7ca0..97ad57ffc404 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4186,6 +4186,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, } } +static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype, + bool check_atype) +{ + switch (ptype) { + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + return true; + case BPF_PROG_TYPE_LSM: + return check_atype ? atype == BPF_LSM_CGROUP : true; + default: + return false; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD expected_revision #define BPF_F_ATTACH_MASK_BASE \ @@ -4216,6 +4235,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (bpf_mprog_supported(ptype)) { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG) return -EINVAL; + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG)) + return -EINVAL; } else { if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE) return -EINVAL; @@ -4233,6 +4255,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } + if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { + ret = cgroup_bpf_prog_attach(attr, ptype, prog); + goto out; + } + switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: @@ -4244,20 +4271,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: ret = netns_bpf_prog_attach(attr, prog); break; - case BPF_PROG_TYPE_CGROUP_DEVICE: - case BPF_PROG_TYPE_CGROUP_SKB: - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: - case BPF_PROG_TYPE_CGROUP_SYSCTL: - case BPF_PROG_TYPE_SOCK_OPS: - case BPF_PROG_TYPE_LSM: - if (ptype == BPF_PROG_TYPE_LSM && - prog->expected_attach_type != BPF_LSM_CGROUP) - ret = -EINVAL; - else - ret = cgroup_bpf_prog_attach(attr, ptype, prog); - break; case BPF_PROG_TYPE_SCHED_CLS: if (attr->attach_type == BPF_TCX_INGRESS || attr->attach_type == BPF_TCX_EGRESS) @@ -4268,7 +4281,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) default: ret = -EINVAL; } - +out: if (ret) bpf_prog_put(prog); return ret; @@ -4296,6 +4309,9 @@ static int bpf_prog_detach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); } + } else if (is_cgroup_prog_type(ptype, 0, false)) { + if (attr->attach_flags || attr->relative_fd) + return -EINVAL; } else if (attr->attach_flags || attr->relative_fd || attr->expected_revision) { -- cgit v1.2.3 From 97ebac58865dec1bca31140252386daefef1654f Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 6 Jun 2025 23:02:58 +0800 Subject: bpf: Add show_fdinfo for perf_event After commit 1b715e1b0ec5 ("bpf: Support ->fill_link_info for perf_event") add perf_event info, we can also show the info with the method of cat /proc/[fd]/fdinfo. kprobe fdinfo: link_type: perf link_id: 10 prog_tag: bcf7977d3b93787c prog_id: 20 name: bpf_fentry_test1 offset: 0x0 missed: 0 addr: 0xffffffffa28a2904 event_type: kprobe cookie: 3735928559 uprobe fdinfo: link_type: perf link_id: 13 prog_tag: bcf7977d3b93787c prog_id: 21 name: /proc/self/exe offset: 0x63dce4 ref_ctr_offset: 0x33eee2a event_type: uprobe cookie: 3735928559 tracepoint fdinfo: link_type: perf link_id: 11 prog_tag: bcf7977d3b93787c prog_id: 22 tp_name: sched_switch event_type: tracepoint cookie: 3735928559 perf_event fdinfo: link_type: perf link_id: 12 prog_tag: bcf7977d3b93787c prog_id: 23 type: 1 config: 2 event_type: event cookie: 3735928559 Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606150258.3385166-1-chen.dylane@linux.dev --- kernel/bpf/syscall.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 97ad57ffc404..0c267f37775b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3795,6 +3795,32 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.kprobe.cookie = event->bpf_cookie; return 0; } + +static void bpf_perf_link_fdinfo_kprobe(const struct perf_event *event, + struct seq_file *seq) +{ + const char *name; + int err; + u32 prog_id, type; + u64 offset, addr; + unsigned long missed; + + err = bpf_get_perf_event_info(event, &prog_id, &type, &name, + &offset, &addr, &missed); + if (err) + return; + + seq_printf(seq, + "name:\t%s\n" + "offset:\t%#llx\n" + "missed:\t%lu\n" + "addr:\t%#llx\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, offset, missed, addr, + type == BPF_FD_TYPE_KRETPROBE ? "kretprobe" : "kprobe", + event->bpf_cookie); +} #endif #ifdef CONFIG_UPROBE_EVENTS @@ -3823,6 +3849,31 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.uprobe.ref_ctr_offset = ref_ctr_offset; return 0; } + +static void bpf_perf_link_fdinfo_uprobe(const struct perf_event *event, + struct seq_file *seq) +{ + const char *name; + int err; + u32 prog_id, type; + u64 offset, ref_ctr_offset; + unsigned long missed; + + err = bpf_get_perf_event_info(event, &prog_id, &type, &name, + &offset, &ref_ctr_offset, &missed); + if (err) + return; + + seq_printf(seq, + "name:\t%s\n" + "offset:\t%#llx\n" + "ref_ctr_offset:\t%#llx\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, offset, ref_ctr_offset, + type == BPF_FD_TYPE_URETPROBE ? "uretprobe" : "uprobe", + event->bpf_cookie); +} #endif static int bpf_perf_link_fill_probe(const struct perf_event *event, @@ -3891,10 +3942,79 @@ static int bpf_perf_link_fill_link_info(const struct bpf_link *link, } } +static void bpf_perf_event_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ + seq_printf(seq, + "type:\t%u\n" + "config:\t%llu\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + event->attr.type, event->attr.config, + "event", event->bpf_cookie); +} + +static void bpf_tracepoint_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ + int err; + const char *name; + u32 prog_id; + + err = bpf_get_perf_event_info(event, &prog_id, NULL, &name, NULL, + NULL, NULL); + if (err) + return; + + seq_printf(seq, + "tp_name:\t%s\n" + "event_type:\t%s\n" + "cookie:\t%llu\n", + name, "tracepoint", event->bpf_cookie); +} + +static void bpf_probe_link_show_fdinfo(const struct perf_event *event, + struct seq_file *seq) +{ +#ifdef CONFIG_KPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE) + return bpf_perf_link_fdinfo_kprobe(event, seq); +#endif + +#ifdef CONFIG_UPROBE_EVENTS + if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE) + return bpf_perf_link_fdinfo_uprobe(event, seq); +#endif +} + +static void bpf_perf_link_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_perf_link *perf_link; + const struct perf_event *event; + + perf_link = container_of(link, struct bpf_perf_link, link); + event = perf_get_event(perf_link->perf_file); + if (IS_ERR(event)) + return; + + switch (event->prog->type) { + case BPF_PROG_TYPE_PERF_EVENT: + return bpf_perf_event_link_show_fdinfo(event, seq); + case BPF_PROG_TYPE_TRACEPOINT: + return bpf_tracepoint_link_show_fdinfo(event, seq); + case BPF_PROG_TYPE_KPROBE: + return bpf_probe_link_show_fdinfo(event, seq); + default: + return; + } +} + static const struct bpf_link_ops bpf_perf_link_lops = { .release = bpf_perf_link_release, .dealloc = bpf_perf_link_dealloc, .fill_link_info = bpf_perf_link_fill_link_info, + .show_fdinfo = bpf_perf_link_show_fdinfo, }; static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) -- cgit v1.2.3 From 5534e58f2e9bd72b253d033ee0af6e68eb8ac96b Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Mon, 9 Jun 2025 11:30:22 -0700 Subject: bpf: Make reg_not_null() true for CONST_PTR_TO_MAP When reg->type is CONST_PTR_TO_MAP, it can not be null. However the verifier explores the branches under rX == 0 in check_cond_jmp_op() even if reg->type is CONST_PTR_TO_MAP, because it was not checked for in reg_not_null(). Fix this by adding CONST_PTR_TO_MAP to the set of types that are considered non nullable in reg_not_null(). An old "unpriv: cmp map pointer with zero" selftest fails with this change, because now early out correctly triggers in check_cond_jmp_op(), making the verification to pass. In practice verifier may allow pointer to null comparison in unpriv, since in many cases the relevant branch and comparison op are removed as dead code. So change the expected test result to __success_unpriv. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250609183024.359974-2-isolodrai@meta.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e31f6b0ccb30..48a3241cda0f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -405,7 +405,8 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || - type == PTR_TO_MEM; + type == PTR_TO_MEM || + type == CONST_PTR_TO_MAP; } static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) -- cgit v1.2.3 From c7beb48344d2ea0f3f1869b078309dbeb2ed4c96 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sat, 7 Jun 2025 00:58:14 +0800 Subject: bpf: Add cookie to tracing bpf_link_info bpf_tramp_link includes cookie info, we can add it in bpf_link_info. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606165818.3394397-1-chen.dylane@linux.dev --- kernel/bpf/syscall.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0c267f37775b..85e080c3333d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3416,6 +3416,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, container_of(link, struct bpf_tracing_link, link.link); info->tracing.attach_type = tr_link->attach_type; + info->tracing.cookie = tr_link->link.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, &info->tracing.target_btf_id); -- cgit v1.2.3 From 380cb6dfa2bffea6fc14291e8f789b46dfb6a713 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sat, 7 Jun 2025 00:58:17 +0800 Subject: bpf: Add cookie in fdinfo for tracing Add cookie in fdinfo for tracing, the info as follows: link_type: tracing link_id: 6 prog_tag: 9dfdf8ef453843bf prog_id: 35 attach_type: 25 target_obj_id: 1 target_btf_id: 60355 cookie: 9007199254740992 Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606165818.3394397-4-chen.dylane@linux.dev --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 85e080c3333d..185cf662451d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3403,10 +3403,12 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, seq_printf(seq, "attach_type:\t%d\n" "target_obj_id:\t%u\n" - "target_btf_id:\t%u\n", + "target_btf_id:\t%u\n" + "cookie:\t%llu\n", tr_link->attach_type, target_obj_id, - target_btf_id); + target_btf_id, + tr_link->link.cookie); } static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, -- cgit v1.2.3 From 2bc0575fec3647b204ad3a438661605117a60146 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sat, 7 Jun 2025 00:58:18 +0800 Subject: bpf: Add cookie in fdinfo for raw_tp Add cookie in fdinfo for raw_tp, the info as follows: link_type: raw_tracepoint link_id: 31 prog_tag: 9dfdf8ef453843bf prog_id: 32 tp_name: sys_enter cookie: 23925373020405760 Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250606165818.3394397-5-chen.dylane@linux.dev --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 185cf662451d..56500381c28a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3654,8 +3654,10 @@ static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link, container_of(link, struct bpf_raw_tp_link, link); seq_printf(seq, - "tp_name:\t%s\n", - raw_tp_link->btp->tp->name); + "tp_name:\t%s\n" + "cookie:\t%llu\n", + raw_tp_link->btp->tp->name, + raw_tp_link->cookie); } static int bpf_copy_to_user(char __user *ubuf, const char *buf, u32 ulen, -- cgit v1.2.3 From 8b7df50fd40d7ab3b467c9739965b0e5a02e6113 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 22:57:52 +0200 Subject: bpf: Move insn if/else into do_check_insn() This is required to catch the errors later and fall back to a nospec if on a speculative path. Eliminate the regs variable as it is only used once and insn_idx is not modified in-between the definition and usage. Do not pass insn but compute it in the function itself. As Eduard points out [1], insn is assumed to correspond to env->insn_idx in many places (e.g, __check_reg_arg()). Move code into do_check_insn(), replace * "continue" with "return 0" after modifying insn_idx * "goto process_bpf_exit" with "return PROCESS_BPF_EXIT" * "goto process_bpf_exit_full" with "return process_bpf_exit_full()" * "do_print_state = " with "*do_print_state = " [1] https://lore.kernel.org/all/293dbe3950a782b8eb3b87b71d7a967e120191fd.camel@gmail.com/ Signed-off-by: Luis Gerhorst Acked-by: Kumar Kartikeya Dwivedi Acked-by: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603205800.334980-2-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 428 ++++++++++++++++++++++++++------------------------ 1 file changed, 223 insertions(+), 205 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48a3241cda0f..48982172565b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19430,20 +19430,223 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ return 0; } +enum { + PROCESS_BPF_EXIT = 1 +}; + +static int process_bpf_exit_full(struct bpf_verifier_env *env, + bool *do_print_state, + bool exception_exit) +{ + /* We must do check_reference_leak here before + * prepare_func_exit to handle the case when + * state->curframe > 0, it may be a callback function, + * for which reference_state must match caller reference + * state when it exits. + */ + int err = check_resource_leak(env, exception_exit, + !env->cur_state->curframe, + "BPF_EXIT instruction in main prog"); + if (err) + return err; + + /* The side effect of the prepare_func_exit which is + * being skipped is that it frees bpf_func_state. + * Typically, process_bpf_exit will only be hit with + * outermost exit. copy_verifier_state in pop_stack will + * handle freeing of any extra bpf_func_state left over + * from not processing all nested function exits. We + * also skip return code checks as they are not needed + * for exceptional exits. + */ + if (exception_exit) + return PROCESS_BPF_EXIT; + + if (env->cur_state->curframe) { + /* exit from nested function */ + err = prepare_func_exit(env, &env->insn_idx); + if (err) + return err; + *do_print_state = true; + return 0; + } + + err = check_return_code(env, BPF_REG_0, "R0"); + if (err) + return err; + return PROCESS_BPF_EXIT; +} + +static int do_check_insn(struct bpf_verifier_env *env, bool *do_print_state) +{ + int err; + struct bpf_insn *insn = &env->prog->insnsi[env->insn_idx]; + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(env, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; + + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). + */ + err = check_load_mem(env, insn, false, is_ldsx, true, "ldx"); + if (err) + return err; + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, insn); + if (err) + return err; + env->insn_idx++; + return 0; + } + + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + + err = check_store_reg(env, insn, false); + if (err) + return err; + } else if (class == BPF_ST) { + enum bpf_reg_type dst_reg_type; + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose(env, "BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = cur_regs(env)[insn->dst_reg].type; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false, false); + if (err) + return err; + + err = save_aux_ptr_type(env, dst_reg_type, false); + if (err) + return err; + } else if (class == BPF_JMP || class == BPF_JMP32) { + u8 opcode = BPF_OP(insn->code); + + env->jmps_processed++; + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + (insn->src_reg != BPF_PSEUDO_KFUNC_CALL && + insn->off != 0) || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL && + insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || + insn->dst_reg != BPF_REG_0 || class == BPF_JMP32) { + verbose(env, "BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + if (env->cur_state->active_locks) { + if ((insn->src_reg == BPF_REG_0 && + insn->imm != BPF_FUNC_spin_unlock) || + (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { + verbose(env, + "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } + } + if (insn->src_reg == BPF_PSEUDO_CALL) { + err = check_func_call(env, insn, &env->insn_idx); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + err = check_kfunc_call(env, insn, &env->insn_idx); + if (!err && is_bpf_throw_kfunc(insn)) + return process_bpf_exit_full(env, do_print_state, true); + } else { + err = check_helper_call(env, insn, &env->insn_idx); + } + if (err) + return err; + + mark_reg_scratched(env, BPF_REG_0); + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0 || + (class == BPF_JMP && insn->imm != 0) || + (class == BPF_JMP32 && insn->off != 0)) { + verbose(env, "BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + if (class == BPF_JMP) + env->insn_idx += insn->off + 1; + else + env->insn_idx += insn->imm + 1; + return 0; + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0 || + class == BPF_JMP32) { + verbose(env, "BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + return process_bpf_exit_full(env, do_print_state, false); + } else { + err = check_cond_jmp_op(env, insn, &env->insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + err = check_ld_abs(env, insn); + if (err) + return err; + + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + env->insn_idx++; + sanitize_mark_insn_seen(env); + } else { + verbose(env, "invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose(env, "unknown insn class %d\n", class); + return -EINVAL; + } + + env->insn_idx++; + return 0; +} + static int do_check(struct bpf_verifier_env *env) { bool pop_log = !(env->log.level & BPF_LOG_LEVEL2); struct bpf_verifier_state *state = env->cur_state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs; int insn_cnt = env->prog->len; bool do_print_state = false; int prev_insn_idx = -1; for (;;) { - bool exception_exit = false; struct bpf_insn *insn; - u8 class; int err; /* reset current history entry on each new instruction */ @@ -19457,7 +19660,6 @@ static int do_check(struct bpf_verifier_env *env) } insn = &insns[env->insn_idx]; - class = BPF_CLASS(insn->code); if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, @@ -19527,215 +19729,31 @@ static int do_check(struct bpf_verifier_env *env) return err; } - regs = cur_regs(env); sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; - if (class == BPF_ALU || class == BPF_ALU64) { - err = check_alu_op(env, insn); - if (err) - return err; - - } else if (class == BPF_LDX) { - bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; - - /* Check for reserved fields is already done in - * resolve_pseudo_ldimm64(). - */ - err = check_load_mem(env, insn, false, is_ldsx, true, - "ldx"); - if (err) - return err; - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, insn); - if (err) - return err; - env->insn_idx++; - continue; - } - - if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - - err = check_store_reg(env, insn, false); - if (err) - return err; - } else if (class == BPF_ST) { - enum bpf_reg_type dst_reg_type; - - if (BPF_MODE(insn->code) != BPF_MEM || - insn->src_reg != BPF_REG_0) { - verbose(env, "BPF_ST uses reserved fields\n"); - return -EINVAL; - } - /* check src operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, -1, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); - if (err) - return err; - } else if (class == BPF_JMP || class == BPF_JMP32) { - u8 opcode = BPF_OP(insn->code); - - env->jmps_processed++; - if (opcode == BPF_CALL) { - if (BPF_SRC(insn->code) != BPF_K || - (insn->src_reg != BPF_PSEUDO_KFUNC_CALL - && insn->off != 0) || - (insn->src_reg != BPF_REG_0 && - insn->src_reg != BPF_PSEUDO_CALL && - insn->src_reg != BPF_PSEUDO_KFUNC_CALL) || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_CALL uses reserved fields\n"); - return -EINVAL; - } - - if (env->cur_state->active_locks) { - if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || - (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && - (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) { - verbose(env, "function calls are not allowed while holding a lock\n"); - return -EINVAL; - } - } - if (insn->src_reg == BPF_PSEUDO_CALL) { - err = check_func_call(env, insn, &env->insn_idx); - } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - err = check_kfunc_call(env, insn, &env->insn_idx); - if (!err && is_bpf_throw_kfunc(insn)) { - exception_exit = true; - goto process_bpf_exit_full; - } - } else { - err = check_helper_call(env, insn, &env->insn_idx); - } - if (err) - return err; - - mark_reg_scratched(env, BPF_REG_0); - } else if (opcode == BPF_JA) { - if (BPF_SRC(insn->code) != BPF_K || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - (class == BPF_JMP && insn->imm != 0) || - (class == BPF_JMP32 && insn->off != 0)) { - verbose(env, "BPF_JA uses reserved fields\n"); - return -EINVAL; - } - - if (class == BPF_JMP) - env->insn_idx += insn->off + 1; - else - env->insn_idx += insn->imm + 1; - continue; - - } else if (opcode == BPF_EXIT) { - if (BPF_SRC(insn->code) != BPF_K || - insn->imm != 0 || - insn->src_reg != BPF_REG_0 || - insn->dst_reg != BPF_REG_0 || - class == BPF_JMP32) { - verbose(env, "BPF_EXIT uses reserved fields\n"); - return -EINVAL; - } -process_bpf_exit_full: - /* We must do check_reference_leak here before - * prepare_func_exit to handle the case when - * state->curframe > 0, it may be a callback - * function, for which reference_state must - * match caller reference state when it exits. - */ - err = check_resource_leak(env, exception_exit, !env->cur_state->curframe, - "BPF_EXIT instruction in main prog"); - if (err) - return err; - - /* The side effect of the prepare_func_exit - * which is being skipped is that it frees - * bpf_func_state. Typically, process_bpf_exit - * will only be hit with outermost exit. - * copy_verifier_state in pop_stack will handle - * freeing of any extra bpf_func_state left over - * from not processing all nested function - * exits. We also skip return code checks as - * they are not needed for exceptional exits. - */ - if (exception_exit) - goto process_bpf_exit; - - if (state->curframe) { - /* exit from nested function */ - err = prepare_func_exit(env, &env->insn_idx); - if (err) - return err; - do_print_state = true; - continue; - } - - err = check_return_code(env, BPF_REG_0, "R0"); - if (err) - return err; + err = do_check_insn(env, &do_print_state); + if (err < 0) { + return err; + } else if (err == PROCESS_BPF_EXIT) { process_bpf_exit: - mark_verifier_state_scratched(env); - update_branch_counts(env, env->cur_state); - err = pop_stack(env, &prev_insn_idx, - &env->insn_idx, pop_log); - if (err < 0) { - if (err != -ENOENT) - return err; - break; - } else { - if (verifier_bug_if(env->cur_state->loop_entry, env, - "broken loop detection")) - return -EFAULT; - do_print_state = true; - continue; - } - } else { - err = check_cond_jmp_op(env, insn, &env->insn_idx); - if (err) - return err; - } - } else if (class == BPF_LD) { - u8 mode = BPF_MODE(insn->code); - - if (mode == BPF_ABS || mode == BPF_IND) { - err = check_ld_abs(env, insn); - if (err) - return err; - - } else if (mode == BPF_IMM) { - err = check_ld_imm(env, insn); - if (err) + mark_verifier_state_scratched(env); + update_branch_counts(env, env->cur_state); + err = pop_stack(env, &prev_insn_idx, &env->insn_idx, + pop_log); + if (err < 0) { + if (err != -ENOENT) return err; - - env->insn_idx++; - sanitize_mark_insn_seen(env); + break; } else { - verbose(env, "invalid BPF_LD mode\n"); - return -EINVAL; + if (verifier_bug_if(env->cur_state->loop_entry, env, + "broken loop detection")) + return -EFAULT; + do_print_state = true; + continue; } - } else { - verbose(env, "unknown insn class %d\n", class); - return -EINVAL; } - - env->insn_idx++; + WARN_ON_ONCE(err); } return 0; -- cgit v1.2.3 From fd508bde5d646fe8b8e664ae7c523d2d467d6c76 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 22:57:53 +0200 Subject: bpf: Return -EFAULT on misconfigurations Mark these cases as non-recoverable to later prevent them from being caught when they occur during speculative path verification. Eduard writes [1]: The only pace I'm aware of that might act upon specific error code from verifier syscall is libbpf. Looking through libbpf code, it seems that this change does not interfere with libbpf. [1] https://lore.kernel.org/all/785b4531ce3b44a84059a4feb4ba458c68fce719.camel@gmail.com/ Signed-off-by: Luis Gerhorst Reviewed-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Acked-by: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603205800.334980-3-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48982172565b..464facb1e283 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8955,7 +8955,7 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ verbose(env, "invalid map_ptr to access map->type\n"); - return -EACCES; + return -EFAULT; } switch (meta->map_ptr->map_type) { @@ -9643,7 +9643,7 @@ skip_type_check: * that kernel subsystem misconfigured verifier */ verbose(env, "invalid map_ptr to access map->key\n"); - return -EACCES; + return -EFAULT; } key_size = meta->map_ptr->key_size; err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL); @@ -9670,7 +9670,7 @@ skip_type_check: if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ verbose(env, "invalid map_ptr to access map->value\n"); - return -EACCES; + return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, @@ -10966,7 +10966,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (map == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + return -EFAULT; } /* In case of read-only, some additional restrictions @@ -11005,7 +11005,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { verbose(env, "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + return -EFAULT; } reg = ®s[BPF_REG_3]; @@ -11259,7 +11259,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", func_id_name(func_id), func_id); - return -EINVAL; + return -EFAULT; } memset(&meta, 0, sizeof(meta)); @@ -11561,7 +11561,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.map_ptr == NULL) { verbose(env, "kernel subsystem misconfigured verifier\n"); - return -EINVAL; + return -EFAULT; } if (func_id == BPF_FUNC_map_lookup_elem && @@ -16738,7 +16738,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->type = CONST_PTR_TO_MAP; } else { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } return 0; @@ -16785,7 +16785,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) if (!env->ops->gen_ld_abs) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || @@ -20825,7 +20825,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) -(subprogs[0].stack_depth + 8)); if (epilogue_cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } else if (epilogue_cnt) { /* Save the ARG_PTR_TO_CTX for the epilogue to use */ cnt = 0; @@ -20848,13 +20848,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); if (!new_prog) @@ -21011,7 +21011,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (type == BPF_WRITE) { verbose(env, "bpf verifier narrow ctx access misconfigured\n"); - return -EINVAL; + return -EFAULT; } size_code = BPF_H; @@ -21030,7 +21030,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } if (is_narrower_load && size < target_size) { @@ -21038,7 +21038,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) off, size, size_default) * 8; if (shift && cnt + 1 >= INSN_BUF_SIZE) { verbose(env, "bpf verifier narrow ctx load misconfigured\n"); - return -EINVAL; + return -EFAULT; } if (ctx_field_size <= 4) { if (shift) @@ -21803,7 +21803,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) cnt = env->ops->gen_ld_abs(insn, insn_buf); if (cnt == 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); @@ -22139,7 +22139,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_map_ops_generic; if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } new_prog = bpf_patch_insn_data(env, i + delta, @@ -22499,7 +22499,7 @@ next_insn: !map_ptr->ops->map_poke_untrack || !map_ptr->ops->map_poke_run) { verbose(env, "bpf verifier is misconfigured\n"); - return -EINVAL; + return -EFAULT; } ret = map_ptr->ops->map_poke_track(map_ptr, prog->aux); -- cgit v1.2.3 From 6b84d7895d78079c76c5c7de052d8db3ec6680c9 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 22:57:54 +0200 Subject: bpf: Return -EFAULT on internal errors This prevents us from trying to recover from these on speculative paths in the future. Signed-off-by: Luis Gerhorst Reviewed-by: Eduard Zingerman Acked-by: Kumar Kartikeya Dwivedi Acked-by: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603205800.334980-4-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 464facb1e283..04465e317f10 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11654,7 +11654,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn verbose(env, "verifier internal error:"); verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", func_id_name(func_id)); - return -EINVAL; + return -EFAULT; } ret_btf = btf_vmlinux; ret_btf_id = *fn->ret_btf_id; @@ -15287,12 +15287,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (WARN_ON_ONCE(ptr_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); - return -EINVAL; + return -EFAULT; } if (WARN_ON(!src_reg)) { print_verifier_state(env, vstate, vstate->curframe, true); verbose(env, "verifier internal error: no src_reg\n"); - return -EINVAL; + return -EFAULT; } err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); if (err) -- cgit v1.2.3 From 03c68a0f8c68936a0bb915b030693923784724cb Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:13:18 +0200 Subject: bpf, arm64, powerpc: Add bpf_jit_bypass_spec_v1/v4() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JITs can set bpf_jit_bypass_spec_v1/v4() if they want the verifier to skip analysis/patching for the respective vulnerability. For v4, this will reduce the number of barriers the verifier inserts. For v1, it allows more programs to be accepted. The primary motivation for this is to not regress unpriv BPF's performance on ARM64 in a future commit where BPF_NOSPEC is also used against Spectre v1. This has the user-visible change that v1-induced rejections on non-vulnerable PowerPC CPUs are avoided. For now, this does not change the semantics of BPF_NOSPEC. It is still a v4-only barrier and must not be implemented if bypass_spec_v4 is always true for the arch. Changing it to a v1 AND v4-barrier is done in a future commit. As an alternative to bypass_spec_v1/v4, one could introduce NOSPEC_V1 AND NOSPEC_V4 instructions and allow backends to skip their lowering as suggested by commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4"). Adding bpf_jit_bypass_spec_v1/v4() was found to be preferable for the following reason: * bypass_spec_v1/v4 benefits non-vulnerable CPUs: Always performing the same analysis (not taking into account whether the current CPU is vulnerable), needlessly restricts users of CPUs that are not vulnerable. The only use case for this would be portability-testing, but this can later be added easily when needed by allowing users to force bypass_spec_v1/v4 to false. * Portability is still acceptable: Directly disabling the analysis instead of skipping the lowering of BPF_NOSPEC(_V1/V4) might allow programs on non-vulnerable CPUs to be accepted while the program will be rejected on vulnerable CPUs. With the fallback to speculation barriers for Spectre v1 implemented in a future commit, this will only affect programs that do variable stack-accesses or are very complex. For PowerPC, the SEC_FTR checking in bpf_jit_bypass_spec_v4() is based on the check that was previously located in the BPF_NOSPEC case. For LoongArch, it would likely be safe to set both bpf_jit_bypass_spec_v1() and _v4() according to commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode"). This is omitted here as I am unable to do any testing for LoongArch. Hari's ack concerns the PowerPC part only. Signed-off-by: Luis Gerhorst Acked-by: Hari Bathini Cc: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603211318.337474-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c20babbf998f..f9bd9625438b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3034,6 +3034,21 @@ bool __weak bpf_jit_needs_zext(void) return false; } +/* By default, enable the verifier's mitigations against Spectre v1 and v4 for + * all archs. The value returned must not change at runtime as there is + * currently no support for reloading programs that were loaded without + * mitigations. + */ +bool __weak bpf_jit_bypass_spec_v1(void) +{ + return false; +} + +bool __weak bpf_jit_bypass_spec_v4(void) +{ + return false; +} + /* Return true if the JIT inlines the call to the helper corresponding to * the imm. * -- cgit v1.2.3 From dff883d9e93a7f2f2fa4e38a9444b2c79d6da91a Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:17:03 +0200 Subject: bpf, arm64, powerpc: Change nospec to include v1 barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes the semantics of BPF_NOSPEC (previously a v4-only barrier) to always emit a speculation barrier that works against both Spectre v1 AND v4. If mitigation is not needed on an architecture, the backend should set bpf_jit_bypass_spec_v4/v1(). As of now, this commit only has the user-visible implication that unpriv BPF's performance on PowerPC is reduced. This is the case because we have to emit additional v1 barrier instructions for BPF_NOSPEC now. This commit is required for a future commit to allow us to rely on BPF_NOSPEC for Spectre v1 mitigation. As of this commit, the feature that nospec acts as a v1 barrier is unused. Commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4") noted that mitigation instructions for v1 and v4 might be different on some archs. While this would potentially offer improved performance on PowerPC, it was dismissed after the following considerations: * Only having one barrier simplifies the verifier and allows us to easily rely on v4-induced barriers for reducing the complexity of v1-induced speculative path verification. * For the architectures that implemented BPF_NOSPEC, only PowerPC has distinct instructions for v1 and v4. Even there, some insns may be shared between the barriers for v1 and v4 (e.g., 'ori 31,31,0' and 'sync'). If this is still found to impact performance in an unacceptable way, BPF_NOSPEC can be split into BPF_NOSPEC_V1 and BPF_NOSPEC_V4 later. As an optimization, we can already skip v1/v4 insns from being emitted for PowerPC with this setup if bypass_spec_v1/v4 is set. Vulnerability-status for BPF_NOSPEC-based Spectre mitigations (v4 as of this commit, v1 in the future) is therefore: * x86 (32-bit and 64-bit), ARM64, and PowerPC (64-bit): Mitigated - This patch implements BPF_NOSPEC for these architectures. The previous v4-only version was supported since commit f5e81d111750 ("bpf: Introduce BPF nospec instruction for mitigating Spectre v4") and commit b7540d625094 ("powerpc/bpf: Emit stf barrier instruction sequences for BPF_NOSPEC"). * LoongArch: Not Vulnerable - Commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode") is the only other past commit related to BPF_NOSPEC and indicates that the insn is not required there. * MIPS: Vulnerable (if unprivileged BPF is enabled) - Commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode") indicates that it is not vulnerable, but this contradicts the kernel and Debian documentation. Therefore, I assume that there exist vulnerable MIPS CPUs (but maybe not from Loongson?). In the future, BPF_NOSPEC could be implemented for MIPS based on the GCC speculation_barrier [1]. For now, we rely on unprivileged BPF being disabled by default. * Other: Unknown - To the best of my knowledge there is no definitive information available that indicates that any other arch is vulnerable. They are therefore left untouched (BPF_NOSPEC is not implemented, but bypass_spec_v1/v4 is also not set). I did the following testing to ensure the insn encoding is correct: * ARM64: * 'dsb nsh; isb' was successfully tested with the BPF CI in [2] * 'sb' locally using QEMU v7.2.15 -cpu max (emitted sb insn is executed for example with './test_progs -t verifier_array_access') * PowerPC: The following configs were tested locally with ppc64le QEMU v8.2 '-machine pseries -cpu POWER9': * STF_BARRIER_EIEIO + CONFIG_PPC_BOOK32_64 * STF_BARRIER_SYNC_ORI (forced on) + CONFIG_PPC_BOOK32_64 * STF_BARRIER_FALLBACK (forced on) + CONFIG_PPC_BOOK32_64 * CONFIG_PPC_E500 (forced on) + STF_BARRIER_EIEIO * CONFIG_PPC_E500 (forced on) + STF_BARRIER_SYNC_ORI (forced on) * CONFIG_PPC_E500 (forced on) + STF_BARRIER_FALLBACK (forced on) * CONFIG_PPC_E500 (forced on) + STF_BARRIER_NONE (forced on) Most of those cobinations should not occur in practice, but I was not able to get an PPC e6500 rootfs (for testing PPC_E500 without forcing it on). In any case, this should ensure that there are no unexpected conflicts between the insns when combined like this. Individual v1/v4 barriers were already emitted elsewhere. Hari's ack is for the PowerPC changes only. [1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=29b74545531f6afbee9fc38c267524326dbfbedf ("MIPS: Add speculation_barrier support") [2] https://github.com/kernel-patches/bpf/pull/8576 Signed-off-by: Luis Gerhorst Acked-by: Hari Bathini Cc: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603211703.337860-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f9bd9625438b..e536a34a32c8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2102,14 +2102,15 @@ out: #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: - /* Speculation barrier for mitigating Speculative Store Bypass. - * In case of arm64, we rely on the firmware mitigation as - * controlled via the ssbd kernel parameter. Whenever the - * mitigation is enabled, it works for all of the kernel code - * with no need to provide any additional instructions here. - * In case of x86, we use 'lfence' insn for mitigation. We - * reuse preexisting logic from Spectre v1 mitigation that - * happens to produce the required code on x86 for v4 as well. + /* Speculation barrier for mitigating Speculative Store Bypass, + * Bounds-Check Bypass and Type Confusion. In case of arm64, we + * rely on the firmware mitigation as controlled via the ssbd + * kernel parameter. Whenever the mitigation is enabled, it + * works for all of the kernel code with no need to provide any + * additional instructions here. In case of x86, we use 'lfence' + * insn for mitigation. We reuse preexisting logic from Spectre + * v1 mitigation that happens to produce the required code on + * x86 for v4 as well. */ barrier_nospec(); CONT; -- cgit v1.2.3 From 9124a4508007f146206a279f0c5e81dde314bda1 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:20:24 +0200 Subject: bpf: Rename sanitize_stack_spill to nospec_result This is made to clarify that this flag will cause a nospec to be added after this insn and can therefore be relied upon to reduce speculative path analysis. Signed-off-by: Luis Gerhorst Acked-by: Kumar Kartikeya Dwivedi Cc: Henriette Herzog Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603212024.338154-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 04465e317f10..79ae0ee395b0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5027,7 +5027,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (sanitize) - env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + env->insn_aux_data[insn_idx].nospec_result = true; } err = destroy_if_dynptr_stack_slot(env, state, spi); @@ -20930,7 +20930,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_spill) { + env->insn_aux_data[i + delta].nospec_result) { struct bpf_insn patch[] = { *insn, BPF_ST_NOSPEC(), -- cgit v1.2.3 From d6f1c85f22534d2d9fea9b32645da19c91ebe7d2 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Tue, 3 Jun 2025 23:24:28 +0200 Subject: bpf: Fall back to nospec for Spectre v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements the core of the series and causes the verifier to fall back to mitigating Spectre v1 using speculation barriers. The approach was presented at LPC'24 [1] and RAID'24 [2]. If we find any forbidden behavior on a speculative path, we insert a nospec (e.g., lfence speculation barrier on x86) before the instruction and stop verifying the path. While verifying a speculative path, we can furthermore stop verification of that path whenever we encounter a nospec instruction. A minimal example program would look as follows: A = true B = true if A goto e f() if B goto e unsafe() e: exit There are the following speculative and non-speculative paths (`cur->speculative` and `speculative` referring to the value of the push_stack() parameters): - A = true - B = true - if A goto e - A && !cur->speculative && !speculative - exit - !A && !cur->speculative && speculative - f() - if B goto e - B && cur->speculative && !speculative - exit - !B && cur->speculative && speculative - unsafe() If f() contains any unsafe behavior under Spectre v1 and the unsafe behavior matches `state->speculative && error_recoverable_with_nospec(err)`, do_check() will now add a nospec before f() instead of rejecting the program: A = true B = true if A goto e nospec f() if B goto e unsafe() e: exit Alternatively, the algorithm also takes advantage of nospec instructions inserted for other reasons (e.g., Spectre v4). Taking the program above as an example, speculative path exploration can stop before f() if a nospec was inserted there because of Spectre v4 sanitization. In this example, all instructions after the nospec are dead code (and with the nospec they are also dead code speculatively). For this, it relies on the fact that speculation barriers generally prevent all later instructions from executing if the speculation was not correct: * On Intel x86_64, lfence acts as full speculation barrier, not only as a load fence [3]: An LFENCE instruction or a serializing instruction will ensure that no later instructions execute, even speculatively, until all prior instructions complete locally. [...] Inserting an LFENCE instruction after a bounds check prevents later operations from executing before the bound check completes. This was experimentally confirmed in [4]. * On AMD x86_64, lfence is dispatch-serializing [5] (requires MSR C001_1029[1] to be set if the MSR is supported, this happens in init_amd()). AMD further specifies "A dispatch serializing instruction forces the processor to retire the serializing instruction and all previous instructions before the next instruction is executed" [8]. As dispatch is not specific to memory loads or branches, lfence therefore also affects all instructions there. Also, if retiring a branch means it's PC change becomes architectural (should be), this means any "wrong" speculation is aborted as required for this series. * ARM's SB speculation barrier instruction also affects "any instruction that appears later in the program order than the barrier" [6]. * PowerPC's barrier also affects all subsequent instructions [7]: [...] executing an ori R31,R31,0 instruction ensures that all instructions preceding the ori R31,R31,0 instruction have completed before the ori R31,R31,0 instruction completes, and that no subsequent instructions are initiated, even out-of-order, until after the ori R31,R31,0 instruction completes. The ori R31,R31,0 instruction may complete before storage accesses associated with instructions preceding the ori R31,R31,0 instruction have been performed Regarding the example, this implies that `if B goto e` will not execute before `if A goto e` completes. Once `if A goto e` completes, the CPU should find that the speculation was wrong and continue with `exit`. If there is any other path that leads to `if B goto e` (and therefore `unsafe()`) without going through `if A goto e`, then a nospec will still be needed there. However, this patch assumes this other path will be explored separately and therefore be discovered by the verifier even if the exploration discussed here stops at the nospec. This patch furthermore has the unfortunate consequence that Spectre v1 mitigations now only support architectures which implement BPF_NOSPEC. Before this commit, Spectre v1 mitigations prevented exploits by rejecting the programs on all architectures. Because some JITs do not implement BPF_NOSPEC, this patch therefore may regress unpriv BPF's security to a limited extent: * The regression is limited to systems vulnerable to Spectre v1, have unprivileged BPF enabled, and do NOT emit insns for BPF_NOSPEC. The latter is not the case for x86 64- and 32-bit, arm64, and powerpc 64-bit and they are therefore not affected by the regression. According to commit a6f6a95f2580 ("LoongArch, bpf: Fix jit to skip speculation barrier opcode"), LoongArch is not vulnerable to Spectre v1 and therefore also not affected by the regression. * To the best of my knowledge this regression may therefore only affect MIPS. This is deemed acceptable because unpriv BPF is still disabled there by default. As stated in a previous commit, BPF_NOSPEC could be implemented for MIPS based on GCC's speculation_barrier implementation. * It is unclear which other architectures (besides x86 64- and 32-bit, ARM64, PowerPC 64-bit, LoongArch, and MIPS) supported by the kernel are vulnerable to Spectre v1. Also, it is not clear if barriers are available on these architectures. Implementing BPF_NOSPEC on these architectures therefore is non-trivial. Searching GCC and the kernel for speculation barrier implementations for these architectures yielded no result. * If any of those regressed systems is also vulnerable to Spectre v4, the system was already vulnerable to Spectre v4 attacks based on unpriv BPF before this patch and the impact is therefore further limited. As an alternative to regressing security, one could still reject programs if the architecture does not emit BPF_NOSPEC (e.g., by removing the empty BPF_NOSPEC-case from all JITs except for LoongArch where it appears justified). However, this will cause rejections on these archs that are likely unfounded in the vast majority of cases. In the tests, some are now successful where we previously had a false-positive (i.e., rejection). Change them to reflect where the nospec should be inserted (using __xlated_unpriv) and modify the error message if the nospec is able to mitigate a problem that previously shadowed another problem (in that case __xlated_unpriv does not work, therefore just add a comment). Define SPEC_V1 to avoid duplicating this ifdef whenever we check for nospec insns using __xlated_unpriv, define it here once. This also improves readability. PowerPC can probably also be added here. However, omit it for now because the BPF CI currently does not include a test. Limit it to EPERM, EACCES, and EINVAL (and not everything except for EFAULT and ENOMEM) as it already has the desired effect for most real-world programs. Briefly went through all the occurrences of EPERM, EINVAL, and EACCESS in verifier.c to validate that catching them like this makes sense. Thanks to Dustin for their help in checking the vendor documentation. [1] https://lpc.events/event/18/contributions/1954/ ("Mitigating Spectre-PHT using Speculation Barriers in Linux eBPF") [2] https://arxiv.org/pdf/2405.00078 ("VeriFence: Lightweight and Precise Spectre Defenses for Untrusted Linux Kernel Extensions") [3] https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/runtime-speculative-side-channel-mitigations.html ("Managed Runtime Speculative Execution Side Channel Mitigations") [4] https://dl.acm.org/doi/pdf/10.1145/3359789.3359837 ("Speculator: a tool to analyze speculative execution attacks and mitigations" - Section 4.6 "Stopping Speculative Execution") [5] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/software-techniques-for-managing-speculation.pdf ("White Paper - SOFTWARE TECHNIQUES FOR MANAGING SPECULATION ON AMD PROCESSORS - REVISION 5.09.23") [6] https://developer.arm.com/documentation/ddi0597/2020-12/Base-Instructions/SB--Speculation-Barrier- ("SB - Speculation Barrier - Arm Armv8-A A32/T32 Instruction Set Architecture (2020-12)") [7] https://wiki.raptorcs.com/w/images/5/5f/OPF_PowerISA_v3.1C.pdf ("Power ISA™ - Version 3.1C - May 26, 2024 - Section 9.2.1 of Book III") [8] https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/programmer-references/40332.pdf ("AMD64 Architecture Programmer’s Manual Volumes 1–5 - Revision 4.08 - April 2024 - 7.6.4 Serializing Instructions") Signed-off-by: Luis Gerhorst Acked-by: Kumar Kartikeya Dwivedi Acked-by: Henriette Herzog Cc: Dustin Nguyen Cc: Maximilian Ott Cc: Milan Stephan Link: https://lore.kernel.org/r/20250603212428.338473-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 79ae0ee395b0..b1f797616f20 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2013,6 +2013,18 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, return 0; } +static bool error_recoverable_with_nospec(int err) +{ + /* Should only return true for non-fatal errors that are allowed to + * occur during speculative verification. For these we can insert a + * nospec and the program might still be accepted. Do not include + * something like ENOMEM because it is likely to re-occur for the next + * architectural path once it has been recovered-from in all speculative + * paths. + */ + return err == -EPERM || err == -EACCES || err == -EINVAL; +} + static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx, bool speculative) @@ -11147,7 +11159,7 @@ static int check_get_func_ip(struct bpf_verifier_env *env) return -ENOTSUPP; } -static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +static struct bpf_insn_aux_data *cur_aux(const struct bpf_verifier_env *env) { return &env->insn_aux_data[env->insn_idx]; } @@ -14015,7 +14027,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || + BPF_SRC(insn->code) == BPF_K || + cur_aux(env)->nospec; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -19732,10 +19746,41 @@ static int do_check(struct bpf_verifier_env *env) sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; + /* Reduce verification complexity by stopping speculative path + * verification when a nospec is encountered. + */ + if (state->speculative && cur_aux(env)->nospec) + goto process_bpf_exit; + err = do_check_insn(env, &do_print_state); - if (err < 0) { + if (state->speculative && error_recoverable_with_nospec(err)) { + /* Prevent this speculative path from ever reaching the + * insn that would have been unsafe to execute. + */ + cur_aux(env)->nospec = true; + /* If it was an ADD/SUB insn, potentially remove any + * markings for alu sanitization. + */ + cur_aux(env)->alu_state = 0; + goto process_bpf_exit; + } else if (err < 0) { return err; } else if (err == PROCESS_BPF_EXIT) { + goto process_bpf_exit; + } + WARN_ON_ONCE(err); + + if (state->speculative && cur_aux(env)->nospec_result) { + /* If we are on a path that performed a jump-op, this + * may skip a nospec patched-in after the jump. This can + * currently never happen because nospec_result is only + * used for the write-ops + * `*(size*)(dst_reg+off)=src_reg|imm32` which must + * never skip the following insn. Still, add a warning + * to document this in case nospec_result is used + * elsewhere in the future. + */ + WARN_ON_ONCE(env->insn_idx != prev_insn_idx + 1); process_bpf_exit: mark_verifier_state_scratched(env); update_branch_counts(env, env->cur_state); @@ -19753,7 +19798,6 @@ process_bpf_exit: continue; } } - WARN_ON_ONCE(err); } return 0; @@ -20881,6 +20925,29 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) bpf_convert_ctx_access_t convert_ctx_access; u8 mode; + if (env->insn_aux_data[i + delta].nospec) { + WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state); + struct bpf_insn patch[] = { + BPF_ST_NOSPEC(), + *insn, + }; + + cnt = ARRAY_SIZE(patch); + new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + /* This can not be easily merged with the + * nospec_result-case, because an insn may require a + * nospec before and after itself. Therefore also do not + * 'continue' here but potentially apply further + * patching to insn. *insn should equal patch[1] now. + */ + } + if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || @@ -20931,6 +20998,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (type == BPF_WRITE && env->insn_aux_data[i + delta].nospec_result) { + /* nospec_result is only used to mitigate Spectre v4 and + * to limit verification-time for Spectre v1. + */ struct bpf_insn patch[] = { *insn, BPF_ST_NOSPEC(), -- cgit v1.2.3 From 3d7e10188ae0b68dadd60f611ca81ecf9d991f77 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 May 2025 18:26:21 +0200 Subject: sched: Make clangd usable Due to the weird Makefile setup of sched the various files do not compile as stand alone units. The new generation of editors are trying to do just this -- mostly to offer fancy things like completions but also better syntax highlighting and code navigation. Specifically, I've been playing around with neovim and clangd. Setting up clangd on the kernel source is a giant pain in the arse (this really should be improved), but once you do manage, you run into dumb stuff like the above. Fix up the scheduler files to at least pretend to work. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20250523164348.GN39944@noisy.programming.kicks-ass.net --- kernel/sched/autogroup.c | 3 +++ kernel/sched/autogroup.h | 2 ++ kernel/sched/clock.c | 3 +++ kernel/sched/completion.c | 5 +++++ kernel/sched/core_sched.c | 2 ++ kernel/sched/cpuacct.c | 2 ++ kernel/sched/cpudeadline.c | 1 + kernel/sched/cpudeadline.h | 2 ++ kernel/sched/cpufreq.c | 1 + kernel/sched/cpufreq_schedutil.c | 2 ++ kernel/sched/cpupri.c | 1 + kernel/sched/cpupri.h | 3 +++ kernel/sched/cputime.c | 3 +++ kernel/sched/deadline.c | 4 ++++ kernel/sched/debug.c | 3 +++ kernel/sched/idle.c | 5 +++++ kernel/sched/isolation.c | 2 ++ kernel/sched/loadavg.c | 2 ++ kernel/sched/membarrier.c | 2 ++ kernel/sched/pelt.c | 1 + kernel/sched/pelt.h | 7 ++++++- kernel/sched/psi.c | 4 ++++ kernel/sched/rt.c | 3 +++ kernel/sched/sched-pelt.h | 1 + kernel/sched/sched.h | 1 + kernel/sched/smp.h | 7 +++++++ kernel/sched/stats.c | 1 + kernel/sched/stop_task.c | 1 + kernel/sched/swait.c | 1 + kernel/sched/topology.c | 2 ++ kernel/sched/wait.c | 1 + kernel/sched/wait_bit.c | 3 +++ 32 files changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2b331822c7e7..e96a167c3f28 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -4,6 +4,9 @@ * Auto-group scheduling implementation: */ +#include "autogroup.h" +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 90d69f2c5eaf..5c1796a463af 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -2,6 +2,8 @@ #ifndef _KERNEL_SCHED_AUTOGROUP_H #define _KERNEL_SCHED_AUTOGROUP_H +#include "sched.h" + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a09655b48140..e62b5510cd2e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -54,6 +54,9 @@ * */ +#include +#include "sched.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 3561ab533dd4..19ee702273c0 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,11 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +#include +#include +#include +#include "sched.h" + static void complete_with_flags(struct completion *x, int wake_flags) { unsigned long flags; diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c4606ca89210..9ede71ecba7f 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -4,6 +4,8 @@ * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. */ +#include "sched.h" + struct sched_core_cookie { refcount_t refcnt; }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda09949..23a56ba12d81 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,8 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 95baa12a1029..cdd740b3f774 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -6,6 +6,7 @@ * * Author: Juri Lelli */ +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 0adeda93b5fb..3f7c73d1d189 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include +#include #define IDX_INVALID -1 diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5252fb191fae..742fb9e62e1a 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,7 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki */ +#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 461242ec958a..3d7e9ccb99a0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki */ +#include +#include "sched.h" #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 42c40cfdf836..76a9ac5eb794 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,6 +22,7 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ +#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index d6cba0020064..f0f5a734c0c1 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6dab4854c6c0..f01f17acb4a8 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,6 +2,9 @@ /* * Simple CPU accounting cgroup controller */ +#include +#include +#include "sched.h" #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245..ff5be80c5c9d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,10 @@ */ #include +#include +#include +#include "sched.h" +#include "pelt.h" /* * Default limits for DL period; on the top end we guard against small util diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf08075..b384124d89d3 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,6 +6,9 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#include +#include +#include "sched.h" /* * This allows printing both to /sys/kernel/debug/sched/debug and diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2c85c86b455f..cd3298653ef4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,6 +6,11 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ +#include +#include +#include +#include "sched.h" +#include "smp.h" /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 93b038d48900..a4cf17b1fab0 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,6 +7,8 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ +#include +#include "sched.h" enum hk_flags { HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2..f6df84ca6925 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,6 +6,8 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ +#include +#include "sched.h" /* * Global load-average calculations diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 809194cd779f..62fba83b7bb1 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,6 +4,8 @@ * * membarrier system call */ +#include +#include "sched.h" /* * For documentation purposes, here are some membarrier ordering diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 7a8534a2deff..09be6a83e45a 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -23,6 +23,7 @@ * Move PELT related code from fair.c into this pelt.c file * Author: Vincent Guittot */ +#include "pelt.h" /* * Approximate: diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index f4f6a0875c66..19592077452e 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,3 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _KERNEL_SCHED_PELT_H +#define _KERNEL_SCHED_PELT_H +#include "sched.h" + #ifdef CONFIG_SMP #include "sched-pelt.h" @@ -233,4 +238,4 @@ update_idle_rq_clock_pelt(struct rq *rq) { } static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } #endif - +#endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ad04a5c3162a..333a7ba39668 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,6 +136,10 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ +#include +#include +#include +#include "sched.h" static int psi_bug __read_mostly; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40422c37033..16008ac33e14 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,6 +4,9 @@ * policies) */ +#include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index c529706bed11..6803cfec7a1e 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ +#include static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 475bb5998295..f3a41487c96d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -69,6 +69,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 21ac44428bb0..7f151d96dba9 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_SCHED_SMP_H +#define _KERNEL_SCHED_SMP_H + /* * Scheduler internal SMP callback types and methods between the scheduler * and other internal parts of the core kernel: */ +#include extern void sched_ttwu_pending(void *arg); @@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void); #else static inline void flush_smp_call_function_queue(void) { } #endif + +#endif /* _KERNEL_SCHED_SMP_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 4346fd81c31f..1faea875d0b3 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,6 +2,7 @@ /* * /proc/schedstat implementation */ +#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 058dd42e3d9b..1c1bbc6e33b6 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,6 +7,7 @@ * * See kernel/stop_machine.c */ +#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 72505cd3b60a..0fef6496c4c8 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,6 +2,7 @@ /* * (simple wait queues ) implementation: */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b958fe48e020..9026d325d0fd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ +#include #include +#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); void sched_domains_mutex_lock(void) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f4701..a6f00012c72e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,6 +4,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index b410b61cec95..1088d3b7012c 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include +#include "sched.h" + /* * The implementation of the wait_bit*() and related waiting APIs: */ -- cgit v1.2.3 From b01f2d9597250e9c4011cb78d8d46287deaa6a69 Mon Sep 17 00:00:00 2001 From: wang wei Date: Thu, 5 Jun 2025 23:29:31 +0800 Subject: sched/eevdf: Correct the comment in place_entity Correct "l" to "vl_i". Signed-off-by: wang wei Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250605152931.22804-1-a929244872@163.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a14da5396fb..83157de5b808 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5253,7 +5253,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) - * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: -- cgit v1.2.3 From bc4394e5e79cdda1b0997e0be1d65e242f523f02 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 6 Jun 2025 12:25:46 -0700 Subject: perf: Fix the throttle error of some clock events Both ARM and IBM CI reports RCU stall, which can be reproduced by the below perf command. perf record -a -e cpu-clock -- sleep 2 The issue is introduced by the generic throttle patch set, which unconditionally invoke the event_stop() when throttle is triggered. The cpu-clock and task-clock are two special SW events, which rely on the hrtimer. The throttle is invoked in the hrtimer handler. The event_stop()->hrtimer_cancel() waits for the handler to finish, which is a deadlock. Instead of invoking the stop(), the HRTIMER_NORESTART should be used to stop the timer. There may be two ways to fix it: - Introduce a PMU flag to track the case. Avoid the event_stop in perf_event_throttle() if the flag is detected. It has been implemented in the https://lore.kernel.org/lkml/20250528175832.2999139-1-kan.liang@linux.intel.com/ The new flag was thought to be an overkill for the issue. - Add a check in the event_stop. Return immediately if the throttle is invoked in the hrtimer handler. Rely on the existing HRTIMER_NORESTART method to stop the timer. The latter is implemented here. Move event->hw.interrupts = MAX_INTERRUPTS before the stop(). It makes the order the same as perf_event_unthrottle(). Except the patch, no one checks the hw.interrupts in the stop(). There is no impact from the order change. When stops in the throttle, the event should not be updated, stop(event, 0). But the cpu_clock_event_stop() doesn't handle the flag. In logic, it's wrong. But it didn't bring any problems with the old code, because the stop() was not invoked when handling the throttle. Checking the flag before updating the event. Fixes: 9734e25fbf5a ("perf: Fix the throttle logic for a group") Closes: https://lore.kernel.org/lkml/20250527161656.GJ2566836@e132581.arm.com/ Closes: https://lore.kernel.org/lkml/djxlh5fx326gcenwrr52ry3pk4wxmugu4jccdjysza7tlc5fef@ktp4rffawgcw/ Closes: https://lore.kernel.org/lkml/8e8f51d8-af64-4d9e-934b-c0ee9f131293@linux.ibm.com/ Closes: https://lore.kernel.org/lkml/4ce106d0-950c-aadc-0b6a-f0215cd39913@maine.edu/ Reported-by: Leo Yan Reported-by: Aishwarya TCV Reported-by: Alexei Starovoitov Reported-by: Venkat Rao Bagalkote Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20250606192546.915765-1-kan.liang@linux.intel.com --- kernel/events/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d7cf008f3d75..1f746469fda5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2674,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start) static void perf_event_throttle(struct perf_event *event) { - event->pmu->stop(event, 0); event->hw.interrupts = MAX_INTERRUPTS; + event->pmu->stop(event, 0); if (event == event->group_leader) perf_log_throttle(event, 0); } @@ -11774,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (is_sampling_event(event)) { + /* + * The throttle can be triggered in the hrtimer handler. + * The HRTIMER_NORESTART should be used to stop the timer, + * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + */ + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); @@ -11829,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags) static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); + if (flags & PERF_EF_UPDATE) + cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) @@ -11907,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags) static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); + if (flags & PERF_EF_UPDATE) + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) -- cgit v1.2.3 From 69a14d146f3b87819f3fb73ed5d1de3e1fa680c1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 2 Jun 2025 13:00:27 +0200 Subject: futex: Verify under the lock if hash can be replaced Once the global hash is requested there is no way back to switch back to the per-task private hash. This is checked at the begin of the function. It is possible that two threads simultaneously request the global hash and both pass the initial check and block later on the mm::futex_hash_lock. In this case the first thread performs the switch to the global hash. The second thread will also attempt to switch to the global hash and while doing so, accessing the nonexisting slot 1 of the struct futex_private_hash. The same applies if the hash is made immutable: There is no reference counting and the hash must not be replaced. Verify under mm_struct::futex_phash that neither the global hash nor an immutable hash in use. Tested-by: "Lai, Yi" Reported-by: "Lai, Yi" Closes: https://lore.kernel.org/all/aDwDw9Aygqo6oAx+@ly-workstation/ Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250610104400.1077266-5-bigeasy@linutronix.de/ --- kernel/futex/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/futex/core.c b/kernel/futex/core.c index b652d2f60c40..90d53fb0ee9e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1629,6 +1629,16 @@ again: mm->futex_phash_new = NULL; if (fph) { + if (cur && (!cur->hash_mask || cur->immutable)) { + /* + * If two threads simultaneously request the global + * hash then the first one performs the switch, + * the second one returns here. + */ + free = fph; + mm->futex_phash_new = new; + return -EBUSY; + } if (cur && !new) { /* * If we have an existing hash, but do not yet have -- cgit v1.2.3 From aa807b9f22df2eee28593cbbabba0f93f4aa26c1 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 12 Jun 2025 10:14:17 +0800 Subject: dma-contiguous: hornor the cma address limit setup by user When porting a cma related usage from x86_64 server to arm64 server, the "cma=4G@4G" setup failed on arm64. The reason is arm64 and some other architectures have specific physical address limit for reserved cma area, like 4GB due to the device's need for 32 bit dma. Actually lots of platforms of those architectures don't have this device dma limit, but still have to obey it, and are not able to reserve a huge cma pool. This situation could be improved by honoring the user input cma physical address than the arch limit. As when users specify it, they already knows what the default is which probably can't suit them. Suggested-by: Robin Murphy Signed-off-by: Feng Tang Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250612021417.44929-1-feng.tang@linux.alibaba.com --- kernel/dma/contiguous.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 8df0dfaaca18..67af8a55185d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -222,7 +222,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit) if (size_cmdline != -1) { selected_size = size_cmdline; selected_base = base_cmdline; - selected_limit = min_not_zero(limit_cmdline, limit); + + /* Hornor the user setup dma address limit */ + selected_limit = limit_cmdline ?: limit; + if (base_cmdline + size_cmdline == limit_cmdline) fixed = true; } else { -- cgit v1.2.3 From ff56a3e2a8613e8524f40ef2efa2c0169659e99e Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Fri, 6 Jun 2025 14:48:18 +0200 Subject: timers/migration: Clean up the loop in tmigr_quick_check() Make the logic easier to follow: - Remove the final return statement, which is never reached, and move the actual walk-terminating return statement out of the do-while loop. - Remove the else-clause to reduce indentation. If a non-lonely group is encountered during the walk, the loop is immediately terminated with a return statement anyway; no need for an else. Signed-off-by: Petr Tesarik Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20250606124818.455560-1-ptesarik@suse.com --- kernel/time/timer_migration.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 2f6330831f08..c0c54dc5314c 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -1405,23 +1405,20 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; do { - if (!tmigr_check_lonely(group)) { + if (!tmigr_check_lonely(group)) return KTIME_MAX; - } else { - /* - * Since current CPU is active, events may not be sorted - * from bottom to the top because the CPU's event is ignored - * up to the top and its sibling's events not propagated upwards. - * Thus keep track of the lowest observed expiry. - */ - nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); - if (!group->parent) - return nextevt; - } + + /* + * Since current CPU is active, events may not be sorted + * from bottom to the top because the CPU's event is ignored + * up to the top and its sibling's events not propagated upwards. + * Thus keep track of the lowest observed expiry. + */ + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); group = group->parent; } while (group); - return KTIME_MAX; + return nextevt; } /* -- cgit v1.2.3 From baaebe0928bf321a1cd980d569e308dec66be94c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:26 -0700 Subject: Revert "bpf: use common instruction history across all states" This reverts commit 96a30e469ca1d2b8cc7811b40911f8614b558241. Next patches in the series modify propagate_precision() to allow arbitrary starting state. Precision propagation requires access to jump history, and arbitrary states represent history not belonging to `env->cur_state`. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 109 ++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b1f797616f20..92f2dad5f453 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1660,6 +1660,13 @@ static void free_func_state(struct bpf_func_state *state) kfree(state); } +static void clear_jmp_history(struct bpf_verifier_state *state) +{ + kfree(state->jmp_history); + state->jmp_history = NULL; + state->jmp_history_cnt = 0; +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { @@ -1670,6 +1677,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, state->frame[i] = NULL; } kfree(state->refs); + clear_jmp_history(state); if (free_self) kfree(state); } @@ -1734,6 +1742,13 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, struct bpf_func_state *dst; int i, err; + dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, + src->jmp_history_cnt, sizeof(*dst_state->jmp_history), + GFP_USER); + if (!dst_state->jmp_history) + return -ENOMEM; + dst_state->jmp_history_cnt = src->jmp_history_cnt; + /* if dst has more stack frames then src frame, free them, this is also * necessary in case of exceptional exits using bpf_throw. */ @@ -1751,8 +1766,6 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; dst_state->last_insn_idx = src->last_insn_idx; - dst_state->insn_hist_start = src->insn_hist_start; - dst_state->insn_hist_end = src->insn_hist_end; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; dst_state->used_as_loop_entry = src->used_as_loop_entry; @@ -2820,14 +2833,9 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, * The caller state doesn't matter. * This is async callback. It starts in a fresh stack. * Initialize it similar to do_check_common(). - * But we do need to make sure to not clobber insn_hist, so we keep - * chaining insn_hist_start/insn_hist_end indices as for a normal - * child state. */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; - elem->st.insn_hist_start = env->cur_state->insn_hist_end; - elem->st.insn_hist_end = elem->st.insn_hist_start; frame = kzalloc(sizeof(*frame), GFP_KERNEL); if (!frame) goto err; @@ -3856,10 +3864,11 @@ static void linked_regs_unpack(u64 val, struct linked_regs *s) } /* for any branch, call, exit record the history of jmps in the given state */ -static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, - int insn_flags, u64 linked_regs) +static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, + int insn_flags, u64 linked_regs) { - struct bpf_insn_hist_entry *p; + u32 cnt = cur->jmp_history_cnt; + struct bpf_jmp_history_entry *p; size_t alloc_size; /* combine instruction flags if we already recorded this instruction */ @@ -3879,32 +3888,29 @@ static int push_insn_history(struct bpf_verifier_env *env, struct bpf_verifier_s return 0; } - if (cur->insn_hist_end + 1 > env->insn_hist_cap) { - alloc_size = size_mul(cur->insn_hist_end + 1, sizeof(*p)); - p = kvrealloc(env->insn_hist, alloc_size, GFP_USER); - if (!p) - return -ENOMEM; - env->insn_hist = p; - env->insn_hist_cap = alloc_size / sizeof(*p); - } + cnt++; + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_USER); + if (!p) + return -ENOMEM; + cur->jmp_history = p; - p = &env->insn_hist[cur->insn_hist_end]; + p = &cur->jmp_history[cnt - 1]; p->idx = env->insn_idx; p->prev_idx = env->prev_insn_idx; p->flags = insn_flags; p->linked_regs = linked_regs; - - cur->insn_hist_end++; + cur->jmp_history_cnt = cnt; env->cur_hist_ent = p; return 0; } -static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env *env, - u32 hist_start, u32 hist_end, int insn_idx) +static struct bpf_jmp_history_entry *get_jmp_hist_entry(struct bpf_verifier_state *st, + u32 hist_end, int insn_idx) { - if (hist_end > hist_start && env->insn_hist[hist_end - 1].idx == insn_idx) - return &env->insn_hist[hist_end - 1]; + if (hist_end > 0 && st->jmp_history[hist_end - 1].idx == insn_idx) + return &st->jmp_history[hist_end - 1]; return NULL; } @@ -3921,26 +3927,25 @@ static struct bpf_insn_hist_entry *get_insn_hist_entry(struct bpf_verifier_env * * history entry recording a jump from last instruction of parent state and * first instruction of given state. */ -static int get_prev_insn_idx(const struct bpf_verifier_env *env, - struct bpf_verifier_state *st, - int insn_idx, u32 hist_start, u32 *hist_endp) +static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) { - u32 hist_end = *hist_endp; - u32 cnt = hist_end - hist_start; + u32 cnt = *history; - if (insn_idx == st->first_insn_idx) { + if (i == st->first_insn_idx) { if (cnt == 0) return -ENOENT; - if (cnt == 1 && env->insn_hist[hist_start].idx == insn_idx) + if (cnt == 1 && st->jmp_history[0].idx == i) return -ENOENT; } - if (cnt && env->insn_hist[hist_end - 1].idx == insn_idx) { - (*hist_endp)--; - return env->insn_hist[hist_end - 1].prev_idx; + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; } else { - return insn_idx - 1; + i--; } + return i; } static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) @@ -4121,7 +4126,7 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask) /* If any register R in hist->linked_regs is marked as precise in bt, * do bt_set_frame_{reg,slot}(bt, R) for all registers in hist->linked_regs. */ -static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_insn_hist_entry *hist) +static void bt_sync_linked_regs(struct backtrack_state *bt, struct bpf_jmp_history_entry *hist) { struct linked_regs linked_regs; bool some_precise = false; @@ -4166,7 +4171,7 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); * - *was* processed previously during backtracking. */ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, - struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) + struct bpf_jmp_history_entry *hist, struct backtrack_state *bt) { struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); @@ -4584,7 +4589,7 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * SCALARS, as well as any other registers and slots that contribute to * a tracked state of given registers/stack slots, depending on specific BPF * assembly instructions (see backtrack_insns() for exact instruction handling - * logic). This backtracking relies on recorded insn_hist and is able to + * logic). This backtracking relies on recorded jmp_history and is able to * traverse entire chain of parent states. This process ends only when all the * necessary registers/slots and their transitive dependencies are marked as * precise. @@ -4701,9 +4706,8 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) for (;;) { DECLARE_BITMAP(mask, 64); - u32 hist_start = st->insn_hist_start; - u32 hist_end = st->insn_hist_end; - struct bpf_insn_hist_entry *hist; + u32 history = st->jmp_history_cnt; + struct bpf_jmp_history_entry *hist; if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "mark_precise: frame%d: last_idx %d first_idx %d subseq_idx %d \n", @@ -4741,7 +4745,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = 0; skip_first = false; } else { - hist = get_insn_hist_entry(env, hist_start, hist_end, i); + hist = get_jmp_hist_entry(st, history, i); err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { @@ -4758,7 +4762,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) */ return 0; subseq_idx = i; - i = get_prev_insn_idx(env, st, i, hist_start, &hist_end); + i = get_prev_insn_idx(st, i, &history); if (i == -ENOENT) break; if (i >= env->prog->len) { @@ -5122,7 +5126,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -5429,7 +5433,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env, insn_flags = 0; /* we are not restoring spilled register */ } if (insn_flags) - return push_insn_history(env, env->cur_state, insn_flags, 0); + return push_jmp_history(env, env->cur_state, insn_flags, 0); return 0; } @@ -16496,7 +16500,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } if (insn_flags) { - err = push_insn_history(env, this_branch, insn_flags, 0); + err = push_jmp_history(env, this_branch, insn_flags, 0); if (err) return err; } @@ -16554,7 +16558,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && dst_reg->id) collect_linked_regs(this_branch, dst_reg->id, &linked_regs); if (linked_regs.cnt > 1) { - err = push_insn_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); + err = push_jmp_history(env, this_branch, 0, linked_regs_pack(&linked_regs)); if (err) return err; } @@ -19052,7 +19056,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || /* Avoid accumulating infinitely long jmp history */ - cur->insn_hist_end - cur->insn_hist_start > 40; + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -19251,7 +19255,7 @@ hit: * the current state. */ if (is_jmp_point(env, env->insn_idx)) - err = err ? : push_insn_history(env, cur, 0, 0); + err = err ? : push_jmp_history(env, cur, 0, 0); err = err ? : propagate_precision(env, &sl->state); if (err) return err; @@ -19333,8 +19337,8 @@ miss: cur->parent = new; cur->first_insn_idx = insn_idx; - cur->insn_hist_start = cur->insn_hist_end; cur->dfs_depth = new->dfs_depth + 1; + clear_jmp_history(cur); list_add(&new_sl->node, head); /* connect new state to parentage chain. Current frame needs all @@ -19704,7 +19708,7 @@ static int do_check(struct bpf_verifier_env *env) } if (is_jmp_point(env, env->insn_idx)) { - err = push_insn_history(env, state, 0, 0); + err = push_jmp_history(env, state, 0, 0); if (err) return err; } @@ -24291,7 +24295,6 @@ err_unlock: if (!is_priv) mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); - kvfree(env->insn_hist); err_free_env: kvfree(env->cfg.insn_postorder); kvfree(env); -- cgit v1.2.3 From 96c6aa4c63af0bb0675c41b3e61a2fc7f6fed998 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:27 -0700 Subject: bpf: compute SCCs in program control flow graph Compute strongly connected components in the program CFG. Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. Use Tarjan's algorithm for SCC computation adapted to run non-recursively. For debug purposes print out computed SCCs as a part of full program dump in compute_live_registers() at log level 2, e.g.: func#0 @0 Live regs before insn: 0: .......... (b4) w6 = 10 2 1: ......6... (18) r1 = 0xffff88810bbb5565 2 3: .1....6... (b4) w2 = 2 2 4: .12...6... (85) call bpf_trace_printk#6 2 5: ......6... (04) w6 += -1 2 6: ......6... (56) if w6 != 0x0 goto pc-6 7: .......... (b4) w6 = 5 1 8: ......6... (18) r1 = 0xffff88810bbb5567 1 10: .1....6... (b4) w2 = 2 1 11: .12...6... (85) call bpf_trace_printk#6 1 12: ......6... (04) w6 += -1 1 13: ......6... (56) if w6 != 0x0 goto pc-6 14: .......... (b4) w0 = 0 15: 0......... (95) exit ^^^ SCC number for the instruction Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 182 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92f2dad5f453..75e4f6544b2a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -24013,6 +24013,10 @@ static int compute_live_registers(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "Live regs before insn:\n"); for (i = 0; i < insn_cnt; ++i) { + if (env->insn_aux_data[i].scc) + verbose(env, "%3d ", env->insn_aux_data[i].scc); + else + verbose(env, " "); verbose(env, "%3d: ", i); for (j = BPF_REG_0; j < BPF_REG_10; ++j) if (insn_aux[i].live_regs_before & BIT(j)) @@ -24034,6 +24038,180 @@ out: return err; } +/* + * Compute strongly connected components (SCCs) on the CFG. + * Assign an SCC number to each instruction, recorded in env->insn_aux[*].scc. + * If instruction is a sole member of its SCC and there are no self edges, + * assign it SCC number of zero. + * Uses a non-recursive adaptation of Tarjan's algorithm for SCC computation. + */ +static int compute_scc(struct bpf_verifier_env *env) +{ + const u32 NOT_ON_STACK = U32_MAX; + + struct bpf_insn_aux_data *aux = env->insn_aux_data; + const u32 insn_cnt = env->prog->len; + int stack_sz, dfs_sz, err = 0; + u32 *stack, *pre, *low, *dfs; + u32 succ_cnt, i, j, t, w; + u32 next_preorder_num; + u32 next_scc_id; + bool assign_scc; + u32 succ[2]; + + next_preorder_num = 1; + next_scc_id = 1; + /* + * - 'stack' accumulates vertices in DFS order, see invariant comment below; + * - 'pre[t] == p' => preorder number of vertex 't' is 'p'; + * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; + * - 'dfs' DFS traversal stack, used to emulate explicit recursion. + */ + stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL); + if (!stack || !pre || !low || !dfs) { + err = -ENOMEM; + goto exit; + } + /* + * References: + * [1] R. Tarjan "Depth-First Search and Linear Graph Algorithms" + * [2] D. J. Pearce "A Space-Efficient Algorithm for Finding Strongly Connected Components" + * + * The algorithm maintains the following invariant: + * - suppose there is a path 'u' ~> 'v', such that 'pre[v] < pre[u]'; + * - then, vertex 'u' remains on stack while vertex 'v' is on stack. + * + * Consequently: + * - If 'low[v] < pre[v]', there is a path from 'v' to some vertex 'u', + * such that 'pre[u] == low[v]'; vertex 'u' is currently on the stack, + * and thus there is an SCC (loop) containing both 'u' and 'v'. + * - If 'low[v] == pre[v]', loops containing 'v' have been explored, + * and 'v' can be considered the root of some SCC. + * + * Here is a pseudo-code for an explicitly recursive version of the algorithm: + * + * NOT_ON_STACK = insn_cnt + 1 + * pre = [0] * insn_cnt + * low = [0] * insn_cnt + * scc = [0] * insn_cnt + * stack = [] + * + * next_preorder_num = 1 + * next_scc_id = 1 + * + * def recur(w): + * nonlocal next_preorder_num + * nonlocal next_scc_id + * + * pre[w] = next_preorder_num + * low[w] = next_preorder_num + * next_preorder_num += 1 + * stack.append(w) + * for s in successors(w): + * # Note: for classic algorithm the block below should look as: + * # + * # if pre[s] == 0: + * # recur(s) + * # low[w] = min(low[w], low[s]) + * # elif low[s] != NOT_ON_STACK: + * # low[w] = min(low[w], pre[s]) + * # + * # But replacing both 'min' instructions with 'low[w] = min(low[w], low[s])' + * # does not break the invariant and makes itartive version of the algorithm + * # simpler. See 'Algorithm #3' from [2]. + * + * # 's' not yet visited + * if pre[s] == 0: + * recur(s) + * # if 's' is on stack, pick lowest reachable preorder number from it; + * # if 's' is not on stack 'low[s] == NOT_ON_STACK > low[w]', + * # so 'min' would be a noop. + * low[w] = min(low[w], low[s]) + * + * if low[w] == pre[w]: + * # 'w' is the root of an SCC, pop all vertices + * # below 'w' on stack and assign same SCC to them. + * while True: + * t = stack.pop() + * low[t] = NOT_ON_STACK + * scc[t] = next_scc_id + * if t == w: + * break + * next_scc_id += 1 + * + * for i in range(0, insn_cnt): + * if pre[i] == 0: + * recur(i) + * + * Below implementation replaces explicit recusion with array 'dfs'. + */ + for (i = 0; i < insn_cnt; i++) { + if (pre[i]) + continue; + stack_sz = 0; + dfs_sz = 1; + dfs[0] = i; +dfs_continue: + while (dfs_sz) { + w = dfs[dfs_sz - 1]; + if (pre[w] == 0) { + low[w] = next_preorder_num; + pre[w] = next_preorder_num; + next_preorder_num++; + stack[stack_sz++] = w; + } + /* Visit 'w' successors */ + succ_cnt = insn_successors(env->prog, w, succ); + for (j = 0; j < succ_cnt; ++j) { + if (pre[succ[j]]) { + low[w] = min(low[w], low[succ[j]]); + } else { + dfs[dfs_sz++] = succ[j]; + goto dfs_continue; + } + } + /* + * Preserve the invariant: if some vertex above in the stack + * is reachable from 'w', keep 'w' on the stack. + */ + if (low[w] < pre[w]) { + dfs_sz--; + goto dfs_continue; + } + /* + * Assign SCC number only if component has two or more elements, + * or if component has a self reference. + */ + assign_scc = stack[stack_sz - 1] != w; + for (j = 0; j < succ_cnt; ++j) { + if (succ[j] == w) { + assign_scc = true; + break; + } + } + /* Pop component elements from stack */ + do { + t = stack[--stack_sz]; + low[t] = NOT_ON_STACK; + if (assign_scc) + aux[t].scc = next_scc_id; + } while (t != w); + if (assign_scc) + next_scc_id++; + dfs_sz--; + } + } +exit: + kvfree(stack); + kvfree(pre); + kvfree(low); + kvfree(dfs); + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -24155,6 +24333,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_scc(env); + if (ret < 0) + goto skip_full_check; + ret = compute_live_registers(env); if (ret < 0) goto skip_full_check; -- cgit v1.2.3 From 13f843c0177eeb367ac63467c538046b90785583 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:28 -0700 Subject: bpf: frame_insn_idx() utility function A function to return IP for a given frame in a call stack of a state. Will be used by a next patch. The `state->insn_idx = env->insn_idx;` assignment in the do_check() allows to use frame_insn_idx with env->cur_state. At the moment bpf_verifier_state->insn_idx is set when new cached state is added in is_state_visited() and accessed only in the contexts when the state is already in the cache. Hence this assignment does not change verifier behaviour. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 75e4f6544b2a..ebb98a78c919 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1964,6 +1964,14 @@ static void update_loop_entry(struct bpf_verifier_env *env, } } +/* Return IP for a given frame in a call stack */ +static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) +{ + return frame == st->curframe + ? st->insn_idx + : st->frame[frame + 1]->callsite; +} + static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_verifier_state_list *sl = NULL, *parent_sl; @@ -18790,9 +18798,7 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { - insn_idx = i == old->curframe - ? env->insn_idx - : old->frame[i + 1]->callsite; + insn_idx = frame_insn_idx(old, i); if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) @@ -19687,6 +19693,7 @@ static int do_check(struct bpf_verifier_env *env) } state->last_insn_idx = env->prev_insn_idx; + state->insn_idx = env->insn_idx; if (is_prune_point(env, env->insn_idx)) { err = is_state_visited(env, env->insn_idx); -- cgit v1.2.3 From 9a2a0d79244d27fae6b5174e199b34fe17db0316 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:29 -0700 Subject: bpf: starting_state parameter for __mark_chain_precision() Allow `mark_chain_precision()` to run from an arbitrary starting state by replacing direct references to `env->cur_state` with a parameter. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ebb98a78c919..cca8858c3caa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4677,12 +4677,13 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * mark_all_scalars_imprecise() to hopefully get more permissive and generic * finalized states which help in short circuiting more future states. */ -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) +static int __mark_chain_precision(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state, int regno) { + struct bpf_verifier_state *st = starting_state; struct backtrack_state *bt = &env->bt; - struct bpf_verifier_state *st = env->cur_state; int first_idx = st->first_insn_idx; - int last_idx = env->insn_idx; + int last_idx = starting_state->insn_idx; int subseq_idx = -1; struct bpf_func_state *func; struct bpf_reg_state *reg; @@ -4693,7 +4694,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) return 0; /* set frame number from which we are starting to backtrack */ - bt_init(bt, env->cur_state->curframe); + bt_init(bt, starting_state->curframe); /* Do sanity checks against current state of register and/or stack * slot, but don't set precise flag in current state, as precision @@ -4757,7 +4758,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) err = backtrack_insn(env, i, subseq_idx, hist, bt); } if (err == -ENOTSUPP) { - mark_all_scalars_precise(env, env->cur_state); + mark_all_scalars_precise(env, starting_state); bt_reset(bt); return 0; } else if (err) { @@ -4845,7 +4846,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) * fallback to marking all precise */ if (!bt_empty(bt)) { - mark_all_scalars_precise(env, env->cur_state); + mark_all_scalars_precise(env, starting_state); bt_reset(bt); } @@ -4854,15 +4855,16 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno) int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno); + return __mark_chain_precision(env, env->cur_state, regno); } /* mark_chain_precision_batch() assumes that env->bt is set in the caller to * desired reg and stack masks across all relevant frames */ -static int mark_chain_precision_batch(struct bpf_verifier_env *env) +static int mark_chain_precision_batch(struct bpf_verifier_env *env, + struct bpf_verifier_state *starting_state) { - return __mark_chain_precision(env, -1); + return __mark_chain_precision(env, starting_state, -1); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -9515,7 +9517,7 @@ static int get_constant_map_key(struct bpf_verifier_env *env, * to prevent pruning on it. */ bt_set_frame_slot(&env->bt, key->frameno, spi); - err = mark_chain_precision_batch(env); + err = mark_chain_precision_batch(env, env->cur_state); if (err < 0) return err; @@ -18939,7 +18941,7 @@ static int propagate_precision(struct bpf_verifier_env *env, verbose(env, "\n"); } - err = mark_chain_precision_batch(env); + err = mark_chain_precision_batch(env, env->cur_state); if (err < 0) return err; -- cgit v1.2.3 From 23b37d616565c89dd202febc68d926345727d092 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:30 -0700 Subject: bpf: set 'changed' status if propagate_precision() did any updates Add an out parameter to `propagate_precision()` to record whether any new precision bits were set during its execution. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cca8858c3caa..b00769ceef8c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4678,7 +4678,9 @@ static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_ * finalized states which help in short circuiting more future states. */ static int __mark_chain_precision(struct bpf_verifier_env *env, - struct bpf_verifier_state *starting_state, int regno) + struct bpf_verifier_state *starting_state, + int regno, + bool *changed) { struct bpf_verifier_state *st = starting_state; struct backtrack_state *bt = &env->bt; @@ -4686,13 +4688,14 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int last_idx = starting_state->insn_idx; int subseq_idx = -1; struct bpf_func_state *func; + bool tmp, skip_first = true; struct bpf_reg_state *reg; - bool skip_first = true; int i, fr, err; if (!env->bpf_capable) return 0; + changed = changed ?: &tmp; /* set frame number from which we are starting to backtrack */ bt_init(bt, starting_state->curframe); @@ -4738,8 +4741,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, for_each_set_bit(i, mask, 32) { reg = &st->frame[0]->regs[i]; bt_clear_reg(bt, i); - if (reg->type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) { reg->precise = true; + *changed = true; + } } return 0; } @@ -4798,10 +4803,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, bt_clear_frame_reg(bt, fr, i); continue; } - if (reg->precise) + if (reg->precise) { bt_clear_frame_reg(bt, fr, i); - else + } else { reg->precise = true; + *changed = true; + } } bitmap_from_u64(mask, bt_frame_stack_mask(bt, fr)); @@ -4816,10 +4823,12 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, continue; } reg = &func->stack[i].spilled_ptr; - if (reg->precise) + if (reg->precise) { bt_clear_frame_slot(bt, fr, i); - else + } else { reg->precise = true; + *changed = true; + } } if (env->log.level & BPF_LOG_LEVEL2) { fmt_reg_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, @@ -4855,7 +4864,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, env->cur_state, regno); + return __mark_chain_precision(env, env->cur_state, regno, NULL); } /* mark_chain_precision_batch() assumes that env->bt is set in the caller to @@ -4864,7 +4873,7 @@ int mark_chain_precision(struct bpf_verifier_env *env, int regno) static int mark_chain_precision_batch(struct bpf_verifier_env *env, struct bpf_verifier_state *starting_state) { - return __mark_chain_precision(env, starting_state, -1); + return __mark_chain_precision(env, starting_state, -1, NULL); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -18893,7 +18902,9 @@ static int propagate_liveness(struct bpf_verifier_env *env, * propagate them into the current state */ static int propagate_precision(struct bpf_verifier_env *env, - const struct bpf_verifier_state *old) + const struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + bool *changed) { struct bpf_reg_state *state_reg; struct bpf_func_state *state; @@ -18941,7 +18952,7 @@ static int propagate_precision(struct bpf_verifier_env *env, verbose(env, "\n"); } - err = mark_chain_precision_batch(env, env->cur_state); + err = __mark_chain_precision(env, cur, -1, changed); if (err < 0) return err; @@ -19264,7 +19275,7 @@ hit: */ if (is_jmp_point(env, env->insn_idx)) err = err ? : push_jmp_history(env, cur, 0, 0); - err = err ? : propagate_precision(env, &sl->state); + err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; return 1; -- cgit v1.2.3 From dfb2d4c64b82ac1e7a03e0b28b4326458705d26f Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:31 -0700 Subject: bpf: set 'changed' status if propagate_liveness() did any updates Add an out parameter to `propagate_liveness()` to record whether any new liveness bits were set during its execution. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b00769ceef8c..e49bdce05049 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18856,12 +18856,15 @@ static int propagate_liveness_reg(struct bpf_verifier_env *env, */ static int propagate_liveness(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, - struct bpf_verifier_state *vparent) + struct bpf_verifier_state *vparent, + bool *changed) { struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; int i, frame, err = 0; + bool tmp; + changed = changed ?: &tmp; if (vparent->curframe != vstate->curframe) { WARN(1, "propagate_live: parent frame %d current frame %d\n", vparent->curframe, vstate->curframe); @@ -18880,6 +18883,7 @@ static int propagate_liveness(struct bpf_verifier_env *env, &parent_reg[i]); if (err < 0) return err; + *changed |= err > 0; if (err == REG_LIVE_READ64) mark_insn_zext(env, &parent_reg[i]); } @@ -18891,6 +18895,7 @@ static int propagate_liveness(struct bpf_verifier_env *env, state_reg = &state->stack[i].spilled_ptr; err = propagate_liveness_reg(env, state_reg, parent_reg); + *changed |= err > 0; if (err < 0) return err; } @@ -19266,7 +19271,7 @@ hit: * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - err = propagate_liveness(env, &sl->state, cur); + err = propagate_liveness(env, &sl->state, cur, NULL); /* if previous state reached the exit with precision and * current state is equivalent to it (except precision marks) -- cgit v1.2.3 From b5c677d8d9e58b9f6c6478ba0850580883588d3c Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:32 -0700 Subject: bpf: move REG_LIVE_DONE check to clean_live_states() The next patch would add some relatively heavy-weight operation to clean_live_states(), this operation can be skipped if REG_LIVE_DONE is set. Move the check from clean_verifier_state() to clean_verifier_state() as a small refactoring commit. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-7-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e49bdce05049..90b3d1a0bd86 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18305,10 +18305,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env, { int i; - if (st->frame[0]->regs[0].live & REG_LIVE_DONE) - /* all regs in this state in all frames were already marked */ - return; - for (i = 0; i <= st->curframe; i++) clean_func_state(env, st->frame[i]); } @@ -18363,6 +18359,9 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; + if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) + /* all regs in this state in all frames were already marked */ + continue; clean_verifier_state(env, &sl->state); } } -- cgit v1.2.3 From c9e31900b54cadf5398dfb838c0a63effa1defec Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:33 -0700 Subject: bpf: propagate read/precision marks over state graph backedges Current loop_entry-based exact states comparison logic does not handle the following case: .-> A --. Assume the states are visited in the order A, B, C. | | | Assume that state B reaches a state equivalent to state A. | v v At this point, state C is not processed yet, so state A '-- B C has not received any read or precision marks from C. As a result, these marks won't be propagated to B. If B has incomplete marks, it is unsafe to use it in states_equal() checks. This commit replaces the existing logic with the following: - Strongly connected components (SCCs) are computed over the program's control flow graph (intraprocedurally). - When a verifier state enters an SCC, that state is recorded as the SCC entry point. - When a verifier state is found equivalent to another (e.g., B to A in the example), it is recorded as a states graph backedge. Backedges are accumulated per SCC. - When an SCC entry state reaches `branches == 0`, read and precision marks are propagated through the backedges (e.g., from A to B, from C to A, and then again from A to B). To support nested subprogram calls, the entry state and backedge list are associated not with the SCC itself but with an object called `bpf_scc_callchain`. A callchain is a tuple `(callsite*, scc_id)`, where `callsite` is the index of a call instruction for each frame except the last. See the comments added in `is_state_visited()` and `compute_scc_callchain()` for more details. Fixes: 2a0992829ea3 ("bpf: correct loop detection for iterators convergence") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 452 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 384 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90b3d1a0bd86..aa1bb4be7b8b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1700,6 +1700,9 @@ static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verif return NULL; } +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st); + /* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states; @@ -1710,20 +1713,14 @@ static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verif static void maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl) { - struct bpf_verifier_state_list *loop_entry_sl; - - while (sl && sl->in_free_list && - sl->state.branches == 0 && - sl->state.used_as_loop_entry == 0) { - loop_entry_sl = state_loop_entry_as_list(&sl->state); - if (loop_entry_sl) - loop_entry_sl->state.used_as_loop_entry--; - list_del(&sl->node); - free_verifier_state(&sl->state, false); - kfree(sl); - env->free_list_size--; - sl = loop_entry_sl; - } + if (!sl->in_free_list + || sl->state.branches != 0 + || incomplete_read_marks(env, &sl->state)) + return; + list_del(&sl->node); + free_verifier_state(&sl->state, false); + kfree(sl); + env->free_list_size--; } /* copy verifier state from src to dst growing dst stack space @@ -1771,6 +1768,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; dst_state->loop_entry = src->loop_entry; + dst_state->equal_state = src->equal_state; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1972,22 +1970,218 @@ static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) : st->frame[frame + 1]->callsite; } -static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +/* For state @st look for a topmost frame with frame_insn_idx() in some SCC, + * if such frame exists form a corresponding @callchain as an array of + * call sites leading to this frame and SCC id. + * E.g.: + * + * void foo() { A: loop {... SCC#1 ...}; } + * void bar() { B: loop { C: foo(); ... SCC#2 ... } + * D: loop { E: foo(); ... SCC#3 ... } } + * void main() { F: bar(); } + * + * @callchain at (A) would be either (F,SCC#2) or (F,SCC#3) depending + * on @st frame call sites being (F,C,A) or (F,E,A). + */ +static bool compute_scc_callchain(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_callchain *callchain) +{ + u32 i, scc, insn_idx; + + memset(callchain, 0, sizeof(*callchain)); + for (i = 0; i <= st->curframe; i++) { + insn_idx = frame_insn_idx(st, i); + scc = env->insn_aux_data[insn_idx].scc; + if (scc) { + callchain->scc = scc; + break; + } else if (i < st->curframe) { + callchain->callsites[i] = insn_idx; + } else { + return false; + } + } + return true; +} + +/* Check if bpf_scc_visit instance for @callchain exists. */ +static struct bpf_scc_visit *scc_visit_lookup(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_info *info = env->scc_info[callchain->scc]; + struct bpf_scc_visit *visits = info->visits; + u32 i; + + if (!info) + return NULL; + for (i = 0; i < info->num_visits; i++) + if (memcmp(callchain, &visits[i].callchain, sizeof(*callchain)) == 0) + return &visits[i]; + return NULL; +} + +/* Allocate a new bpf_scc_visit instance corresponding to @callchain. + * Allocated instances are alive for a duration of the do_check_common() + * call and are freed by free_states(). + */ +static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, + struct bpf_scc_callchain *callchain) +{ + struct bpf_scc_visit *visit; + struct bpf_scc_info *info; + u32 scc, num_visits; + u64 new_sz; + + scc = callchain->scc; + info = env->scc_info[scc]; + num_visits = info ? info->num_visits : 0; + new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); + info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL); + if (!info) + return NULL; + env->scc_info[scc] = info; + info->num_visits = num_visits + 1; + visit = &info->visits[num_visits]; + memset(visit, 0, sizeof(*visit)); + memcpy(&visit->callchain, callchain, sizeof(*callchain)); + return visit; +} + +/* Form a string '(callsite#1,callsite#2,...,scc)' in env->tmp_str_buf */ +static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callchain *callchain) +{ + char *buf = env->tmp_str_buf; + int i, delta = 0; + + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "("); + for (i = 0; i < ARRAY_SIZE(callchain->callsites); i++) { + if (!callchain->callsites[i]) + break; + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u,", + callchain->callsites[i]); + } + delta += snprintf(buf + delta, TMP_STR_BUF_LEN - delta, "%u)", callchain->scc); + return env->tmp_str_buf; +} + +/* If callchain for @st exists (@st is in some SCC), ensure that + * bpf_scc_visit instance for this callchain exists. + * If instance does not exist or is empty, assign visit->entry_state to @st. + */ +static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) + return 0; + visit = scc_visit_lookup(env, &callchain); + visit = visit ?: scc_visit_alloc(env, &callchain); + if (!visit) + return -ENOMEM; + if (!visit->entry_state) { + visit->entry_state = st; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC enter %s\n", format_callchain(env, &callchain)); + } + return 0; +} + +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit); + +/* If callchain for @st exists (@st is in some SCC), make it empty: + * - set visit->entry_state to NULL; + * - flush accumulated backedges. + */ +static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) + return 0; + visit = scc_visit_lookup(env, &callchain); + if (!visit) { + verifier_bug(env, "scc exit: no visit info for call chain %s", + format_callchain(env, &callchain)); + return -EFAULT; + } + if (visit->entry_state != st) + return 0; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC exit %s\n", format_callchain(env, &callchain)); + visit->entry_state = NULL; + return propagate_backedges(env, visit); +} + +/* Lookup an bpf_scc_visit instance corresponding to @st callchain + * and add @backedge to visit->backedges. @st callchain must exist. + */ +static int add_scc_backedge(struct bpf_verifier_env *env, + struct bpf_verifier_state *st, + struct bpf_scc_backedge *backedge) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) { + verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", + st->insn_idx); + return -EFAULT; + } + visit = scc_visit_lookup(env, &callchain); + if (!visit) { + verifier_bug(env, "add backedge: no visit info for call chain %s", + format_callchain(env, &callchain)); + return -EFAULT; + } + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain)); + backedge->next = visit->backedges; + visit->backedges = backedge; + return 0; +} + +/* bpf_reg_state->live marks for registers in a state @st are incomplete, + * if state @st is in some SCC and not all execution paths starting at this + * SCC are fully explored. + */ +static bool incomplete_read_marks(struct bpf_verifier_env *env, + struct bpf_verifier_state *st) +{ + struct bpf_scc_callchain callchain; + struct bpf_scc_visit *visit; + + if (!compute_scc_callchain(env, st, &callchain)) + return false; + visit = scc_visit_lookup(env, &callchain); + if (!visit) + return false; + return !!visit->backedges; +} + +static void free_backedges(struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge, *next; + + for (backedge = visit->backedges; backedge; backedge = next) { + free_verifier_state(&backedge->state, false); + next = backedge->next; + kvfree(backedge); + } + visit->backedges = NULL; +} + +static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { struct bpf_verifier_state_list *sl = NULL, *parent_sl; struct bpf_verifier_state *parent; + int err; while (st) { u32 br = --st->branches; - /* br == 0 signals that DFS exploration for 'st' is finished, - * thus it is necessary to update parent's loop entry if it - * turned out that st is a part of some loop. - * This is a part of 'case A' in get_loop_entry() comment. - */ - if (br == 0 && st->parent && st->loop_entry) - update_loop_entry(env, st->parent, st->loop_entry); - /* WARN_ON(br > 1) technically makes sense here, * but see comment in push_stack(), hence: */ @@ -1996,6 +2190,9 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi br); if (br) break; + err = maybe_exit_scc(env, st); + if (err) + return err; parent = st->parent; parent_sl = state_parent_as_list(st); if (sl) @@ -2003,6 +2200,7 @@ static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifi st = parent; sl = parent_sl; } + return 0; } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, @@ -18344,7 +18542,6 @@ static void clean_verifier_state(struct bpf_verifier_env *env, static void clean_live_states(struct bpf_verifier_env *env, int insn, struct bpf_verifier_state *cur) { - struct bpf_verifier_state *loop_entry; struct bpf_verifier_state_list *sl; struct list_head *pos, *head; @@ -18353,15 +18550,14 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, sl = container_of(pos, struct bpf_verifier_state_list, node); if (sl->state.branches) continue; - loop_entry = get_loop_entry(env, &sl->state); - if (!IS_ERR_OR_NULL(loop_entry) && loop_entry->branches) - continue; if (sl->state.insn_idx != insn || !same_callsites(&sl->state, cur)) continue; if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) /* all regs in this state in all frames were already marked */ continue; + if (incomplete_read_marks(env, &sl->state)) + continue; clean_verifier_state(env, &sl->state); } } @@ -18963,6 +19159,46 @@ static int propagate_precision(struct bpf_verifier_env *env, return 0; } +#define MAX_BACKEDGE_ITERS 64 + +/* Propagate read and precision marks from visit->backedges[*].state->equal_state + * to corresponding parent states of visit->backedges[*].state until fixed point is reached, + * then free visit->backedges. + * After execution of this function incomplete_read_marks() will return false + * for all states corresponding to @visit->callchain. + */ +static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visit *visit) +{ + struct bpf_scc_backedge *backedge; + struct bpf_verifier_state *st; + bool changed; + int i, err; + + i = 0; + do { + if (i++ > MAX_BACKEDGE_ITERS) { + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "%s: too many iterations\n", __func__); + for (backedge = visit->backedges; backedge; backedge = backedge->next) + mark_all_scalars_precise(env, &backedge->state); + break; + } + changed = false; + for (backedge = visit->backedges; backedge; backedge = backedge->next) { + st = &backedge->state; + err = propagate_liveness(env, st->equal_state, st, &changed); + if (err) + return err; + err = propagate_precision(env, st->equal_state, st, &changed); + if (err) + return err; + } + } while (changed); + + free_backedges(visit); + return 0; +} + static bool states_maybe_looping(struct bpf_verifier_state *old, struct bpf_verifier_state *cur) { @@ -19072,9 +19308,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; + struct bpf_verifier_state *cur = env->cur_state, *new; + bool force_new_state, add_new_state, loop; int i, j, n, err, states_cnt = 0; - bool force_new_state, add_new_state, force_exact; struct list_head *pos, *tmp, *head; force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || @@ -19096,6 +19332,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) clean_live_states(env, insn_idx, cur); + loop = false; head = explored_state(env, insn_idx); list_for_each_safe(pos, tmp, head) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -19175,7 +19412,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) spi = __get_spi(iter_reg->off + iter_reg->var_off.value); iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr; if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } @@ -19184,7 +19421,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) if (is_may_goto_insn_at(env, insn_idx)) { if (sl->state.may_goto_depth != cur->may_goto_depth && states_equal(env, &sl->state, cur, RANGE_WITHIN)) { - update_loop_entry(env, cur, &sl->state); + loop = true; goto hit; } } @@ -19226,38 +19463,9 @@ skip_inf_loop_check: add_new_state = false; goto miss; } - /* If sl->state is a part of a loop and this loop's entry is a part of - * current verification path then states have to be compared exactly. - * 'force_exact' is needed to catch the following case: - * - * initial Here state 'succ' was processed first, - * | it was eventually tracked to produce a - * V state identical to 'hdr'. - * .---------> hdr All branches from 'succ' had been explored - * | | and thus 'succ' has its .branches == 0. - * | V - * | .------... Suppose states 'cur' and 'succ' correspond - * | | | to the same instruction + callsites. - * | V V In such case it is necessary to check - * | ... ... if 'succ' and 'cur' are states_equal(). - * | | | If 'succ' and 'cur' are a part of the - * | V V same loop exact flag has to be set. - * | succ <- cur To check if that is the case, verify - * | | if loop entry of 'succ' is in current - * | V DFS path. - * | ... - * | | - * '----' - * - * Additional details are in the comment before get_loop_entry(). - */ - loop_entry = get_loop_entry(env, &sl->state); - if (IS_ERR(loop_entry)) - return PTR_ERR(loop_entry); - force_exact = loop_entry && loop_entry->branches > 0; - if (states_equal(env, &sl->state, cur, force_exact ? RANGE_WITHIN : NOT_EXACT)) { - if (force_exact) - update_loop_entry(env, cur, loop_entry); + /* See comments for mark_all_regs_read_and_precise() */ + loop = incomplete_read_marks(env, &sl->state); + if (states_equal(env, &sl->state, cur, loop ? RANGE_WITHIN : NOT_EXACT)) { hit: sl->hit_cnt++; /* reached equivalent register/stack state, @@ -19282,6 +19490,94 @@ hit: err = err ? : propagate_precision(env, &sl->state, cur, NULL); if (err) return err; + /* When processing iterator based loops above propagate_liveness and + * propagate_precision calls are not sufficient to transfer all relevant + * read and precision marks. E.g. consider the following case: + * + * .-> A --. Assume the states are visited in the order A, B, C. + * | | | Assume that state B reaches a state equivalent to state A. + * | v v At this point, state C is not processed yet, so state A + * '-- B C has not received any read or precision marks from C. + * Thus, marks propagated from A to B are incomplete. + * + * The verifier mitigates this by performing the following steps: + * + * - Prior to the main verification pass, strongly connected components + * (SCCs) are computed over the program's control flow graph, + * intraprocedurally. + * + * - During the main verification pass, `maybe_enter_scc()` checks + * whether the current verifier state is entering an SCC. If so, an + * instance of a `bpf_scc_visit` object is created, and the state + * entering the SCC is recorded as the entry state. + * + * - This instance is associated not with the SCC itself, but with a + * `bpf_scc_callchain`: a tuple consisting of the call sites leading to + * the SCC and the SCC id. See `compute_scc_callchain()`. + * + * - When a verification path encounters a `states_equal(..., + * RANGE_WITHIN)` condition, there exists a call chain describing the + * current state and a corresponding `bpf_scc_visit` instance. A copy + * of the current state is created and added to + * `bpf_scc_visit->backedges`. + * + * - When a verification path terminates, `maybe_exit_scc()` is called + * from `update_branch_counts()`. For states with `branches == 0`, it + * checks whether the state is the entry state of any `bpf_scc_visit` + * instance. If it is, this indicates that all paths originating from + * this SCC visit have been explored. `propagate_backedges()` is then + * called, which propagates read and precision marks through the + * backedges until a fixed point is reached. + * (In the earlier example, this would propagate marks from A to B, + * from C to A, and then again from A to B.) + * + * A note on callchains + * -------------------- + * + * Consider the following example: + * + * void foo() { loop { ... SCC#1 ... } } + * void main() { + * A: foo(); + * B: ... + * C: foo(); + * } + * + * Here, there are two distinct callchains leading to SCC#1: + * - (A, SCC#1) + * - (C, SCC#1) + * + * Each callchain identifies a separate `bpf_scc_visit` instance that + * accumulates backedge states. The `propagate_{liveness,precision}()` + * functions traverse the parent state of each backedge state, which + * means these parent states must remain valid (i.e., not freed) while + * the corresponding `bpf_scc_visit` instance exists. + * + * Associating `bpf_scc_visit` instances directly with SCCs instead of + * callchains would break this invariant: + * - States explored during `C: foo()` would contribute backedges to + * SCC#1, but SCC#1 would only be exited once the exploration of + * `A: foo()` completes. + * - By that time, the states explored between `A: foo()` and `C: foo()` + * (i.e., `B: ...`) may have already been freed, causing the parent + * links for states from `C: foo()` to become invalid. + */ + if (loop) { + struct bpf_scc_backedge *backedge; + + backedge = kzalloc(sizeof(*backedge), GFP_KERNEL); + if (!backedge) + return -ENOMEM; + err = copy_verifier_state(&backedge->state, cur); + backedge->state.equal_state = &sl->state; + backedge->state.insn_idx = insn_idx; + err = err ?: add_scc_backedge(env, &sl->state, backedge); + if (err) { + free_verifier_state(&backedge->state, false); + kvfree(backedge); + return err; + } + } return 1; } miss: @@ -19357,6 +19653,12 @@ miss: new->insn_idx = insn_idx; WARN_ONCE(new->branches != 1, "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + err = maybe_enter_scc(env, new); + if (err) { + free_verifier_state(new, false); + kvfree(new_sl); + return err; + } cur->parent = new; cur->first_insn_idx = insn_idx; @@ -19811,7 +20113,9 @@ static int do_check(struct bpf_verifier_env *env) WARN_ON_ONCE(env->insn_idx != prev_insn_idx + 1); process_bpf_exit: mark_verifier_state_scratched(env); - update_branch_counts(env, env->cur_state); + err = update_branch_counts(env, env->cur_state); + if (err) + return err; err = pop_stack(env, &prev_insn_idx, &env->insn_idx, pop_log); if (err < 0) { @@ -19819,9 +20123,6 @@ process_bpf_exit: return err; break; } else { - if (verifier_bug_if(env->cur_state->loop_entry, env, - "broken loop detection")) - return -EFAULT; do_print_state = true; continue; } @@ -22787,7 +23088,8 @@ static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl; struct list_head *head, *pos, *tmp; - int i; + struct bpf_scc_info *info; + int i, j; list_for_each_safe(pos, tmp, &env->free_list) { sl = container_of(pos, struct bpf_verifier_state_list, node); @@ -22796,6 +23098,14 @@ static void free_states(struct bpf_verifier_env *env) } INIT_LIST_HEAD(&env->free_list); + for (i = 0; i < env->scc_cnt; ++i) { + info = env->scc_info[i]; + for (j = 0; j < info->num_visits; j++) + free_backedges(&info->visits[j]); + kvfree(info); + env->scc_info[i] = NULL; + } + if (!env->explored_states) return; @@ -24228,6 +24538,11 @@ dfs_continue: dfs_sz--; } } + env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL); + if (!env->scc_info) { + err = -ENOMEM; + goto exit; + } exit: kvfree(stack); kvfree(pre); @@ -24503,6 +24818,7 @@ err_unlock: vfree(env->insn_aux_data); err_free_env: kvfree(env->cfg.insn_postorder); + kvfree(env->scc_info); kvfree(env); return ret; } -- cgit v1.2.3 From 0e0da5f901f582b97bfeefbf1f36a27e9d427ff4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:34 -0700 Subject: bpf: remove {update,get}_loop_entry functions The previous patch switched read and precision tracking for iterator-based loops from state-graph-based loop tracking to control-flow-graph-based loop tracking. This patch removes the now-unused `update_loop_entry()` and `get_loop_entry()` functions, which were part of the state-graph-based logic. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-9-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 165 +------------------------------------------------- 1 file changed, 1 insertion(+), 164 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aa1bb4be7b8b..48847f8da5b1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1682,7 +1682,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, kfree(state); } -/* struct bpf_verifier_state->{parent,loop_entry} refer to states +/* struct bpf_verifier_state->parent refers to states * that are in either of env->{expored_states,free_list}. * In both cases the state is contained in struct bpf_verifier_state_list. */ @@ -1693,22 +1693,12 @@ static struct bpf_verifier_state_list *state_parent_as_list(struct bpf_verifier_ return NULL; } -static struct bpf_verifier_state_list *state_loop_entry_as_list(struct bpf_verifier_state *st) -{ - if (st->loop_entry) - return container_of(st->loop_entry, struct bpf_verifier_state_list, state); - return NULL; -} - static bool incomplete_read_marks(struct bpf_verifier_env *env, struct bpf_verifier_state *st); /* A state can be freed if it is no longer referenced: * - is in the env->free_list; * - has no children states; - * - is not used as loop_entry. - * - * Freeing a state can make it's loop_entry free-able. */ static void maybe_free_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state_list *sl) @@ -1765,9 +1755,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->last_insn_idx = src->last_insn_idx; dst_state->dfs_depth = src->dfs_depth; dst_state->callback_unroll_depth = src->callback_unroll_depth; - dst_state->used_as_loop_entry = src->used_as_loop_entry; dst_state->may_goto_depth = src->may_goto_depth; - dst_state->loop_entry = src->loop_entry; dst_state->equal_state = src->equal_state; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -1811,157 +1799,6 @@ static bool same_callsites(struct bpf_verifier_state *a, struct bpf_verifier_sta return true; } -/* Open coded iterators allow back-edges in the state graph in order to - * check unbounded loops that iterators. - * - * In is_state_visited() it is necessary to know if explored states are - * part of some loops in order to decide whether non-exact states - * comparison could be used: - * - non-exact states comparison establishes sub-state relation and uses - * read and precision marks to do so, these marks are propagated from - * children states and thus are not guaranteed to be final in a loop; - * - exact states comparison just checks if current and explored states - * are identical (and thus form a back-edge). - * - * Paper "A New Algorithm for Identifying Loops in Decompilation" - * by Tao Wei, Jian Mao, Wei Zou and Yu Chen [1] presents a convenient - * algorithm for loop structure detection and gives an overview of - * relevant terminology. It also has helpful illustrations. - * - * [1] https://api.semanticscholar.org/CorpusID:15784067 - * - * We use a similar algorithm but because loop nested structure is - * irrelevant for verifier ours is significantly simpler and resembles - * strongly connected components algorithm from Sedgewick's textbook. - * - * Define topmost loop entry as a first node of the loop traversed in a - * depth first search starting from initial state. The goal of the loop - * tracking algorithm is to associate topmost loop entries with states - * derived from these entries. - * - * For each step in the DFS states traversal algorithm needs to identify - * the following situations: - * - * initial initial initial - * | | | - * V V V - * ... ... .---------> hdr - * | | | | - * V V | V - * cur .-> succ | .------... - * | | | | | | - * V | V | V V - * succ '-- cur | ... ... - * | | | - * | V V - * | succ <- cur - * | | - * | V - * | ... - * | | - * '----' - * - * (A) successor state of cur (B) successor state of cur or it's entry - * not yet traversed are in current DFS path, thus cur and succ - * are members of the same outermost loop - * - * initial initial - * | | - * V V - * ... ... - * | | - * V V - * .------... .------... - * | | | | - * V V V V - * .-> hdr ... ... ... - * | | | | | - * | V V V V - * | succ <- cur succ <- cur - * | | | - * | V V - * | ... ... - * | | | - * '----' exit - * - * (C) successor state of cur is a part of some loop but this loop - * does not include cur or successor state is not in a loop at all. - * - * Algorithm could be described as the following python code: - * - * traversed = set() # Set of traversed nodes - * entries = {} # Mapping from node to loop entry - * depths = {} # Depth level assigned to graph node - * path = set() # Current DFS path - * - * # Find outermost loop entry known for n - * def get_loop_entry(n): - * h = entries.get(n, None) - * while h in entries: - * h = entries[h] - * return h - * - * # Update n's loop entry if h comes before n in current DFS path. - * def update_loop_entry(n, h): - * if h in path and depths[entries.get(n, n)] < depths[n]: - * entries[n] = h1 - * - * def dfs(n, depth): - * traversed.add(n) - * path.add(n) - * depths[n] = depth - * for succ in G.successors(n): - * if succ not in traversed: - * # Case A: explore succ and update cur's loop entry - * # only if succ's entry is in current DFS path. - * dfs(succ, depth + 1) - * h = entries.get(succ, None) - * update_loop_entry(n, h) - * else: - * # Case B or C depending on `h1 in path` check in update_loop_entry(). - * update_loop_entry(n, succ) - * path.remove(n) - * - * To adapt this algorithm for use with verifier: - * - use st->branch == 0 as a signal that DFS of succ had been finished - * and cur's loop entry has to be updated (case A), handle this in - * update_branch_counts(); - * - use st->branch > 0 as a signal that st is in the current DFS path; - * - handle cases B and C in is_state_visited(). - */ -static struct bpf_verifier_state *get_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *st) -{ - struct bpf_verifier_state *topmost = st->loop_entry; - u32 steps = 0; - - while (topmost && topmost->loop_entry) { - if (verifier_bug_if(steps++ > st->dfs_depth, env, "infinite loop")) - return ERR_PTR(-EFAULT); - topmost = topmost->loop_entry; - } - return topmost; -} - -static void update_loop_entry(struct bpf_verifier_env *env, - struct bpf_verifier_state *cur, struct bpf_verifier_state *hdr) -{ - /* The hdr->branches check decides between cases B and C in - * comment for get_loop_entry(). If hdr->branches == 0 then - * head's topmost loop entry is not in current DFS path, - * hence 'cur' and 'hdr' are not in the same loop and there is - * no need to update cur->loop_entry. - */ - if (hdr->branches && hdr->dfs_depth < (cur->loop_entry ?: cur)->dfs_depth) { - if (cur->loop_entry) { - cur->loop_entry->used_as_loop_entry--; - maybe_free_verifier_state(env, state_loop_entry_as_list(cur)); - } - cur->loop_entry = hdr; - hdr->used_as_loop_entry++; - } -} - /* Return IP for a given frame in a call stack */ static u32 frame_insn_idx(struct bpf_verifier_state *st, u32 frame) { -- cgit v1.2.3 From 0f54ff54700315caa8ed3bea36fa0ff3ebc53f56 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 11 Jun 2025 13:08:35 -0700 Subject: bpf: include backedges in peak_states stat Count states accumulated in bpf_scc_visit->backedges in env->peak_states. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611200836.4135542-10-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48847f8da5b1..1d3277bf935e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1648,7 +1648,7 @@ static void update_peak_states(struct bpf_verifier_env *env) { u32 cur_states; - cur_states = env->explored_states_size + env->free_list_size; + cur_states = env->explored_states_size + env->free_list_size + env->num_backedges; env->peak_states = max(env->peak_states, cur_states); } @@ -1949,6 +1949,9 @@ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_stat if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "SCC exit %s\n", format_callchain(env, &callchain)); visit->entry_state = NULL; + env->num_backedges -= visit->num_backedges; + visit->num_backedges = 0; + update_peak_states(env); return propagate_backedges(env, visit); } @@ -1977,6 +1980,9 @@ static int add_scc_backedge(struct bpf_verifier_env *env, verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain)); backedge->next = visit->backedges; visit->backedges = backedge; + visit->num_backedges++; + env->num_backedges++; + update_peak_states(env); return 0; } -- cgit v1.2.3 From 3d71b8b9abae68f6dfc434f779e1139370fbe891 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Wed, 11 Jun 2025 23:07:28 +0200 Subject: bpf: Fix state use-after-free on push_stack() err Without this, `state->speculative` is used after the cleanup cycles in push_stack() or push_async_cb() freed `env->cur_state` (i.e., `state`). Avoid this by relying on the short-circuit logic to only access `state` if the error is recoverable (and make sure it never is after push_*() failed). push_*() callers must always return an error for which error_recoverable_with_nospec(err) is false if push_*() returns NULL, otherwise we try to recover and access the stale `state`. This is only violated by sanitize_ptr_alu(), thus also fix this case to return -ENOMEM. state->speculative does not make sense if the error path of push_*() ran. In that case, `state->speculative && error_recoverable_with_nospec(err)` as a whole should already never evaluate to true (because all cases where push_stack() fails must return -ENOMEM/-EFAULT). As mentioned, this is only violated by the push_stack() call in sanitize_speculative_path() which returns -EACCES without [1] (through REASON_STACK in sanitize_err() after sanitize_ptr_alu()). To fix this, return -ENOMEM for REASON_STACK (which is also the behavior we will have after [1]). Checked that it fixes the syzbot reproducer as expected. [1] https://lore.kernel.org/all/20250603213232.339242-1-luis.gerhorst@fau.de/ Fixes: d6f1c85f2253 ("bpf: Fall back to nospec for Spectre v1") Reported-by: syzbot+b5eb72a560b8149a1885@syzkaller.appspotmail.com Reported-by: Eduard Zingerman Link: https://lore.kernel.org/all/38862a832b91382cddb083dddd92643bed0723b8.camel@gmail.com/ Signed-off-by: Luis Gerhorst Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250611210728.266563-1-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d3277bf935e..14dd836acb13 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14293,7 +14293,7 @@ static int sanitize_err(struct bpf_verifier_env *env, case REASON_STACK: verbose(env, "R%d could not be pushed for speculative verification, %s\n", dst, err); - break; + return -ENOMEM; default: verbose(env, "verifier internal error: unknown reason (%d)\n", reason); @@ -19926,7 +19926,7 @@ static int do_check(struct bpf_verifier_env *env) goto process_bpf_exit; err = do_check_insn(env, &do_print_state); - if (state->speculative && error_recoverable_with_nospec(err)) { + if (error_recoverable_with_nospec(err) && state->speculative) { /* Prevent this speculative path from ever reaching the * insn that would have been unsafe to execute. */ -- cgit v1.2.3 From fa6932577c073497379a1f5901ea5b208a38da10 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 12 Jun 2025 15:11:00 -0700 Subject: bpf: Initialize used but uninit variable in propagate_liveness() With input changed == NULL, a local variable is used for "changed". Initialize tmp properly, so that it can be used in the following: *changed |= err > 0; Otherwise, UBSAN will complain: UBSAN: invalid-load in kernel/bpf/verifier.c:18924:4 load of value is not a valid value for type '_Bool' Fixes: dfb2d4c64b82 ("bpf: set 'changed' status if propagate_liveness() did any updates") Signed-off-by: Song Liu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250612221100.2153401-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14dd836acb13..c378074516cf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18900,7 +18900,7 @@ static int propagate_liveness(struct bpf_verifier_env *env, struct bpf_reg_state *state_reg, *parent_reg; struct bpf_func_state *state, *parent; int i, frame, err = 0; - bool tmp; + bool tmp = false; changed = changed ?: &tmp; if (vparent->curframe != vstate->curframe) { -- cgit v1.2.3 From 69ab14ee525649a875ca187cad2f05a2bec1ac3c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:42 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/autogroup.[ch] - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-2-mingo@kernel.org --- kernel/sched/autogroup.c | 6 +++--- kernel/sched/autogroup.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index e96a167c3f28..cdea931aae30 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -28,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void) { register_sysctl_init("kernel", sched_autogroup_sysctls); } -#else +#else /* !CONFIG_SYSCTL: */ #define sched_autogroup_sysctl_init() do { } while (0) -#endif +#endif /* !CONFIG_SYSCTL */ void __init autogroup_init(struct task_struct *init_task) { @@ -111,7 +111,7 @@ static inline struct autogroup *autogroup_create(void) free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; tg->rt_rq = root_task_group.rt_rq; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ tg->autogroup = ag; sched_online_group(tg, &root_task_group); diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 5c1796a463af..06c82b2bdfb5 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -43,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) extern int autogroup_path(struct task_group *tg, char *buf, int buflen); -#else /* !CONFIG_SCHED_AUTOGROUP */ +#else /* !CONFIG_SCHED_AUTOGROUP: */ static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } @@ -63,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) return 0; } -#endif /* CONFIG_SCHED_AUTOGROUP */ +#endif /* !CONFIG_SCHED_AUTOGROUP */ #endif /* _KERNEL_SCHED_AUTOGROUP_H */ -- cgit v1.2.3 From bbb1b274e85b739ade5f9bc060a8c4fa00388e29 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:43 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/clock.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-3-mingo@kernel.org --- kernel/sched/clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e62b5510cd2e..f5e6dd6a6b3a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -474,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ void __init sched_clock_init(void) { @@ -492,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * Running clock - returns the time that has elapsed while a guest has been -- cgit v1.2.3 From b7ebb758568b60ae816b0c21df70a67eecb84a71 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:44 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/core.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Apply this simplification: -#if defined(CONFIG_FOO) +#ifdef CONFIG_FOO - Fix whitespace noise. - Use vertical alignment to better visualize nested #ifdef blocks, where appropriate: #ifdef CONFIG_FOO # ifdef CONFIG_BAR ... # endif #endif Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-4-mingo@kernel.org --- kernel/sched/core.c | 186 ++++++++++++++++++++++++++-------------------------- 1 file changed, 93 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dce50fa57471..f1ef6d29792c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -481,13 +481,13 @@ void sched_core_put(void) schedule_work(&_work); } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); @@ -667,7 +667,7 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif +#endif /* CONFIG_SMP */ /* * __task_rq_lock - lock the rq @p resides on. @@ -899,7 +899,7 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else +#else /* !CONFIG_SMP: */ /* * Called to set the hrtick timer state. * @@ -916,7 +916,7 @@ void hrtick_start(struct rq *rq, u64 delay) HRTIMER_MODE_REL_PINNED_HARD); } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { @@ -925,7 +925,7 @@ static void hrtick_rq_init(struct rq *rq) #endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } -#else /* CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_clear(struct rq *rq) { } @@ -933,7 +933,7 @@ static inline void hrtick_clear(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq) { } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ /* * try_cmpxchg based fetch_or() macro so it works for different integer types: @@ -1971,7 +1971,7 @@ undo: sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; } -#endif +#endif /* CONFIG_SYSCTL */ static void uclamp_fork(struct task_struct *p) { @@ -2037,13 +2037,13 @@ static void __init init_uclamp(void) } } -#else /* !CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK: */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } -#endif /* CONFIG_UCLAMP_TASK */ +#endif /* !CONFIG_UCLAMP_TASK */ bool sched_task_on_rq(struct task_struct *p) { @@ -3661,7 +3661,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } @@ -3770,7 +3770,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif +#endif /* CONFIG_SMP */ } /* @@ -3992,14 +3992,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { return false; } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { @@ -4335,9 +4335,9 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else +#else /* !CONFIG_SMP: */ cpu = task_cpu(p); -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } @@ -4599,8 +4599,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write, } return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SCHEDSTATS @@ -4787,7 +4787,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP p->on_cpu = 0; #endif init_task_preempt_count(p); @@ -4978,7 +4978,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(curr, next); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ +#else /* !CONFIG_PREEMPT_NOTIFIERS: */ static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -4990,7 +4990,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif /* CONFIG_PREEMPT_NOTIFIERS */ +#endif /* !CONFIG_PREEMPT_NOTIFIERS */ static inline void prepare_task(struct task_struct *next) { @@ -5107,13 +5107,13 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else +#else /* !CONFIG_SMP: */ static inline void __balance_callbacks(struct rq *rq) { } -#endif +#endif /* !CONFIG_SMP */ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) @@ -5527,7 +5527,7 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif +#endif /* CONFIG_SMP */ DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); @@ -5835,10 +5835,10 @@ int __init sched_tick_offload_init(void) return 0; } -#else /* !CONFIG_NO_HZ_FULL */ +#else /* !CONFIG_NO_HZ_FULL: */ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) @@ -6553,7 +6553,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) rq->core = rq; } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_cpu_starting(unsigned int cpu) {} static inline void sched_core_cpu_deactivate(unsigned int cpu) {} @@ -6565,7 +6565,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return __pick_next_task(rq, prev, rf); } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* * Constants for the sched_mode argument of __schedule(). @@ -6992,14 +6992,14 @@ NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_dynamic_enabled -#define preempt_schedule_dynamic_enabled preempt_schedule -#define preempt_schedule_dynamic_disabled NULL -#endif +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# ifndef preempt_schedule_dynamic_enabled +# define preempt_schedule_dynamic_enabled preempt_schedule +# define preempt_schedule_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); void __sched notrace dynamic_preempt_schedule(void) { @@ -7009,8 +7009,8 @@ void __sched notrace dynamic_preempt_schedule(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -7065,14 +7065,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_notrace_dynamic_enabled -#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -#define preempt_schedule_notrace_dynamic_disabled NULL -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# ifndef preempt_schedule_notrace_dynamic_enabled +# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +# define preempt_schedule_notrace_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); void __sched notrace dynamic_preempt_schedule_notrace(void) { @@ -7082,7 +7082,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -#endif +# endif #endif #endif /* CONFIG_PREEMPTION */ @@ -7301,7 +7301,7 @@ out_unlock: preempt_enable(); } -#endif +#endif /* CONFIG_RT_MUTEXES */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) @@ -7332,17 +7332,17 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define cond_resched_dynamic_enabled __cond_resched -#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# define cond_resched_dynamic_enabled __cond_resched +# define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); -#define might_resched_dynamic_enabled __cond_resched -#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +# define might_resched_dynamic_enabled __cond_resched +# define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { @@ -7360,8 +7360,8 @@ int __sched dynamic_might_resched(void) return __cond_resched(); } EXPORT_SYMBOL(dynamic_might_resched); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -7427,9 +7427,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC -#ifdef CONFIG_GENERIC_ENTRY -#include -#endif +# ifdef CONFIG_GENERIC_ENTRY +# include +# endif /* * SC:cond_resched @@ -7484,37 +7484,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -#ifndef CONFIG_PREEMPT_RT +# ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; -#endif +# endif if (!strcmp(str, "full")) return preempt_dynamic_full; -#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY if (!strcmp(str, "lazy")) return preempt_dynamic_lazy; -#endif +# endif return -EINVAL; } -#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) -#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) -#else -#error "Unsupported PREEMPT_DYNAMIC mechanism" -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) +# else +# error "Unsupported PREEMPT_DYNAMIC mechanism" +# endif static DEFINE_MUTEX(sched_dynamic_mutex); @@ -7618,7 +7618,7 @@ static void __init preempt_dynamic_init(void) } } -#define PREEMPT_MODEL_ACCESSOR(mode) \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ bool preempt_model_##mode(void) \ { \ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ @@ -8123,7 +8123,7 @@ static void balance_hotplug_wait(void) TASK_UNINTERRUPTIBLE); } -#else +#else /* !CONFIG_HOTPLUG_CPU: */ static inline void balance_push(struct rq *rq) { @@ -8137,7 +8137,7 @@ static inline void balance_hotplug_wait(void) { } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* !CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) { @@ -8446,7 +8446,7 @@ int sched_cpu_dying(unsigned int cpu) sched_core_cpu_dying(cpu); return 0; } -#endif +#endif /* CONFIG_HOTPLUG_CPU */ void __init sched_init_smp(void) { @@ -8480,12 +8480,12 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else +#else /* !CONFIG_SMP: */ void __init sched_init_smp(void) { sched_init_granularity(); } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ int in_sched_functions(unsigned long addr) { @@ -8637,15 +8637,15 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ_COMMON +# ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); -#endif -#ifdef CONFIG_HOTPLUG_CPU +# endif +# ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); -#endif +# endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); @@ -8830,7 +8830,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset) } EXPORT_SYMBOL_GPL(__cant_sleep); -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP void __cant_migrate(const char *file, int line) { static unsigned long prev_jiffy; @@ -8861,8 +8861,8 @@ void __cant_migrate(const char *file, int line) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_migrate); -#endif -#endif +# endif /* CONFIG_SMP */ +#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) @@ -8902,7 +8902,7 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_KGDB_KDB) +#ifdef CONFIG_KGDB_KDB /* * These functions are only useful for KDB. * @@ -8926,7 +8926,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_KGDB_KDB) */ +#endif /* CONFIG_KGDB_KDB */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -9807,7 +9807,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, scx_group_set_idle(css_tg(css), idle); return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -9935,7 +9935,7 @@ static int cpu_extra_stat_show(struct seq_file *sf, cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec, cfs_b->nr_burst, burst_usec); } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ return 0; } @@ -10076,7 +10076,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, ret = tg_set_cfs_bandwidth(tg, period, quota, burst); return ret ?: nbytes; } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ static struct cftype cpu_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -10112,7 +10112,7 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_cfs_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, }, -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -10126,7 +10126,7 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, -#endif +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* terminate */ }; @@ -10147,7 +10147,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .threaded = true, }; -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ void dump_cpu_task(int cpu) { @@ -10733,7 +10733,7 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!t->mm || t->mm_cid != -1); t->mm_cid_active = 1; } -#endif +#endif /* CONFIG_SCHED_MM_CID */ #ifdef CONFIG_SCHED_CLASS_EXT void sched_deq_and_put_task(struct task_struct *p, int queue_flags, @@ -10768,4 +10768,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) if (ctx->running) set_next_task(rq, ctx->p); } -#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* CONFIG_SCHED_CLASS_EXT */ -- cgit v1.2.3 From 8bb9b0c5aeb9594bc1039e38d15b14d2a055cf3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:45 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/cpufreq_schedutil.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-5-mingo@kernel.org --- kernel/sched/cpufreq_schedutil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3d7e9ccb99a0..0ab5f9d4bc59 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -382,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL -- cgit v1.2.3 From 79af17344c2728c58e254219ff47840c67211cd9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:46 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/cpupri.h - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-6-mingo@kernel.org --- kernel/sched/cpupri.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f0f5a734c0c1..24add19625ff 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -29,4 +29,4 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif +#endif /* CONFIG_SMP */ -- cgit v1.2.3 From 4aec8669ff3c7ea7ab62c10bb6442c70c0d1b1eb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:47 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/cputime.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-7-mingo@kernel.org --- kernel/sched/cputime.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f01f17acb4a8..7097de2c8cda 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -91,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime) return delta; } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static u64 irqtime_tick_accounted(u64 dummy) { @@ -244,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) task_group_account_field(p, CPUTIME_FORCEIDLE, delta); } -#endif +#endif /* CONFIG_SCHED_CORE */ /* * When a guest is interrupted for a longer amount of time, missed clock @@ -265,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) return steal; } -#endif +#endif /* CONFIG_PARAVIRT */ return 0; } @@ -291,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t) { return t->se.sum_exec_runtime; } -#else +#else /* !CONFIG_64BIT: */ static u64 read_sum_exec_runtime(struct task_struct *t) { u64 ns; @@ -304,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) return ns; } -#endif +#endif /* !CONFIG_64BIT */ /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live @@ -414,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks) { irqtime_account_process_tick(current, 0, ticks); } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, int nr_ticks) { } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: -- cgit v1.2.3 From c503c3dc2d49dbca2cec531ca8f1b8e9143c5087 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:48 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/deadline.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-8-mingo@kernel.org --- kernel/sched/deadline.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ff5be80c5c9d..a2cea68b2198 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -55,7 +55,7 @@ static int __init sched_dl_sysctl_init(void) return 0; } late_initcall(sched_dl_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static bool dl_server(struct sched_dl_entity *dl_se) { @@ -103,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return pi_of(dl_se) != dl_se; } -#else +#else /* !CONFIG_RT_MUTEXES: */ static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) { return dl_se; @@ -113,7 +113,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return false; } -#endif +#endif /* !CONFIG_RT_MUTEXES */ #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) @@ -195,7 +195,7 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else +#else /* !CONFIG_SMP: */ static inline struct dl_bw *dl_bw_of(int i) { return &cpu_rq(i)->dl.dl_bw; @@ -223,7 +223,7 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) dl->extra_bw += bw; } -#endif +#endif /* !CONFIG_SMP */ static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -757,7 +757,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else +#else /* !CONFIG_SMP: */ static inline void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -786,7 +786,7 @@ static inline void deadline_queue_push_tasks(struct rq *rq) static inline void deadline_queue_pull_task(struct rq *rq) { } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); @@ -1213,7 +1213,7 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif +#endif /* CONFIG_SMP */ } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ @@ -1360,7 +1360,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif +#endif /* CONFIG_SMP */ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1602,7 +1602,7 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } /* @@ -1885,12 +1885,12 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else +#else /* !CONFIG_SMP: */ static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) @@ -2379,11 +2379,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { hrtick_start(rq, dl_se->runtime); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { @@ -3125,13 +3125,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else +#else /* !CONFIG_SMP: */ /* * We don't know if p has a earlier or later deadline, so let's blindly * set a (maybe not needed) rescheduling point. */ resched_curr(rq); -#endif +#endif /* !CONFIG_SMP */ } #ifdef CONFIG_SCHED_CORE @@ -3162,7 +3162,7 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif +#endif /* CONFIG_SMP */ .task_tick = task_tick_dl, .task_fork = task_fork_dl, @@ -3574,7 +3574,7 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif +#endif /* CONFIG_SMP */ void print_dl_stats(struct seq_file *m, int cpu) { -- cgit v1.2.3 From 29dd6f8cd285360cc5a3e1dc696a960541020705 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:49 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/debug.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-9-mingo@kernel.org --- kernel/sched/debug.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b384124d89d3..748709c03214 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -93,10 +93,10 @@ static void sched_feat_enable(int i) { static_key_enable_cpuslocked(&sched_feat_keys[i]); } -#else +#else /* !CONFIG_JUMP_LABEL: */ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { @@ -217,7 +217,7 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* SMP */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -314,9 +314,9 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else -#define sched_verbose_write debugfs_write_file_bool -#endif +#else /* !CONFIG_SMP: */ +# define sched_verbose_write debugfs_write_file_bool +#endif /* !CONFIG_SMP */ static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -523,7 +523,7 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -533,7 +533,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); -#endif +#endif /* CONFIG_NUMA_BALANCING */ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); @@ -697,14 +697,14 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif +#endif /* CONFIG_SMP */ #undef PN_SCHEDSTAT #undef PN #undef P_SCHEDSTAT #undef P } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED static DEFINE_SPINLOCK(sched_debug_lock); @@ -877,8 +877,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -954,9 +954,9 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } -#else +#else /* !CONFIG_X86: */ SEQ_printf(m, "cpu#%d\n", cpu); -#endif +#endif /* !CONFIG_X86 */ #define P(x) \ do { \ @@ -984,7 +984,7 @@ do { \ P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif +#endif /* CONFIG_SMP */ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1166,7 +1166,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); -#endif +#endif /* CONFIG_NUMA_BALANCING */ } void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, @@ -1263,13 +1263,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); -#endif +#endif /* CONFIG_UCLAMP_TASK */ P(policy); P(prio); if (task_has_dl_policy(p)) { -- cgit v1.2.3 From 416d5f78e4d3b010734248cb0aad9dc54b7589fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:50 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/fair.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-10-mingo@kernel.org --- kernel/sched/fair.c | 111 ++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83157de5b808..1fabbe01bf93 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -111,7 +111,7 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH /* @@ -162,7 +162,7 @@ static int __init sched_fair_sysctl_init(void) return 0; } late_initcall(sched_fair_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -471,7 +471,7 @@ static int se_is_idle(struct sched_entity *se) return cfs_rq_is_idle(group_cfs_rq(se)); } -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -517,7 +517,7 @@ static int se_is_idle(struct sched_entity *se) return task_has_idle_policy(task_of(se)); } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); @@ -1008,7 +1008,7 @@ int sched_update_scaling(void) return 0; } -#endif +#endif /* CONFIG_SMP */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1041,6 +1041,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) } #include "pelt.h" + #ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); @@ -1131,7 +1132,7 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ void init_entity_runnable_average(struct sched_entity *se) { } @@ -1141,7 +1142,7 @@ void post_init_entity_util_avg(struct task_struct *p) static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { @@ -2114,12 +2115,12 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline int numa_idle_core(int idle_core, int cpu) { return idle_core; } -#endif +#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -3673,7 +3674,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) p->numa_scan_period = task_scan_start(p); } -#else +#else /* !CONFIG_NUMA_BALANCING: */ + static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } @@ -3690,7 +3692,7 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) { } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -3785,12 +3787,12 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else +#else /* !CONFIG_SMP: */ static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif +#endif /* !CONFIG_SMP */ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -4000,11 +4002,11 @@ static void update_cfs_group(struct sched_entity *se) reweight_entity(cfs_rq_of(se), se, shares); } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_cfs_group(struct sched_entity *se) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { @@ -4481,7 +4483,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) return true; } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} @@ -4494,7 +4496,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_NO_HZ_COMMON static inline void migrate_se_pelt_lag(struct sched_entity *se) @@ -4575,9 +4577,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) __update_load_avg_blocked_se(now, se); } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static void migrate_se_pelt_lag(struct sched_entity *se) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages @@ -5144,7 +5146,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { @@ -5184,7 +5186,7 @@ util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { @@ -5685,7 +5687,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ static bool cfs_bandwidth_used(void) { return true; @@ -5693,7 +5695,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ /* * default period for cfs group bandwidth. @@ -6147,12 +6149,12 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else +#else /* !CONFIG_SMP: */ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { unthrottle_cfs_rq(cfs_rq); } -#endif +#endif /* !CONFIG_SMP */ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6733,9 +6735,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) if (cfs_task_bw_constrained(p)) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#endif +#endif /* CONFIG_NO_HZ_FULL */ -#else /* CONFIG_CFS_BANDWIDTH */ +#else /* !CONFIG_CFS_BANDWIDTH: */ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } @@ -6777,7 +6779,7 @@ bool cfs_task_bw_constrained(struct task_struct *p) return false; } #endif -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* !CONFIG_CFS_BANDWIDTH */ #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} @@ -6822,7 +6824,7 @@ static void hrtick_update(struct rq *rq) hrtick_start_fair(rq, donor); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -6831,7 +6833,7 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static inline void hrtick_update(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) @@ -6875,9 +6877,9 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else +#else /* !CONFIG_SMP: */ static inline void check_update_overutilized_status(struct rq *rq) { } -#endif +#endif /* !CONFIG_SMP */ /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -7677,7 +7679,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT: */ static inline void set_idle_cores(int cpu, int val) { @@ -7698,7 +7700,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd return -1; } -#endif /* CONFIG_SCHED_SMT */ +#endif /* !CONFIG_SCHED_SMT */ /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by @@ -8743,9 +8745,9 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } -#else +#else /* !CONFIG_SMP: */ static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { @@ -8939,7 +8941,7 @@ again: return p; simple: -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ put_prev_set_next_task(rq, prev, p); return p; @@ -9357,13 +9359,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return src_weight - dst_weight; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { return 0; } -#endif +#endif /* !CONFIG_NUMA_BALANCING */ /* * Check whether the task is ineligible on the destination cpu @@ -9772,12 +9774,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) if (!has_blocked) rq->has_blocked_load = 0; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -9886,7 +9888,7 @@ static unsigned long task_h_load(struct task_struct *p) return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1); } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -9903,7 +9905,7 @@ static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void sched_balance_update_blocked_averages(int cpu) { @@ -10616,7 +10618,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) return remote; return all; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { return all; @@ -10626,7 +10628,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) { return regular; } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ struct sg_lb_stats; @@ -12772,7 +12774,7 @@ static void nohz_newidle_balance(struct rq *this_rq) atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -12781,7 +12783,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * sched_balance_newidle is called by schedule() if this_cpu is about to become @@ -13076,10 +13078,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, cfs_rqa = sea->cfs_rq; cfs_rqb = seb->cfs_rq; -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ cfs_rqa = &task_rq(a)->cfs; cfs_rqb = &task_rq(b)->cfs; -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ /* * Find delta after normalizing se's vruntime with its cfs_rq's @@ -13103,9 +13105,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } -#else +#else /* !CONFIG_SCHED_CORE: */ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} -#endif +#endif /* !CONFIG_SCHED_CORE */ /* * scheduler tick hitting a task of our scheduling class. @@ -13199,9 +13201,9 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) list_add_leaf_cfs_rq(cfs_rq); } } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static void propagate_entity_cfs_rq(struct sched_entity *se) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void detach_entity_cfs_rq(struct sched_entity *se) { @@ -13737,6 +13739,5 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* SMP */ - +#endif /* CONFIG_SMP */ } -- cgit v1.2.3 From 833840a94f4dfd86ed32f1b6222fe8d9ba1790a9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:51 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/idle.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-11-mingo@kernel.org --- kernel/sched/idle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cd3298653ef4..43d6e006cff1 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -52,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) return 1; } __setup("hlt", cpu_idle_nopoll_setup); -#endif +#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */ static noinline int __cpuidle cpu_idle_poll(void) { @@ -100,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void) if (static_branch_unlikely(&arch_needs_tick_broadcast)) tick_broadcast_exit(); } -#else +#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */ static inline void cond_tick_broadcast_enter(void) { } static inline void cond_tick_broadcast_exit(void) { } -#endif +#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */ /** * default_idle_call - Default CPU idle routine. @@ -444,7 +444,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif +#endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: -- cgit v1.2.3 From c215dff7f80caab4a25160f0ded369eba5cb9190 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:52 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/loadavg.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-12-mingo@kernel.org --- kernel/sched/loadavg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f6df84ca6925..e3a1ce923f9f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -335,12 +335,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the -- cgit v1.2.3 From 311bb3f7b78e944e831ffb07cb58455b47bf2269 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:53 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/pelt.[ch] - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-13-mingo@kernel.org --- kernel/sched/pelt.c | 4 ++-- kernel/sched/pelt.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 09be6a83e45a..fa83bbaf4f3e 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -414,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) return 0; } -#endif +#endif /* CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* @@ -467,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } -#endif +#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */ /* * Load avg and utiliztion metrics need to be updated periodically and before diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 19592077452e..a5d4933e6b70 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -20,7 +20,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return READ_ONCE(rq->avg_hw.load_avg); } -#else +#else /* !CONFIG_SCHED_HW_PRESSURE: */ static inline int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { @@ -31,7 +31,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return 0; } -#endif +#endif /* !CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); @@ -179,15 +179,15 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } -#else +#else /* !CONFIG_CFS_BANDWIDTH: */ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); } -#endif +#endif /* !CONFIG_CFS_BANDWIDTH */ -#else +#else /* !CONFIG_SMP: */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) @@ -236,6 +236,6 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) { } static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif +#endif /* !CONFIG_SMP */ #endif /* _KERNEL_SCHED_PELT_H */ -- cgit v1.2.3 From fd3db705f7496c5d6b56ce8d78570dd1da11cd30 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:54 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/psi.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-14-mingo@kernel.org --- kernel/sched/psi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 333a7ba39668..5837cd8e7b97 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1039,7 +1039,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st psi_schedule_rtpoll_work(group, 1, false); } while ((group = group->parent)); } -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -1655,7 +1655,7 @@ static const struct proc_ops psi_irq_proc_ops = { .proc_poll = psi_fop_poll, .proc_release = psi_fop_release, }; -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ static int __init psi_proc_init(void) { -- cgit v1.2.3 From 3eca109a7885903f1bf07099ae89025069017cc0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:55 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/rt.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-15-mingo@kernel.org --- kernel/sched/rt.c | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 16008ac33e14..7e8ed05d302a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -63,7 +63,7 @@ static int __init sched_rt_sysctl_init(void) return 0; } late_initcall(sched_rt_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ void init_rt_rq(struct rt_rq *rt_rq) { @@ -294,7 +294,7 @@ err: return 0; } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #define rt_entity_is_task(rt_se) (1) @@ -330,7 +330,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP @@ -433,7 +433,7 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else +#else /* !CONFIG_SMP: */ static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { @@ -446,7 +446,7 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) static inline void rt_queue_push_tasks(struct rq *rq) { } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -488,12 +488,12 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) return cpu_cap >= min(min_cap, max_cap); } -#else +#else /* !CONFIG_UCLAMP_TASK: */ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { return true; } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_RT_GROUP_SCHED @@ -801,9 +801,9 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -933,7 +933,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ typedef struct rt_rq *rt_rq_iter_t; @@ -985,7 +985,7 @@ static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } #endif -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline int rt_se_prio(struct sched_rt_entity *rt_se) { @@ -1036,7 +1036,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } static void @@ -1110,14 +1110,14 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} static inline void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void @@ -1158,12 +1158,12 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else +#else /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED): */ static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ +#endif /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED) */ #ifdef CONFIG_RT_GROUP_SCHED @@ -1185,7 +1185,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) @@ -1195,7 +1195,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) @@ -1685,7 +1685,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif +#endif /* CONFIG_SMP */ } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -2512,11 +2512,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else +#else /* !CONFIG_SMP: */ /* For UP simply resched on drop of prio */ if (oldprio < p->prio) resched_curr(rq); -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2552,9 +2552,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) } } } -#else +#else /* !CONFIG_POSIX_TIMERS: */ static inline void watchdog(struct rq *rq, struct task_struct *p) { } -#endif +#endif /* !CONFIG_POSIX_TIMERS */ /* * scheduler tick hitting a task of our scheduling class. @@ -2623,7 +2623,7 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) return rt_rq_throttled(rt_rq); } -#endif +#endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { @@ -2646,7 +2646,7 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif +#endif /* !CONFIG_SMP */ .task_tick = task_tick_rt, @@ -2890,7 +2890,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) @@ -2898,7 +2898,7 @@ static int sched_rt_global_constraints(void) return 0; } #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) -- cgit v1.2.3 From fdccd0c79280da0a332f1370d2ad17192d563c95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:56 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/sched.h - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-16-mingo@kernel.org --- kernel/sched/sched.h | 82 +++++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f3a41487c96d..2bf804b8c89b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -425,7 +425,7 @@ struct cfs_bandwidth { int nr_burst; u64 throttled_time; u64 burst_time; -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ }; /* Task group related information */ @@ -443,15 +443,15 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP +#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; @@ -532,7 +532,7 @@ extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void free_fair_sched_group(struct task_group *tg) { } static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { @@ -540,7 +540,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou } static inline void online_fair_sched_group(struct task_group *tg) { } static inline void unregister_fair_sched_group(struct task_group *tg) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -577,22 +577,22 @@ extern int sched_group_set_idle(struct task_group *tg, long idle); #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* CONFIG_SMP */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_SMP */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ struct cfs_bandwidth { }; static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); @@ -860,9 +860,9 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else +#else /* !CONFIG_SMP: */ struct dl_bw dl_bw; -#endif +#endif /* !CONFIG_SMP */ /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -1009,7 +1009,7 @@ struct root_domain { /* These atomics are updated outside of a lock */ atomic_t rto_loop_next; atomic_t rto_loop_start; -#endif +#endif /* HAVE_RT_PUSH_IPI */ /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -1295,7 +1295,7 @@ struct rq { unsigned int core_forceidle_seq; unsigned int core_forceidle_occupation; u64 core_forceidle_start; -#endif +#endif /* CONFIG_SCHED_CORE */ /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; @@ -1314,13 +1314,13 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { return container_of(cfs_rq, struct rq, cfs); } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline int cpu_of(struct rq *rq) { @@ -1501,6 +1501,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ + #ifdef CONFIG_RT_GROUP_SCHED # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED DECLARE_STATIC_KEY_FALSE(rt_group_sched); @@ -1508,16 +1509,16 @@ static inline bool rt_group_sched_enabled(void) { return static_branch_unlikely(&rt_group_sched); } -# else +# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */ DECLARE_STATIC_KEY_TRUE(rt_group_sched); static inline bool rt_group_sched_enabled(void) { return static_branch_likely(&rt_group_sched); } -# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ -#else +# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ # define rt_group_sched_enabled() false -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1575,9 +1576,9 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline void update_idle_core(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_SMT */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1758,7 +1759,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq) WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); } -#else /* !CONFIG_SCHED_CLASS_EXT */ +#else /* !CONFIG_SCHED_CLASS_EXT: */ #define scx_enabled() false #define scx_switched_all() false @@ -2175,7 +2176,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } #else /* !CONFIG_CGROUP_SCHED: */ @@ -2201,7 +2202,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; -#endif +#endif /* CONFIG_SMP */ } /* @@ -2430,7 +2431,7 @@ struct sched_class { void (*rq_offline)(struct rq *rq); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif +#endif /* CONFIG_SMP */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); @@ -2955,7 +2956,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) /* * __sched_core_flip() relies on SMT having cpu-id lock order. */ -#endif +#endif /* CONFIG_SCHED_CORE */ return rq1->cpu < rq2->cpu; } @@ -3146,6 +3147,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); + #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void @@ -3255,14 +3257,14 @@ static inline u64 irq_time_read(int cpu) return total; } -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline int irqtime_enabled(void) { return 0; } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3356,9 +3358,9 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK @@ -3536,13 +3538,13 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ #define perf_domain_span(pd) NULL static inline bool sched_energy_enabled(void) { return false; } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #ifdef CONFIG_MEMBARRIER @@ -3568,7 +3570,7 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else /* !CONFIG_MEMBARRIER :*/ +#else /* !CONFIG_MEMBARRIER: */ static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, @@ -3589,7 +3591,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif +#endif /* CONFIG_SMP */ extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3909,7 +3911,7 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_RT_MUTEXES @@ -3953,7 +3955,7 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else +#else /* !CONFIG_SMP: */ static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { @@ -3964,7 +3966,7 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea { } -#endif +#endif /* !CONFIG_SMP */ #ifdef CONFIG_SCHED_CLASS_EXT /* -- cgit v1.2.3 From 91433cd6e46828044a64520dd1dce817be41940e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:57 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/stats.[ch] - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-17-mingo@kernel.org --- kernel/sched/stats.c | 2 +- kernel/sched/stats.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 1faea875d0b3..86acd374fc4d 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -164,7 +164,7 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif +#endif /* CONFIG_SMP */ } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 452826df6ae1..26f3fd4d34ce 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); } -#else /* CONFIG_PSI */ +#else /* !CONFIG_PSI: */ static inline void psi_enqueue(struct task_struct *p, bool migrate) {} static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} @@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev, bool sleep) {} static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /* CONFIG_PSI */ +#endif /* !CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO /* @@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_enqueue(rq, t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHED_INFO */ +#endif /* !CONFIG_SCHED_INFO */ #endif /* _KERNEL_STATS_H */ -- cgit v1.2.3 From 23d27e2cfbee69d85b867a427c3021234bcf09ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:58 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/syscalls.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-18-mingo@kernel.org --- kernel/sched/syscalls.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 547c1f05b667..5cb5e9487f0d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment) return 0; } -#endif +#endif /* __ARCH_WANT_SYS_NICE */ /** * task_prio - return the priority value of a given task. @@ -255,8 +255,7 @@ int sched_core_idle_cpu(int cpu) return idle_cpu(cpu); } - -#endif +#endif /* CONFIG_SCHED_CORE */ /** * find_process_by_pid - find a process with a matching PID value. @@ -448,7 +447,7 @@ static inline int uclamp_validate(struct task_struct *p, } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ /* * Allow unprivileged RT tasks to decrease priority. @@ -658,7 +657,7 @@ change: goto unlock; } } -#endif +#endif /* CONFIG_SMP */ } /* Re-check policy now with rq lock held: */ -- cgit v1.2.3 From f1c6b957f7f4091d822b93ca2833e83bf728e001 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:59 +0200 Subject: sched: Clean up and standardize #if/#else/#endif markers in sched/topology.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-19-mingo@kernel.org --- kernel/sched/topology.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9026d325d0fd..2352caf4f942 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -315,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void) } late_initcall(sched_energy_aware_sysctl_init); -#endif +#endif /* CONFIG_PROC_SYSCTL */ static void free_pd(struct perf_domain *pd) { @@ -451,9 +451,9 @@ free: return false; } -#else +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ static void free_rootdomain(struct rcu_head *rcu) { @@ -1600,7 +1600,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -#endif +#endif /* CONFIG_NUMA */ /* * SD_flags allowed in topology descriptions. @@ -1716,7 +1716,7 @@ sd_init(struct sched_domain_topology_level *tl, SD_WAKE_AFFINE); } -#endif +#endif /* CONFIG_NUMA */ } else { sd->cache_nice_tries = 1; } -- cgit v1.2.3 From 5202c25dd17c54cd4c21f266d9a51b644d7cd682 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:00 +0200 Subject: sched/smp: Always define sched_domains_mutex_lock()/unlock(), def_root_domain and sched_domains_mutex Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional. Unconditionally build kernel/sched/topology.c and the main sched-domains locking primitives. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-20-mingo@kernel.org --- kernel/sched/build_utility.c | 3 ++- kernel/sched/topology.c | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index bf9d8db94b70..5c485b2dfb95 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -83,9 +83,10 @@ #ifdef CONFIG_SMP # include "cpupri.c" # include "stop_task.c" -# include "topology.c" #endif +#include "topology.c" + #ifdef CONFIG_SCHED_CORE # include "core_sched.c" #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2352caf4f942..ee347d9c5df4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void) mutex_unlock(&sched_domains_mutex); } +#ifdef CONFIG_SMP + /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; @@ -2842,3 +2844,5 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); sched_domains_mutex_unlock(); } + +#endif /* CONFIG_SMP */ -- cgit v1.2.3 From cac5cefbade90ff0bb0b393d301fa3b5234cf056 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:01 +0200 Subject: sched/smp: Make SMP unconditional Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional. Introduce transitory wrappers for functionality not yet converted to SMP. Note that this patch is pretty large, because there's no clear separation between various aspects of the SMP scheduler, it's basically a huge block of #ifdef CONFIG_SMP. A fair amount of it has to be switched on for it to boot and work on UP systems. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-21-mingo@kernel.org --- kernel/sched/build_policy.c | 6 +-- kernel/sched/build_utility.c | 6 +-- kernel/sched/core.c | 106 ++++++------------------------------- kernel/sched/cpudeadline.h | 2 - kernel/sched/cpupri.h | 2 - kernel/sched/deadline.c | 95 --------------------------------- kernel/sched/debug.c | 12 ----- kernel/sched/fair.c | 115 ---------------------------------------- kernel/sched/pelt.h | 52 ------------------- kernel/sched/rt.c | 6 ++- kernel/sched/sched.h | 121 ++----------------------------------------- kernel/sched/syscalls.c | 2 - kernel/sched/topology.c | 10 +--- 13 files changed, 29 insertions(+), 506 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 72d97aa8b726..c4a488e67aa7 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -50,11 +50,9 @@ #include "idle.c" #include "rt.c" +#include "cpudeadline.c" -#ifdef CONFIG_SMP -# include "cpudeadline.c" -# include "pelt.c" -#endif +#include "pelt.c" #include "cputime.c" #include "deadline.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 5c485b2dfb95..e2cf3b08d4e9 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -80,10 +80,8 @@ #include "wait_bit.c" #include "wait.c" -#ifdef CONFIG_SMP -# include "cpupri.c" -# include "stop_task.c" -#endif +#include "cpupri.c" +#include "stop_task.c" #include "topology.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ef6d29792c..9fc44f4b779a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -650,7 +650,6 @@ void raw_spin_rq_unlock(struct rq *rq) raw_spin_unlock(rq_lockp(rq)); } -#ifdef CONFIG_SMP /* * double_rq_lock - safely lock two runqueues */ @@ -667,7 +666,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif /* CONFIG_SMP */ /* * __task_rq_lock - lock the rq @p resides on. @@ -949,7 +947,7 @@ static inline void hrtick_rq_init(struct rq *rq) _val; \ }) -#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +#ifdef TIF_POLLING_NRFLAG /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -988,13 +986,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) return true; } -#ifdef CONFIG_SMP static inline bool set_nr_if_polling(struct task_struct *p) { return false; } #endif -#endif static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { @@ -1167,7 +1163,6 @@ void resched_cpu(int cpu) raw_spin_rq_unlock_irqrestore(rq, flags); } -#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy CPU for migrating timers @@ -1374,10 +1369,8 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ -#endif /* CONFIG_SMP */ -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. @@ -2353,8 +2346,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state return ncsw; } -#ifdef CONFIG_SMP - static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); @@ -3305,6 +3296,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) WARN_ON_ONCE(ret); } +#ifdef CONFIG_SMP + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { unsigned int state = READ_ONCE(p->__state); @@ -3358,6 +3351,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) @@ -3661,17 +3655,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* !CONFIG_SMP: */ - -static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } - -static inline bool rq_has_pinned_tasks(struct rq *rq) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -3682,7 +3665,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) rq = this_rq(); -#ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); __schedstat_inc(p->stats.nr_wakeups_local); @@ -3702,7 +3684,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) __schedstat_inc(p->stats.nr_wakeups_migrate); -#endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); __schedstat_inc(p->stats.nr_wakeups); @@ -3731,13 +3712,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (p->sched_contributes_to_load) rq->nr_uninterruptible--; -#ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else -#endif if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); @@ -3748,7 +3727,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ttwu_do_wakeup(p); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Our task @p is fully woken up and running; so it's safe to @@ -3770,7 +3748,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif /* CONFIG_SMP */ } /* @@ -3824,7 +3801,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) return ret; } -#ifdef CONFIG_SMP void sched_ttwu_pending(void *arg) { struct llist_node *llist = arg; @@ -3891,7 +3867,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); +#ifdef CONFIG_SMP __smp_call_single_queue(cpu, &p->wake_entry.llist); +#endif } void wake_up_if_idle(int cpu) @@ -3992,15 +3970,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP: */ - -static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -4533,10 +4502,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); -#ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; -#endif init_sched_mm_cid(p); } @@ -4787,14 +4754,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#ifdef CONFIG_SMP p->on_cpu = 0; -#endif init_task_preempt_count(p); -#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -#endif + return 0; } @@ -4871,7 +4835,6 @@ void wake_up_new_task(struct task_struct *p) raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); -#ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_ptr can change in the fork path @@ -4883,7 +4846,6 @@ void wake_up_new_task(struct task_struct *p) p->recent_used_cpu = task_cpu(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); -#endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); @@ -4994,7 +4956,6 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, static inline void prepare_task(struct task_struct *next) { -#ifdef CONFIG_SMP /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. @@ -5003,12 +4964,10 @@ static inline void prepare_task(struct task_struct *next) * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); -#endif } static inline void finish_task(struct task_struct *prev) { -#ifdef CONFIG_SMP /* * This must be the very last reference to @prev from this CPU. After * p->on_cpu is cleared, the task can be moved to a different CPU. We @@ -5021,11 +4980,8 @@ static inline void finish_task(struct task_struct *prev) * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); -#endif } -#ifdef CONFIG_SMP - static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5107,14 +5063,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else /* !CONFIG_SMP: */ - -static inline void __balance_callbacks(struct rq *rq) -{ -} - -#endif /* !CONFIG_SMP */ - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -5563,7 +5511,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +#ifdef CONFIG_64BIT /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. @@ -5690,12 +5638,10 @@ void sched_tick(void) if (donor->flags & PF_WQ_WORKER) wq_worker_tick(donor); -#ifdef CONFIG_SMP if (!scx_switched_all()) { rq->idle_balance = idle_cpu(cpu); sched_balance_trigger(rq); } -#endif } #ifdef CONFIG_NO_HZ_FULL @@ -7819,12 +7765,10 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { -#ifdef CONFIG_SMP struct affinity_context ac = (struct affinity_context) { .new_mask = cpumask_of(cpu), .flags = 0, }; -#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -7840,13 +7784,11 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); -#ifdef CONFIG_SMP /* * No validation and serialization required at boot time and for * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); -#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the CPU isn't yet set to this CPU so the @@ -7865,9 +7807,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; -#ifdef CONFIG_SMP idle->on_cpu = 1; -#endif raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); @@ -7880,13 +7820,9 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif } -#ifdef CONFIG_SMP - int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -8480,13 +8416,6 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else /* !CONFIG_SMP: */ -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* !CONFIG_SMP */ - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8512,9 +8441,7 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ -#ifdef CONFIG_SMP BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -#endif BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); @@ -8557,9 +8484,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_SMP init_defrootdomain(); -#endif #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -8620,7 +8545,6 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif -#ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8637,16 +8561,15 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -# ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); -# endif -# ifdef CONFIG_HOTPLUG_CPU +#endif +#ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); -# endif -#endif /* CONFIG_SMP */ +#endif hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); @@ -8696,8 +8619,9 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); - balance_push_set(smp_processor_id(), false); #endif + + balance_push_set(smp_processor_id(), false); init_sched_fair_class(); init_sched_ext_class(); diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 3f7c73d1d189..11c0f1faa7e1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -17,7 +17,6 @@ struct cpudl { struct cpudl_item *elements; }; -#ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); @@ -25,4 +24,3 @@ int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 24add19625ff..6f562088c056 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -20,7 +20,6 @@ struct cpupri { int *cpu_to_pri; }; -#ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, @@ -29,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2cea68b2198..bf9b70a3ff95 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -115,7 +115,6 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) } #endif /* !CONFIG_RT_MUTEXES */ -#ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -195,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else /* !CONFIG_SMP: */ -static inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - return 1; -} - -static inline unsigned long dl_bw_capacity(int i) -{ - return SCHED_CAPACITY_SCALE; -} - -bool dl_bw_visited(int cpu, u64 cookie) -{ - return false; -} - -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif /* !CONFIG_SMP */ static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -556,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->root = RB_ROOT_CACHED; -#ifdef CONFIG_SMP /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; -#else - init_dl_bw(&dl_rq->dl_bw); -#endif dl_rq->running_bw = 0; dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } -#ifdef CONFIG_SMP - static inline int dl_overloaded(struct rq *rq) { return atomic_read(&rq->rd->dlo_count); @@ -757,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else /* !CONFIG_SMP: */ - -static inline -void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline -void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline void deadline_queue_push_tasks(struct rq *rq) -{ -} - -static inline void deadline_queue_pull_task(struct rq *rq) -{ -} -#endif /* !CONFIG_SMP */ - static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -1199,7 +1132,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) static void __push_dl_task(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -1213,7 +1145,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif /* CONFIG_SMP */ } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ @@ -1343,7 +1274,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } -#ifdef CONFIG_SMP if (unlikely(!rq->online)) { /* * If the runqueue is no longer available, migrate the @@ -1360,7 +1290,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif /* CONFIG_SMP */ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1848,8 +1777,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) #define __node_2_dle(node) \ rb_entry((node), struct sched_dl_entity, rb_node) -#ifdef CONFIG_SMP - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); @@ -1885,13 +1812,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else /* !CONFIG_SMP: */ - -static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} - -#endif /* !CONFIG_SMP */ - static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -2218,8 +2138,6 @@ static void yield_task_dl(struct rq *rq) rq_clock_skip_update(rq); } -#ifdef CONFIG_SMP - static inline bool dl_task_is_earliest_deadline(struct task_struct *p, struct rq *rq) { @@ -2349,7 +2267,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Only called when both the current and waking task are -deadline @@ -2363,7 +2280,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, return; } -#ifdef CONFIG_SMP /* * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... @@ -2371,7 +2287,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); -#endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK @@ -2504,8 +2419,6 @@ static void task_fork_dl(struct task_struct *p) */ } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define DL_MAX_TRIES 3 @@ -2999,8 +2912,6 @@ void dl_clear_root_domain_cpu(int cpu) dl_clear_root_domain(cpu_rq(cpu)->rd); } -#endif /* CONFIG_SMP */ - static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* @@ -3073,10 +2984,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } if (rq->donor != p) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); -#endif if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else @@ -3153,7 +3062,6 @@ DEFINE_SCHED_CLASS(dl) = { .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, -#ifdef CONFIG_SMP .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, @@ -3162,7 +3070,6 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif /* CONFIG_SMP */ .task_tick = task_tick_dl, .task_fork = task_fork_dl, @@ -3462,7 +3369,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) return false; } -#ifdef CONFIG_SMP int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3574,7 +3480,6 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif /* CONFIG_SMP */ void print_dl_stats(struct seq_file *m, int cpu) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 748709c03214..04c0354f05e4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -286,7 +286,6 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_verbose; -#ifdef CONFIG_SMP static struct dentry *sd_dentry; @@ -314,9 +313,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else /* !CONFIG_SMP: */ -# define sched_verbose_write debugfs_write_file_bool -#endif /* !CONFIG_SMP */ static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -543,8 +539,6 @@ static __init int sched_init_debug(void) } late_initcall(sched_init_debug); -#ifdef CONFIG_SMP - static cpumask_var_t sd_sysctl_cpus; static int sd_flags_show(struct seq_file *m, void *v) @@ -655,8 +649,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -932,11 +924,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) PU(dl_nr_running); -#ifdef CONFIG_SMP dl_bw = &cpu_rq(cpu)->rd->dl_bw; -#else - dl_bw = &dl_rq->dl_bw; -#endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1fabbe01bf93..6b17d3da034a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str) } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); -#ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. */ @@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH /* @@ -996,7 +994,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1008,7 +1005,6 @@ int sched_update_scaling(void) return 0; } -#endif /* CONFIG_SMP */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1042,8 +1038,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) #include "pelt.h" -#ifdef CONFIG_SMP - static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); static unsigned long capacity_of(int cpu); @@ -1132,18 +1126,6 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP: */ -void init_entity_runnable_average(struct sched_entity *se) -{ -} -void post_init_entity_util_avg(struct task_struct *p) -{ -} -static void update_tg_load_avg(struct cfs_rq *cfs_rq) -{ -} -#endif /* !CONFIG_SMP */ - static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { u64 now = rq_clock_task(rq); @@ -3698,14 +3680,12 @@ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); } -#endif cfs_rq->nr_queued++; } @@ -3713,12 +3693,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } -#endif cfs_rq->nr_queued--; } @@ -3770,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) -#ifdef CONFIG_SMP static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3787,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else /* !CONFIG_SMP: */ -static inline void -enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif /* !CONFIG_SMP */ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -3824,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_load_set(&se->load, weight); -#ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); -#endif enqueue_load_avg(cfs_rq, se); if (se->on_rq) { @@ -3865,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -3972,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } -#endif /* CONFIG_SMP */ /* * Recomputes the group entity based on the current state of its group @@ -3993,11 +3960,7 @@ static void update_cfs_group(struct sched_entity *se) if (throttled_hierarchy(gcfs_rq)) return; -#ifndef CONFIG_SMP - shares = READ_ONCE(gcfs_rq->tg->shares); -#else shares = calc_group_shares(gcfs_rq); -#endif if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); } @@ -4031,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } } -#ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { if (sa->load_sum) @@ -5146,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* !CONFIG_SMP: */ - -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - return !cfs_rq->nr_queued; -} - -#define UPDATE_TG 0x0 -#define SKIP_AGE_LOAD 0x0 -#define DO_ATTACH 0x0 -#define DO_DETACH 0x0 - -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) -{ - cfs_rq_util_change(cfs_rq, 0); -} - -static inline void remove_entity_load_avg(struct sched_entity *se) {} - -static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} - -static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) -{ - return 0; -} - -static inline void -util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} -static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - -#endif /* !CONFIG_SMP */ - void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { struct sched_entity *se = &p->se; @@ -6090,7 +6010,6 @@ unthrottle_throttle: resched_curr(rq); } -#ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; @@ -6149,12 +6068,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else /* !CONFIG_SMP: */ -static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -{ - unthrottle_cfs_rq(cfs_rq); -} -#endif /* !CONFIG_SMP */ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6610,7 +6523,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * guaranteed at this point that no additional cfs_rq of this group can * join a CSD list. */ -#ifdef CONFIG_SMP for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); unsigned long flags; @@ -6622,7 +6534,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) __cfsb_csd_unthrottle(rq); local_irq_restore(flags); } -#endif } /* @@ -6835,7 +6746,6 @@ static inline void hrtick_update(struct rq *rq) } #endif /* !CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { unsigned long rq_util_min, rq_util_max; @@ -6877,9 +6787,6 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else /* !CONFIG_SMP: */ -static inline void check_update_overutilized_status(struct rq *rq) { } -#endif /* !CONFIG_SMP */ /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -6888,12 +6795,10 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } -#endif static void requeue_delayed_entity(struct sched_entity *se) @@ -7208,8 +7113,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq) return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); } -#ifdef CONFIG_SMP - /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); @@ -8745,9 +8648,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } -#else /* !CONFIG_SMP: */ -static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* !CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { @@ -9057,7 +8957,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) return true; } -#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods. * @@ -12980,8 +12879,6 @@ static void rq_offline_fair(struct rq *rq) clear_tg_offline_cfs_rqs(rq); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_SCHED_CORE static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) @@ -13209,7 +13106,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_SMP /* * In case the task sched_avg hasn't been attached: * - A forked task which hasn't been woken up by wake_up_new_task(). @@ -13218,7 +13114,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) */ if (!se->avg.last_update_time) return; -#endif /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); @@ -13282,7 +13177,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs { struct sched_entity *se = &p->se; -#ifdef CONFIG_SMP if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13290,7 +13184,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs */ list_move(&se->group_node, &rq->cfs_tasks); } -#endif if (!first) return; @@ -13328,9 +13221,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13345,10 +13236,8 @@ static void task_change_group_fair(struct task_struct *p) detach_task_cfs_rq(p); -#ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; -#endif set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } @@ -13644,7 +13533,6 @@ DEFINE_SCHED_CLASS(fair) = { .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, -#ifdef CONFIG_SMP .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13654,7 +13542,6 @@ DEFINE_SCHED_CLASS(fair) = { .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_fair, -#endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -13717,7 +13604,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { -#ifdef CONFIG_SMP int i; for_each_possible_cpu(i) { @@ -13739,5 +13625,4 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* CONFIG_SMP */ } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index a5d4933e6b70..62c3fa543c0f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -3,7 +3,6 @@ #define _KERNEL_SCHED_PELT_H #include "sched.h" -#ifdef CONFIG_SMP #include "sched-pelt.h" int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); @@ -187,55 +186,4 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) } #endif /* !CONFIG_CFS_BANDWIDTH */ -#else /* !CONFIG_SMP: */ - -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int -update_rt_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_dl_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) -{ - return 0; -} - -static inline u64 hw_load_avg(struct rq *rq) -{ - return 0; -} - -static inline int -update_irq_load_avg(struct rq *rq, u64 running) -{ - return 0; -} - -static inline u64 rq_clock_pelt(struct rq *rq) -{ - return rq_clock_task(rq); -} - -static inline void -update_rq_clock_pelt(struct rq *rq, s64 delta) { } - -static inline void -update_idle_rq_clock_pelt(struct rq *rq) { } - -static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif /* !CONFIG_SMP */ - #endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7e8ed05d302a..ab211706b160 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2454,7 +2454,11 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#endif /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ +void __init init_sched_rt_class(void) +{ +} +#endif /* !CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2bf804b8c89b..7a7ebc2a3675 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -443,14 +443,12 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -574,13 +572,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); -#ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP: */ -static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* !CONFIG_SMP */ #else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } @@ -668,7 +661,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SMP /* * CFS load tracking */ @@ -700,7 +692,6 @@ struct cfs_rq { u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ @@ -797,14 +788,10 @@ struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; unsigned int rr_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP int next; /* next highest */ -#endif } highest_prio; -#endif #ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; @@ -840,7 +827,6 @@ struct dl_rq { unsigned int dl_nr_running; -#ifdef CONFIG_SMP /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates @@ -860,9 +846,7 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else /* !CONFIG_SMP: */ - struct dl_bw dl_bw; -#endif /* !CONFIG_SMP */ + /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -933,7 +917,6 @@ static inline long se_runnable(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * XXX we want to get rid of these helpers and use the full load resolution. */ @@ -1044,7 +1027,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status) #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK /* @@ -1108,18 +1090,14 @@ struct rq { unsigned int numa_migrate_on; #endif #ifdef CONFIG_NO_HZ_COMMON -#ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; call_single_data_t nohz_csd; -#endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_SMP unsigned int ttwu_pending; -#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -1184,7 +1162,6 @@ struct rq { int membarrier_state; #endif -#ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1225,7 +1202,6 @@ struct rq { #ifdef CONFIG_HOTPLUG_CPU struct rcuwait hotplug_wait; #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1272,9 +1248,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#ifdef CONFIG_SMP unsigned int nr_pinned; -#endif unsigned int push_busy; struct cpu_stop_work push_work; @@ -1300,7 +1274,7 @@ struct rq { /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; -#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) +#ifdef CONFIG_CFS_BANDWIDTH call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif @@ -1963,8 +1937,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_SMP - static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2130,8 +2102,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2418,7 +2388,6 @@ struct sched_class { void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); -#ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -2431,7 +2400,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif /* CONFIG_SMP */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); @@ -2583,8 +2551,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_MIGRATE_ENABLE 0x04 #define SCA_USER 0x08 -#ifdef CONFIG_SMP - extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void sched_balance_trigger(struct rq *rq); @@ -2636,26 +2602,6 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#else /* !CONFIG_SMP: */ - -static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -{ - return true; -} - -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_CPU_IDLE static inline void idle_set_state(struct rq *rq, @@ -2932,8 +2878,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } -#ifdef CONFIG_SMP - static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { #ifdef CONFIG_SCHED_CORE @@ -3093,42 +3037,6 @@ extern void set_rq_offline(struct rq *rq); extern bool sched_smp_initialized; -#else /* !CONFIG_SMP: */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - WARN_ON_ONCE(!irqs_disabled()); - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_lock(rq1); - __acquire(rq2->lock); /* Fake it out ;) */ - double_rq_clock_clear_update(rq1, rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_unlock(rq1); - __release(rq2->lock); -} - -#endif /* !CONFIG_SMP */ - DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) @@ -3187,7 +3095,7 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif /* !CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_run_idle_balance(int cpu); #else static inline void nohz_run_idle_balance(int cpu) { } @@ -3313,8 +3221,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_SMP - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3358,10 +3264,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP: */ -static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -3580,7 +3482,6 @@ static inline void membarrier_switch_mm(struct rq *rq, #endif /* !CONFIG_MEMBARRIER */ -#ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) { if (!(p->flags & PF_KTHREAD)) @@ -3591,7 +3492,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif /* CONFIG_SMP */ extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3890,7 +3790,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); -#ifdef CONFIG_SMP static inline void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) { @@ -3911,7 +3810,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif /* CONFIG_SMP */ #ifdef CONFIG_RT_MUTEXES @@ -3952,21 +3850,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio); -#ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else /* !CONFIG_SMP: */ - -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} - -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - -#endif /* !CONFIG_SMP */ #ifdef CONFIG_SCHED_CLASS_EXT /* diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 5cb5e9487f0d..d7fccf871c7d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -1119,7 +1119,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } -#ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { /* @@ -1148,7 +1147,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; } -#endif /* CONFIG_SMP */ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ee347d9c5df4..f2c10167f2bc 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -17,8 +17,6 @@ void sched_domains_mutex_unlock(void) mutex_unlock(&sched_domains_mutex); } -#ifdef CONFIG_SMP - /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; @@ -1322,11 +1320,10 @@ next: update_group_capacity(sd, cpu); } -#ifdef CONFIG_SMP - /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) { +#ifdef CONFIG_SMP int asym_prefer_cpu = cpu; struct sched_domain *sd; @@ -1376,9 +1373,8 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); } -} - #endif /* CONFIG_SMP */ +} /* * Set of available CPUs grouped by their corresponding capacities @@ -2844,5 +2840,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); sched_domains_mutex_unlock(); } - -#endif /* CONFIG_SMP */ -- cgit v1.2.3 From a1416303d108befb4e2b62c863ae1defe4eb1ba3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:03 +0200 Subject: sched/smp: Always define rq->hrtick_csd Simplify the scheduler by making CONFIG_SMP=y data structure of rq->hrtick_csd unconditional. Adjust hrtick_start() accordingly, which was split due to the ::hrtick_csd asymmetry and use the SMP version there too. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-23-mingo@kernel.org --- kernel/sched/core.c | 23 ----------------------- kernel/sched/sched.h | 2 -- 2 files changed, 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9fc44f4b779a..c155ee0bd7a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -851,8 +851,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -#ifdef CONFIG_SMP - static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; @@ -897,30 +895,9 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else /* !CONFIG_SMP: */ -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and IRQs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - delay = max_t(u64, delay, 10000LL); - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_HARD); -} - -#endif /* !CONFIG_SMP */ - static void hrtick_rq_init(struct rq *rq) { -#ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -#endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* !CONFIG_SCHED_HRTICK: */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7ebc2a3675..3e7151ed5e5e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1219,9 +1219,7 @@ struct rq { long calc_load_active; #ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP call_single_data_t hrtick_csd; -#endif struct hrtimer hrtick_timer; ktime_t hrtick_time; #endif -- cgit v1.2.3 From d0a0a055a58617ac8dd9418f08346e94310f1096 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:04 +0200 Subject: sched/smp: Use the SMP version of try_to_wake_up() Simplify the scheduler by making CONFIG_SMP=y logic within try_to_wake_up() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-24-mingo@kernel.org --- kernel/sched/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c155ee0bd7a8..d20fea6a380e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4202,7 +4202,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) break; -#ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -4281,9 +4280,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else /* !CONFIG_SMP: */ - cpu = task_cpu(p); -#endif /* !CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } -- cgit v1.2.3 From 8039addbe5a56e4108b1bea57aa5b3f70750fed9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:05 +0200 Subject: sched/smp: Use the SMP version of __task_needs_rq_lock() Simplify the scheduler by making CONFIG_SMP=y code in __task_needs_rq_lock() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-25-mingo@kernel.org --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d20fea6a380e..82b7cdb353d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4312,14 +4312,12 @@ static bool __task_needs_rq_lock(struct task_struct *p) if (p->on_rq) return true; -#ifdef CONFIG_SMP /* * Ensure the task has finished __schedule() and will not be referenced * anymore. Again, see try_to_wake_up() for a longer comment. */ smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); -#endif return false; } -- cgit v1.2.3 From 588467616c88b12657f6ebe7306d830c272fe054 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:06 +0200 Subject: sched/smp: Use the SMP version of wake_up_new_task() Simplify the scheduler by making CONFIG_SMP=y code in wake_up_new_task() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-26-mingo@kernel.org --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 82b7cdb353d8..c108b5c2e115 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4824,7 +4824,6 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, wake_flags); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Nothing relies on rq->lock after this, so it's fine to @@ -4834,7 +4833,6 @@ void wake_up_new_task(struct task_struct *p) p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); } -#endif task_rq_unlock(rq, p, &rf); } -- cgit v1.2.3 From 1f25730e5a780b33f78e3ea23e64d3f75e0b2042 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:07 +0200 Subject: sched/smp: Use the SMP version of sched_exec() Simplify the scheduler making CONFIG_SMP=y sched_exec() code unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-27-mingo@kernel.org --- kernel/sched/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c108b5c2e115..fa89006e05e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5419,8 +5419,6 @@ unsigned int nr_iowait(void) return sum; } -#ifdef CONFIG_SMP - /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. @@ -5444,8 +5442,6 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif /* CONFIG_SMP */ - DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -- cgit v1.2.3 From 74063c1755ca990440a9c61c91bb7a873a40deeb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:08 +0200 Subject: sched/smp: Use the SMP version of idle_thread_set_boot_cpu() Simplify the scheduler by making the CONFIG_SMP=y version of idle_thread_set_boot_cpu() unconditional. Note that idle_thread_set_boot_cpu() is already conditional on CONFIG_GENERIC_SMP_IDLE_THREAD, which most architectures select unconditionally on both UP and SMP kernels. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-28-mingo@kernel.org --- kernel/sched/core.c | 2 -- kernel/smpboot.c | 4 ---- 2 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa89006e05e9..a03c3c1d7f50 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8582,9 +8582,7 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; -#ifdef CONFIG_SMP idle_thread_set_boot_cpu(); -#endif balance_push_set(smp_processor_id(), false); init_sched_fair_class(); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1992b62e980b..4503b60ce9bd 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -18,8 +18,6 @@ #include "smpboot.h" -#ifdef CONFIG_SMP - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -76,8 +74,6 @@ void __init idle_threads_init(void) } #endif -#endif /* #ifdef CONFIG_SMP */ - static LIST_HEAD(hotplug_threads); static DEFINE_MUTEX(smpboot_threads_lock); -- cgit v1.2.3 From 15125a229abc2404a264ce493e64a9ffa7850f6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:09 +0200 Subject: sched/smp: Use the SMP version of the RT scheduling class Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional in the RT policies scheduler. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-29-mingo@kernel.org --- kernel/sched/rt.c | 72 ---------------------------------------------------- kernel/sched/sched.h | 2 -- 2 files changed, 74 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ab211706b160..15d5855c542c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -78,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -332,8 +330,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) } #endif /* !CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP - static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ @@ -433,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else /* !CONFIG_SMP: */ - -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void rt_queue_push_tasks(struct rq *rq) -{ -} -#endif /* !CONFIG_SMP */ - static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -597,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { return this_rq()->rd->span; } -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) @@ -628,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) rt_rq->rt_time < rt_b->rt_runtime); } -#ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. */ @@ -801,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP: */ -static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* !CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -980,10 +950,8 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return &cpu_rq(cpu)->rt; } -#ifdef CONFIG_SMP static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } -#endif #endif /* !CONFIG_RT_GROUP_SCHED */ @@ -1078,8 +1046,6 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) cpufreq_update_util(rq, 0); } -#if defined CONFIG_SMP - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { @@ -1110,16 +1076,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* !CONFIG_SMP: */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* !CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void inc_rt_prio(struct rt_rq *rt_rq, int prio) { @@ -1158,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED): */ - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED) */ - #ifdef CONFIG_RT_GROUP_SCHED static void @@ -1541,7 +1490,6 @@ static void yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr, 0); } -#ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); static int @@ -1656,7 +1604,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Preempt the current task with a newly woken task if needed: @@ -1670,7 +1617,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) return; } -#ifdef CONFIG_SMP /* * If: * @@ -1685,7 +1631,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif /* CONFIG_SMP */ } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -1779,8 +1724,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s enqueue_pushable_task(rq, p); } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define RT_MAX_TRIES 3 @@ -2454,11 +2397,6 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#else /* !CONFIG_SMP: */ -void __init init_sched_rt_class(void) -{ -} -#endif /* !CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue @@ -2482,10 +2420,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * then see if we can move to another run queue. */ if (task_on_rq_queued(p)) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); -#endif /* CONFIG_SMP */ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } @@ -2502,7 +2438,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) return; if (task_current_donor(rq, p)) { -#ifdef CONFIG_SMP /* * If our priority decreases while running, we * may need to pull tasks to this runqueue. @@ -2516,11 +2451,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else /* !CONFIG_SMP: */ - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_curr(rq); -#endif /* !CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2641,7 +2571,6 @@ DEFINE_SCHED_CLASS(rt) = { .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, -#ifdef CONFIG_SMP .balance = balance_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, @@ -2650,7 +2579,6 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif /* !CONFIG_SMP */ .task_tick = task_tick_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3e7151ed5e5e..6bc8e426f174 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -792,11 +792,9 @@ struct rt_rq { int curr; /* highest queued rt task prio */ int next; /* next highest */ } highest_prio; -#ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; -#endif /* CONFIG_SMP */ int rt_queued; #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.2.3 From 6324dce8f6262ec2049494af311e5418bc733341 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:10 +0200 Subject: sched/smp: Use the SMP version of the deadline scheduling class Simplify the scheduler by making CONFIG_SMP=y code in prio_changed_dl() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-30-mingo@kernel.org --- kernel/sched/deadline.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index bf9b70a3ff95..0f30697ad795 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3005,7 +3005,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!task_on_rq_queued(p)) return; -#ifdef CONFIG_SMP /* * This might be too much, but unfortunately * we don't have the old deadline value, and @@ -3034,13 +3033,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else /* !CONFIG_SMP: */ - /* - * We don't know if p has a earlier or later deadline, so let's blindly - * set a (maybe not needed) rescheduling point. - */ - resched_curr(rq); -#endif /* !CONFIG_SMP */ } #ifdef CONFIG_SCHED_CORE -- cgit v1.2.3 From 02fb885ebdc4ad029aabff4b85dcbc540d7cdb32 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:11 +0200 Subject: sched/smp: Use the SMP version of scheduler debugging data Simplify the scheduler by making CONFIG_SMP=y debug output unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-31-mingo@kernel.org --- kernel/sched/debug.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 04c0354f05e4..f16d53928bb9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,8 +169,6 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -#ifdef CONFIG_SMP - static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -217,8 +215,6 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* CONFIG_SMP */ - #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -511,7 +507,6 @@ static __init int sched_init_debug(void) debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -#ifdef CONFIG_SMP debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -519,7 +514,6 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -685,11 +679,9 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); -#ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif /* CONFIG_SMP */ #undef PN_SCHEDSTAT #undef PN @@ -849,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", @@ -870,7 +861,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -967,12 +957,10 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SMP #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif /* CONFIG_SMP */ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1242,7 +1230,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); -#ifdef CONFIG_SMP P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); @@ -1251,7 +1238,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); -- cgit v1.2.3 From 9d9af2372f2a46242fd5e827973235f40f31a706 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:12 +0200 Subject: sched/smp: Use the SMP version of schedstats Simplify the scheduler by making CONFIG_SMP=y schedstats debugging output unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-32-mingo@kernel.org --- kernel/sched/stats.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 86acd374fc4d..d1c9429a4ac5 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -115,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "timestamp %lu\n", jiffies); } else { struct rq *rq; -#ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; -#endif cpu = (unsigned long)(v - 2); rq = cpu_rq(cpu); @@ -133,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); -#ifdef CONFIG_SMP /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -164,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif /* CONFIG_SMP */ } return 0; } -- cgit v1.2.3 From 8a9246ddc16c0feaa3b09ca9d2e30fbfa88d09de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:13 +0200 Subject: sched/smp: Use the SMP version of the scheduler syscalls Simplify the scheduler by making CONFIG_SMP=y code in idle_cpu(), __sched_setscheduler() and sched_setaffinity() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-33-mingo@kernel.org --- kernel/sched/syscalls.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index d7fccf871c7d..77ae87f36e84 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -209,10 +209,8 @@ int idle_cpu(int cpu) if (rq->nr_running) return 0; -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -641,7 +639,6 @@ change: goto unlock; } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; @@ -657,7 +654,6 @@ change: goto unlock; } } -#endif /* CONFIG_SMP */ } /* Re-check policy now with rq lock held: */ @@ -1239,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); if (user_mask) { cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { + } else { return -ENOMEM; } -- cgit v1.2.3 From 6c8d251621c131274fa05b46fc138f296b00f6e4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:14 +0200 Subject: sched/smp: Use the SMP version of sched_update_asym_prefer_cpu() Simplify the scheduler by making CONFIG_SMP=y code in sched_update_asym_prefer_cpu() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-34-mingo@kernel.org --- kernel/sched/topology.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f2c10167f2bc..8e06b1d22e91 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1323,7 +1323,6 @@ next: /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) { -#ifdef CONFIG_SMP int asym_prefer_cpu = cpu; struct sched_domain *sd; @@ -1373,7 +1372,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); } -#endif /* CONFIG_SMP */ } /* -- cgit v1.2.3 From 482c4dae75cbe3a3bc7109d6c2346e400facbea8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:15 +0200 Subject: sched/smp: Use the SMP version of the idle scheduling class Simplify the scheduler by making CONFIG_SMP=y code in the idle scheduling classunconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-35-mingo@kernel.org --- kernel/sched/idle.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 43d6e006cff1..c39b089d4f09 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -432,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state) * idle-task scheduling class. */ -#ifdef CONFIG_SMP static int select_task_rq_idle(struct task_struct *p, int cpu, int flags) { @@ -444,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -531,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = { .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, -#ifdef CONFIG_SMP .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_idle, -- cgit v1.2.3 From caf5bde9c542f0cbdf2c91db7ea9743e33941d91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:16 +0200 Subject: sched/smp: Use the SMP version of the stop-CPU scheduling class Simplify the scheduler by making CONFIG_SMP=y code in the stop-CPU scheduling class unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-36-mingo@kernel.org --- kernel/sched/stop_task.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 1c1bbc6e33b6..2d4e279f05ee 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -9,7 +9,6 @@ */ #include "sched.h" -#ifdef CONFIG_SMP static int select_task_rq_stop(struct task_struct *p, int cpu, int flags) { @@ -21,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return sched_stop_runnable(rq); } -#endif /* CONFIG_SMP */ static void wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) @@ -107,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = { .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, -#ifdef CONFIG_SMP .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_stop, -- cgit v1.2.3 From 172408811961d7facbd1ec9af66893b058d5d542 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:17 +0200 Subject: sched/smp: Use the SMP version of cpu_of() Simplify the scheduler by making CONFIG_SMP=y code in cpu_of() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-37-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6bc8e426f174..234dd28e2bec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1294,11 +1294,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) static inline int cpu_of(struct rq *rq) { -#ifdef CONFIG_SMP return rq->cpu; -#else - return 0; -#endif } #define MDF_PUSH 0x01 -- cgit v1.2.3 From 9fd5da7989ba6175fe71439738ddcf4c030d3d51 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:18 +0200 Subject: sched/smp: Use the SMP version of is_migration_disabled() Simplify the scheduler by making the CONFIG_SMP-only code in is_migration_disabled() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-38-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 234dd28e2bec..b184c3d61ba2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1301,11 +1301,7 @@ static inline int cpu_of(struct rq *rq) static inline bool is_migration_disabled(struct task_struct *p) { -#ifdef CONFIG_SMP return p->migration_disabled; -#else - return false; -#endif } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -- cgit v1.2.3 From 703b8e8545c7ca88002164d6c119c49e8ee9b137 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:19 +0200 Subject: sched/smp: Use the SMP version of rq_pin_lock() Simplify the scheduler by making a CONFIG_SMP-only warning in rq_pin_lock() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-39-mingo@kernel.org --- kernel/sched/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b184c3d61ba2..f4bac9fcb36b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1745,9 +1745,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) -- cgit v1.2.3 From ea100b31eed459ea32d6df4489cf70219fd83a07 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:20 +0200 Subject: sched/smp: Use the SMP version of task_on_cpu() Simplify the scheduler by making CONFIG_SMP=y code in task_on_cpu() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-40-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f4bac9fcb36b..7490473a73aa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2238,11 +2238,7 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_SMP return p->on_cpu; -#else - return task_current(rq, p); -#endif } static inline int task_on_rq_queued(struct task_struct *p) -- cgit v1.2.3 From 0203244600b2f48c7074a9d439789d8d1152d3e1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:21 +0200 Subject: sched/smp: Use the SMP version of WF_ and SD_ flag sanity checks Simplify the scheduler by making CONFIG_SMP=y asserts related to WF_ and SD_ flags unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-41-mingo@kernel.org --- kernel/sched/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7490473a73aa..d8c78ee73d12 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2261,11 +2261,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ -#ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); static_assert(WF_FORK == SD_BALANCE_FORK); static_assert(WF_TTWU == SD_BALANCE_WAKE); -#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution -- cgit v1.2.3 From 241c307b05b00478bbb65bf440ece464aeac9208 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:22 +0200 Subject: sched/smp: Use the SMP version of ENQUEUE_MIGRATED Simplify the scheduler by making the CONFIG_SMP-only ENQUEUE_MIGRATED flag unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-42-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d8c78ee73d12..45245f4057ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2319,11 +2319,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x10 #define ENQUEUE_REPLENISH 0x20 -#ifdef CONFIG_SMP #define ENQUEUE_MIGRATED 0x40 -#else -#define ENQUEUE_MIGRATED 0x00 -#endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 -- cgit v1.2.3 From fc75ac3c918d512952f7c132ef635cedfc4900a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:23 +0200 Subject: sched/smp: Use the SMP version of add_nr_running() Simplify the scheduler by making CONFIG_SMP=y code in add_nr_running() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-43-mingo@kernel.org --- kernel/sched/sched.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 45245f4057ce..aa08103f8584 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2673,10 +2673,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) call_trace_sched_update_nr_running(rq, count); } -#ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) set_rd_overloaded(rq->rd, 1); -#endif sched_update_tick_dependency(rq); } -- cgit v1.2.3 From dabe1be4e84c05db9341eb8c6c410e18a5ffeaa5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:24 +0200 Subject: sched/smp: Use the SMP version of double_rq_clock_clear_update() Simplify the scheduler by making CONFIG_SMP=y code in double_rq_clock_clear_update() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-44-mingo@kernel.org --- kernel/sched/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aa08103f8584..c323d015486c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2840,10 +2840,7 @@ unsigned long arch_scale_freq_capacity(int cpu) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); - /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ -#ifdef CONFIG_SMP rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); -#endif } #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ -- cgit v1.2.3 From 71203f68c7749609d7fc8ae6ad054bdedeb24f91 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 24 May 2025 20:32:20 +0800 Subject: padata: Fix pd UAF once and for all There is a race condition/UAF in padata_reorder that goes back to the initial commit. A reference count is taken at the start of the process in padata_do_parallel, and released at the end in padata_serial_worker. This reference count is (and only is) required for padata_replace to function correctly. If padata_replace is never called then there is no issue. In the function padata_reorder which serves as the core of padata, as soon as padata is added to queue->serial.list, and the associated spin lock released, that padata may be processed and the reference count on pd would go away. Fix this by getting the next padata before the squeue->serial lock is released. In order to make this possible, simplify padata_reorder by only calling it once the next padata arrives. Fixes: 16295bec6398 ("padata: Generic parallelization/serialization interface") Signed-off-by: Herbert Xu --- kernel/padata.c | 132 ++++++++++++++++---------------------------------------- 1 file changed, 37 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 7eee94166357..25cd3406477a 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -261,20 +261,17 @@ EXPORT_SYMBOL(padata_do_parallel); * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. */ -static struct padata_priv *padata_find_next(struct parallel_data *pd, - bool remove_object) +static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu, + unsigned int processed) { struct padata_priv *padata; struct padata_list *reorder; - int cpu = pd->cpu; reorder = per_cpu_ptr(pd->reorder_list, cpu); spin_lock(&reorder->lock); - if (list_empty(&reorder->list)) { - spin_unlock(&reorder->lock); - return NULL; - } + if (list_empty(&reorder->list)) + goto notfound; padata = list_entry(reorder->list.next, struct padata_priv, list); @@ -282,97 +279,52 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd, * Checks the rare case where two or more parallel jobs have hashed to * the same CPU and one of the later ones finishes first. */ - if (padata->seq_nr != pd->processed) { - spin_unlock(&reorder->lock); - return NULL; - } - - if (remove_object) { - list_del_init(&padata->list); - ++pd->processed; - pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); - } + if (padata->seq_nr != processed) + goto notfound; + list_del_init(&padata->list); spin_unlock(&reorder->lock); return padata; + +notfound: + pd->processed = processed; + pd->cpu = cpu; + spin_unlock(&reorder->lock); + return NULL; } -static void padata_reorder(struct parallel_data *pd) +static void padata_reorder(struct padata_priv *padata) { + struct parallel_data *pd = padata->pd; struct padata_instance *pinst = pd->ps->pinst; - int cb_cpu; - struct padata_priv *padata; - struct padata_serial_queue *squeue; - struct padata_list *reorder; + unsigned int processed; + int cpu; - /* - * We need to ensure that only one cpu can work on dequeueing of - * the reorder queue the time. Calculating in which percpu reorder - * queue the next object will arrive takes some time. A spinlock - * would be highly contended. Also it is not clear in which order - * the objects arrive to the reorder queues. So a cpu could wait to - * get the lock just to notice that there is nothing to do at the - * moment. Therefore we use a trylock and let the holder of the lock - * care for all the objects enqueued during the holdtime of the lock. - */ - if (!spin_trylock_bh(&pd->lock)) - return; + processed = pd->processed; + cpu = pd->cpu; - while (1) { - padata = padata_find_next(pd, true); + do { + struct padata_serial_queue *squeue; + int cb_cpu; - /* - * If the next object that needs serialization is parallel - * processed by another cpu and is still on it's way to the - * cpu's reorder queue, nothing to do for now. - */ - if (!padata) - break; + cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu); + processed++; cb_cpu = padata->cb_cpu; squeue = per_cpu_ptr(pd->squeue, cb_cpu); spin_lock(&squeue->serial.lock); list_add_tail(&padata->list, &squeue->serial.list); - spin_unlock(&squeue->serial.lock); - queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work); - } - spin_unlock_bh(&pd->lock); - - /* - * The next object that needs serialization might have arrived to - * the reorder queues in the meantime. - * - * Ensure reorder queue is read after pd->lock is dropped so we see - * new objects from another task in padata_do_serial. Pairs with - * smp_mb in padata_do_serial. - */ - smp_mb(); - - reorder = per_cpu_ptr(pd->reorder_list, pd->cpu); - if (!list_empty(&reorder->list) && padata_find_next(pd, false)) { /* - * Other context(eg. the padata_serial_worker) can finish the request. - * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. + * If the next object that needs serialization is parallel + * processed by another cpu and is still on it's way to the + * cpu's reorder queue, end the loop. */ - padata_get_pd(pd); - if (!queue_work(pinst->serial_wq, &pd->reorder_work)) - padata_put_pd(pd); - } -} - -static void invoke_padata_reorder(struct work_struct *work) -{ - struct parallel_data *pd; - - local_bh_disable(); - pd = container_of(work, struct parallel_data, reorder_work); - padata_reorder(pd); - local_bh_enable(); - /* Pairs with putting the reorder_work in the serial_wq */ - padata_put_pd(pd); + padata = padata_find_next(pd, cpu, processed); + spin_unlock(&squeue->serial.lock); + } while (padata); } static void padata_serial_worker(struct work_struct *serial_work) @@ -423,6 +375,7 @@ void padata_do_serial(struct padata_priv *padata) struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; struct list_head *pos; + bool gotit = true; spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ @@ -432,17 +385,14 @@ void padata_do_serial(struct padata_priv *padata) if ((signed int)(cur->seq_nr - padata->seq_nr) < 0) break; } - list_add(&padata->list, pos); + if (padata->seq_nr != pd->processed) { + gotit = false; + list_add(&padata->list, pos); + } spin_unlock(&reorder->lock); - /* - * Ensure the addition to the reorder list is ordered correctly - * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb - * in padata_reorder. - */ - smp_mb(); - - padata_reorder(pd); + if (gotit) + padata_reorder(padata); } EXPORT_SYMBOL(padata_do_serial); @@ -632,9 +582,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps) padata_init_squeues(pd); pd->seq_nr = -1; refcount_set(&pd->refcnt, 1); - spin_lock_init(&pd->lock); pd->cpu = cpumask_first(pd->cpumask.pcpu); - INIT_WORK(&pd->reorder_work, invoke_padata_reorder); return pd; @@ -1144,12 +1092,6 @@ void padata_free_shell(struct padata_shell *ps) if (!ps) return; - /* - * Wait for all _do_serial calls to finish to avoid touching - * freed pd's and ps's. - */ - synchronize_rcu(); - mutex_lock(&ps->pinst->lock); list_del(&ps->list); pd = rcu_dereference_protected(ps->pd, 1); -- cgit v1.2.3 From 758f5bdf1bef2c3820de38283bc35cf668b85f9c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 4 Jun 2025 19:58:01 -0400 Subject: padata: use cpumask_nth() padata_do_parallel() and padata_index_to_cpu() duplicate cpumask_nth(). Fix both and use the generic helper. Signed-off-by: Yury Norov Signed-off-by: Herbert Xu --- kernel/padata.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 25cd3406477a..f85f8bd788d0 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -63,17 +63,6 @@ static inline void padata_put_pd(struct parallel_data *pd) padata_put_pd_cnt(pd, 1); } -static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) -{ - int cpu, target_cpu; - - target_cpu = cpumask_first(pd->cpumask.pcpu); - for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); - - return target_cpu; -} - static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) { /* @@ -82,7 +71,7 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr) */ int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu); - return padata_index_to_cpu(pd, cpu_index); + return cpumask_nth(cpu_index, pd->cpumask.pcpu); } static struct padata_work *padata_work_alloc(void) @@ -192,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu) { struct padata_instance *pinst = ps->pinst; - int i, cpu, cpu_index, err; struct parallel_data *pd; struct padata_work *pw; + int cpu_index, err; rcu_read_lock_bh(); @@ -210,12 +199,7 @@ int padata_do_parallel(struct padata_shell *ps, /* Select an alternate fallback CPU and notify the caller. */ cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu); - - cpu = cpumask_first(pd->cpumask.cbcpu); - for (i = 0; i < cpu_index; i++) - cpu = cpumask_next(cpu, pd->cpumask.cbcpu); - - *cb_cpu = cpu; + *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu); } err = -EBUSY; -- cgit v1.2.3 From 2b32fc8ff08deac3aa509f321a28e21b1eea5525 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 12 Jun 2025 11:32:51 -0700 Subject: genirq/cpuhotplug: Rebalance managed interrupts across multi-CPU hotplug Commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") intended to only decrement the disable depth once per managed shutdown, but instead it decrements for each CPU hotplug in the affinity mask, until its depth reaches a point where it finally gets re-started. For example, consider: 1. Interrupt is affine to CPU {M,N} 2. disable_irq() -> depth is 1 3. CPU M goes offline -> interrupt migrates to CPU N / depth is still 1 4. CPU N goes offline -> irq_shutdown() / depth is 2 5. CPU N goes online -> irq_restore_affinity_of_irq() -> irqd_is_managed_and_shutdown()==true -> irq_startup_managed() -> depth is 1 6. CPU M goes online -> irq_restore_affinity_of_irq() -> irqd_is_managed_and_shutdown()==true -> irq_startup_managed() -> depth is 0 *** BUG: driver expects the interrupt is still disabled *** -> irq_startup() -> irqd_clr_managed_shutdown() 7. enable_irq() -> depth underflow / unbalanced enable_irq() warning This should clear the managed-shutdown flag at step 6, so that further hotplugs don't cause further imbalance. Note: It might be cleaner to also remove the irqd_clr_managed_shutdown() invocation from __irq_startup_managed(). But this is currently not possible because of irq_update_affinity_desc() as it sets IRQD_MANAGED_SHUTDOWN and expects irq_startup() to clear it. Fixes: 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") Reported-by: Aleksandrs Vinarskis Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Aleksandrs Vinarskis Link: https://lore.kernel.org/all/20250612183303.3433234-2-briannorris@chromium.org --- kernel/irq/chip.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b0e0a7332993..2b274007e8ba 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -205,6 +205,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, void irq_startup_managed(struct irq_desc *desc) { + struct irq_data *d = irq_desc_get_irq_data(desc); + + /* + * Clear managed-shutdown flag, so we don't repeat managed-startup for + * multiple hotplugs, and cause imbalanced disable depth. + */ + irqd_clr_managed_shutdown(d); + /* * Only start it up when the disable depth is 1, so that a disable, * hotunplug, hotplug sequence does not end up enabling it during -- cgit v1.2.3 From 72218d74c9c57b8ea36c2a58875dff406fc10462 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 12 Jun 2025 11:32:52 -0700 Subject: genirq/cpuhotplug: Restore affinity even for suspended IRQ Commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") tried to make managed shutdown/startup properly reference counted, but it missed the fact that the unplug and hotplug code has an intentional imbalance by skipping IRQS_SUSPENDED interrupts on the "restore" path. This means that if a managed-affinity interrupt was both suspended and managed-shutdown (such as may happen during system suspend / S3), resume skips calling irq_startup_managed(), and would again have an unbalanced depth this time, with a positive value (i.e., remaining unexpectedly masked). This IRQS_SUSPENDED check was introduced in commit a60dd06af674 ("genirq/cpuhotplug: Skip suspended interrupts when restoring affinity") for essentially the same reason as commit 788019eb559f, to prevent that irq_startup() would unconditionally re-enable an interrupt too early. Because irq_startup_managed() now respsects the disable-depth count, the IRQS_SUSPENDED check is not longer needed, and instead, it causes harm. Thus, drop the IRQS_SUSPENDED check, and restore balance. This effectively reverts commit a60dd06af674 ("genirq/cpuhotplug: Skip suspended interrupts when restoring affinity"), because it is replaced by commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug"). Fixes: 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") Reported-by: Aleksandrs Vinarskis Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Aleksandrs Vinarskis Link: https://lore.kernel.org/all/20250612183303.3433234-3-briannorris@chromium.org Closes: https://lore.kernel.org/lkml/24ec4adc-7c80-49e9-93ee-19908a97ab84@gmail.com/ --- kernel/irq/cpuhotplug.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index f07529ae4895..755346ea9819 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -210,13 +210,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - /* - * Don't restore suspended interrupts here when a system comes back - * from S3. They are reenabled via resume_device_irqs(). - */ - if (desc->istate & IRQS_SUSPENDED) - return; - if (irqd_is_managed_and_shutdown(data)) irq_startup_managed(desc); -- cgit v1.2.3 From 66067c3c8a1ee097e9c30e8bbd643b12ba54a6b0 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 22 May 2025 14:08:01 -0700 Subject: genirq: Add kunit tests for depth counts There have been a few bugs and/or misunderstandings about the reference counting, and startup/shutdown behaviors in the IRQ core and related CPU hotplug code. These 4 test cases try to capture a few interesting cases. * irq_disable_depth_test: basic request/disable/enable sequence * irq_free_disabled_test: request/disable/free/re-request sequence - this catches errors on previous revisions of my work * irq_cpuhotplug_test: exercises managed-affinity IRQ + CPU hotplug. This captures a problematic test case which was fixed recently. This test requires CONFIG_SMP and a hotpluggable CPU#1. * irq_shutdown_depth_test: exercises similar behavior from irq_cpuhotplug_test, but directly using irq_*() APIs instead of going through CPU hotplug. This still requires CONFIG_SMP, because managed-affinity is stubbed out (and not all APIs are even present) without it. Note the use of 'imply SMP': ARCH=um doesn't support SMP, and kunit is often exercised there. Thus, 'imply' will force SMP on where possible (such as ARCH=x86_64), but leave it off where it's not. Behavior on various SMP and ARCH configurations: $ tools/testing/kunit/kunit.py run 'irq_test_cases*' --arch x86_64 --qemu_args '-smp 2' [...] [11:12:24] Testing complete. Ran 4 tests: passed: 4 $ tools/testing/kunit/kunit.py run 'irq_test_cases*' --arch x86_64 [...] [11:13:27] [SKIPPED] irq_cpuhotplug_test [11:13:27] ================= [PASSED] irq_test_cases ================== [11:13:27] ============================================================ [11:13:27] Testing complete. Ran 4 tests: passed: 3, skipped: 1 # default: ARCH=um $ tools/testing/kunit/kunit.py run 'irq_test_cases*' [11:14:26] [SKIPPED] irq_shutdown_depth_test [11:14:26] [SKIPPED] irq_cpuhotplug_test [11:14:26] ================= [PASSED] irq_test_cases ================== [11:14:26] ============================================================ [11:14:26] Testing complete. Ran 4 tests: passed: 2, skipped: 2 Without commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug"), this fails as follows: [11:18:55] =============== irq_test_cases (4 subtests) ================ [11:18:55] [PASSED] irq_disable_depth_test [11:18:55] [PASSED] irq_free_disabled_test [11:18:55] # irq_shutdown_depth_test: EXPECTATION FAILED at kernel/irq/irq_test.c:147 [11:18:55] Expected desc->depth == 1, but [11:18:55] desc->depth == 0 (0x0) [11:18:55] ------------[ cut here ]------------ [11:18:55] Unbalanced enable for IRQ 26 [11:18:55] WARNING: CPU: 1 PID: 36 at kernel/irq/manage.c:792 __enable_irq+0x36/0x60 ... [11:18:55] [FAILED] irq_shutdown_depth_test [11:18:55] #1 [11:18:55] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:202 [11:18:55] Expected irqd_is_activated(data) to be false, but is true [11:18:55] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:203 [11:18:55] Expected irqd_is_started(data) to be false, but is true [11:18:55] # irq_cpuhotplug_test: EXPECTATION FAILED at kernel/irq/irq_test.c:204 [11:18:55] Expected desc->depth == 1, but [11:18:55] desc->depth == 0 (0x0) [11:18:55] ------------[ cut here ]------------ [11:18:55] Unbalanced enable for IRQ 27 [11:18:55] WARNING: CPU: 0 PID: 38 at kernel/irq/manage.c:792 __enable_irq+0x36/0x60 ... [11:18:55] [FAILED] irq_cpuhotplug_test [11:18:55] # module: irq_test [11:18:55] # irq_test_cases: pass:2 fail:2 skip:0 total:4 [11:18:55] # Totals: pass:2 fail:2 skip:0 total:4 [11:18:55] ================= [FAILED] irq_test_cases ================== [11:18:55] ============================================================ [11:18:55] Testing complete. Ran 4 tests: passed: 2, failed: 2 Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250522210837.4135244-1-briannorris@chromium.org --- kernel/irq/Kconfig | 11 +++ kernel/irq/Makefile | 1 + kernel/irq/irq_test.c | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+) create mode 100644 kernel/irq/irq_test.c (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3f02a0e45254..1da5e9d9da71 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -144,6 +144,17 @@ config GENERIC_IRQ_DEBUGFS config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD bool +config IRQ_KUNIT_TEST + bool "KUnit tests for IRQ management APIs" if !KUNIT_ALL_TESTS + depends on KUNIT=y + default KUNIT_ALL_TESTS + imply SMP + help + This option enables KUnit tests for the IRQ subsystem API. These are + only for development and testing, not for regular kernel use cases. + + If unsure, say N. + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index c0f44c06d69d..6ab3a4055667 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o +obj-$(CONFIG_IRQ_KUNIT_TEST) += irq_test.o diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c new file mode 100644 index 000000000000..5161b56a12f9 --- /dev/null +++ b/kernel/irq/irq_test.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: LGPL-2.1+ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +static irqreturn_t noop_handler(int irq, void *data) +{ + return IRQ_HANDLED; +} + +static void noop(struct irq_data *data) { } +static unsigned int noop_ret(struct irq_data *data) { return 0; } + +static int noop_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + irq_data_update_effective_affinity(data, dest); + + return 0; +} + +static struct irq_chip fake_irq_chip = { + .name = "fake", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + .irq_set_affinity = noop_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static void irq_disable_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_free_disabled_test(struct kunit *test) +{ + struct irq_desc *desc; + int virq, ret; + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, NULL); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + free_irq(virq, NULL); + KUNIT_EXPECT_GE(test, desc->depth, 1); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_shutdown_depth_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + .mask = CPU_MASK_ALL, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for managed shutdown"); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &dummy_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + irq_shutdown_and_deactivate(desc); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + + KUNIT_EXPECT_EQ(test, irq_activate(desc), 0); +#ifdef CONFIG_SMP + irq_startup_managed(desc); +#endif + + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static void irq_cpuhotplug_test(struct kunit *test) +{ + struct irq_desc *desc; + struct irq_data *data; + int virq, ret; + struct irq_affinity_desc affinity = { + .is_managed = 1, + }; + + if (!IS_ENABLED(CONFIG_SMP)) + kunit_skip(test, "requires CONFIG_SMP for CPU hotplug"); + if (!get_cpu_device(1)) + kunit_skip(test, "requires more than 1 CPU for CPU hotplug"); + if (!cpu_is_hotpluggable(1)) + kunit_skip(test, "CPU 1 must be hotpluggable"); + + cpumask_copy(&affinity.mask, cpumask_of(1)); + + virq = irq_domain_alloc_descs(-1, 1, 0, NUMA_NO_NODE, &affinity); + KUNIT_ASSERT_GE(test, virq, 0); + + irq_set_chip_and_handler(virq, &fake_irq_chip, handle_simple_irq); + + desc = irq_to_desc(virq); + KUNIT_ASSERT_PTR_NE(test, desc, NULL); + + data = irq_desc_get_irq_data(desc); + KUNIT_ASSERT_PTR_NE(test, data, NULL); + + ret = request_irq(virq, noop_handler, 0, "test_irq", NULL); + KUNIT_EXPECT_EQ(test, ret, 0); + + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_TRUE(test, irqd_affinity_is_managed(data)); + + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + disable_irq(virq); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + KUNIT_EXPECT_EQ(test, remove_cpu(1), 0); + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_GE(test, desc->depth, 1); + KUNIT_EXPECT_EQ(test, add_cpu(1), 0); + + KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); + KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 1); + + enable_irq(virq); + KUNIT_EXPECT_TRUE(test, irqd_is_activated(data)); + KUNIT_EXPECT_TRUE(test, irqd_is_started(data)); + KUNIT_EXPECT_EQ(test, desc->depth, 0); + + free_irq(virq, NULL); +} + +static struct kunit_case irq_test_cases[] = { + KUNIT_CASE(irq_disable_depth_test), + KUNIT_CASE(irq_free_disabled_test), + KUNIT_CASE(irq_shutdown_depth_test), + KUNIT_CASE(irq_cpuhotplug_test), + {} +}; + +static struct kunit_suite irq_test_suite = { + .name = "irq_test_cases", + .test_cases = irq_test_cases, +}; + +kunit_test_suite(irq_test_suite); +MODULE_DESCRIPTION("IRQ unit test suite"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 8a2277a3c9e4cc5398f80821afe7ecbe9bdf2819 Mon Sep 17 00:00:00 2001 From: Gyeyoung Baek Date: Thu, 12 Jun 2025 21:48:27 +0900 Subject: genirq/irq_sim: Initialize work context pointers properly Initialize `ops` member's pointers properly by using kzalloc() instead of kmalloc() when allocating the simulation work context. Otherwise the pointers contain random content leading to invalid dereferencing. Signed-off-by: Gyeyoung Baek Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250612124827.63259-1-gye976@gmail.com --- kernel/irq/irq_sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 1a3d483548e2..ae4c9cbd1b4b 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, void *data) { struct irq_sim_work_ctx *work_ctx __free(kfree) = - kmalloc(sizeof(*work_ctx), GFP_KERNEL); + kzalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From a2fc422ed75748eef2985454e97847fb22f873c2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Wed, 21 May 2025 17:04:29 +0200 Subject: syscall_user_dispatch: Add PR_SYS_DISPATCH_INCLUSIVE_ON There are two possible scenarios for syscall filtering: - having a trusted/allowed range of PCs, and intercepting everything else - or the opposite: a single untrusted/intercepted range and allowing everything else (this is relevant for any kind of sandboxing scenario, or monitoring behavior of a single library) The current API only allows the former use case due to allowed range wrap-around check. Add PR_SYS_DISPATCH_INCLUSIVE_ON that enables the second use case. Add PR_SYS_DISPATCH_EXCLUSIVE_ON alias for PR_SYS_DISPATCH_ON to make it clear how it's different from the new PR_SYS_DISPATCH_INCLUSIVE_ON. Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/97947cc8e205ff49675826d7b0327ef2e2c66eea.1747839857.git.dvyukov@google.com --- kernel/entry/syscall_user_dispatch.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 5340c5aa89e7..a9055eccb27e 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon if (offset || len || selector) return -EINVAL; break; - case PR_SYS_DISPATCH_ON: + case PR_SYS_DISPATCH_EXCLUSIVE_ON: /* * Validate the direct dispatcher region just for basic * sanity against overflow and a 0-sized dispatcher @@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon */ if (offset && offset + len <= offset) return -EINVAL; - + break; + case PR_SYS_DISPATCH_INCLUSIVE_ON: + if (len == 0 || offset + len <= offset) + return -EINVAL; /* - * access_ok() will clear memory tags for tagged addresses - * if current has memory tagging enabled. - - * To enable a tracer to set a tracees selector the - * selector address must be untagged for access_ok(), - * otherwise an untagged tracer will always fail to set a - * tagged tracees selector. + * Invert the range, the check in syscall_user_dispatch() + * supports wrap-around. */ - if (selector && !access_ok(untagged_addr(selector), sizeof(*selector))) - return -EFAULT; - + offset = offset + len; + len = -len; break; default: return -EINVAL; } + /* + * access_ok() will clear memory tags for tagged addresses + * if current has memory tagging enabled. + * + * To enable a tracer to set a tracees selector the + * selector address must be untagged for access_ok(), + * otherwise an untagged tracer will always fail to set a + * tagged tracees selector. + */ + if (mode != PR_SYS_DISPATCH_OFF && selector && + !access_ok(untagged_addr(selector), sizeof(*selector))) + return -EFAULT; + task->syscall_dispatch.selector = selector; task->syscall_dispatch.offset = offset; task->syscall_dispatch.len = len; task->syscall_dispatch.on_dispatch = false; - if (mode == PR_SYS_DISPATCH_ON) + if (mode != PR_SYS_DISPATCH_OFF) set_task_syscall_work(task, SYSCALL_USER_DISPATCH); else clear_task_syscall_work(task, SYSCALL_USER_DISPATCH); -- cgit v1.2.3 From 43736ec3e02789795d4e7b9cb49a005fa56c0171 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 13 Jun 2025 00:21:46 -0700 Subject: bpf: Include verifier memory allocations in memcg statistics This commit adds __GFP_ACCOUNT flag to verifier induced memory allocations. The intent is to account for all allocations reachable from BPF_PROG_LOAD command, which is needed to track verifier memory consumption in veristat. This includes allocations done in verifier.c, and some allocations in btf.c, functions in log.c do not allocate. There is also a utility function bpf_memcg_flags() which selectively adds GFP_ACCOUNT flag depending on the `cgroup.memory=nobpf` option. As far as I understand [1], the idea is to remove bpf_prog instances and maps from memcg accounting as these objects do not strictly belong to cgroup, hence it should not apply here. (btf_parse_fields() is reachable from both program load and map creation, but allocated record is not persistent as is freed as soon as map_check_btf() exits). [1] https://lore.kernel.org/all/20230210154734.4416-1-laoar.shao@gmail.com/ Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250613072147.3938139-2-eddyz87@gmail.com --- kernel/bpf/btf.c | 15 +++++------ kernel/bpf/verifier.c | 69 ++++++++++++++++++++++++++------------------------- 2 files changed, 43 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1d2cf898e21e..682acb1ed234 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3443,7 +3443,8 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, node_field_name = strstr(value_type, ":"); if (!node_field_name) return -EINVAL; - value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN); + value_type = kstrndup(value_type, node_field_name - value_type, + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); @@ -3958,7 +3959,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* This needs to be kzalloc to zero out padding and unused fields, see * comment in btf_record_equal. */ - rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL | __GFP_NOWARN); + rec = kzalloc(struct_size(rec, fields, cnt), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); @@ -9019,7 +9020,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, bpf_free_cands_from_cache(*cc); *cc = NULL; } - new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); + new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9027,7 +9028,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, /* strdup the name, since it will stay in cache. * the cands->name points to strings in prog's BTF and the prog can be unloaded. */ - new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL_ACCOUNT); bpf_free_cands(cands); if (!new_cands->name) { kfree(new_cands); @@ -9111,7 +9112,7 @@ bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, continue; /* most of the time there is only one candidate for a given kind+name pair */ - new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL_ACCOUNT); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); @@ -9228,7 +9229,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" * into arrays of btf_ids of struct fields and array indices. */ - specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL_ACCOUNT); if (!specs) return -ENOMEM; @@ -9253,7 +9254,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, goto out; } if (cc->cnt) { - cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL_ACCOUNT); if (!cands.cands) { err = -ENOMEM; goto out; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c378074516cf..fd81cff11617 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1404,7 +1404,7 @@ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) goto out; alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); - new_arr = krealloc(arr, alloc_size, GFP_KERNEL); + new_arr = krealloc(arr, alloc_size, GFP_KERNEL_ACCOUNT); if (!new_arr) { kfree(arr); return NULL; @@ -1421,7 +1421,7 @@ out: static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src) { dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs, - sizeof(struct bpf_reference_state), GFP_KERNEL); + sizeof(struct bpf_reference_state), GFP_KERNEL_ACCOUNT); if (!dst->refs) return -ENOMEM; @@ -1440,7 +1440,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st size_t n = src->allocated_stack / BPF_REG_SIZE; dst->stack = copy_array(dst->stack, src->stack, n, sizeof(struct bpf_stack_state), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!dst->stack) return -ENOMEM; @@ -1731,7 +1731,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->jmp_history = copy_array(dst_state->jmp_history, src->jmp_history, src->jmp_history_cnt, sizeof(*dst_state->jmp_history), - GFP_USER); + GFP_KERNEL_ACCOUNT); if (!dst_state->jmp_history) return -ENOMEM; dst_state->jmp_history_cnt = src->jmp_history_cnt; @@ -1760,7 +1760,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { - dst = kzalloc(sizeof(*dst), GFP_KERNEL); + dst = kzalloc(sizeof(*dst), GFP_KERNEL_ACCOUNT); if (!dst) return -ENOMEM; dst_state->frame[i] = dst; @@ -1874,7 +1874,7 @@ static struct bpf_scc_visit *scc_visit_alloc(struct bpf_verifier_env *env, info = env->scc_info[scc]; num_visits = info ? info->num_visits : 0; new_sz = sizeof(*info) + sizeof(struct bpf_scc_visit) * (num_visits + 1); - info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL); + info = kvrealloc(env->scc_info[scc], new_sz, GFP_KERNEL_ACCOUNT); if (!info) return NULL; env->scc_info[scc] = info; @@ -2095,7 +2095,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; int err; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) goto err; @@ -2862,7 +2862,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, struct bpf_verifier_stack_elem *elem; struct bpf_func_state *frame; - elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) goto err; @@ -2885,7 +2885,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, */ elem->st.branches = 1; elem->st.in_sleepable = is_sleepable; - frame = kzalloc(sizeof(*frame), GFP_KERNEL); + frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) goto err; init_func_state(env, frame, @@ -3237,7 +3237,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } - tab = kzalloc(sizeof(*tab), GFP_KERNEL); + tab = kzalloc(sizeof(*tab), GFP_KERNEL_ACCOUNT); if (!tab) return -ENOMEM; prog_aux->kfunc_tab = tab; @@ -3253,7 +3253,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return 0; if (!btf_tab && offset) { - btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL); + btf_tab = kzalloc(sizeof(*btf_tab), GFP_KERNEL_ACCOUNT); if (!btf_tab) return -ENOMEM; prog_aux->kfunc_btf_tab = btf_tab; @@ -3939,7 +3939,7 @@ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_st cnt++; alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); - p = krealloc(cur->jmp_history, alloc_size, GFP_USER); + p = krealloc(cur->jmp_history, alloc_size, GFP_KERNEL_ACCOUNT); if (!p) return -ENOMEM; cur->jmp_history = p; @@ -10356,7 +10356,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls } caller = state->frame[state->curframe]; - callee = kzalloc(sizeof(*callee), GFP_KERNEL); + callee = kzalloc(sizeof(*callee), GFP_KERNEL_ACCOUNT); if (!callee) return -ENOMEM; state->frame[state->curframe + 1] = callee; @@ -17693,17 +17693,18 @@ static int check_cfg(struct bpf_verifier_env *env) int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_state) return -ENOMEM; - insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_stack) { kvfree(insn_state); return -ENOMEM; } - insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + insn_postorder = env->cfg.insn_postorder = + kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); if (!insn_postorder) { kvfree(insn_state); kvfree(insn_stack); @@ -17837,7 +17838,7 @@ static int check_btf_func_early(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); min_size = min_t(u32, krec_size, urec_size); - krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); + krecord = kvcalloc(nfuncs, krec_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!krecord) return -ENOMEM; @@ -17937,7 +17938,7 @@ static int check_btf_func(struct bpf_verifier_env *env, urecord = make_bpfptr(attr->func_info, uattr.is_kernel); krecord = prog->aux->func_info; - info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL | __GFP_NOWARN); + info_aux = kcalloc(nfuncs, sizeof(*info_aux), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!info_aux) return -ENOMEM; @@ -18023,7 +18024,7 @@ static int check_btf_line(struct bpf_verifier_env *env, * pass in a smaller bpf_line_info object. */ linfo = kvcalloc(nr_linfo, sizeof(struct bpf_line_info), - GFP_KERNEL | __GFP_NOWARN); + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!linfo) return -ENOMEM; @@ -19408,7 +19409,7 @@ hit: if (loop) { struct bpf_scc_backedge *backedge; - backedge = kzalloc(sizeof(*backedge), GFP_KERNEL); + backedge = kzalloc(sizeof(*backedge), GFP_KERNEL_ACCOUNT); if (!backedge) return -ENOMEM; err = copy_verifier_state(&backedge->state, cur); @@ -19472,7 +19473,7 @@ miss: * When looping the sl->state.branches will be > 0 and this state * will not be considered for equivalence until branches == 0. */ - new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL_ACCOUNT); if (!new_sl) return -ENOMEM; env->total_states++; @@ -22976,13 +22977,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) env->prev_linfo = NULL; env->pass_cnt++; - state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL_ACCOUNT); if (!state) return -ENOMEM; state->curframe = 0; state->speculative = false; state->branches = 1; - state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL_ACCOUNT); if (!state->frame[0]) { kfree(state); return -ENOMEM; @@ -23208,7 +23209,7 @@ static void print_verification_stats(struct bpf_verifier_env *env) int bpf_prog_ctx_arg_info_init(struct bpf_prog *prog, const struct bpf_ctx_arg_aux *info, u32 cnt) { - prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL); + prog->aux->ctx_arg_info = kmemdup_array(info, cnt, sizeof(*info), GFP_KERNEL_ACCOUNT); prog->aux->ctx_arg_info_size = cnt; return prog->aux->ctx_arg_info ? 0 : -ENOMEM; @@ -24152,7 +24153,7 @@ static int compute_live_registers(struct bpf_verifier_env *env) * - repeat the computation while {in,out} fields changes for * any instruction. */ - state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL_ACCOUNT); if (!state) { err = -ENOMEM; goto out; @@ -24244,10 +24245,10 @@ static int compute_scc(struct bpf_verifier_env *env) * - 'low[t] == n' => smallest preorder number of the vertex reachable from 't' is 'n'; * - 'dfs' DFS traversal stack, used to emulate explicit recursion. */ - stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); - dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL); + stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + pre = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + low = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL_ACCOUNT); + dfs = kvcalloc(insn_cnt, sizeof(*dfs), GFP_KERNEL_ACCOUNT); if (!stack || !pre || !low || !dfs) { err = -ENOMEM; goto exit; @@ -24381,7 +24382,7 @@ dfs_continue: dfs_sz--; } } - env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL); + env->scc_info = kvcalloc(next_scc_id, sizeof(*env->scc_info), GFP_KERNEL_ACCOUNT); if (!env->scc_info) { err = -ENOMEM; goto exit; @@ -24409,7 +24410,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL_ACCOUNT); if (!env) return -ENOMEM; @@ -24472,7 +24473,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct list_head), - GFP_USER); + GFP_KERNEL_ACCOUNT); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; @@ -24603,7 +24604,7 @@ skip_full_check: /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_maps) { ret = -ENOMEM; @@ -24618,7 +24619,7 @@ skip_full_check: /* if program passed verifier, update used_btfs in bpf_prog_aux */ env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, sizeof(env->used_btfs[0]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!env->prog->aux->used_btfs) { ret = -ENOMEM; goto err_release_maps; -- cgit v1.2.3 From 3157f7e2999616ac91f4d559a8566214f74000a5 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 13 Jun 2025 10:53:30 -0700 Subject: bpf: handle jset (if a & b ...) as a jump in CFG computation BPF_JSET is a conditional jump and currently verifier.c:can_jump() does not know about that. This can lead to incorrect live registers and SCC computation. E.g. in the following example: 1: r0 = 1; 2: r2 = 2; 3: if r1 & 0x7 goto +1; 4: exit; 5: r0 = r2; 6: exit; W/o this fix insn_successors(3) will return only (4), a jump to (5) would be missed and r2 won't be marked as alive at (3). Fixes: 14c8552db644 ("bpf: simple DFA-based live registers analysis") Reported-by: syzbot+a36aac327960ff474804@syzkaller.appspotmail.com Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250613175331.3238739-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fd81cff11617..a1c312e216ca 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23951,6 +23951,7 @@ static bool can_jump(struct bpf_insn *insn) case BPF_JSLT: case BPF_JSLE: case BPF_JCOND: + case BPF_JSET: return true; } -- cgit v1.2.3 From f66b4aaff2548bed5eedded0f47ae3a9ac933cec Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Fri, 13 Jun 2025 11:01:58 +0200 Subject: bpf: Remove redundant free_verifier_state()/pop_stack() This patch removes duplicated code. Eduard points out [1]: Same cleanup cycles are done in push_stack() and push_async_cb(), both functions are only reachable from do_check_common() via do_check() -> do_check_insn(). Hence, I think that cur state should not be freed in push_*() functions and pop_stack() loop there is not needed. This would also fix the 'symptom' for [2], but the issue also has a simpler fix which was sent separately. This fix also makes sure the push_*() callers always return an error for which error_recoverable_with_nospec(err) is false. This is required because otherwise we try to recover and access the stale `state`. Moving free_verifier_state() and pop_stack(..., pop_log=false) to happen after the bpf_vlog_reset() call in do_check_common() is fine because the pop_stack() call that is moved does not call bpf_vlog_reset() with the pop_log=false parameter. [1] https://lore.kernel.org/all/b6931bd0dd72327c55287862f821ca6c4c3eb69a.camel@gmail.com/ [2] https://lore.kernel.org/all/68497853.050a0220.33aa0e.036a.GAE@google.com/ Reported-by: Eduard Zingerman Link: https://lore.kernel.org/all/b6931bd0dd72327c55287862f821ca6c4c3eb69a.camel@gmail.com/ Acked-by: Eduard Zingerman Signed-off-by: Luis Gerhorst Link: https://lore.kernel.org/r/20250613090157.568349-2-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 36 ++++++++++-------------------------- 1 file changed, 10 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a1c312e216ca..279a64933262 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2097,7 +2097,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2107,12 +2107,12 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->stack_size++; err = copy_verifier_state(&elem->st, cur); if (err) - goto err; + return NULL; elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { verbose(env, "The sequence of %d jumps is too complex.\n", env->stack_size); - goto err; + return NULL; } if (elem->st.parent) { ++elem->st.parent->branches; @@ -2127,12 +2127,6 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, */ } return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } #define CALLER_SAVED_REGS 6 @@ -2864,7 +2858,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL_ACCOUNT); if (!elem) - goto err; + return NULL; elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; @@ -2876,7 +2870,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, verbose(env, "The sequence of %d jumps is too complex for async cb.\n", env->stack_size); - goto err; + return NULL; } /* Unlike push_stack() do not copy_verifier_state(). * The caller state doesn't matter. @@ -2887,19 +2881,13 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env, elem->st.in_sleepable = is_sleepable; frame = kzalloc(sizeof(*frame), GFP_KERNEL_ACCOUNT); if (!frame) - goto err; + return NULL; init_func_state(env, frame, BPF_MAIN_FUNC /* callsite */, 0 /* frameno within this callchain */, subprog /* subprog number within this prog */); elem->st.frame[0] = frame; return &elem->st; -err: - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - /* pop all elements and return */ - while (!pop_stack(env, NULL, NULL, false)); - return NULL; } @@ -22935,6 +22923,10 @@ static void free_states(struct bpf_verifier_env *env) struct bpf_scc_info *info; int i, j; + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + while (!pop_stack(env, NULL, NULL, false)); + list_for_each_safe(pos, tmp, &env->free_list) { sl = container_of(pos, struct bpf_verifier_state_list, node); free_verifier_state(&sl->state, false); @@ -23086,14 +23078,6 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) ret = do_check(env); out: - /* check for NULL is necessary, since cur_state can be freed inside - * do_check() under memory pressure. - */ - if (env->cur_state) { - free_verifier_state(env->cur_state, true); - env->cur_state = NULL; - } - while (!pop_stack(env, NULL, NULL, false)); if (!ret && pop_log) bpf_vlog_reset(&env->log, 0); free_states(env); -- cgit v1.2.3 From 165af415168568f386709cf8cbc542036cb57fdd Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Wed, 11 Jun 2025 21:54:01 +0800 Subject: sched_ext: Always use SMP versions in kernel/sched/ext.c Simplify the scheduler by making formerly SMP-only primitives and data structures unconditional. tj: Updated subject for clarity. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 3623ba98d7d8..28bb6810e5d1 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1778,12 +1778,10 @@ static void run_deferred(struct rq *rq) process_ddsp_deferred_locals(rq); } -#ifdef CONFIG_SMP static void deferred_bal_cb_workfn(struct rq *rq) { run_deferred(rq); } -#endif static void deferred_irq_workfn(struct irq_work *irq_work) { @@ -1806,7 +1804,6 @@ static void schedule_deferred(struct rq *rq) { lockdep_assert_rq_held(rq); -#ifdef CONFIG_SMP /* * If in the middle of waking up a task, task_woken_scx() will be called * afterwards which will then run the deferred actions, no need to @@ -1824,7 +1821,7 @@ static void schedule_deferred(struct rq *rq) deferred_bal_cb_workfn); return; } -#endif + /* * No scheduler hooks available. Queue an irq work. They are executed on * IRQ re-enable which may take a bit longer than the scheduler hooks. @@ -2528,7 +2525,6 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, p->scx.dsq = dst_dsq; } -#ifdef CONFIG_SMP /** * move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ * @p: task to move @@ -2695,11 +2691,6 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, return false; } } -#else /* CONFIG_SMP */ -static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } -static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; } -static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } -#endif /* CONFIG_SMP */ /** * move_task_between_dsqs() - Move a task from one DSQ to another @@ -2872,9 +2863,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, { struct rq *src_rq = task_rq(p); struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); -#ifdef CONFIG_SMP struct rq *locked_rq = rq; -#endif /* * We're synchronized against dequeue through DISPATCHING. As @p can't @@ -2888,7 +2877,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, return; } -#ifdef CONFIG_SMP if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { dispatch_enqueue(sch, find_global_dsq(p), p, @@ -2948,9 +2936,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, raw_spin_rq_unlock(locked_rq); raw_spin_rq_lock(rq); } -#else /* CONFIG_SMP */ - BUG(); /* control can not reach here on UP */ -#endif /* CONFIG_SMP */ } /** @@ -3274,10 +3259,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) static enum scx_cpu_preempt_reason preempt_reason_from_class(const struct sched_class *class) { -#ifdef CONFIG_SMP if (class == &stop_sched_class) return SCX_CPU_PREEMPT_STOP; -#endif if (class == &dl_sched_class) return SCX_CPU_PREEMPT_DL; if (class == &rt_sched_class) @@ -3290,14 +3273,12 @@ static void switch_class(struct rq *rq, struct task_struct *next) struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; -#ifdef CONFIG_SMP /* * Pairs with the smp_load_acquire() issued by a CPU in * kick_cpus_irq_workfn() who is waiting for this CPU to perform a * resched. */ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); -#endif if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; @@ -3494,8 +3475,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, } #endif /* CONFIG_SCHED_CORE */ -#ifdef CONFIG_SMP - static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { struct scx_sched *sch = scx_root; @@ -3625,7 +3604,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } -#endif /* CONFIG_SMP */ static bool check_rq_for_timeouts(struct rq *rq) { @@ -4285,14 +4263,12 @@ DEFINE_SCHED_CLASS(ext) = { .put_prev_task = put_prev_task_scx, .set_next_task = set_next_task_scx, -#ifdef CONFIG_SMP .select_task_rq = select_task_rq_scx, .task_woken = task_woken_scx, .set_cpus_allowed = set_cpus_allowed_scx, .rq_online = rq_online_scx, .rq_offline = rq_offline_scx, -#endif .task_tick = task_tick_scx, -- cgit v1.2.3 From 6a1cda143c239475018e7f72f4359ed3c265653c Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Wed, 11 Jun 2025 21:54:02 +0800 Subject: sched_ext: Always use SMP versions in kernel/sched/ext.h Simplify the scheduler by making formerly SMP-only primitives and data structures unconditional. tj: Updated subject for clarity. Replace #if defined() with #ifdef. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 6d6d00e9de20..621efa8f0fe9 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -81,7 +81,7 @@ static inline void init_sched_ext_class(void) {} #endif /* CONFIG_SCHED_CLASS_EXT */ -#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) +#ifdef CONFIG_SCHED_CLASS_EXT void __scx_update_idle(struct rq *rq, bool idle, bool do_notify); static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) -- cgit v1.2.3 From 8834ace4a86db0a85cb003c2efd98e6a4389243c Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Wed, 11 Jun 2025 21:54:03 +0800 Subject: sched_ext: Always use SMP versions in kernel/sched/ext_idle.c Simplify the scheduler by making formerly SMP-only primitives and data structures unconditional. tj: Updated subject for clarity. Fixed stray #else block which wasn't removed causing build failure. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 35 +---------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 17802693e304..b79cbdb7999a 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -17,7 +17,6 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); /* Enable/disable per-node idle cpumasks */ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node); -#ifdef CONFIG_SMP /* Enable/disable LLC aware optimizations */ static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc); @@ -794,17 +793,6 @@ static void reset_idle_masks(struct sched_ext_ops *ops) cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask); } } -#else /* !CONFIG_SMP */ -static bool scx_idle_test_and_clear_cpu(int cpu) -{ - return -EBUSY; -} - -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags) -{ - return -EBUSY; -} -#endif /* CONFIG_SMP */ void scx_idle_enable(struct sched_ext_ops *ops) { @@ -818,9 +806,7 @@ void scx_idle_enable(struct sched_ext_ops *ops) else static_branch_disable_cpuslocked(&scx_builtin_idle_per_node); -#ifdef CONFIG_SMP reset_idle_masks(ops); -#endif } void scx_idle_disable(void) @@ -906,7 +892,6 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f if (!rq) lockdep_assert_held(&p->pi_lock); -#ifdef CONFIG_SMP /* * This may also be called from ops.enqueue(), so we need to handle * per-CPU tasks as well. For these tasks, we can skip all idle CPU @@ -923,9 +908,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, allowed ?: p->cpus_ptr, flags); } -#else - cpu = -EBUSY; -#endif + if (scx_kf_allowed_if_unlocked()) task_rq_unlock(rq, p, &rf); @@ -1016,11 +999,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) if (node < 0) return cpu_none_mask; -#ifdef CONFIG_SMP return idle_cpumask(node)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -1040,11 +1019,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) if (!check_builtin_idle_enabled()) return cpu_none_mask; -#ifdef CONFIG_SMP return idle_cpumask(NUMA_NO_NODE)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -1063,14 +1038,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) if (node < 0) return cpu_none_mask; -#ifdef CONFIG_SMP if (sched_smt_active()) return idle_cpumask(node)->smt; else return idle_cpumask(node)->cpu; -#else - return cpu_none_mask; -#endif } /** @@ -1091,14 +1062,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) if (!check_builtin_idle_enabled()) return cpu_none_mask; -#ifdef CONFIG_SMP if (sched_smt_active()) return idle_cpumask(NUMA_NO_NODE)->smt; else return idle_cpumask(NUMA_NO_NODE)->cpu; -#else - return cpu_none_mask; -#endif } /** -- cgit v1.2.3 From 545b343015ed1d34ee3e38dc48c6405097b5ac8d Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Wed, 11 Jun 2025 21:54:04 +0800 Subject: sched_ext: Always use SMP versions in kernel/sched/ext_idle.h Simplify the scheduler by making formerly SMP-only primitives and data structures unconditional. tj: Updated subject for clarity. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index 05e389ed72e4..fa583f141f35 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,13 +12,8 @@ struct sched_ext_ops; -#ifdef CONFIG_SMP void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); -#else /* !CONFIG_SMP */ -static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {} -static inline void scx_idle_init_masks(void) {} -#endif /* CONFIG_SMP */ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags); -- cgit v1.2.3 From 4fa7d61d5a02ad57a05c69365db293afddf678fc Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Sat, 14 Jun 2025 11:50:29 -0400 Subject: clocksource: Use cpumask_any_but() in clocksource_verify_choose_cpus() cpumask_any_but() is more verbose than cpumask_first() followed by cpumask_next(). Use it in clocksource_verify_choose_cpus(). Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250614155031.340988-2-yury.norov@gmail.com --- kernel/time/clocksource.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a8bc7da9062..a2f2e9f4d37b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -323,9 +323,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_first(cpu_online_mask); - if (cpu == smp_processor_id()) - cpu = cpumask_next(cpu, cpu_online_mask); + cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; cpumask_set_cpu(cpu, &cpus_chosen); -- cgit v1.2.3 From bfa788dc2ddaea7d7930f63a5c7c8f3668a3f2c5 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Sat, 14 Jun 2025 11:50:30 -0400 Subject: clocksource: Use cpumask_next_wrap() in clocksource_watchdog() cpumask_next_wrap() is more verbose and efficient comparing to cpumask_next() followed by cpumask_first(). Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250614155031.340988-3-yury.norov@gmail.com --- kernel/time/clocksource.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a2f2e9f4d37b..e400fe150f9d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -587,9 +587,7 @@ static void clocksource_watchdog(struct timer_list *unused) * Cycle through CPUs to check if the CPUs stay synchronized * to each other. */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); /* * Arm timer if not already pending: could race with concurrent -- cgit v1.2.3 From 70e3ee31282d293c794fb5bbec8efe495c32044b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 12 Jun 2025 15:25:23 +0200 Subject: coredump: rename do_coredump() to vfs_coredump() Align the naming with the rest of our helpers exposed outside of core vfs. Link: https://lore.kernel.org/20250612-work-coredump-massage-v1-9-315c0c34ba94@kernel.org Signed-off-by: Christian Brauner --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 148082db9a55..e2c928de7d2c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3016,7 +3016,7 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump(&ksig->info); + vfs_coredump(&ksig->info); } /* -- cgit v1.2.3 From f479fee3827aa8a532b62b41025075b25259117e Mon Sep 17 00:00:00 2001 From: Cheng-Yang Chou Date: Mon, 16 Jun 2025 04:04:14 +0800 Subject: sched_ext: Return NULL in llc_span Use NULL instead of 0 to signal no LLC domain, matching numa_span() and the function comment. No functional change. Signed-off-by: Cheng-Yang Chou Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index b79cbdb7999a..dfacb0ad9b6f 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -249,7 +249,7 @@ static struct cpumask *llc_span(s32 cpu) sd = rcu_dereference(per_cpu(sd_llc, cpu)); if (!sd) - return 0; + return NULL; return sched_domain_span(sd); } -- cgit v1.2.3 From fda6add9243867486f8cd456d7b05395d2132e0a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Jun 2025 13:59:20 -0400 Subject: workqueue: Basic memory allocation profiling support Hook alloc_workqueue and alloc_workqueue_attrs() so that they're accounted to the callsite. Since we're doing allocations on behalf of another subsystem, this helps when using memory allocation profiling to check for leaks. Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Kent Overstreet Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d9de0f2a2e00..c24844afaa98 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4626,7 +4626,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) * * Return: The allocated new workqueue_attr on success. %NULL on failure. */ -struct workqueue_attrs *alloc_workqueue_attrs(void) +struct workqueue_attrs *alloc_workqueue_attrs_noprof(void) { struct workqueue_attrs *attrs; @@ -5679,12 +5679,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt, else wq_size = sizeof(*wq); - wq = kzalloc(wq_size, GFP_KERNEL); + wq = kzalloc_noprof(wq_size, GFP_KERNEL); if (!wq) return NULL; if (flags & WQ_UNBOUND) { - wq->unbound_attrs = alloc_workqueue_attrs(); + wq->unbound_attrs = alloc_workqueue_attrs_noprof(); if (!wq->unbound_attrs) goto err_free_wq; } @@ -5774,9 +5774,9 @@ err_destroy: } __printf(1, 4) -struct workqueue_struct *alloc_workqueue(const char *fmt, - unsigned int flags, - int max_active, ...) +struct workqueue_struct *alloc_workqueue_noprof(const char *fmt, + unsigned int flags, + int max_active, ...) { struct workqueue_struct *wq; va_list args; @@ -5791,7 +5791,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, return wq; } -EXPORT_SYMBOL_GPL(alloc_workqueue); +EXPORT_SYMBOL_GPL(alloc_workqueue_noprof); #ifdef CONFIG_LOCKDEP __printf(1, 5) -- cgit v1.2.3 From ae1ae11fb277f1335d6bcd4935ba0ea985af3c32 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Fri, 13 Jun 2025 15:58:00 -0400 Subject: audit,module: restore audit logging in load failure case The move of the module sanity check to earlier skipped the audit logging call in the case of failure and to a place where the previously used context is unavailable. Add an audit logging call for the module loading failure case and get the module name when possible. Link: https://issues.redhat.com/browse/RHEL-52839 Fixes: 02da2cbab452 ("module: move check_modinfo() early to early_mod_check()") Signed-off-by: Richard Guy Briggs Reviewed-by: Petr Pavlu Signed-off-by: Paul Moore --- kernel/audit.h | 2 +- kernel/auditsc.c | 2 +- kernel/module/main.c | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index 0211cb307d30..2a24d01c5fb0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -200,7 +200,7 @@ struct audit_context { int argc; } execve; struct { - char *name; + const char *name; } module; struct { struct audit_ntp_data ntp_data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 78fd876a5473..eb98cd6fe91f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2864,7 +2864,7 @@ void __audit_openat2_how(struct open_how *how) context->type = AUDIT_OPENAT2; } -void __audit_log_kern_module(char *name) +void __audit_log_kern_module(const char *name) { struct audit_context *context = audit_context(); diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..1ac487d2f7c4 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3368,7 +3368,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_allocated = true; - audit_log_kern_module(mod->name); + audit_log_kern_module(info->name); /* Reserve our place in the list. */ err = add_unformed_module(mod); @@ -3532,8 +3532,10 @@ static int load_module(struct load_info *info, const char __user *uargs, * failures once the proper module was allocated and * before that. */ - if (!module_allocated) + if (!module_allocated) { + audit_log_kern_module(info->name ? info->name : "?"); mod_stat_bump_becoming(info, flags); + } free_copy(info, flags); return err; } -- cgit v1.2.3 From 2fbe82037ab2513275b9d97fe4fd9947df26e960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 30 May 2025 05:54:37 +0200 Subject: sysfs: treewide: switch back to bin_attribute::read()/write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bin_attribute argument of bin_attribute::read() is now const. This makes the _new() callbacks unnecessary. Switch all users back. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250530-sysfs-const-bin_attr-final-v3-3-724bfcf05b99@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/btf.c | 2 +- kernel/bpf/sysfs_btf.c | 2 +- kernel/module/sysfs.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1d2cf898e21e..e8e63bd025c7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -8183,7 +8183,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, attr->attr.mode = 0444; attr->size = btf->data_size; attr->private = btf->data; - attr->read_new = sysfs_bin_attr_simple_read; + attr->read = sysfs_bin_attr_simple_read; err = sysfs_create_bin_file(btf_kobj, attr); if (err) { diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 941d0d2427e3..8951d8a2f2a3 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -45,7 +45,7 @@ static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .attr = { .name = "vmlinux", .mode = 0444, }, - .read_new = sysfs_bin_attr_simple_read, + .read = sysfs_bin_attr_simple_read, .mmap = btf_sysfs_vmlinux_mmap, }; diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index b401ff4b02d2..5183ae86e05e 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -101,7 +101,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info) ret = -ENOMEM; goto out; } - sattr->read_new = module_sect_read; + sattr->read = module_sect_read; sattr->private = (void *)sec->sh_addr; sattr->size = MODULE_SECT_READ_SIZE; sattr->attr.mode = 0400; @@ -190,7 +190,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) nattr->attr.mode = 0444; nattr->size = info->sechdrs[i].sh_size; nattr->private = (void *)info->sechdrs[i].sh_addr; - nattr->read_new = sysfs_bin_attr_simple_read; + nattr->read = sysfs_bin_attr_simple_read; *(gattr++) = nattr++; } ++loaded; -- cgit v1.2.3 From fb506e31b3d52f7faaec00352c2732ce31c1f930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 30 May 2025 05:54:38 +0200 Subject: sysfs: treewide: switch back to attribute_group::bin_attrs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The normal bin_attrs field can now handle const pointers. This makes the _new variant unnecessary. Switch all users back. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250530-sysfs-const-bin_attr-final-v3-4-724bfcf05b99@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- kernel/module/sysfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index 5183ae86e05e..c7622ff5226a 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -56,9 +56,9 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) { const struct bin_attribute *const *bin_attr; - for (bin_attr = sect_attrs->grp.bin_attrs_new; *bin_attr; bin_attr++) + for (bin_attr = sect_attrs->grp.bin_attrs; *bin_attr; bin_attr++) kfree((*bin_attr)->attr.name); - kfree(sect_attrs->grp.bin_attrs_new); + kfree(sect_attrs->grp.bin_attrs); kfree(sect_attrs); } @@ -86,7 +86,7 @@ static int add_sect_attrs(struct module *mod, const struct load_info *info) /* Setup section attributes. */ sect_attrs->grp.name = "sections"; - sect_attrs->grp.bin_attrs_new = gattr; + sect_attrs->grp.bin_attrs = gattr; sattr = §_attrs->attrs[0]; for (i = 0; i < info->hdr->e_shnum; i++) { @@ -144,7 +144,7 @@ struct module_notes_attrs { static void free_notes_attrs(struct module_notes_attrs *notes_attrs) { - kfree(notes_attrs->grp.bin_attrs_new); + kfree(notes_attrs->grp.bin_attrs); kfree(notes_attrs); } @@ -178,7 +178,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info) } notes_attrs->grp.name = "notes"; - notes_attrs->grp.bin_attrs_new = gattr; + notes_attrs->grp.bin_attrs = gattr; nattr = ¬es_attrs->attrs[0]; for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { -- cgit v1.2.3 From 571c1ea91a73db56bd94054fabecd0f070dc90db Mon Sep 17 00:00:00 2001 From: John Ogness Date: Fri, 6 Jun 2025 21:01:49 +0206 Subject: printk: nbcon: Allow reacquire during panic If a console printer is interrupted during panic, it will never be able to reacquire ownership in order to perform and cleanup. That in itself is not a problem, since the non-panic CPU will simply quiesce in an endless loop within nbcon_reacquire_nobuf(). However, in this state, platforms that do not support a true NMI to interrupt the quiesced CPU will not be able to shutdown that CPU from within panic(). This then causes problems for such as being unable to load and run a kdump kernel. Fix this by allowing non-panic CPUs to reacquire ownership using a direct acquire. Then the non-panic CPUs can successfullyl exit the nbcon_reacquire_nobuf() loop and the console driver can perform any necessary cleanup. But more importantly, the CPU is no longer quiesced and is free to process any interrupts necessary for panic() to shutdown the CPU. All other forms of acquire are still not allowed for non-panic CPUs since it is safer to have them avoid gaining console ownership that is not strictly necessary. Reported-by: Michael Kelley Closes: https://lore.kernel.org/r/SN6PR02MB4157A4C5E8CB219A75263A17D46DA@SN6PR02MB4157.namprd02.prod.outlook.com Signed-off-by: John Ogness Reviewed-by: Petr Mladek Tested-by: Michael Kelley Link: https://patch.msgid.link/20250606185549.900611-1-john.ogness@linutronix.de Signed-off-by: Petr Mladek --- kernel/printk/nbcon.c | 63 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..e7a3af81b173 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -214,8 +214,9 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) /** * nbcon_context_try_acquire_direct - Try to acquire directly - * @ctxt: The context of the caller - * @cur: The current console state + * @ctxt: The context of the caller + * @cur: The current console state + * @is_reacquire: This acquire is a reacquire * * Acquire the console when it is released. Also acquire the console when * the current owner has a lower priority and the console is in a safe state. @@ -225,17 +226,17 @@ static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq) * * Errors: * - * -EPERM: A panic is in progress and this is not the panic CPU. - * Or the current owner or waiter has the same or higher - * priority. No acquire method can be successful in - * this case. + * -EPERM: A panic is in progress and this is neither the panic + * CPU nor is this a reacquire. Or the current owner or + * waiter has the same or higher priority. No acquire + * method can be successful in these cases. * * -EBUSY: The current owner has a lower priority but the console * in an unsafe state. The caller should try using * the handover acquire method. */ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, - struct nbcon_state *cur) + struct nbcon_state *cur, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -243,14 +244,20 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, do { /* - * Panic does not imply that the console is owned. However, it - * is critical that non-panic CPUs during panic are unable to - * acquire ownership in order to satisfy the assumptions of - * nbcon_waiter_matches(). In particular, the assumption that - * lower priorities are ignored during panic. + * Panic does not imply that the console is owned. However, + * since all non-panic CPUs are stopped during panic(), it + * is safer to have them avoid gaining console ownership. + * + * If this acquire is a reacquire (and an unsafe takeover + * has not previously occurred) then it is allowed to attempt + * a direct acquire in panic. This gives console drivers an + * opportunity to perform any necessary cleanup if they were + * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic()) + if (other_cpu_in_panic() && + (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; + } if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio) return -EPERM; @@ -301,8 +308,9 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #1 implies this context is EMERGENCY. * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. - * Events #4 and #5 are not possible due to the other_cpu_in_panic() - * check in nbcon_context_try_acquire_direct(). + * Event #4 occurs when a non-panic CPU reacquires. + * Event #5 is not possible due to the other_cpu_in_panic() check + * in nbcon_context_try_acquire_handover(). */ return (cur->req_prio == expected_prio); @@ -431,6 +439,16 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio); WARN_ON_ONCE(!cur->unsafe); + /* + * Panic does not imply that the console is owned. However, it + * is critical that non-panic CPUs during panic are unable to + * wait for a handover in order to satisfy the assumptions of + * nbcon_waiter_matches(). In particular, the assumption that + * lower priorities are ignored during panic. + */ + if (other_cpu_in_panic()) + return -EPERM; + /* Handover is not possible on the same CPU. */ if (cur->cpu == cpu) return -EBUSY; @@ -558,7 +576,8 @@ static struct printk_buffers panic_nbcon_pbufs; /** * nbcon_context_try_acquire - Try to acquire nbcon console - * @ctxt: The context of the caller + * @ctxt: The context of the caller + * @is_reacquire: This acquire is a reacquire * * Context: Under @ctxt->con->device_lock() or local_irq_save(). * Return: True if the console was acquired. False otherwise. @@ -568,7 +587,7 @@ static struct printk_buffers panic_nbcon_pbufs; * in an unsafe state. Otherwise, on success the caller may assume * the console is not in an unsafe state. */ -static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) +static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; @@ -577,7 +596,7 @@ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt) nbcon_state_read(con, &cur); try_again: - err = nbcon_context_try_acquire_direct(ctxt, &cur); + err = nbcon_context_try_acquire_direct(ctxt, &cur, is_reacquire); if (err != -EBUSY) goto out; @@ -913,7 +932,7 @@ void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt); - while (!nbcon_context_try_acquire(ctxt)) + while (!nbcon_context_try_acquire(ctxt, true)) cpu_relax(); nbcon_write_context_set_buf(wctxt, NULL, 0); @@ -1101,7 +1120,7 @@ static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic) cant_migrate(); } - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) goto out; /* @@ -1486,7 +1505,7 @@ static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq, ctxt->prio = nbcon_get_default_prio(); ctxt->allow_unsafe_takeover = allow_unsafe_takeover; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return -EPERM; while (nbcon_seq_read(con) < stop_seq) { @@ -1762,7 +1781,7 @@ bool nbcon_device_try_acquire(struct console *con) ctxt->console = con; ctxt->prio = NBCON_PRIO_NORMAL; - if (!nbcon_context_try_acquire(ctxt)) + if (!nbcon_context_try_acquire(ctxt, false)) return false; if (!nbcon_context_enter_unsafe(ctxt)) -- cgit v1.2.3 From a766cfbbeb3a74397965a8fa2e9a402026d3e1d8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 12 Jun 2025 22:28:56 -0700 Subject: bpf: Mark dentry->d_inode as trusted_or_null LSM hooks such as security_path_mknod() and security_inode_rename() have access to newly allocated negative dentry, which has NULL d_inode. Therefore, it is necessary to do the NULL pointer check for d_inode. Also add selftests that checks the verifier enforces the NULL pointer check. Signed-off-by: Song Liu Reviewed-by: Matt Bobrowski Link: https://lore.kernel.org/r/20250613052857.1992233-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..169845710c7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7027,8 +7027,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -7066,7 +7065,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -7076,6 +7074,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); -- cgit v1.2.3 From c50784e99f0e7199cdb12dbddf02229b102744ef Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 13:23:07 -1000 Subject: sched_ext: Make scx_group_set_weight() always update tg->scx.weight Otherwise, tg->scx.weight can go out of sync while scx_cgroup is not enabled and ops.cgroup_init() may be called with a stale weight value. Signed-off-by: Tejun Heo Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ --- kernel/sched/ext.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 2c41c78be61e..33a0d8c6ff95 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4241,12 +4241,12 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && tg->scx_weight != weight) { - if (SCX_HAS_OP(sch, cgroup_set_weight)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, - tg_cgrp(tg), weight); - tg->scx_weight = weight; - } + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && + tg->scx_weight != weight) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, + tg_cgrp(tg), weight); + + tg->scx_weight = weight; percpu_up_read(&scx_cgroup_rwsem); } -- cgit v1.2.3 From 33796b91871ad4010c8188372dd1faf97cf0f1c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 16 Jun 2025 10:13:25 -1000 Subject: sched_ext, sched/core: Don't call scx_group_set_weight() prematurely from sched_create_group() During task_group creation, sched_create_group() calls scx_group_set_weight() with CGROUP_WEIGHT_DFL to initialize the sched_ext portion. This is premature and ends up calling ops.cgroup_set_weight() with an incorrect @cgrp before ops.cgroup_init() is called. sched_create_group() should just initialize SCX related fields in the new task_group. Fix it by factoring out scx_tg_init() from sched_init() and making sched_create_group() call that function instead of scx_group_set_weight(). v2: Retain CONFIG_EXT_GROUP_SCHED ifdef in sched_init() as removing it leads to build failures on !CONFIG_GROUP_SCHED configs. Signed-off-by: Tejun Heo Fixes: 819513666966 ("sched_ext: Add cgroup support") Cc: stable@vger.kernel.org # v6.12+ --- kernel/sched/core.c | 4 ++-- kernel/sched/ext.c | 5 +++++ kernel/sched/ext.h | 2 ++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dce50fa57471..8988d38d46a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8545,7 +8545,7 @@ void __init sched_init(void) init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - root_task_group.scx_weight = CGROUP_WEIGHT_DFL; + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -8985,7 +8985,7 @@ struct task_group *sched_create_group(struct task_group *parent) if (!alloc_rt_sched_group(tg, parent)) goto err; - scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); + scx_tg_init(tg); alloc_uclamp_sched_group(tg, parent); return tg; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 33a0d8c6ff95..b498d867ba21 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4092,6 +4092,11 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; +void scx_tg_init(struct task_group *tg) +{ + tg->scx_weight = CGROUP_WEIGHT_DFL; +} + int scx_tg_online(struct task_group *tg) { struct scx_sched *sch = scx_root; diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 6e5072f57771..a75835c23f15 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -79,6 +79,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {} #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_EXT_GROUP_SCHED +void scx_tg_init(struct task_group *tg); int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); @@ -88,6 +89,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); #else /* CONFIG_EXT_GROUP_SCHED */ +static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -- cgit v1.2.3 From 128ea9f6ccfb6960293ae4212f4f97165e42222d Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Sat, 14 Jun 2025 15:35:29 +0200 Subject: workqueue: Add system_percpu_wq and system_dfl_wq Currently, if a user enqueue a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistentcy cannot be addressed without refactoring the API. system_wq is a per-CPU worqueue, yet nothing in its name tells about that CPU affinity constraint, which is very often not required by users. Make it clear by adding a system_percpu_wq. system_unbound_wq should be the default workqueue so as not to enforce locality constraints for random work whenever it's not required. Adding system_dfl_wq to encourage its use when unbound work should be used. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cf6203282737..71a4dd59977c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -505,12 +505,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init; struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); +struct workqueue_struct *system_percpu_wq __ro_after_init; +EXPORT_SYMBOL(system_percpu_wq); struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); +struct workqueue_struct *system_dfl_wq __ro_after_init; +EXPORT_SYMBOL_GPL(system_dfl_wq); struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); struct workqueue_struct *system_power_efficient_wq __ro_after_init; @@ -7816,10 +7820,11 @@ void __init workqueue_init_early(void) } system_wq = alloc_workqueue("events", 0, 0); + system_percpu_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); - system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, - WQ_MAX_ACTIVE); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); + system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); system_power_efficient_wq = alloc_workqueue("events_power_efficient", @@ -7830,8 +7835,8 @@ void __init workqueue_init_early(void) system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0); system_bh_highpri_wq = alloc_workqueue("events_bh_highpri", WQ_BH | WQ_HIGHPRI, 0); - BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq || + BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq || + !system_unbound_wq || !system_freezable_wq || !system_dfl_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq || !system_bh_wq || !system_bh_highpri_wq); -- cgit v1.2.3 From 261dce3d64021e7ec828a17b4975ce9182e54ceb Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Tue, 17 Jun 2025 12:42:16 +0800 Subject: workqueue: Initialize wq_isolated_cpumask in workqueue_init_early() Now when isolcpus is enabled via the cmdline, wq_isolated_cpumask does not include these isolated CPUs, even wq_unbound_cpumask has already excluded them. It is only when we successfully configure an isolate cpuset partition that wq_isolated_cpumask gets overwritten by workqueue_unbound_exclude_cpumask(), including both the cmdline-specified isolated CPUs and the isolated CPUs within the cpuset partitions. Fix this issue by initializing wq_isolated_cpumask properly in workqueue_init_early(). Fixes: fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") Signed-off-by: Chuyi Zhou Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 97f37b5bae66..9f9148075828 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7767,7 +7767,8 @@ void __init workqueue_init_early(void) restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); - + cpumask_andnot(wq_isolated_cpumask, cpu_possible_mask, + housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs(); -- cgit v1.2.3 From 1257b8786ac689a2ce5fe3e1741c65038035adc6 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:22 -0700 Subject: cgroup: support to enable nmi-safe css_rstat_updated Add necessary infrastructure to enable the nmi-safe execution of css_rstat_updated(). Currently css_rstat_updated() takes a per-cpu per-css raw spinlock to add the given css in the per-cpu per-css update tree. However the kernel can not spin in nmi context, so we need to remove the spinning on the raw spinlock in css_rstat_updated(). To support lockless css_rstat_updated(), let's add necessary data structures in the css and ss structures. Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index ce4752ab9e09..bfa6366d2325 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -11,6 +11,7 @@ static DEFINE_SPINLOCK(rstat_base_lock); static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); +static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -45,6 +46,13 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss) return &rstat_base_lock; } +static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) +{ + if (ss) + return per_cpu_ptr(ss->lhead, cpu); + return per_cpu_ptr(&rstat_backlog_list, cpu); +} + static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) { if (ss) @@ -456,7 +464,8 @@ int css_rstat_init(struct cgroup_subsys_state *css) for_each_possible_cpu(cpu) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); - rstatc->updated_children = css; + rstatc->owner = rstatc->updated_children = css; + init_llist_node(&rstatc->lnode); if (is_self) { struct cgroup_rstat_base_cpu *rstatbc; @@ -525,9 +534,19 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) } #endif + if (ss) { + ss->lhead = alloc_percpu(struct llist_head); + if (!ss->lhead) { + free_percpu(ss->rstat_ss_cpu_lock); + return -ENOMEM; + } + } + spin_lock_init(ss_rstat_lock(ss)); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + init_llist_head(ss_lhead_cpu(ss, cpu)); + } return 0; } -- cgit v1.2.3 From 36df6e3dbd7e7b074e55fec080012184e2fa3a46 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:23 -0700 Subject: cgroup: make css_rstat_updated nmi safe To make css_rstat_updated() able to safely run in nmi context, let's move the rstat update tree creation at the flush side and use per-cpu lockless lists in struct cgroup_subsys to track the css whose stats are updated on that cpu. The struct cgroup_subsys_state now has per-cpu lnode which needs to be inserted into the corresponding per-cpu lhead of struct cgroup_subsys. Since we want the insertion to be nmi safe, there can be multiple inserters on the same cpu for the same lnode. Here multiple inserters are from stacked contexts like softirq, hardirq and nmi. The current llist does not provide function to protect against the scenario where multiple inserters can use the same lnode. So, using llist_node() out of the box is not safe for this scenario. However we can protect against multiple inserters using the same lnode by using the fact llist node points to itself when not on the llist and atomically reset it and select the winner as the single inserter. Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 65 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bfa6366d2325..823a4c7c3fea 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -126,13 +126,16 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of - * css_rstat_cpu definition for details. + * Atomically inserts the css in the ss's llist for the given cpu. This is + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist + * will be processed at the flush time to create the update tree. */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { - unsigned long flags; + struct llist_head *lhead; + struct css_rstat_cpu *rstatc; + struct css_rstat_cpu __percpu *rstatc_pcpu; + struct llist_node *self; /* * Since bpf programs can call this function, prevent access to @@ -141,19 +144,44 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) if (!css_uses_rstat(css)) return; + lockdep_assert_preemption_disabled(); + + /* + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore + * the requests from nmi context. + */ + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) + return; + + rstatc = css_rstat_cpu(css, cpu); + /* If already on list return. */ + if (llist_on_list(&rstatc->lnode)) + return; + /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. + * This function can be renentered by irqs and nmis for the same cgroup + * and may try to insert the same per-cpu lnode into the llist. Note + * that llist_add() does not protect against such scenarios. * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @css is on the list by - * testing the next pointer for NULL. + * To protect against such stacked contexts of irqs/nmis, we use the + * fact that lnode points to itself when not on a list and then use + * this_cpu_cmpxchg() to atomically set to NULL to select the winner + * which will call llist_add(). The losers can assume the insertion is + * successful and the winner will eventually add the per-cpu lnode to + * the llist. */ - if (data_race(css_rstat_cpu(css, cpu)->updated_next)) + self = &rstatc->lnode; + rstatc_pcpu = css->rstat_cpu; + if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) return; - flags = _css_rstat_cpu_lock(css, cpu, true); + lhead = ss_lhead_cpu(css->ss, cpu); + llist_add(&rstatc->lnode, lhead); +} +static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) +{ /* put @css and all ancestors on the corresponding updated lists */ while (true) { struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); @@ -179,8 +207,19 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) css = parent; } +} + +static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) +{ + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); + struct llist_node *lnode; + + while ((lnode = llist_del_first_init(lhead))) { + struct css_rstat_cpu *rstatc; - _css_rstat_cpu_unlock(css, cpu, flags, true); + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); + __css_process_update_tree(rstatc->owner, cpu); + } } /** @@ -288,6 +327,8 @@ static struct cgroup_subsys_state *css_rstat_updated_list( flags = _css_rstat_cpu_lock(root, cpu, false); + css_process_update_tree(root->ss, cpu); + /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) goto unlock_ret; -- cgit v1.2.3 From 6af89c6ca71742e9227e6f8172a86ce1ee16aa85 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 17 Jun 2025 12:57:24 -0700 Subject: cgroup: remove per-cpu per-subsystem locks The rstat update side used to insert the cgroup whose stats are updated in the update tree and the read side flush the update tree to get the latest uptodate stats. The per-cpu per-subsystem locks were used to synchronize the update and flush side. However now the update side does not access update tree but uses per-cpu lockless lists. So there is no need for locks to synchronize update and flush side. Let's remove them. Suggested-by: JP Kobryn Signed-off-by: Shakeel Butt Tested-by: JP Kobryn Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 100 ++------------------------------------------------ 1 file changed, 4 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 823a4c7c3fea..c8a48cf83878 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -10,7 +10,6 @@ #include static DEFINE_SPINLOCK(rstat_base_lock); -static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -53,74 +52,6 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) return per_cpu_ptr(&rstat_backlog_list, cpu); } -static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) -{ - if (ss) - return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); - - return per_cpu_ptr(&rstat_base_cpu_lock, cpu); -} - -/* - * Helper functions for rstat per CPU locks. - * - * This makes it easier to diagnose locking issues and contention in - * production environments. The parameter @fast_path determine the - * tracepoints being added, allowing us to diagnose "flush" related - * operations without handling high-frequency fast-path "update" events. - */ -static __always_inline -unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, - const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - unsigned long flags; - bool contended; - - /* - * The _irqsave() is needed because the locks used for flushing are - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock - * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT - * kernel. The raw_spinlock_t below disables interrupts on both - * configurations. The _irqsave() ensures that interrupts are always - * disabled and later restored. - */ - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); - if (contended) { - if (fast_path) - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); - - raw_spin_lock_irqsave(cpu_lock, flags); - } - - if (fast_path) - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); - else - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); - - return flags; -} - -static __always_inline -void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, - unsigned long flags, const bool fast_path) -{ - struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock; - - if (fast_path) - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); - else - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); - - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); - raw_spin_unlock_irqrestore(cpu_lock, flags); -} - /** * css_rstat_updated - keep track of updated rstat_cpu * @css: target cgroup subsystem state @@ -323,15 +254,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list( { struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; - unsigned long flags; - - flags = _css_rstat_cpu_lock(root, cpu, false); css_process_update_tree(root->ss, cpu); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) - goto unlock_ret; + return NULL; /* * Unlink @root from its parent. As the updated_children list is @@ -363,8 +291,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list( rstatc->updated_children = root; if (child != root) head = css_rstat_push_children(head, child, cpu); -unlock_ret: - _css_rstat_cpu_unlock(root, cpu, flags, false); + return head; } @@ -560,34 +487,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss) { int cpu; -#ifdef CONFIG_SMP - /* - * On uniprocessor machines, arch_spinlock_t is defined as an empty - * struct. Avoid allocating a size of zero by having this block - * excluded in this case. It's acceptable to leave the subsystem locks - * unitialized since the associated lock functions are no-ops in the - * non-smp case. - */ - if (ss) { - ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); - if (!ss->rstat_ss_cpu_lock) - return -ENOMEM; - } -#endif - if (ss) { ss->lhead = alloc_percpu(struct llist_head); - if (!ss->lhead) { - free_percpu(ss->rstat_ss_cpu_lock); + if (!ss->lhead) return -ENOMEM; - } } spin_lock_init(ss_rstat_lock(ss)); - for_each_possible_cpu(cpu) { - raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); + for_each_possible_cpu(cpu) init_llist_head(ss_lhead_cpu(ss, cpu)); - } return 0; } -- cgit v1.2.3 From f5527f0171f049e73c0aed21507662abd78821b8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 15 Jun 2025 01:47:19 +0100 Subject: bpf: Get rid of redundant 3rd argument of prepare_seq_file() Remove 3rd argument in prepare_seq_file() to clean up the code a bit. Signed-off-by: Al Viro Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250615004719.GE3011112@ZenIV Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_iter.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 380e9a7cac75..303ab1f42d3a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -38,8 +38,7 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, - const struct bpf_iter_seq_info *seq_info); +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); static void bpf_iter_inc_seq_num(struct seq_file *seq) { @@ -257,7 +256,7 @@ static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; - return prepare_seq_file(file, link, __get_seq_info(link)); + return prepare_seq_file(file, link); } static int iter_release(struct inode *inode, struct file *file) @@ -586,9 +585,9 @@ static void init_seq_meta(struct bpf_iter_priv_data *priv_data, priv_data->done_stop = false; } -static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, - const struct bpf_iter_seq_info *seq_info) +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) { + const struct bpf_iter_seq_info *seq_info = __get_seq_info(link); struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; struct bpf_prog *prog; @@ -653,7 +652,7 @@ int bpf_iter_new_fd(struct bpf_link *link) } iter_link = container_of(link, struct bpf_iter_link, link); - err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); + err = prepare_seq_file(file, iter_link); if (err) goto free_file; -- cgit v1.2.3 From bd07bd12f2c17d3e13d58c09f5eac5d021ec14ea Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Tue, 17 Jun 2025 10:57:36 -0400 Subject: bpf: Fix key serial argument of bpf_lookup_user_key() The underlying lookup_user_key() function uses a signed 32 bit integer for key serial numbers because legitimate serial numbers are positive (and > 3) and keyrings are negative. Using a u32 for the keyring in the bpf function doesn't currently cause any conversion problems but will start to trip the signed to unsigned conversion warnings when the kernel enables them, so convert the argument to signed (and update the tests accordingly) before it acquires more users. Signed-off-by: James Bottomley Reviewed-by: Roberto Sassu Link: https://lore.kernel.org/r/84cdb0775254d297d75e21f577089f64abdfbd28.camel@HansenPartnership.com Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 132c8be6f635..0a06ea6638fe 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1270,7 +1270,7 @@ __bpf_kfunc_start_defs(); * Return: a bpf_key pointer with a valid key pointer if the key is found, a * NULL pointer otherwise. */ -__bpf_kfunc struct bpf_key *bpf_lookup_user_key(u32 serial, u64 flags) +__bpf_kfunc struct bpf_key *bpf_lookup_user_key(s32 serial, u64 flags) { key_ref_t key_ref; struct bpf_key *bkey; -- cgit v1.2.3 From 327e28664307d49ce3fa71ba30dcc0007c270974 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Jun 2025 07:38:01 -0400 Subject: fgraph: Do not enable function_graph tracer when setting funcgraph-args When setting the funcgraph-args option when function graph tracer is net enabled, it incorrectly enables it. Worse, it unregisters itself when it was never registered. Then when it gets enabled again, it will register itself a second time causing a WARNing. ~# echo 1 > /sys/kernel/tracing/options/funcgraph-args ~# head -20 /sys/kernel/tracing/trace # tracer: nop # # entries-in-buffer/entries-written: 813/26317372 #P:8 # # _-----=> irqs-off/BH-disabled # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / _-=> migrate-disable # |||| / delay # TASK-PID CPU# ||||| TIMESTAMP FUNCTION # | | | ||||| | | -0 [007] d..4. 358.966010: 7) 1.692 us | fetch_next_timer_interrupt(basej=4294981640, basem=357956000000, base_local=0xffff88823c3ae040, base_global=0xffff88823c3af300, tevt=0xffff888100e47cb8); -0 [007] d..4. 358.966012: 7) | tmigr_cpu_deactivate(nextexp=357988000000) { -0 [007] d..4. 358.966013: 7) | _raw_spin_lock(lock=0xffff88823c3b2320) { -0 [007] d..4. 358.966014: 7) 0.981 us | preempt_count_add(val=1); -0 [007] d..5. 358.966017: 7) 1.058 us | do_raw_spin_lock(lock=0xffff88823c3b2320); -0 [007] d..4. 358.966019: 7) 5.824 us | } -0 [007] d..5. 358.966021: 7) | tmigr_inactive_up(group=0xffff888100cb9000, child=0x0, data=0xffff888100e47bc0) { -0 [007] d..5. 358.966022: 7) | tmigr_update_events(group=0xffff888100cb9000, child=0x0, data=0xffff888100e47bc0) { Notice the "tracer: nop" at the top there. The current tracer is the "nop" tracer, but the content is obviously the function graph tracer. Enabling function graph tracing will cause it to register again and trigger a warning in the accounting: ~# echo function_graph > /sys/kernel/tracing/current_tracer -bash: echo: write error: Device or resource busy With the dmesg of: ------------[ cut here ]------------ WARNING: CPU: 7 PID: 1095 at kernel/trace/ftrace.c:3509 ftrace_startup_subops+0xc1e/0x1000 Modules linked in: kvm_intel kvm irqbypass CPU: 7 UID: 0 PID: 1095 Comm: bash Not tainted 6.16.0-rc2-test-00006-gea03de4105d3 #24 PREEMPT Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:ftrace_startup_subops+0xc1e/0x1000 Code: 48 b8 22 01 00 00 00 00 ad de 49 89 84 24 88 01 00 00 8b 44 24 08 89 04 24 e9 c3 f7 ff ff c7 04 24 ed ff ff ff e9 b7 f7 ff ff <0f> 0b c7 04 24 f0 ff ff ff e9 a9 f7 ff ff c7 04 24 f4 ff ff ff e9 RSP: 0018:ffff888133cff948 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 1ffff1102679ff31 RCX: 0000000000000000 RDX: 1ffffffff0b27a60 RSI: ffffffff8593d2f0 RDI: ffffffff85941140 RBP: 00000000000c2041 R08: ffffffffffffffff R09: ffffed1020240221 R10: ffff88810120110f R11: ffffed1020240214 R12: ffffffff8593d2f0 R13: ffffffff8593d300 R14: ffffffff85941140 R15: ffffffff85631100 FS: 00007f7ec6f28740(0000) GS:ffff8882b5251000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f7ec6f181c0 CR3: 000000012f1d0005 CR4: 0000000000172ef0 Call Trace: ? __pfx_ftrace_startup_subops+0x10/0x10 ? find_held_lock+0x2b/0x80 ? ftrace_stub_direct_tramp+0x10/0x10 ? ftrace_stub_direct_tramp+0x10/0x10 ? trace_preempt_on+0xd0/0x110 ? __pfx_trace_graph_entry_args+0x10/0x10 register_ftrace_graph+0x4d2/0x1020 ? tracing_reset_online_cpus+0x14b/0x1e0 ? __pfx_register_ftrace_graph+0x10/0x10 ? ring_buffer_record_enable+0x16/0x20 ? tracing_reset_online_cpus+0x153/0x1e0 ? __pfx_tracing_reset_online_cpus+0x10/0x10 ? __pfx_trace_graph_return+0x10/0x10 graph_trace_init+0xfd/0x160 tracing_set_tracer+0x500/0xa80 ? __pfx_tracing_set_tracer+0x10/0x10 ? lock_release+0x181/0x2d0 ? _copy_from_user+0x26/0xa0 tracing_set_trace_write+0x132/0x1e0 ? __pfx_tracing_set_trace_write+0x10/0x10 ? ftrace_graph_func+0xcc/0x140 ? ftrace_stub_direct_tramp+0x10/0x10 ? ftrace_stub_direct_tramp+0x10/0x10 ? ftrace_stub_direct_tramp+0x10/0x10 vfs_write+0x1d0/0xe90 ? __pfx_vfs_write+0x10/0x10 Have the setting of the funcgraph-args check if function_graph tracer is the current tracer of the instance, and if not, do nothing, as there's nothing to do (the option is checked when function_graph tracing starts). Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/20250618073801.057ea636@gandalf.local.home Fixes: c7a60a733c373 ("ftrace: Have funcgraph-args take affect during tracing") Closes: https://lore.kernel.org/all/4ab1a7bdd0174ab09c7b0d68cdbff9a4@huawei.com/ Reported-by: Changbin Du Tested-by: Changbin Du Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 9234e2c39abf..14d74a7491b8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -455,10 +455,16 @@ static int graph_trace_init(struct trace_array *tr) return 0; } +static struct tracer graph_trace; + static int ftrace_graph_trace_args(struct trace_array *tr, int set) { trace_func_graph_ent_t entry; + /* Do nothing if the current tracer is not this tracer */ + if (tr->current_trace != &graph_trace) + return 0; + if (set) entry = trace_graph_entry_args; else -- cgit v1.2.3 From d403a3689af5c3a3e3ac6e282958d0eaa69ca47f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:27 -1000 Subject: sched/fair: Move max_cfs_quota_period decl and default_cfs_period() def from fair.c to sched.h max_cfs_quota_period is defined in core.c but has a declaration in fair.c. Move the declaration to kernel/sched/sched.h. Also, move default_cfs_period() from fair.c to sched.h. No functional changes. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-2-tj@kernel.org --- kernel/sched/fair.c | 11 ----------- kernel/sched/sched.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b17d3da034a..707be4570430 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5617,15 +5617,6 @@ void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} #endif /* !CONFIG_JUMP_LABEL */ -/* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds - */ -static inline u64 default_cfs_period(void) -{ - return 100000000ULL; -} - static inline u64 sched_cfs_bandwidth_slice(void) { return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; @@ -6405,8 +6396,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } -extern const u64 max_cfs_quota_period; - static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c323d015486c..e00b80cd998e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -402,6 +402,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; +#ifdef CONFIG_CFS_BANDWIDTH +extern const u64 max_cfs_quota_period; + +/* + * default period for cfs group bandwidth. + * default: 0.1s, units: nanoseconds + */ +static inline u64 default_cfs_period(void) +{ + return 100000000ULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ + struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; -- cgit v1.2.3 From de4c80c6963e130707ead16a544a387f811dbd87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:28 -1000 Subject: sched/core: Relocate tg_get_cfs_*() and cpu_cfs_*_read_*() Collect the getters, relocate the trivial interface file wrappers, and put all of them in period, quota, burst order to prepare for future changes. Pure reordering. No functional changes. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-3-tj@kernel.org --- kernel/sched/core.c | 134 ++++++++++++++++++++++++++-------------------------- 1 file changed, 67 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a03c3c1d7f50..dc9668a6592f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9404,20 +9404,14 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static long tg_get_cfs_period(struct task_group *tg) { - u64 quota, period, burst; + u64 cfs_period_us; - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); - return tg_set_cfs_bandwidth(tg, period, quota, burst); + return cfs_period_us; } static long tg_get_cfs_quota(struct task_group *tg) @@ -9433,6 +9427,16 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } +static long tg_get_cfs_burst(struct task_group *tg) +{ + u64 burst_us; + + burst_us = tg->cfs_bandwidth.burst; + do_div(burst_us, NSEC_PER_USEC); + + return burst_us; +} + static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period, burst; @@ -9447,14 +9451,20 @@ static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) return tg_set_cfs_bandwidth(tg, period, quota, burst); } -static long tg_get_cfs_period(struct task_group *tg) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { - u64 cfs_period_us; + u64 quota, period, burst; - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); - do_div(cfs_period_us, NSEC_PER_USEC); + period = ktime_to_ns(tg->cfs_bandwidth.period); + burst = tg->cfs_bandwidth.burst; + if (cfs_quota_us < 0) + quota = RUNTIME_INF; + else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) + quota = (u64)cfs_quota_us * NSEC_PER_USEC; + else + return -EINVAL; - return cfs_period_us; + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) @@ -9471,52 +9481,6 @@ static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) return tg_set_cfs_bandwidth(tg, period, quota, burst); } -static long tg_get_cfs_burst(struct task_group *tg) -{ - u64 burst_us; - - burst_us = tg->cfs_bandwidth.burst; - do_div(burst_us, NSEC_PER_USEC); - - return burst_us; -} - -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_quota(css_tg(css)); -} - -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) -{ - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); -} - -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_period(css_tg(css)); -} - -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) -{ - return tg_set_cfs_period(css_tg(css), cfs_period_us); -} - -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_burst(css_tg(css)); -} - -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) -{ - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9649,6 +9613,42 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } + +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_period(css_tg(css)); +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_quota(css_tg(css)); +} + +static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_burst(css_tg(css)); +} + +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) +{ + return tg_set_cfs_period(css_tg(css), cfs_period_us); +} + +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) +{ + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); +} + +static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_burst_us) +{ + return tg_set_cfs_burst(css_tg(css), cfs_burst_us); +} #endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9710,16 +9710,16 @@ static struct cftype cpu_legacy_files[] = { }, #endif #ifdef CONFIG_CFS_BANDWIDTH - { - .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, - }, { .name = "cfs_period_us", .read_u64 = cpu_cfs_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, }, + { + .name = "cfs_quota_us", + .read_s64 = cpu_cfs_quota_read_s64, + .write_s64 = cpu_cfs_quota_write_s64, + }, { .name = "cfs_burst_us", .read_u64 = cpu_cfs_burst_read_u64, -- cgit v1.2.3 From 43e33f53e25687ca870248d1939cfade0164426c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:29 -1000 Subject: sched/core: Reorganize cgroup bandwidth control interface file reads - Update tg_get_cfs_*() to return u64 values. These are now used as the low level accessors to the fair's bandwidth configuration parameters. Translation to usecs takes place in these functions. - Add tg_bandwidth() which reads all three bandwidth parameters using tg_get_cfs_*(). - Reimplement cgroup interface read functions using tg_bandwidth(). Drop cfs from the function names. This is to prepare for adding bandwidth control support to sched_ext. tg_bandwidth() will be used as the muxing point similar to tg_weight(). No functional changes. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-4-tj@kernel.org --- kernel/sched/core.c | 58 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dc9668a6592f..8de93a3bfa27 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9404,7 +9404,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static long tg_get_cfs_period(struct task_group *tg) +static u64 tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; @@ -9414,12 +9414,12 @@ static long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static long tg_get_cfs_quota(struct task_group *tg) +static u64 tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; if (tg->cfs_bandwidth.quota == RUNTIME_INF) - return -1; + return RUNTIME_INF; quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); @@ -9427,7 +9427,7 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -static long tg_get_cfs_burst(struct task_group *tg) +static u64 tg_get_cfs_burst(struct task_group *tg) { u64 burst_us; @@ -9614,22 +9614,42 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) +static void tg_bandwidth(struct task_group *tg, + u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) { - return tg_get_cfs_period(css_tg(css)); + if (period_us_p) + *period_us_p = tg_get_cfs_period(tg); + if (quota_us_p) + *quota_us_p = tg_get_cfs_quota(tg); + if (burst_us_p) + *burst_us_p = tg_get_cfs_burst(tg); } -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) +static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(css_tg(css)); + u64 period_us; + + tg_bandwidth(css_tg(css), &period_us, NULL, NULL); + return period_us; } -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) +static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_burst(css_tg(css)); + u64 quota_us; + + tg_bandwidth(css_tg(css), NULL, "a_us, NULL); + return quota_us; /* (s64)RUNTIME_INF becomes -1 */ +} + +static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 burst_us; + + tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); + return burst_us; } static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, @@ -9712,17 +9732,17 @@ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_CFS_BANDWIDTH { .name = "cfs_period_us", - .read_u64 = cpu_cfs_period_read_u64, + .read_u64 = cpu_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, }, { .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, + .read_s64 = cpu_quota_read_s64, .write_s64 = cpu_cfs_quota_write_s64, }, { .name = "cfs_burst_us", - .read_u64 = cpu_cfs_burst_read_u64, + .read_u64 = cpu_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, }, { @@ -9944,8 +9964,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); + u64 period_us, quota_us; - cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + tg_bandwidth(tg, &period_us, "a_us, NULL); + cpu_period_quota_print(sf, period_us, quota_us); return 0; } @@ -9996,7 +10018,7 @@ static struct cftype cpu_files[] = { { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, + .read_u64 = cpu_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, }, #endif /* CONFIG_CFS_BANDWIDTH */ -- cgit v1.2.3 From 5bc34be478d09c4d16009e665e020ad0fcd0deea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:30 -1000 Subject: sched/core: Reorganize cgroup bandwidth control interface file writes - Move input parameter validation from tg_set_cfs_bandwidth() to the new outer function tg_set_bandwidth(). The outer function handles parameters in usecs, validates them and calls tg_set_cfs_bandwidth() which converts them into nsecs. This matches tg_bandwidth() on the read side. - max/min_cfs_* consts are now used by tg_set_bandwidth(). Relocate, convert into usecs and drop "cfs" from the names. - Reimplement cpu_cfs_{period|quote|burst}_write_*() using tg_bandwidth() and tg_set_bandwidth() and replace "cfs" in the names with "bw". - Update cpu_max_write() to use tg_set_bandiwdth(). cpu_period_quota_parse() is updated to drop nsec conversion accordingly. This aligns the behavior with cfs_period_quota_print(). - Drop now unused tg_set_cfs_{period|quota|burst}(). - While at it, for consistency, rename default_cfs_period() to default_bw_period_us() and make it return usecs. This is to prepare for adding bandwidth control support to sched_ext. tg_set_bandwidth() will be used as the muxing point. No functional changes intended. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-5-tj@kernel.org --- kernel/sched/core.c | 205 +++++++++++++++++++++++++-------------------------- kernel/sched/fair.c | 4 +- kernel/sched/sched.h | 10 +-- 3 files changed, 106 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8de93a3bfa27..2f8caa9db78d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9309,47 +9309,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); -const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ -/* More than 203 days if BW_SHIFT equals 20. */ -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; - static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, - u64 burst) +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 period, quota, burst; - if (tg == &root_task_group) - return -EINVAL; - - /* - * Ensure we have at some amount of bandwidth every period. This is - * to prevent reaching a state of large arrears when throttled via - * entity_tick() resulting in prolonged exit starvation. - */ - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) - return -EINVAL; + period = (u64)period_us * NSEC_PER_USEC; - /* - * Likewise, bound things on the other side by preventing insane quota - * periods. This also allows us to normalize in computing quota - * feasibility. - */ - if (period > max_cfs_quota_period) - return -EINVAL; - - /* - * Bound quota to defend quota against overflow during bandwidth shift. - */ - if (quota != RUNTIME_INF && quota > max_cfs_runtime) - return -EINVAL; + if (quota_us == RUNTIME_INF) + quota = RUNTIME_INF; + else + quota = (u64)quota_us * NSEC_PER_USEC; - if (quota != RUNTIME_INF && (burst > quota || - burst + quota > max_cfs_runtime)) - return -EINVAL; + burst = (u64)burst_us * NSEC_PER_USEC; /* * Prevent race between setting of cfs_rq->runtime_enabled and @@ -9437,50 +9413,6 @@ static u64 tg_get_cfs_burst(struct task_group *tg) return burst_us; } -static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; - burst = tg->cfs_bandwidth.burst; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) -{ - u64 quota, period, burst; - - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - burst = (u64)cfs_burst_us * NSEC_PER_USEC; - period = ktime_to_ns(tg->cfs_bandwidth.period); - quota = tg->cfs_bandwidth.quota; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9614,6 +9546,11 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } +const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ +static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_bw_runtime_us = MAX_BW; + static void tg_bandwidth(struct task_group *tg, u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) { @@ -9634,6 +9571,50 @@ static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, return period_us; } +static int tg_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + const u64 max_usec = U64_MAX / NSEC_PER_USEC; + + if (tg == &root_task_group) + return -EINVAL; + + /* Values should survive translation to nsec */ + if (period_us > max_usec || + (quota_us != RUNTIME_INF && quota_us > max_usec) || + burst_us > max_usec) + return -EINVAL; + + /* + * Ensure we have some amount of bandwidth every period. This is to + * prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota_us < min_bw_quota_period_us || + period_us < min_bw_quota_period_us) + return -EINVAL; + + /* + * Likewise, bound things on the other side by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period_us > max_bw_quota_period_us) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) + return -EINVAL; + + if (quota_us != RUNTIME_INF && (burst_us > quota_us || + burst_us + quota_us > max_bw_runtime_us)) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +} + static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -9652,22 +9633,37 @@ static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, return burst_us; } -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) +static int cpu_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period_us) { - return tg_set_cfs_period(css_tg(css), cfs_period_us); + struct task_group *tg = css_tg(css); + u64 quota_us, burst_us; + + tg_bandwidth(tg, NULL, "a_us, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) +static int cpu_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 quota_us) { - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); + struct task_group *tg = css_tg(css); + u64 period_us, burst_us; + + if (quota_us < 0) + quota_us = RUNTIME_INF; + + tg_bandwidth(tg, &period_us, NULL, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) +static int cpu_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 burst_us) { - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); + struct task_group *tg = css_tg(css); + u64 period_us, quota_us; + + tg_bandwidth(tg, &period_us, "a_us, NULL); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } #endif /* CONFIG_CFS_BANDWIDTH */ @@ -9733,17 +9729,17 @@ static struct cftype cpu_legacy_files[] = { { .name = "cfs_period_us", .read_u64 = cpu_period_read_u64, - .write_u64 = cpu_cfs_period_write_u64, + .write_u64 = cpu_period_write_u64, }, { .name = "cfs_quota_us", .read_s64 = cpu_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, + .write_s64 = cpu_quota_write_s64, }, { .name = "cfs_burst_us", .read_u64 = cpu_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .write_u64 = cpu_burst_write_u64, }, { .name = "stat", @@ -9940,22 +9936,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, } /* caller should put the current value in *@periodp before calling */ -static int __maybe_unused cpu_period_quota_parse(char *buf, - u64 *periodp, u64 *quotap) +static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, + u64 *quota_us_p) { char tok[21]; /* U64_MAX */ - if (sscanf(buf, "%20s %llu", tok, periodp) < 1) + if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) return -EINVAL; - *periodp *= NSEC_PER_USEC; - - if (sscanf(tok, "%llu", quotap)) - *quotap *= NSEC_PER_USEC; - else if (!strcmp(tok, "max")) - *quotap = RUNTIME_INF; - else - return -EINVAL; + if (sscanf(tok, "%llu", quota_us_p) < 1) { + if (!strcmp(tok, "max")) + *quota_us_p = RUNTIME_INF; + else + return -EINVAL; + } return 0; } @@ -9975,14 +9969,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct task_group *tg = css_tg(of_css(of)); - u64 period = tg_get_cfs_period(tg); - u64 burst = tg->cfs_bandwidth.burst; - u64 quota; + u64 period_us, quota_us, burst_us; int ret; - ret = cpu_period_quota_parse(buf, &period, "a); + tg_bandwidth(tg, &period_us, NULL, &burst_us); + ret = cpu_period_quota_parse(buf, &period_us, "a_us); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota, burst); + ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); return ret ?: nbytes; } #endif /* CONFIG_CFS_BANDWIDTH */ @@ -10019,7 +10012,7 @@ static struct cftype cpu_files[] = { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .write_u64 = cpu_burst_write_u64, }, #endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 707be4570430..7e2963efe800 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6422,7 +6422,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) * to fail. */ new = old * 2; - if (new < max_cfs_quota_period) { + if (new < max_bw_quota_period_us * NSEC_PER_USEC) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; cfs_b->burst *= 2; @@ -6456,7 +6456,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->period = us_to_ktime(default_bw_period_us()); cfs_b->burst = 0; cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e00b80cd998e..105190b18020 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -403,15 +403,15 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; #ifdef CONFIG_CFS_BANDWIDTH -extern const u64 max_cfs_quota_period; +extern const u64 max_bw_quota_period_us; /* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds + * default period for group bandwidth. + * default: 0.1s, units: microseconds */ -static inline u64 default_cfs_period(void) +static inline u64 default_bw_period_us(void) { - return 100000000ULL; + return 100000ULL; } #endif /* CONFIG_CFS_BANDWIDTH */ -- cgit v1.2.3 From 37fb58a7273726e59f9429c89ade5116083a213d Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 18 Jun 2025 07:32:17 +0000 Subject: cgroup,freezer: fix incomplete freezing when attaching tasks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An issue was found: # cd /sys/fs/cgroup/freezer/ # mkdir test # echo FROZEN > test/freezer.state # cat test/freezer.state FROZEN # sleep 1000 & [1] 863 # echo 863 > test/cgroup.procs # cat test/freezer.state FREEZING When tasks are migrated to a frozen cgroup, the freezer fails to immediately freeze the tasks, causing the cgroup to remain in the "FREEZING". The freeze_task() function is called before clearing the CGROUP_FROZEN flag. This causes the freezing() check to incorrectly return false, preventing __freeze_task() from being invoked for the migrated task. To fix this issue, clear the CGROUP_FROZEN state before calling freeze_task(). Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic") Cc: stable@vger.kernel.org # v6.1+ Reported-by: Zhong Jiawei Signed-off-by: Chen Ridong Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 039d1eb2f215..507b8f19a262 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -188,13 +188,12 @@ static void freezer_attach(struct cgroup_taskset *tset) if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } + freeze_task(task); } } -- cgit v1.2.3 From d4adf1c9ee7722545450608bcb095fb31512f0c6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 18 Jun 2025 17:57:40 -0400 Subject: bpf: Adjust free target to avoid global starvation of LRU map BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. Tested-by: Anton Protopopov Signed-off-by: Willem de Bruijn Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/bpf_lru_list.c | 9 ++++++--- kernel/bpf/bpf_lru_list.h | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; -- cgit v1.2.3 From 990518eb3a71c357ca4ff1ad3e747fb844d8094c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:15 +0200 Subject: timekeeping: Remove hardcoded access to tk_core This was overlooked in the initial conversion. Use the provided pointer to access the shadow timekeeper. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.652611452@linutronix.de --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a009c91f7b05..2ad78fbdc9ff 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -663,7 +663,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd) static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); -- cgit v1.2.3 From 506a54a0316ee4854b0ed113a8001477f5211d50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:16 +0200 Subject: timekeeping: Cleanup kernel doc of __ktime_get_real_seconds() Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.715836017@linutronix.de --- kernel/time/timekeeping.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2ad78fbdc9ff..d88d19fb794c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -975,9 +975,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { -- cgit v1.2.3 From 7e55b6ba1fe6987638160e5f8216288f38043759 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:17 +0200 Subject: timekeeping: Avoid double notification in do_adjtimex() Consolidate do_adjtimex() so that it does not notify about clock changes twice. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083025.779267274@linutronix.de --- kernel/time/timekeeping.c | 98 +++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d88d19fb794c..fb1da87a92f1 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1418,40 +1418,49 @@ int do_settimeofday64(const struct timespec64 *ts) EXPORT_SYMBOL(do_settimeofday64); /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(const struct timespec64 *ts) { + struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timespec64 tmp; + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - struct timespec64 tmp; - timekeeping_forward_now(tks); - - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tks), *ts); - if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); - return -EINVAL; - } + timekeeping_forward_now(tks); - tk_xtime_add(tks, ts); - tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(&tk_core); + return -EINVAL; } - /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); return 0; } +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; + + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(ts); + + /* Signal hrtimers about time change */ + if (!ret) + clock_was_set(CLOCK_SET_WALL); + return ret; +} + /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. @@ -2186,7 +2195,7 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(enum timekeeping_adv_mode mode) { struct timekeeper *tk = &tk_core.shadow_timekeeper; struct timekeeper *real_tk = &tk_core.timekeeper; @@ -2194,8 +2203,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) int shift = 0, maxshift; u64 offset, orig_offset; - guard(raw_spinlock_irqsave)(&tk_core.lock); - /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) return false; @@ -2249,6 +2256,12 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * @@ -2537,10 +2550,10 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); */ int do_adjtimex(struct __kernel_timex *txc) { + struct timespec64 delta, ts; struct audit_ntp_data ad; bool offset_set = false; bool clock_set = false; - struct timespec64 ts; int ret; /* Validate the data before disabling interrupts */ @@ -2549,21 +2562,6 @@ int do_adjtimex(struct __kernel_timex *txc) return ret; add_device_randomness(txc, sizeof(*txc)); - if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; - - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); - if (ret) - return ret; - - offset_set = delta.tv_sec != 0; - audit_tk_injoffset(delta); - } - audit_ntp_init(&ad); ktime_get_real_ts64(&ts); @@ -2573,6 +2571,19 @@ int do_adjtimex(struct __kernel_timex *txc) struct timekeeper *tks = &tk_core.shadow_timekeeper; s32 orig_tai, tai; + if (txc->modes & ADJ_SETOFFSET) { + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(&delta); + if (ret) + return ret; + + offset_set = delta.tv_sec != 0; + clock_set = true; + } + orig_tai = tai = tks->tai_offset; ret = __do_adjtimex(txc, &ts, &tai, &ad); @@ -2583,13 +2594,16 @@ int do_adjtimex(struct __kernel_timex *txc) } else { tk_update_leap_state_all(&tk_core); } + + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + clock_set |= __timekeeping_advance(TK_ADV_FREQ); } - audit_ntp_log(&ad); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(delta); - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + audit_ntp_log(&ad); if (clock_set) clock_was_set(CLOCK_SET_WALL); -- cgit v1.2.3 From f12b45862c4dcb9c2937b83ed730e473b9a76cbf Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:19 +0200 Subject: timekeeping: Introduce timekeeper ID As long as there is only a single timekeeper, there is no need to clarify which timekeeper is used. But with the upcoming reusage of the timekeeper infrastructure for auxiliary clock timekeepers, an ID is required to differentiate. Introduce an enum for timekeeper IDs, introduce a field in struct tk_data to store this timekeeper id and add also initialization. The id struct field is added at the end of the second cachline, as there is a 4 byte hole anyway. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.842476378@linutronix.de --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb1da87a92f1..f4692fc2ea6b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1663,10 +1663,11 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; } /* @@ -1696,7 +1697,7 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && -- cgit v1.2.3 From 9094c72c3d81bf2416b7c79d12c8494ab8fbac20 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:20 +0200 Subject: time: Introduce auxiliary POSIX clocks To support auxiliary timekeeping and the related user space interfaces, it's required to define a clock ID range for them. Reserve 8 auxiliary clock IDs after the regular timekeeping clock ID space. This is the maximum number of auxiliary clocks the kernel can support. The actual number of supported clocks depends obviously on the presence of related devices and might be constraint by the available VDSO space. Add the corresponding timekeeper IDs as well. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.905800695@linutronix.de --- kernel/time/Kconfig | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b0b97a60aaa6..7c6a52f7836c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -82,9 +82,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -208,6 +208,17 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US interval and NTP's maximum frequency drift of 500 parts per million. If the clocksource is good enough for NTP, it is good enough for the clocksource watchdog! +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS + help + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif -- cgit v1.2.3 From 8515714b0f88a698a4c26f0f0ce7d43ad14dce16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:21 +0200 Subject: ntp: Add support for auxiliary timekeepers If auxiliary clocks are enabled, provide an array of NTP data so that the auxiliary timekeepers can be steered independently of the core timekeeper. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083025.969000914@linutronix.de --- kernel/time/ntp.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index b837d3d9d325..5b5a0f76866d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -86,14 +87,16 @@ struct ntp_data { #endif }; -static struct ntp_data tk_ntp_data = { - .tick_usec = USER_TICK_USEC, - .time_state = TIME_OK, - .time_status = STA_UNSYNC, - .time_constant = 2, - .time_maxerror = NTP_PHASE_LIMIT, - .time_esterror = NTP_PHASE_LIMIT, - .ntp_next_leap_sec = TIME64_MAX, +static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = { + [ 0 ... TIMEKEEPERS_MAX - 1 ] = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, + }, }; #define SECS_PER_DAY 86400 @@ -351,13 +354,13 @@ static void __ntp_clear(struct ntp_data *ntpdata) */ void ntp_clear(void) { - __ntp_clear(&tk_ntp_data); + __ntp_clear(&tk_ntp_data[TIMEKEEPER_CORE]); } u64 ntp_tick_length(void) { - return tk_ntp_data.tick_length; + return tk_ntp_data[TIMEKEEPER_CORE].tick_length; } /** @@ -368,7 +371,7 @@ u64 ntp_tick_length(void) */ ktime_t ntp_get_next_leap(void) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; ktime_t ret; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) @@ -389,7 +392,7 @@ ktime_t ntp_get_next_leap(void) */ int second_overflow(time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; s64 delta; int leap = 0; s32 rem; @@ -605,7 +608,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns */ static inline bool ntp_synced(void) { - return !(tk_ntp_data.time_status & STA_UNSYNC); + return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC); } /* @@ -762,7 +765,7 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; int result; if (txc->modes & ADJ_ADJTIME) { @@ -1031,8 +1034,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; struct pps_normtime pts_norm, freq_norm; - struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); @@ -1083,18 +1086,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj); if (rc) return rc; - tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } - __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { - ntp_clear(); + for (int id = 0; id < TIMEKEEPERS_MAX; id++) + __ntp_clear(tk_ntp_data + id); ntp_init_cmos_sync(); } -- cgit v1.2.3 From 5ffa25f573cf524ff53660c5ff7a158ee10f23c7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:22 +0200 Subject: ntp: Add timekeeper ID arguments to public functions In preparation for supporting auxiliary POSIX clocks, add a timekeeper ID to the relevant functions. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.032425931@linutronix.de --- kernel/time/ntp.c | 33 +++++++++++++++++++-------------- kernel/time/ntp_internal.h | 11 +++++------ kernel/time/timekeeping.c | 12 ++++++------ 3 files changed, 30 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5b5a0f76866d..e28dc53194a7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -351,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata) /** * ntp_clear - Clears the NTP state variables + * @tkid: Timekeeper ID to be able to select proper ntp data array member */ -void ntp_clear(void) +void ntp_clear(unsigned int tkid) { - __ntp_clear(&tk_ntp_data[TIMEKEEPER_CORE]); + __ntp_clear(&tk_ntp_data[tkid]); } -u64 ntp_tick_length(void) +u64 ntp_tick_length(unsigned int tkid) { - return tk_ntp_data[TIMEKEEPER_CORE].tick_length; + return tk_ntp_data[tkid].tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * @tkid: Timekeeper ID * - * Provides the time of the next leapsecond against CLOCK_REALTIME in - * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next + * leap second against CLOCK_REALTIME in a ktime_t format if a + * leap second is pending. KTIME_MAX otherwise. */ -ktime_t ntp_get_next_leap(void) +ktime_t ntp_get_next_leap(unsigned int tkid) { struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; - ktime_t ret; + + if (tkid != TIMEKEEPER_CORE) + return KTIME_MAX; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) return ktime_set(ntpdata->ntp_next_leap_sec, 0); - ret = KTIME_MAX; - return ret; + + return KTIME_MAX; } /* @@ -390,9 +395,9 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(time64_t secs) +int second_overflow(unsigned int tkid, time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; s64 delta; int leap = 0; s32 rem; @@ -762,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, +int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; if (txc->modes & ADJ_ADJTIME) { diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 5a633dce9057..2d3e9669730b 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -3,13 +3,12 @@ #define _LINUX_NTP_INTERNAL_H extern void ntp_init(void); -extern void ntp_clear(void); +extern void ntp_clear(unsigned int tkid); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 ntp_tick_length(void); -extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, - const struct timespec64 *ts, +extern u64 ntp_tick_length(unsigned int tkid); +extern ktime_t ntp_get_next_leap(unsigned int tkid); +extern int second_overflow(unsigned int tkid, time64_t secs); +extern int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f4692fc2ea6b..e1b8e2618ca7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -601,7 +601,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); @@ -678,7 +678,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); @@ -2049,7 +2049,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - u64 ntp_tl = ntp_tick_length(); + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* @@ -2130,7 +2130,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2227,7 +2227,7 @@ static bool __timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); @@ -2586,7 +2586,7 @@ int do_adjtimex(struct __kernel_timex *txc) } orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + ret = __do_adjtimex(tks->id, txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); -- cgit v1.2.3 From c7ebfbc440151ae4a66a03b0f879cbece45174c8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:23 +0200 Subject: ntp: Rename __do_adjtimex() to ntp_adjtimex() Clean up the name space. No functional change. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.095637820@linutronix.de --- kernel/time/ntp.c | 4 ++-- kernel/time/ntp_internal.h | 4 ++-- kernel/time/timekeeping.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index e28dc53194a7..9aba1bc7b2a7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -767,8 +767,8 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad) +int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 2d3e9669730b..7084d839c207 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -8,8 +8,8 @@ extern void ntp_clear(unsigned int tkid); extern u64 ntp_tick_length(unsigned int tkid); extern ktime_t ntp_get_next_leap(unsigned int tkid); extern int second_overflow(unsigned int tkid, time64_t secs); -extern int __do_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad); +extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e1b8e2618ca7..99b4749f0665 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2586,7 +2586,7 @@ int do_adjtimex(struct __kernel_timex *txc) } orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(tks->id, txc, &ts, &tai, &ad); + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &ad); if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); -- cgit v1.2.3 From 926ad475169f5b24868438e4bff61ec6a73efd19 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:25 +0200 Subject: timekeeping: Make __timekeeping_advance() reusable In __timekeeping_advance() the pointer to struct tk_data is hardcoded by the use of &tk_core. As long as there is only a single timekeeper (tk_core), this is not a problem. But when __timekeeping_advance() will be reused for per auxiliary timekeepers, __timekeeping_advance() needs to be generalized. Add a pointer to struct tk_data as function argument of __timekeeping_advance() and adapt all call sites. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.160967312@linutronix.de --- kernel/time/timekeeping.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 99b4749f0665..153f760dffb4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2196,10 +2196,10 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool __timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; - struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; u64 offset, orig_offset; @@ -2252,7 +2252,7 @@ static bool __timekeeping_advance(enum timekeeping_adv_mode mode) if (orig_offset != offset) tk_update_coarse_nsecs(tk); - timekeeping_update_from_shadow(&tk_core, clock_set); + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } @@ -2260,7 +2260,7 @@ static bool __timekeeping_advance(enum timekeeping_adv_mode mode) static bool timekeeping_advance(enum timekeeping_adv_mode mode) { guard(raw_spinlock_irqsave)(&tk_core.lock); - return __timekeeping_advance(mode); + return __timekeeping_advance(&tk_core, mode); } /** @@ -2598,7 +2598,7 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= __timekeeping_advance(TK_ADV_FREQ); + clock_set |= __timekeeping_advance(&tk_core, TK_ADV_FREQ); } if (txc->modes & ADJ_SETOFFSET) -- cgit v1.2.3 From 8c782acd3f47e21f9b03fd3720172d1f8e4fb796 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:26 +0200 Subject: timekeeping: Prepare timekeeping_update_from_shadow() Don't invoke the VDSO and paravirt updates when utilized for auxiliary clocks. This is a temporary workaround until the VDSO and paravirt interfaces have been worked out. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083026.223876435@linutronix.de --- kernel/time/timekeeping.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 153f760dffb4..e3c1a1c1d8c5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -683,13 +683,15 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act tk_update_leap_state(tk); tk_update_ktime_data(tk); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (tk->id == TIMEKEEPER_CORE) { + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; -- cgit v1.2.3 From 6168024604236cb2bb1004ea8459c8ece2c4ef5f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:27 +0200 Subject: timekeeping: Add clock_valid flag to timekeeper In preparation for supporting independent auxiliary timekeepers, add a clock valid field and set it to true for the system timekeeper. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.287145536@linutronix.de --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e3c1a1c1d8c5..bf59bacc97db 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1665,11 +1665,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* @@ -1699,7 +1700,7 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core, TIMEKEEPER_CORE); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && -- cgit v1.2.3 From 22c62b9a84b8f16ca0277e133a0cd62a259fee7c Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Mon, 19 May 2025 10:33:28 +0200 Subject: timekeeping: Introduce auxiliary timekeepers Provide timekeepers for auxiliary clocks and initialize them during boot. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250519083026.350061049@linutronix.de --- kernel/time/timekeeping.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bf59bacc97db..19f4af1a37ea 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -53,7 +53,11 @@ struct tk_data { raw_spinlock_t lock; } ____cacheline_aligned; -static struct tk_data tk_core; +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -113,6 +117,12 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +#else +static inline void tk_aux_setup(void) { } +#endif + unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; @@ -1589,7 +1599,6 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); - /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ @@ -1701,6 +1710,7 @@ void __init timekeeping_init(void) struct clocksource *clock; tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -2630,3 +2640,11 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ -- cgit v1.2.3 From ffa0519baaed48ca953bd201e1b17f15dae21b2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:30 +0200 Subject: timekeeping: Provide ktime_get_ntp_seconds() ntp_adjtimex() requires access to the actual time keeper per timekeeper ID. Provide an interface. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083026.411809421@linutronix.de --- kernel/time/timekeeping.c | 9 +++++++++ kernel/time/timekeeping_internal.h | 3 +++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 19f4af1a37ea..7d3693a72a01 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2627,6 +2627,15 @@ int do_adjtimex(struct __kernel_timex *txc) return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 8c9079108ffb..973ede670a36 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) unsigned long timekeeper_lock_irqsave(void); void timekeeper_unlock_irqrestore(unsigned long flags); +/* NTP specific interface to access the current seconds value */ +long ktime_get_ntp_seconds(unsigned int id); + #endif /* _TIMEKEEPING_INTERNAL_H */ -- cgit v1.2.3 From 8ec7c826d97b390879df2a03dfb035c70af86779 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:39 +0200 Subject: pidfs: persist information Persist exit and coredump information independent of whether anyone currently holds a pidfd for the struct pid. The current scheme allocated pidfs dentries on-demand repeatedly. This scheme is reaching it's limits as it makes it impossible to pin information that needs to be available after the task has exited or coredumped and that should not be lost simply because the pidfd got closed temporarily. The next opener should still see the stashed information. This is also a prerequisite for supporting extended attributes on pidfds to allow attaching meta information to them. If someone opens a pidfd for a struct pid a pidfs dentry is allocated and stashed in pid->stashed. Once the last pidfd for the struct pid is closed the pidfs dentry is released and removed from pid->stashed. So if 10 callers create a pidfs dentry for the same struct pid sequentially, i.e., each closing the pidfd before the other creates a new one then a new pidfs dentry is allocated every time. Because multiple tasks acquiring and releasing a pidfd for the same struct pid can race with each another a task may still find a valid pidfs entry from the previous task in pid->stashed and reuse it. Or it might find a dead dentry in there and fail to reuse it and so stashes a new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry but only one will succeed, the other ones will put their dentry. The current scheme aims to ensure that a pidfs dentry for a struct pid can only be created if the task is still alive or if a pidfs dentry already existed before the task was reaped and so exit information has been was stashed in the pidfs inode. That's great except that it's buggy. If a pidfs dentry is stashed in pid->stashed after pidfs_exit() but before __unhash_process() is called we will return a pidfd for a reaped task without exit information being available. The pidfds_pid_valid() check does not guard against this race as it doens't sync at all with pidfs_exit(). The pid_has_task() check might be successful simply because we're before __unhash_process() but after pidfs_exit(). Introduce a new scheme where the lifetime of information associated with a pidfs entry (coredump and exit information) isn't bound to the lifetime of the pidfs inode but the struct pid itself. The first time a pidfs dentry is allocated for a struct pid a struct pidfs_attr will be allocated which will be used to store exit and coredump information. If all pidfs for the pidfs dentry are closed the dentry and inode can be cleaned up but the struct pidfs_attr will stick until the struct pid itself is freed. This will ensure minimal memory usage while persisting relevant information. The new scheme has various advantages. First, it allows to close the race where we end up handing out a pidfd for a reaped task for which no exit information is available. Second, it minimizes memory usage. Third, it allows to remove complex lifetime tracking via dentries when registering a struct pid with pidfs. There's no need to get or put a reference. Instead, the lifetime of exit and coredump information associated with a struct pid is bound to the lifetime of struct pid itself. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 8317bcbc7cf7..07db7d8d066c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -100,7 +100,7 @@ void put_pid(struct pid *pid) ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { - WARN_ON_ONCE(pid->stashed); + pidfs_free_pid(pid); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } -- cgit v1.2.3 From c85f5ab60820bde1510110e403d17456fbb8c266 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 19 May 2025 10:33:31 +0200 Subject: ntp: Use ktime_get_ntp_seconds() Use ktime_get_ntp_seconds() to prepare for auxiliary clocks so that the readout becomes per timekeeper. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250519083026.472512636@linutronix.de --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 9aba1bc7b2a7..97fa99b96dd0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -303,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - real_secs = __ktime_get_real_seconds(); + real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); secs = (long)(real_secs - ntpdata->time_reftime); if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; @@ -710,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k * reference time to current time. */ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) - ntpdata->time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); /* only set allowed bits */ ntpdata->time_status &= STA_RONLY; -- cgit v1.2.3 From 12b9a2c05d1b474518b0f5fac4a50b7f93b16930 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Thu, 5 Jun 2025 19:11:41 +0200 Subject: kho: initialize tail pages for higher order folios properly Currently, when restoring higher order folios, kho_restore_folio() only calls prep_compound_page() on all the pages. That is not enough to properly initialize the folios. The managed page count does not get updated, the reserved flag does not get dropped, and page count does not get initialized properly. Restoring a higher order folio with it results in the following BUG with CONFIG_DEBUG_VM when attempting to free the folio: BUG: Bad page state in process test pfn:104e2b page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffffffffffffffff pfn:0x104e2b flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) raw: 002fffff80000000 0000000000000000 00000000ffffffff 0000000000000000 raw: ffffffffffffffff 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: nonzero _refcount [...] Call Trace: dump_stack_lvl+0x4b/0x70 bad_page.cold+0x97/0xb2 __free_frozen_pages+0x616/0x850 [...] Combine the path for 0-order and higher order folios, initialize the tail pages with a count of zero, and call adjust_managed_page_count() to account for all the pages instead of just missing them. In addition, since all the KHO-preserved pages get marked with MEMBLOCK_RSRV_NOINIT by deserialize_bitmap(), the reserved flag is not actually set (as can also be seen from the flags of the dumped page in the logs above). So drop the ClearPageReserved() calls. [ptyadav@amazon.de: declare i in the loop instead of at the top] Link: https://lkml.kernel.org/r/20250613125916.39272-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20250605171143.76963-1-pratyush@kernel.org Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 69b953551677..5a21dbe17950 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -164,11 +164,21 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } /* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page) +static void kho_restore_page(struct page *page, unsigned int order) { - ClearPageReserved(page); - init_page_count(page); - adjust_managed_page_count(page, 1); + unsigned int nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); + + adjust_managed_page_count(page, nr_pages); } /** @@ -186,15 +196,10 @@ struct folio *kho_restore_folio(phys_addr_t phys) return NULL; order = page->private; - if (order) { - if (order > MAX_PAGE_ORDER) - return NULL; - - prep_compound_page(page, order); - } else { - kho_restore_page(page); - } + if (order > MAX_PAGE_ORDER) + return NULL; + kho_restore_page(page, order); return page_folio(page); } EXPORT_SYMBOL_GPL(kho_restore_folio); -- cgit v1.2.3 From 33b6a1f155d627f5bd80c7485c598ce45428f74f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 10 Jun 2025 19:34:48 +0200 Subject: rcu: Return early if callback is not specified Currently the call_rcu() API does not check whether a callback pointer is NULL. If NULL is passed, rcu_core() will try to invoke it, resulting in NULL pointer dereference and a kernel crash. To prevent this and improve debuggability, this patch adds a check for NULL and emits a kernel stack trace to help identify a faulty caller. Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes Signed-off-by: Joel Fernandes --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e8a4b720d7d2..14d4499c6fc3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3072,6 +3072,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. -- cgit v1.2.3 From 6e6558a6bc418f1478c5dc8609d03805364e0cb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:33:10 -1000 Subject: sched_ext, sched/core: Factor out struct scx_task_group More sched_ext fields will be added to struct task_group. In preparation, factor out sched_ext fields into struct scx_task_group to reduce clutter in the common header. No functional changes. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 32 ++++++++++++++++---------------- kernel/sched/sched.h | 5 +---- 2 files changed, 17 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4db51e708f86..6732e50e0679 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4058,7 +4058,7 @@ static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) { - tg->scx_weight = CGROUP_WEIGHT_DFL; + tg->scx.weight = CGROUP_WEIGHT_DFL; } int scx_tg_online(struct task_group *tg) @@ -4066,14 +4066,14 @@ int scx_tg_online(struct task_group *tg) struct scx_sched *sch = scx_root; int ret = 0; - WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); + WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = - { .weight = tg->scx_weight }; + { .weight = tg->scx.weight }; ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); @@ -4081,9 +4081,9 @@ int scx_tg_online(struct task_group *tg) ret = ops_sanitize_err(sch, "cgroup_init", ret); } if (ret == 0) - tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; + tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED; } else { - tg->scx_flags |= SCX_TG_ONLINE; + tg->scx.flags |= SCX_TG_ONLINE; } percpu_up_read(&scx_cgroup_rwsem); @@ -4094,15 +4094,15 @@ void scx_tg_offline(struct task_group *tg) { struct scx_sched *sch = scx_root; - WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); + WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && - (tg->scx_flags & SCX_TG_INITED)) + (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); - tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); + tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); percpu_up_read(&scx_cgroup_rwsem); } @@ -4211,11 +4211,11 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) percpu_down_read(&scx_cgroup_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && - tg->scx_weight != weight) + tg->scx.weight != weight) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, tg_cgrp(tg), weight); - tg->scx_weight = weight; + tg->scx.weight = weight; percpu_up_read(&scx_cgroup_rwsem); } @@ -4366,9 +4366,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - if (!(tg->scx_flags & SCX_TG_INITED)) + if (!(tg->scx.flags & SCX_TG_INITED)) continue; - tg->scx_flags &= ~SCX_TG_INITED; + tg->scx.flags &= ~SCX_TG_INITED; if (!sch->ops.cgroup_exit) continue; @@ -4400,14 +4400,14 @@ static int scx_cgroup_init(struct scx_sched *sch) rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; + struct scx_cgroup_init_args args = { .weight = tg->scx.weight }; - if ((tg->scx_flags & + if ((tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; if (!sch->ops.cgroup_init) { - tg->scx_flags |= SCX_TG_INITED; + tg->scx.flags |= SCX_TG_INITED; continue; } @@ -4422,7 +4422,7 @@ static int scx_cgroup_init(struct scx_sched *sch) scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } - tg->scx_flags |= SCX_TG_INITED; + tg->scx.flags |= SCX_TG_INITED; rcu_read_lock(); css_put(css); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 105190b18020..fdf5f52b54a3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -471,10 +471,7 @@ struct task_group { struct rt_bandwidth rt_bandwidth; #endif -#ifdef CONFIG_EXT_GROUP_SCHED - u32 scx_flags; /* SCX_TG_* */ - u32 scx_weight; -#endif + struct scx_task_group scx; struct rcu_head rcu; struct list_head list; -- cgit v1.2.3 From ddceadce63d9cb752c2472e220ded05cabaf7971 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:34:22 -1000 Subject: sched_ext: Add support for cgroup bandwidth control interface From 077814f57f8acce13f91dc34bbd2b7e4911fbf25 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:06:47 -1000 - Add CONFIG_GROUP_SCHED_BANDWIDTH which is selected by both CONFIG_CFS_BANDWIDTH and EXT_GROUP_SCHED. - Put bandwidth control interface files for both cgroup v1 and v2 under CONFIG_GROUP_SCHED_BANDWIDTH. - Update tg_bandwidth() to fetch configuration parameters from fair if CONFIG_CFS_BANDWIDTH, SCX otherwise. - Update tg_set_bandwidth() to update the parameters for both fair and SCX. - Add bandwidth control parameters to struct scx_cgroup_init_args. - Add sched_ext_ops.cgroup_set_bandwidth() which is invoked on bandwidth control parameter updates. - Update scx_qmap and maximal selftest to test the new feature. Signed-off-by: Tejun Heo --- kernel/sched/core.c | 29 +++++++++++++++++++---- kernel/sched/ext.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/ext.h | 2 ++ kernel/sched/sched.h | 4 ++-- 4 files changed, 91 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0e3a00e2a2cc..91845d00a1cd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9545,7 +9545,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } +#endif /* CONFIG_CFS_BANDWIDTH */ +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ /* More than 203 days if BW_SHIFT equals 20. */ @@ -9554,12 +9556,21 @@ static const u64 max_bw_runtime_us = MAX_BW; static void tg_bandwidth(struct task_group *tg, u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) { +#ifdef CONFIG_CFS_BANDWIDTH if (period_us_p) *period_us_p = tg_get_cfs_period(tg); if (quota_us_p) *quota_us_p = tg_get_cfs_quota(tg); if (burst_us_p) *burst_us_p = tg_get_cfs_burst(tg); +#else /* !CONFIG_CFS_BANDWIDTH */ + if (period_us_p) + *period_us_p = tg->scx.bw_period_us; + if (quota_us_p) + *quota_us_p = tg->scx.bw_quota_us; + if (burst_us_p) + *burst_us_p = tg->scx.bw_burst_us; +#endif /* CONFIG_CFS_BANDWIDTH */ } static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, @@ -9575,6 +9586,7 @@ static int tg_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) { const u64 max_usec = U64_MAX / NSEC_PER_USEC; + int ret = 0; if (tg == &root_task_group) return -EINVAL; @@ -9612,7 +9624,12 @@ static int tg_set_bandwidth(struct task_group *tg, burst_us + quota_us > max_bw_runtime_us)) return -EINVAL; - return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +#ifdef CONFIG_CFS_BANDWIDTH + ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +#endif /* CONFIG_CFS_BANDWIDTH */ + if (!ret) + scx_group_set_bandwidth(tg, period_us, quota_us, burst_us); + return ret; } static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, @@ -9665,7 +9682,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css, tg_bandwidth(tg, &period_us, "a_us, NULL); return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, @@ -9725,7 +9742,7 @@ static struct cftype cpu_legacy_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { .name = "cfs_period_us", .read_u64 = cpu_period_read_u64, @@ -9741,6 +9758,8 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_burst_read_u64, .write_u64 = cpu_burst_write_u64, }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH { .name = "stat", .seq_show = cpu_cfs_stat_show, @@ -9954,7 +9973,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, return 0; } -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); @@ -10001,7 +10020,7 @@ static struct cftype cpu_files[] = { .write_s64 = cpu_idle_write_s64, }, #endif -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6732e50e0679..39cba11688a9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -203,6 +203,11 @@ struct scx_exit_task_args { struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; }; enum scx_cpu_preempt_reason { @@ -664,9 +669,31 @@ struct sched_ext_ops { * @cgrp: cgroup whose weight is being updated * @weight: new weight [1..10000] * - * Update @tg's weight to @weight. + * Update @cgrp's weight to @weight. */ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is upto to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + #endif /* CONFIG_EXT_GROUP_SCHED */ /* @@ -4059,6 +4086,8 @@ static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) { tg->scx.weight = CGROUP_WEIGHT_DFL; + tg->scx.bw_period_us = default_bw_period_us(); + tg->scx.bw_quota_us = RUNTIME_INF; } int scx_tg_online(struct task_group *tg) @@ -4073,7 +4102,10 @@ int scx_tg_online(struct task_group *tg) if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = - { .weight = tg->scx.weight }; + { .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us }; ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, tg->css.cgroup, &args); @@ -4225,6 +4257,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle) /* TODO: Implement ops->cgroup_set_idle() */ } +void scx_group_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && + (tg->scx.bw_period_us != period_us || + tg->scx.bw_quota_us != quota_us || + tg->scx.bw_burst_us != burst_us)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, + tg_cgrp(tg), period_us, quota_us, burst_us); + + tg->scx.bw_period_us = period_us; + tg->scx.bw_quota_us = quota_us; + tg->scx.bw_burst_us = burst_us; + + percpu_up_read(&scx_cgroup_rwsem); +} + static void scx_cgroup_lock(void) { percpu_down_write(&scx_cgroup_rwsem); @@ -4400,7 +4453,12 @@ static int scx_cgroup_init(struct scx_sched *sch) rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); - struct scx_cgroup_init_args args = { .weight = tg->scx.weight }; + struct scx_cgroup_init_args args = { + .weight = tg->scx.weight, + .bw_period_us = tg->scx.bw_period_us, + .bw_quota_us = tg->scx.bw_quota_us, + .bw_burst_us = tg->scx.bw_burst_us, + }; if ((tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) @@ -5902,6 +5960,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} +static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} #endif static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} @@ -5939,6 +5998,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_move = sched_ext_ops__cgroup_move, .cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move, .cgroup_set_weight = sched_ext_ops__cgroup_set_weight, + .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, #endif .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index e7bcaa02ea56..292bb41a242e 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -104,6 +104,7 @@ void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); +void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us); #else /* CONFIG_EXT_GROUP_SCHED */ static inline void scx_tg_init(struct task_group *tg) {} static inline int scx_tg_online(struct task_group *tg) { return 0; } @@ -114,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} +static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {} #endif /* CONFIG_EXT_GROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fdf5f52b54a3..06767a210717 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -402,7 +402,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; -#ifdef CONFIG_CFS_BANDWIDTH +#ifdef CONFIG_GROUP_SCHED_BANDWIDTH extern const u64 max_bw_quota_period_us; /* @@ -413,7 +413,7 @@ static inline u64 default_bw_period_us(void) { return 100000ULL; } -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */ struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH -- cgit v1.2.3 From 535b070f4a807bbd26a30994aba8dfb4011fd447 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 22 Jun 2025 23:38:52 -0700 Subject: bpf: Introduce bpf_cgroup_read_xattr to read xattr of cgroup's node BPF programs, such as LSM and sched_ext, would benefit from tags on cgroups. One common practice to apply such tags is to set xattrs on cgroupfs folders. Introduce kfunc bpf_cgroup_read_xattr, which allows reading cgroup's xattr. Note that, we already have bpf_get_[file|dentry]_xattr. However, these two APIs are not ideal for reading cgroupfs xattrs, because: 1) These two APIs only works in sleepable contexts; 2) There is no kfunc that matches current cgroup to cgroupfs dentry. bpf_cgroup_read_xattr is generic and can be useful for many program types. It is also safe, because it requires trusted or rcu protected argument (KF_RCU). Therefore, we make it available to all program types. Signed-off-by: Song Liu Link: https://lore.kernel.org/20250623063854.1896364-3-song@kernel.org Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- kernel/bpf/helpers.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b71e428ad936..9ff1b4090289 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3397,6 +3397,9 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPAB BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) +#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) +#endif BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 1504d8c7c702cc3697ad1a690c2d6bb4c8687927 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 22 Jun 2025 23:38:53 -0700 Subject: bpf: Mark cgroup_subsys_state->cgroup RCU safe Mark struct cgroup_subsys_state->cgroup as safe under RCU read lock. This will enable accessing css->cgroup from a bpf css iterator. Signed-off-by: Song Liu Link: https://lore.kernel.org/20250623063854.1896364-4-song@kernel.org Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..db01bf51c792 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6993,6 +6993,10 @@ BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; +BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { + struct cgroup *cgroup; +}; + /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; @@ -7043,6 +7047,7 @@ static bool type_is_rcu(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); } -- cgit v1.2.3 From c11f34e30088b25b7e240e12566fcf28f7ad07cc Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 21 Jun 2025 12:55:01 +0800 Subject: bpf: Make update_prog_stats() always_inline The function update_prog_stats() will be called in the bpf trampoline. In most cases, it will be optimized by the compiler by making it inline. However, we can't rely on the compiler all the time, and just make it __always_inline to reduce the possible overhead. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20250621045501.101187-1-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- kernel/bpf/trampoline.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index c4b1a98ff726..b1e358c16eeb 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -911,27 +911,32 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram return bpf_prog_start_time(); } -static void notrace update_prog_stats(struct bpf_prog *prog, - u64 start) +static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start) { struct bpf_prog_stats *stats; + unsigned long flags; + u64 duration; - if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter* - * and disabled in __bpf_prog_exit*. - * And vice versa. - * Hence check that 'start' is valid. - */ - start > NO_START_TIME) { - u64 duration = sched_clock() - start; - unsigned long flags; - - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, duration); - u64_stats_update_end_irqrestore(&stats->syncp, flags); - } + /* + * static_key could be enabled in __bpf_prog_enter* and disabled in + * __bpf_prog_exit*. And vice versa. Check that 'start' is valid. + */ + if (start <= NO_START_TIME) + return; + + duration = sched_clock() - start; + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, duration); + u64_stats_update_end_irqrestore(&stats->syncp, flags); +} + +static __always_inline void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) +{ + if (static_branch_unlikely(&bpf_stats_enabled_key)) + __update_prog_stats(prog, start); } static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, -- cgit v1.2.3 From 867347bb21e18551b48eb147c0b2d8635909c8b5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:16 -0700 Subject: sched/wait: Drop WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() Drop the setting of WQ_FLAG_EXCLUSIVE from add_wait_queue_priority() and instead have callers manually add the flag prior to adding their structure to the queue. Blindly setting WQ_FLAG_EXCLUSIVE is flawed, as the nature of exclusive, priority waiters means that only the first waiter added will ever receive notifications. Pushing the flawed behavior to callers will allow fixing the problem one hypervisor at a time (KVM added the flawed API, and then KVM's code was copy+pasted nearly verbatim by Xen and Hyper-V), and will also allow for adding an API that provides true exclusivity, i.e. that guarantees at most one priority waiter is in the queue. Opportunistically add a comment in Hyper-V to call out the mess. Xen privcmd's irqfd_wakefup() doesn't actually operate in exclusive mode, i.e. can be "fixed" simply by dropping WQ_FLAG_EXCLUSIVE. And KVM is primed to switch to the aforementioned fully exclusive API, i.e. won't be carrying the flawed code for long. No functional change intended. Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-7-seanjc@google.com Signed-off-by: Sean Christopherson --- kernel/sched/wait.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f4701..4ab3ab195277 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -40,7 +40,7 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ { unsigned long flags; - wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + wq_entry->flags |= WQ_FLAG_PRIORITY; spin_lock_irqsave(&wq_head->lock, flags); __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); @@ -64,7 +64,7 @@ EXPORT_SYMBOL(remove_wait_queue); * the non-exclusive tasks. Normally, exclusive tasks will be at the end of * the list and any non-exclusive tasks will be woken first. A priority task * may be at the head of the list, and can consume the event without any other - * tasks being woken. + * tasks being woken if it's also an exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns -- cgit v1.2.3 From 0d09582b3a607436fd91d6ce813048a048ecbf10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 May 2025 16:52:18 -0700 Subject: sched/wait: Add a waitqueue helper for fully exclusive priority waiters Add a waitqueue helper to add a priority waiter that requires exclusive wakeups, i.e. that requires that it be the _only_ priority waiter. The API will be used by KVM to ensure that at most one of KVM's irqfds is bound to a single eventfd (across the entire kernel). Open code the helper instead of using __add_wait_queue() so that the common path doesn't need to "handle" impossible failures. Cc: K Prateek Nayak Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250522235223.3178519-9-seanjc@google.com Signed-off-by: Sean Christopherson --- kernel/sched/wait.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 4ab3ab195277..15632c89c4f2 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -47,6 +47,24 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_ } EXPORT_SYMBOL_GPL(add_wait_queue_priority); +int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head, + struct wait_queue_entry *wq_entry) +{ + struct list_head *head = &wq_head->head; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + + guard(spinlock_irqsave)(&wq_head->lock); + + if (!list_empty(head) && + (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY)) + return -EBUSY; + + list_add(&wq_entry->entry, head); + return 0; +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; -- cgit v1.2.3 From 7484e15dbb016d9d40f8c6e0475810212ae181db Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 17 Jun 2025 00:09:51 -0400 Subject: replace collect_mounts()/drop_collected_mounts() with a safer variant collect_mounts() has several problems - one can't iterate over the results directly, so it has to be done with callback passed to iterate_mounts(); it has an oopsable race with d_invalidate(); it creates temporary clones of mounts invisibly for sync umount (IOW, you can have non-lazy umount succeed leaving filesystem not mounted anywhere and yet still busy). A saner approach is to give caller an array of struct path that would pin every mount in a subtree, without cloning any mounts. * collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone * collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or a pointer to array of struct path, one for each chunk of tree visible under 'where' (i.e. the first element is a copy of where, followed by (mount,root) for everything mounted under it - the same set collect_mounts() would give). Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning references to the roots of subtrees in the caller's namespace. Array is terminated by {NULL, NULL} struct path. If it fits into preallocated array (on-stack, normally), that's where it goes; otherwise it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated' is ignored (and expected to be NULL). * drop_collected_paths(paths, preallocated) is given the array returned by an earlier call of collect_paths() and the preallocated array passed to that call. All mount/dentry references are dropped and array is kfree'd if it's not equal to 'preallocated'. * instead of iterate_mounts(), users should just iterate over array of struct path - nothing exotic is needed for that. Existing users (all in audit_tree.c) are converted. [folded a fix for braino reported by Venkat Rao Bagalkote ] Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry") Tested-by: Venkat Rao Bagalkote Signed-off-by: Al Viro --- kernel/audit_tree.c | 63 +++++++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..b0eae2a3c895 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +677,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +691,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +701,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + struct path *paths; int err; rule->tree = NULL; @@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } -- cgit v1.2.3 From fc2898ea793a48bc4b74b61cde2d8656f20efdf4 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 23 Jun 2025 01:30:49 +0100 Subject: workqueue: Remove unused work_on_cpu_safe The last use of the work_on_cpu_safe() macro was removed recently by commit 9cda46babdfe ("crypto: n2 - remove Niagara2 SPU driver") Remove it, and the work_on_cpu_safe_key() function it calls. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Tejun Heo --- kernel/workqueue.c | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 086452e339c4..12c3080332bd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6771,31 +6771,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *), return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu_key); - -/** - * work_on_cpu_safe_key - run a function in thread context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function argument - * @key: The lock class key for lock debugging purposes - * - * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold - * any locks which would prevent @fn from completing. - * - * Return: The value @fn returns. - */ -long work_on_cpu_safe_key(int cpu, long (*fn)(void *), - void *arg, struct lock_class_key *key) -{ - long ret = -ENODEV; - - cpus_read_lock(); - if (cpu_online(cpu)) - ret = work_on_cpu_key(cpu, fn, arg, key); - cpus_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER -- cgit v1.2.3 From e2a37c277c64078d5439693963fb9813fa1e6e9c Mon Sep 17 00:00:00 2001 From: Ke Ma Date: Thu, 19 Jun 2025 23:11:28 +0200 Subject: kernel/sched/ext.c: fix typo "occured" -> "occurred" in comments Fixes a minor spelling mistake in two comment lines Signed-off-by: Ke Ma Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 39cba11688a9..bee98fdcdd01 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1655,7 +1655,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * scx_add_event - Increase an event counter for 'name' by 'cnt' * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats - * @cnt: the number of the event occured + * @cnt: the number of the event occurred * * This can be used when preemption is not disabled. */ @@ -1668,7 +1668,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * __scx_add_event - Increase an event counter for 'name' by 'cnt' * @sch: scx_sched to account events for * @name: an event name defined in struct scx_event_stats - * @cnt: the number of the event occured + * @cnt: the number of the event occurred * * This should be used only when preemption is disabled. */ -- cgit v1.2.3 From 2eb7648558a7911f2208e8940cd22ca40e93cc76 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 19 Jun 2025 16:06:02 +0200 Subject: bpf: Specify access type of bpf_sysctl_get_name args The second argument of bpf_sysctl_get_name() helper is a pointer to a buffer that is being written to. However that isn't specify in the prototype. Until commit 37cce22dbd51a ("bpf: verifier: Refactor helper access type tracking"), all helper accesses were considered as a possible write access by the verifier, so no big harm was done. However, since then, the verifier might make wrong asssumption about the content of that address which might lead it to make faulty optimizations (such as removing code that was wrongly labeled dead). This is what happens in test_sysctl selftest to the tests related to sysctl_get_name. Add MEM_WRITE flag the second argument of bpf_sysctl_get_name(). Signed-off-by: Jerome Marchand Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250619140603.148942-2-jmarchan@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9122c39870bf..f4885514f007 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2134,7 +2134,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; -- cgit v1.2.3 From 626c54af35764b0b8a4ed5c446458ba6ddfe9cc8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 9 Jun 2025 01:59:55 +0900 Subject: kheaders: rebuild kheaders_data.tar.xz when a file is modified within a minute When a header file is changed, kernel/gen_kheaders.sh may fail to update kernel/kheaders_data.tar.xz. [steps to reproduce] [1] Build kernel/kheaders_data.tar.xz $ make -j$(nproc) kernel/kheaders.o DESCEND objtool INSTALL libsubcmd_headers CALL scripts/checksyscalls.sh CHK kernel/kheaders_data.tar.xz GEN kernel/kheaders_data.tar.xz CC kernel/kheaders.o [2] Modify a header without changing the file size $ sed -i s/0xdeadbeef/0xfeedbeef/ include/linux/elfnote.h [3] Rebuild kernel/kheaders_data.tar.xz $ make -j$(nproc) kernel/kheaders.o DESCEND objtool INSTALL libsubcmd_headers CALL scripts/checksyscalls.sh CHK kernel/kheaders_data.tar.xz kernel/kheaders_data.tar.xz is not updated if steps [1] - [3] are run within the same minute. The headers_md5 variable stores the MD5 hash of the 'ls -l' output for all header files. This hash value is used to determine whether kheaders_data.tar.xz needs to be rebuilt. However, 'ls -l' prints the modification times with minute-level granularity. If a file is modified within the same minute and its size remains the same, the MD5 hash does not change. To reliably detect file modifications, this commit rewrites kernel/gen_kheaders.sh to output header dependencies to kernel/.kheaders_data.tar.xz.cmd. Then, Make compares the timestamps and reruns kernel/gen_kheaders.sh when necessary. This is the standard mechanism used by Make and Kbuild. Signed-off-by: Masahiro Yamada --- kernel/.gitignore | 2 ++ kernel/Makefile | 47 ++++++++++++++++++++++--- kernel/gen_kheaders.sh | 94 ++++++++++++-------------------------------------- 3 files changed, 66 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index c6b299a6b786..a501bfc80694 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,3 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only /config_data /kheaders.md5 +/kheaders-objlist +/kheaders-srclist diff --git a/kernel/Makefile b/kernel/Makefile index 32e80dd626af..9a9ff405ea89 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -158,11 +158,48 @@ filechk_cat = cat $< $(obj)/config_data: $(KCONFIG_CONFIG) FORCE $(call filechk,cat) +# kheaders_data.tar.xz $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz -quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz - cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ -$(obj)/kheaders_data.tar.xz: FORCE - $(call cmd,genikh) +quiet_cmd_kheaders_data = GEN $@ + cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" + cmd_kheaders_data_dep = cat $(depfile) >> $(dot-target).cmd; rm -f $(depfile) -clean-files := kheaders_data.tar.xz kheaders.md5 +define rule_kheaders_data + $(call cmd_and_savecmd,kheaders_data) + $(call cmd,kheaders_data_dep) +endef + +targets += kheaders_data.tar.xz +$(obj)/kheaders_data.tar.xz: $(src)/gen_kheaders.sh $(obj)/kheaders-srclist $(obj)/kheaders-objlist $(obj)/kheaders.md5 FORCE + $(call if_changed_rule,kheaders_data) + +# generated headers in objtree +# +# include/generated/utsversion.h is ignored because it is generated +# after gen_kheaders.sh is executed. (utsversion.h is unneeded for kheaders) +filechk_kheaders_objlist = \ + for d in include "arch/$(SRCARCH)/include"; do \ + find "$${d}/generated" ! -path "include/generated/utsversion.h" -a -name "*.h" -print; \ + done + +$(obj)/kheaders-objlist: FORCE + $(call filechk,kheaders_objlist) + +# non-generated headers in srctree +filechk_kheaders_srclist = \ + for d in include "arch/$(SRCARCH)/include"; do \ + find "$(srctree)/$${d}" -path "$(srctree)/$${d}/generated" -prune -o -name "*.h" -print; \ + done + +$(obj)/kheaders-srclist: FORCE + $(call filechk,kheaders_srclist) + +# Some files are symlinks. If symlinks are changed, kheaders_data.tar.xz should +# be rebuilt. +filechk_kheaders_md5sum = xargs -r -a $< stat -c %N | md5sum + +$(obj)/kheaders.md5: $(obj)/kheaders-srclist FORCE + $(call filechk,kheaders_md5sum) + +clean-files := kheaders.md5 kheaders-srclist kheaders-objlist diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index c9e5dc068e85..0ff7beabb21a 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -4,79 +4,33 @@ # This script generates an archive consisting of kernel headers # for CONFIG_IKHEADERS. set -e -sfile="$(readlink -f "$0")" -outdir="$(pwd)" tarfile=$1 -tmpdir=$outdir/${tarfile%/*}/.tmp_dir - -dir_list=" -include/ -arch/$SRCARCH/include/ -" - -# Support incremental builds by skipping archive generation -# if timestamps of files being archived are not changed. - -# This block is useful for debugging the incremental builds. -# Uncomment it for debugging. -# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; -# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter - -all_dirs= -if [ "$building_out_of_srctree" ]; then - for d in $dir_list; do - all_dirs="$all_dirs $srctree/$d" - done -fi -all_dirs="$all_dirs $dir_list" - -# include/generated/utsversion.h is ignored because it is generated after this -# script is executed. (utsversion.h is unneeded for kheaders) -# -# When Kconfig regenerates include/generated/autoconf.h, its timestamp is -# updated, but the contents might be still the same. When any CONFIG option is -# changed, Kconfig touches the corresponding timestamp file include/config/*. -# Hence, the md5sum detects the configuration change anyway. We do not need to -# check include/generated/autoconf.h explicitly. -# -# Ignore them for md5 calculation to avoid pointless regeneration. -headers_md5="$(find $all_dirs -name "*.h" -a \ - ! -path include/generated/utsversion.h -a \ - ! -path include/generated/autoconf.h | - xargs ls -l | md5sum | cut -d ' ' -f1)" - -# Any changes to this script will also cause a rebuild of the archive. -this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" -if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi -if [ -f kernel/kheaders.md5 ] && - [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] && - [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && - [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then - exit -fi - -echo " GEN $tarfile" +srclist=$2 +objlist=$3 + +dir=$(dirname "${tarfile}") +tmpdir=${dir}/.tmp_dir +depfile=${dir}/.$(basename "${tarfile}").d + +# generate dependency list. +{ + echo + echo "deps_${tarfile} := \\" + sed 's:\(.*\): \1 \\:' "${srclist}" + sed -n '/^include\/generated\/autoconf\.h$/!s:\(.*\): \1 \\:p' "${objlist}" + echo + echo "${tarfile}: \$(deps_${tarfile})" + echo + echo "\$(deps_${tarfile}):" + +} > "${depfile}" rm -rf "${tmpdir}" mkdir "${tmpdir}" -if [ "$building_out_of_srctree" ]; then - ( - cd $srctree - for f in $dir_list - do find "$f" -name "*.h"; - done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" - ) -fi - -for f in $dir_list; - do find "$f" -name "*.h"; -done | tar -c -f - -T - | tar -xf - -C "${tmpdir}" - -# Always exclude include/generated/utsversion.h -# Otherwise, the contents of the tarball may vary depending on the build steps. -rm -f "${tmpdir}/include/generated/utsversion.h" +# shellcheck disable=SC2154 # srctree is passed as an env variable +sed "s:^${srctree}/::" "${srclist}" | tar -c -f - -C "${srctree}" -T - | tar -xf - -C "${tmpdir}" +tar -c -f - -T "${objlist}" | tar -xf - -C "${tmpdir}" # Remove comments except SDPX lines # Use a temporary file to store directory contents to prevent find/xargs from @@ -92,8 +46,4 @@ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null -echo $headers_md5 > kernel/kheaders.md5 -echo "$this_file_md5" >> kernel/kheaders.md5 -echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 - rm -rf "${tmpdir}" -- cgit v1.2.3 From 1a0faff2833b59a74c8389bcdc390af99dc9d2cf Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 9 Jun 2025 01:59:56 +0900 Subject: kheaders: rebuild kheaders_data.tar.xz when KBUILD_BUILD_TIMESTAMP is changed This problem is similar to commit 7f8256ae0efb ("initramfs: Encode dependency on KBUILD_BUILD_TIMESTAMP"): kernel/gen_kheaders.sh has an internal dependency on KBUILD_BUILD_TIMESTAMP that is not exposed to make, so changing KBUILD_BUILD_TIMESTAMP will not trigger a rebuild of the archive. Move $(KBUILD_BUILD_TIMESTAMP) to the Makefile so that is is recorded in the *.cmd file. Signed-off-by: Masahiro Yamada --- kernel/Makefile | 2 +- kernel/gen_kheaders.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a9ff405ea89..c486f17e669a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -162,7 +162,7 @@ $(obj)/config_data: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_kheaders_data = GEN $@ - cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" + cmd_kheaders_data = "$<" "$@" "$(obj)/kheaders-srclist" "$(obj)/kheaders-objlist" "$(KBUILD_BUILD_TIMESTAMP)" cmd_kheaders_data_dep = cat $(depfile) >> $(dot-target).cmd; rm -f $(depfile) define rule_kheaders_data diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 0ff7beabb21a..919bdcf989f4 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -7,6 +7,7 @@ set -e tarfile=$1 srclist=$2 objlist=$3 +timestamp=$4 dir=$(dirname "${tarfile}") tmpdir=${dir}/.tmp_dir @@ -42,7 +43,7 @@ xargs -0 -P8 -n1 \ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. -tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ +tar "${timestamp:+--mtime=$timestamp}" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null -- cgit v1.2.3 From f4363dfc900a7ffda96587d38982a1f3ea3d10bd Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 9 Jun 2025 01:59:57 +0900 Subject: kheaders: double-quote variables to satisfy shellcheck Fix the following: In kernel/gen_kheaders.sh line 48: -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null ^-^ SC2086 (info): Double quote to prevent globbing and word splitting. ^------^ SC2086 (info): Double quote to prevent globbing and word splitting. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 919bdcf989f4..c64e5a00a3d9 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -45,6 +45,6 @@ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. tar "${timestamp:+--mtime=$timestamp}" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ - -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null + -I "${XZ}" -cf "${tarfile}" -C "${tmpdir}/" . > /dev/null rm -rf "${tmpdir}" -- cgit v1.2.3 From cb444006a625c60e6d4dd3753863c3c74f96aac3 Mon Sep 17 00:00:00 2001 From: David Dai Date: Tue, 24 Jun 2025 15:49:06 -0700 Subject: sched_ext, rcu: Eject BPF scheduler on RCU CPU stall panic For systems using a sched_ext scheduler and has panic_on_rcu_stall enabled, try kicking out the current scheduler before issuing a panic. While there are numerous reasons for RCU CPU stalls that are not directly attributed to the scheduler, deferring the panic gives sched_ext an opportunity to provide additional debug info when ejecting the current scheduler. Also, handling the event more gracefully allows us to potentially recover the system instead of incurring additional down time. Suggested-by: Tejun Heo Reviewed-by: Paul E. McKenney Signed-off-by: David Dai Signed-off-by: Tejun Heo --- kernel/rcu/tree_stall.h | 7 +++++++ kernel/sched/ext.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 486c00536207..af61b2d0d311 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -134,6 +134,13 @@ static void panic_on_rcu_stall(void) { static int cpu_stall; + /* + * Attempt to kick out the BPF scheduler if it's installed and defer + * the panic to give the system a chance to recover. + */ + if (scx_rcu_cpu_stall()) + return; + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) return; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index bee98fdcdd01..df5b2c952cf7 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4672,6 +4672,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) p->sched_class != &ext_sched_class; } +/** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + */ +bool scx_rcu_cpu_stall(void) +{ + struct scx_sched *sch; + + rcu_read_lock(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) { + rcu_read_unlock(); + return false; + } + + switch (scx_enable_state()) { + case SCX_ENABLING: + case SCX_ENABLED: + break; + default: + rcu_read_unlock(); + return false; + } + + scx_error(sch, "RCU CPU stall detected!"); + rcu_read_unlock(); + + return true; +} + /** * scx_softlockup - sched_ext softlockup handler * @dur_s: number of seconds of CPU stuck due to soft lockup -- cgit v1.2.3 From 7a998a73162773d5d8dca4c9f46af26a499d8f19 Mon Sep 17 00:00:00 2001 From: Harishankar Vishwanathan Date: Mon, 23 Jun 2025 00:03:56 -0400 Subject: bpf, verifier: Improve precision for BPF_ADD and BPF_SUB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch improves the precison of the scalar(32)_min_max_add and scalar(32)_min_max_sub functions, which update the u(32)min/u(32)_max ranges for the BPF_ADD and BPF_SUB instructions. We discovered this more precise operator using a technique we are developing for automatically synthesizing functions for updating tnums and ranges. According to the BPF ISA [1], "Underflow and overflow are allowed during arithmetic operations, meaning the 64-bit or 32-bit value will wrap". Our patch leverages the wrap-around semantics of unsigned overflow and underflow to improve precision. Below is an example of our patch for scalar_min_max_add; the idea is analogous for all four functions. There are three cases to consider when adding two u64 ranges [dst_umin, dst_umax] and [src_umin, src_umax]. Consider a value x in the range [dst_umin, dst_umax] and another value y in the range [src_umin, src_umax]. (a) No overflow: No addition x + y overflows. This occurs when even the largest possible sum, i.e., dst_umax + src_umax <= U64_MAX. (b) Partial overflow: Some additions x + y overflow. This occurs when the largest possible sum overflows (dst_umax + src_umax > U64_MAX), but the smallest possible sum does not overflow (dst_umin + src_umin <= U64_MAX). (c) Full overflow: All additions x + y overflow. This occurs when both the smallest possible sum and the largest possible sum overflow, i.e., both (dst_umin + src_umin) and (dst_umax + src_umax) are > U64_MAX. The current implementation conservatively sets the output bounds to unbounded, i.e, [umin=0, umax=U64_MAX], whenever there is *any* possibility of overflow, i.e, in cases (b) and (c). Otherwise it computes tight bounds as [dst_umin + src_umin, dst_umax + src_umax]: if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) || check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) { *dst_umin = 0; *dst_umax = U64_MAX; } Our synthesis-based technique discovered a more precise operator. Particularly, in case (c), all possible additions x + y overflow and wrap around according to eBPF semantics, and the computation of the output range as [dst_umin + src_umin, dst_umax + src_umax] continues to work. Only in case (b), do we need to set the output bounds to unbounded, i.e., [0, U64_MAX]. Case (b) can be checked by seeing if the minimum possible sum does *not* overflow and the maximum possible sum *does* overflow, and when that happens, we set the output to unbounded: min_overflow = check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin); max_overflow = check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax); if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U64_MAX; } Below is an example eBPF program and the corresponding log from the verifier. The current implementation of scalar_min_max_add() sets r3's bounds to [0, U64_MAX] at instruction 5: (0f) r3 += r3, due to conservative overflow handling. 0: R1=ctx() R10=fp0 0: (b7) r4 = 0 ; R4_w=0 1: (87) r4 = -r4 ; R4_w=scalar() 2: (18) r3 = 0xa000000000000000 ; R3_w=0xa000000000000000 4: (4f) r3 |= r4 ; R3_w=scalar(smin=0xa000000000000000,smax=-1,umin=0xa000000000000000,var_off=(0xa000000000000000; 0x5fffffffffffffff)) R4_w=scalar() 5: (0f) r3 += r3 ; R3_w=scalar() 6: (b7) r0 = 1 ; R0_w=1 7: (95) exit With our patch, r3's bounds after instruction 5 are set to a much more precise [0x4000000000000000,0xfffffffffffffffe]. ... 5: (0f) r3 += r3 ; R3_w=scalar(umin=0x4000000000000000,umax=0xfffffffffffffffe) 6: (b7) r0 = 1 ; R0_w=1 7: (95) exit The logic for scalar32_min_max_add is analogous. For the scalar(32)_min_max_sub functions, the reasoning is similar but applied to detecting underflow instead of overflow. We verified the correctness of the new implementations using Agni [3,4]. We since also discovered that a similar technique has been used to calculate output ranges for unsigned interval addition and subtraction in Hacker's Delight [2]. [1] https://docs.kernel.org/bpf/standardization/instruction-set.html [2] Hacker's Delight Ch.4-2, Propagating Bounds through Add’s and Subtract’s [3] https://github.com/bpfverif/agni [4] https://people.cs.rutgers.edu/~sn349/papers/sas24-preprint.pdf Co-developed-by: Matan Shachnai Signed-off-by: Matan Shachnai Co-developed-by: Srinivas Narayana Signed-off-by: Srinivas Narayana Co-developed-by: Santosh Nagarakatte Signed-off-by: Santosh Nagarakatte Signed-off-by: Harishankar Vishwanathan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250623040359.343235-2-harishankar.vishwanathan@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 76 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 279a64933262..f403524bd215 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14605,14 +14605,25 @@ static void scalar32_min_max_add(struct bpf_reg_state *dst_reg, s32 *dst_smax = &dst_reg->s32_max_value; u32 *dst_umin = &dst_reg->u32_min_value; u32 *dst_umax = &dst_reg->u32_max_value; + u32 umin_val = src_reg->u32_min_value; + u32 umax_val = src_reg->u32_max_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) { *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U32_MAX; } @@ -14625,14 +14636,25 @@ static void scalar_min_max_add(struct bpf_reg_state *dst_reg, s64 *dst_smax = &dst_reg->smax_value; u64 *dst_umin = &dst_reg->umin_value; u64 *dst_umax = &dst_reg->umax_value; + u64 umin_val = src_reg->umin_value; + u64 umax_val = src_reg->umax_value; + bool min_overflow, max_overflow; if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) || check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) { *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) || - check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) { + + /* If either all additions overflow or no additions overflow, then + * it is okay to set: dst_umin = dst_umin + src_umin, dst_umax = + * dst_umax + src_umax. Otherwise (some additions overflow), set + * the output bounds to unbounded. + */ + min_overflow = check_add_overflow(*dst_umin, umin_val, dst_umin); + max_overflow = check_add_overflow(*dst_umax, umax_val, dst_umax); + + if (!min_overflow && max_overflow) { *dst_umin = 0; *dst_umax = U64_MAX; } @@ -14643,8 +14665,11 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, { s32 *dst_smin = &dst_reg->s32_min_value; s32 *dst_smax = &dst_reg->s32_max_value; + u32 *dst_umin = &dst_reg->u32_min_value; + u32 *dst_umax = &dst_reg->u32_max_value; u32 umin_val = src_reg->u32_min_value; u32 umax_val = src_reg->u32_max_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) { @@ -14652,14 +14677,18 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S32_MIN; *dst_smax = S32_MAX; } - if (dst_reg->u32_min_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->u32_min_value = 0; - dst_reg->u32_max_value = U32_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->u32_min_value -= umax_val; - dst_reg->u32_max_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U32_MAX; } } @@ -14668,8 +14697,11 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, { s64 *dst_smin = &dst_reg->smin_value; s64 *dst_smax = &dst_reg->smax_value; + u64 *dst_umin = &dst_reg->umin_value; + u64 *dst_umax = &dst_reg->umax_value; u64 umin_val = src_reg->umin_value; u64 umax_val = src_reg->umax_value; + bool min_underflow, max_underflow; if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) || check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) { @@ -14677,14 +14709,18 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg, *dst_smin = S64_MIN; *dst_smax = S64_MAX; } - if (dst_reg->umin_value < umax_val) { - /* Overflow possible, we know nothing */ - dst_reg->umin_value = 0; - dst_reg->umax_value = U64_MAX; - } else { - /* Cannot overflow (as long as bounds are consistent) */ - dst_reg->umin_value -= umax_val; - dst_reg->umax_value -= umin_val; + + /* If either all subtractions underflow or no subtractions + * underflow, it is okay to set: dst_umin = dst_umin - src_umax, + * dst_umax = dst_umax - src_umin. Otherwise (some subtractions + * underflow), set the output bounds to unbounded. + */ + min_underflow = check_sub_overflow(*dst_umin, umax_val, dst_umin); + max_underflow = check_sub_overflow(*dst_umax, umin_val, dst_umax); + + if (min_underflow && !max_underflow) { + *dst_umin = 0; + *dst_umax = U64_MAX; } } -- cgit v1.2.3 From f2e555fc04ba238cd1dd21040325fa0629ee433d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:00 +0200 Subject: rcu/exp: Protect against early QS report When a grace period is started, the ->expmask of each node is set up from sync_exp_reset_tree(). Then later on each leaf node also initialize its ->exp_tasks pointer. This means that the initialization of the quiescent state of a node and the initialization of its blocking tasks happen with an unlocked node gap in-between. It happens to be fine because nothing is expected to report an exp quiescent state within this gap, since no IPI have been issued yet and every rdp's ->cpu_no_qs.b.exp should be false. However if it were to happen by accident, the quiescent state could be reported and propagated while ignoring tasks that blocked _before_ the start of the grace period. Prevent such trouble to happen in the future and initialize both the quiescent states mask to report and the blocked tasks head from the same node locked block. If a task blocks within an RCU read side critical section before sync_exp_reset_tree() is called and is then unblocked between sync_exp_reset_tree() and __sync_rcu_exp_select_node_cpus(), the QS won't be reported because no RCU exp IPI had been issued to request it through the setting of srdp->cpu_no_qs.b.exp. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_exp.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index c36c7d5575ca..2fa7aa9155bd 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -141,6 +141,13 @@ static void __maybe_unused sync_exp_reset_tree(void) raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); WRITE_ONCE(rnp->expmask, rnp->expmaskinit); + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited GP + * until such time as the ->expmask bits are cleared. + */ + if (rcu_is_leaf_node(rnp) && rcu_preempt_has_tasks(rnp)) + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -393,13 +400,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp) } mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited GP - * until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ -- cgit v1.2.3 From 3dfdfaff2d49aa7895ced8a54e7d61755eac6830 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 12:08:40 +0200 Subject: rcu: Robustify rcu_is_cpu_rrupt_from_idle() RCU relies on the context tracking nesting counter in order to determine if it is running in extended quiescent state. However the context tracking nesting counter is not completely synchronized with the actual context tracking state: * The nesting counter is set to 1 or incremented further _after_ the actual state is set to RCU watching. * The nesting counter is set to 0 or decremented further _before_ the actual state is set to RCU not watching. Therefore it is safe to assume that if ct_nesting() > 0, RCU is watching. But if ct_nesting() <= 0, RCU is not watching except for tiny windows. This hasn't been a problem so far because rcu_is_cpu_rrupt_from_idle() has only been called from interrupts. However the code is confusing and abuses the role of the context tracking nesting counter while there are more accurate indicators available. Clarify and robustify accordingly. Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14d4499c6fc3..f83bbb408895 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -377,7 +377,7 @@ EXPORT_SYMBOL_GPL(rcu_momentary_eqs); */ static int rcu_is_cpu_rrupt_from_idle(void) { - long nesting; + long nmi_nesting = ct_nmi_nesting(); /* * Usually called from the tick; but also used from smp_function_call() @@ -389,21 +389,28 @@ static int rcu_is_cpu_rrupt_from_idle(void) /* Check for counter underflows */ RCU_LOCKDEP_WARN(ct_nesting() < 0, "RCU nesting counter underflow!"); - RCU_LOCKDEP_WARN(ct_nmi_nesting() <= 0, - "RCU nmi_nesting counter underflow/zero!"); - /* Are we at first interrupt nesting level? */ - nesting = ct_nmi_nesting(); - if (nesting > 1) + /* Non-idle interrupt or nested idle interrupt */ + if (nmi_nesting > 1) return false; /* - * If we're not in an interrupt, we must be in the idle task! + * Non nested idle interrupt (interrupting section where RCU + * wasn't watching). */ - WARN_ON_ONCE(!nesting && !is_idle_task(current)); + if (nmi_nesting == 1) + return true; + + /* Not in an interrupt */ + if (!nmi_nesting) { + RCU_LOCKDEP_WARN(!in_task() || !is_idle_task(current), + "RCU nmi_nesting counter not in idle task!"); + return !rcu_is_watching_curr_cpu(); + } - /* Does CPU appear to be idle from an RCU standpoint? */ - return ct_nesting() == 0; + RCU_LOCKDEP_WARN(1, "RCU nmi_nesting counter underflow/zero!"); + + return false; } #define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10) -- cgit v1.2.3 From 9ea40db969115022335a553b4f6fe731dcd90c2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 May 2025 16:45:02 -0700 Subject: rcutorture: Print only one rtort_pipe_count splat The rcu_torture_writer() function scans the memory blocks after a stutter (or forced idle) interval, complaining about any that have not passed through ten grace periods since the start of the stutter interval. But one splat suffices, so this commit therefore stops at the first splat. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 70ec0f21abc3..d1e0d61d8815 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1722,6 +1722,7 @@ rcu_torture_writer(void *arg) cur_ops->gp_kthread_dbg(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); + break; } if (stutter_waited) sched_set_normal(current, oldnice); -- cgit v1.2.3 From 635bdb9d22796fc6ba1958ec73ab0a0a693c58d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 May 2025 16:45:01 -0700 Subject: rcutorture: Start rcu_torture_writer() after rcu_torture_reader() Testing of rcutorture's SRCU-P scenario on a large arm64 system resulted in rcu_torture_writer() forward-progress failures, but these same tests passed on x86. After some off-list discussion of possible memory-ordering causes for these failures, Boqun showed that these were in fact due to reordering, but by the scheduler, not by the memory system. On x86, rcu_torture_writer() would have run quickly enough that by the time the rcu_torture_updown() kthread started, the rcu_torture_current variable would already be initialized, thus avoiding a bug in which a NULL value would cause rcu_torture_updown() to do an extra call to srcu_up_read_fast(). This commit therefore moves creation of the rcu_torture_writer() kthread after that of the rcu_torture_reader() kthreads. This results in deterministic failures on x86. What about the double-srcu_up_read_fast() bug? Boqun has the fix. But let's also fix the test while we are at it! Reported-by: Joel Fernandes Reported-by: Boqun Feng Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d1e0d61d8815..a209d2419cfd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -4246,11 +4246,6 @@ rcu_torture_init(void) /* Start up the kthreads. */ rcu_torture_write_types(); - firsterr = torture_create_kthread(rcu_torture_writer, NULL, - writer_task); - if (torture_init_error(firsterr)) - goto unwind; - if (nrealfakewriters > 0) { fakewriter_tasks = kcalloc(nrealfakewriters, sizeof(fakewriter_tasks[0]), @@ -4283,6 +4278,12 @@ rcu_torture_init(void) if (torture_init_error(firsterr)) goto unwind; } + + firsterr = torture_create_kthread(rcu_torture_writer, NULL, + writer_task); + if (torture_init_error(firsterr)) + goto unwind; + nrealnocbers = nocbs_nthreads; if (WARN_ON(nrealnocbers < 0)) nrealnocbers = 1; -- cgit v1.2.3 From eec1f94cf77f8a7f7078365f5029a56b0ce16580 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jun 2025 12:42:14 -0700 Subject: rcutorture: Make rcutorture_one_extend_check() account for hard IRQs This commit retrospectively prepares for testing of RCU readers invoked from hardware interrupt handlers (for example, HRTIMER_MODE_HARD hrtimer handlers) in kernels built with CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE=y, which is rarely used but sometimes extremely useful. This preparation involves taking early exits if in_hardirq(), and, while we are in the area, a very early exit if in_nmi(). This means that a number of insoftirq parameters are no longer needed, but that is the subject of a later commit. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202505140917.8ee62cc6-lkp@intel.com Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index a209d2419cfd..7944cc69321d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1920,10 +1920,10 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, { int mask; - if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE)) + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE) || in_nmi()) return; - WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS); + WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled() && !in_hardirq(), ROEC_ARGS); WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS); // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable. @@ -1938,9 +1938,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) && cur_ops->readlock_nesting() == 0, ROEC_ARGS); - // Timer handlers have all sorts of stuff disabled, so ignore + // Interrupt handlers have all sorts of stuff disabled, so ignore // unintended disabling. - if (insoftirq) + if (in_serving_softirq() || in_hardirq()) return; WARN_ONCE(cur_ops->extendables && -- cgit v1.2.3 From 1b67e031d478bf386282c7c5ff3f19cd9c1e5a39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Feb 2025 11:37:55 -0800 Subject: rcutorture: Add tests for SRCU up/down reader primitives This commit adds a new rcutorture.n_up_down kernel boot parameter that specifies the number of outstanding SRCU up/down readers, which begin in kthread context and end in an hrtimer handler. There is a new kthread ("rcu_torture_updown") that scans an per-reader array looking for elements whose readers have ended. This kthread sleeps between one and two milliseconds between consecutive scans. [ paulmck: Apply kernel test robot feedback. ] [ paulmck: Apply Z qiang feedback. ] [ joel: Fix build error: hrtimer_init is replaced by hrtimer_setup. ] [ joel: Apply Boqun bug fix to drop extra up_read() call in rcu_torture_updown()]. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Tested-by: kernel test robot Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 225 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 206 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7944cc69321d..bf87c0c4e77e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -55,22 +55,24 @@ MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -/* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) -#define RCUTORTURE_RDR_SHIFT_2 16 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) -#define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ -#define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ -#define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ -#define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ -#define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ -#define RCUTORTURE_MAX_EXTEND \ +// Bits for ->extendables field, extendables param, and related definitions. +#define RCUTORTURE_RDR_SHIFT_1 8 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_1 (0xff << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 16 // Put SRCU index in upper bits. +#define RCUTORTURE_RDR_MASK_2 (0xff << RCUTORTURE_RDR_SHIFT_2) +#define RCUTORTURE_RDR_BH 0x01 // Extend readers by disabling bh. +#define RCUTORTURE_RDR_IRQ 0x02 // ... disabling interrupts. +#define RCUTORTURE_RDR_PREEMPT 0x04 // ... disabling preemption. +#define RCUTORTURE_RDR_RBH 0x08 // ... rcu_read_lock_bh(). +#define RCUTORTURE_RDR_SCHED 0x10 // ... rcu_read_lock_sched(). +#define RCUTORTURE_RDR_RCU_1 0x20 // ... entering another RCU reader. +#define RCUTORTURE_RDR_RCU_2 0x40 // ... entering another RCU reader. +#define RCUTORTURE_RDR_UPDOWN 0x80 // ... up-read from task, down-read from timer. + // Note: Manual start, automatic end. +#define RCUTORTURE_RDR_NBITS 8 // Number of bits defined above. +#define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ - RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) + RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) // Intentionally omit RCUTORTURE_RDR_UPDOWN. #define RCUTORTURE_RDR_ALLBITS \ (RCUTORTURE_MAX_EXTEND | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2 | \ RCUTORTURE_RDR_MASK_1 | RCUTORTURE_RDR_MASK_2) @@ -110,6 +112,7 @@ torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); torture_param(int, n_barrier_cbs, 0, "# of callbacks/kthreads for barrier testing"); +torture_param(int, n_up_down, 32, "# of concurrent up/down hrtimer-based RCU readers"); torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, object_debug, 0, "Enable debug-object double call_rcu() testing"); @@ -156,6 +159,7 @@ static int nrealfakewriters; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct *updown_task; static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; @@ -378,6 +382,8 @@ struct rcu_torture_ops { void (*readunlock)(int idx); int (*readlock_held)(void); // lockdep. int (*readlock_nesting)(void); // actual nesting, if available, -1 if not. + int (*down_read)(void); + void (*up_read)(int idx); unsigned long (*get_gp_seq)(void); unsigned long (*gp_diff)(unsigned long new, unsigned long old); void (*deferred_free)(struct rcu_torture *p); @@ -427,6 +433,7 @@ struct rcu_torture_ops { int no_pi_lock; int debug_objects; int start_poll_irqsoff; + int have_up_down; const char *name; }; @@ -762,6 +769,50 @@ static int torture_srcu_read_lock_held(void) return srcu_read_lock_held(srcu_ctlp); } +static bool srcu_torture_have_up_down(void) +{ + int rf = reader_flavor; + + if (!rf) + rf = SRCU_READ_FLAVOR_NORMAL; + return !!(cur_ops->have_up_down & rf); +} + +static int srcu_torture_down_read(void) +{ + int idx; + struct srcu_ctr __percpu *scp; + + WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL); + WARN_ON_ONCE(reader_flavor & (reader_flavor - 1)); + + if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) { + idx = srcu_down_read(srcu_ctlp); + WARN_ON_ONCE(idx & ~0x1); + return idx; + } + if (reader_flavor & SRCU_READ_FLAVOR_FAST) { + scp = srcu_down_read_fast(srcu_ctlp); + idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); + WARN_ON_ONCE(idx & ~0x1); + return idx << 3; + } + WARN_ON_ONCE(1); + return 0; +} + +static void srcu_torture_up_read(int idx) +{ + WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); + if (reader_flavor & SRCU_READ_FLAVOR_FAST) + srcu_up_read_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); + else if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || + !(reader_flavor & SRCU_READ_FLAVOR_ALL)) + srcu_up_read(srcu_ctlp, idx & 0x1); + else + WARN_ON_ONCE(1); +} + static unsigned long srcu_torture_completed(void) { return srcu_batches_completed(srcu_ctlp); @@ -819,6 +870,8 @@ static struct rcu_torture_ops srcu_ops = { .readlock = srcu_torture_read_lock, .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .readlock_held = torture_srcu_read_lock_held, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, @@ -839,6 +892,8 @@ static struct rcu_torture_ops srcu_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, .name = "srcu" }; @@ -864,6 +919,8 @@ static struct rcu_torture_ops srcud_ops = { .read_delay = srcu_read_delay, .readunlock = srcu_torture_read_unlock, .readlock_held = torture_srcu_read_lock_held, + .down_read = srcu_torture_down_read, + .up_read = srcu_torture_up_read, .get_gp_seq = srcu_torture_completed, .gp_diff = rcu_seq_diff, .deferred_free = srcu_torture_deferred_free, @@ -883,6 +940,8 @@ static struct rcu_torture_ops srcud_ops = { .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .debug_objects = 1, + .have_up_down = IS_ENABLED(CONFIG_TINY_SRCU) + ? 0 : SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_FAST, .name = "srcud" }; @@ -1994,7 +2053,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); - WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS); + WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); rtrsp->rt_readstate = newstate; @@ -2070,6 +2129,11 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } + if (statesold & RCUTORTURE_RDR_UPDOWN) { + cur_ops->up_read((idxold1 & RCUTORTURE_RDR_MASK_1) >> RCUTORTURE_RDR_SHIFT_1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; + } /* Delay if neither beginning nor end and there was a change. */ if ((statesnew || statesold) && *readstate && newstate) @@ -2210,7 +2274,8 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp rtorsp->started = cur_ops->get_gp_seq(); rtorsp->ts = rcu_trace_clock_local(); rtorsp->p = rcu_dereference_check(rcu_torture_current, - !cur_ops->readlock_held || cur_ops->readlock_held()); + !cur_ops->readlock_held || cur_ops->readlock_held() || + (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); @@ -2379,6 +2444,121 @@ rcu_torture_reader(void *arg) return 0; } +struct rcu_torture_one_read_state_updown { + struct hrtimer rtorsu_hrt; + bool rtorsu_inuse; + struct torture_random_state rtorsu_trs; + struct rcu_torture_one_read_state rtorsu_rtors; +}; + +static struct rcu_torture_one_read_state_updown *updownreaders; +static DEFINE_TORTURE_RANDOM(rcu_torture_updown_rand); +static int rcu_torture_updown(void *arg); + +static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + smp_store_release(&rtorsup->rtorsu_inuse, false); + return HRTIMER_NORESTART; +} + +static int rcu_torture_updown_init(void) +{ + int i; + struct torture_random_state *rand = &rcu_torture_updown_rand; + int ret; + + if (n_up_down < 0) + return 0; + if (!srcu_torture_have_up_down()) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Disabling up/down reader tests due to lack of primitives"); + return 0; + } + updownreaders = kcalloc(n_up_down, sizeof(*updownreaders), GFP_KERNEL); + if (!updownreaders) { + VERBOSE_TOROUT_STRING("rcu_torture_updown_init: Out of memory, disabling up/down reader tests"); + return -ENOMEM; + } + for (i = 0; i < n_up_down; i++) { + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); + hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, + HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + torture_random_init(&updownreaders[i].rtorsu_trs); + init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, + &updownreaders[i].rtorsu_trs); + } + ret = torture_create_kthread(rcu_torture_updown, rand, updown_task); + if (ret) { + kfree(updownreaders); + updownreaders = NULL; + } + return ret; +} + +static void rcu_torture_updown_cleanup(void) +{ + struct rcu_torture_one_read_state_updown *rtorsup; + + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + if (!hrtimer_cancel(&rtorsup->rtorsu_hrt)) + WARN_ON_ONCE(rtorsup->rtorsu_inuse); + + } + kfree(updownreaders); + updownreaders = NULL; +} + +/* + * RCU torture up/down reader kthread, starting RCU readers in kthread + * context and ending them in hrtimer handlers. Otherwise similar to + * rcu_torture_reader(). + */ +static int +rcu_torture_updown(void *arg) +{ + int idx; + int rawidx; + struct rcu_torture_one_read_state_updown *rtorsup; + ktime_t t; + + VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); + do { + for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { + if (torture_must_stop()) + break; + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) + continue; + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, + &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, + &rtorsup->rtorsu_trs, -1)) { + schedule_timeout_idle(HZ); + continue; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, + HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + } + torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); + stutter_wait("rcu_torture_updown"); + } while (!torture_must_stop()); + rcu_torture_updown_cleanup(); + torture_kthread_stopping("rcu_torture_updown"); + return 0; +} + /* * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to * increase race probabilities and fuzzes the interval between toggling. @@ -2633,7 +2813,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "reader_flavor=%x " "nocbs_nthreads=%d nocbs_toggle=%d " "test_nmis=%d " - "preempt_duration=%d preempt_interval=%d\n", + "preempt_duration=%d preempt_interval=%d n_up_down=%d\n", torture_type, tag, nrealreaders, nrealfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -2647,7 +2827,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) reader_flavor, nocbs_nthreads, nocbs_toggle, test_nmis, - preempt_duration, preempt_interval); + preempt_duration, preempt_interval, n_up_down); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -3750,6 +3930,10 @@ rcu_torture_cleanup(void) nocb_tasks = NULL; } + if (updown_task) { + torture_stop_kthread(rcu_torture_updown, updown_task); + updown_task = NULL; + } if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -4284,6 +4468,9 @@ rcu_torture_init(void) if (torture_init_error(firsterr)) goto unwind; + firsterr = rcu_torture_updown_init(); + if (torture_init_error(firsterr)) + goto unwind; nrealnocbers = nocbs_nthreads; if (WARN_ON(nrealnocbers < 0)) nrealnocbers = 1; -- cgit v1.2.3 From 065de24265155fe662aa9f69a49e0697a5b3b4b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 9 Feb 2025 04:58:36 -0800 Subject: rcutorture: Pull rcu_torture_updown() loop body into new function This is strictly a code-movement commit, pulling that part of the rcu_torture_updown() function's loop body that processes one rcu_torture_one_read_state_updown structure into a new rcu_torture_updown_one() function. The checks for the end of the torture test and the current structure being in use remain in the rcu_torture_updown() function. Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bf87c0c4e77e..13cf0e472868 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2513,6 +2513,29 @@ static void rcu_torture_updown_cleanup(void) updownreaders = NULL; } +// Do one reader for rcu_torture_updown(). +static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rtorsup) +{ + int idx; + int rawidx; + ktime_t t; + + init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); + rawidx = cur_ops->down_read(); + idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; + rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; + rtorsup->rtorsu_rtors.rtrsp++; + if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + schedule_timeout_idle(HZ); + return; + } + smp_store_release(&rtorsup->rtorsu_inuse, true); + t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. + if (t < 10 * 1000) + t = 200 * 1000 * 1000; + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); +} + /* * RCU torture up/down reader kthread, starting RCU readers in kthread * context and ending them in hrtimer handlers. Otherwise similar to @@ -2521,10 +2544,7 @@ static void rcu_torture_updown_cleanup(void) static int rcu_torture_updown(void *arg) { - int idx; - int rawidx; struct rcu_torture_one_read_state_updown *rtorsup; - ktime_t t; VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); do { @@ -2533,23 +2553,7 @@ rcu_torture_updown(void *arg) break; if (smp_load_acquire(&rtorsup->rtorsu_inuse)) continue; - init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, - &rtorsup->rtorsu_trs); - rawidx = cur_ops->down_read(); - idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; - rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; - rtorsup->rtorsu_rtors.rtrsp++; - if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, - &rtorsup->rtorsu_trs, -1)) { - schedule_timeout_idle(HZ); - continue; - } - smp_store_release(&rtorsup->rtorsu_inuse, true); - t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. - if (t < 10 * 1000) - t = 200 * 1000 * 1000; - hrtimer_start(&rtorsup->rtorsu_hrt, t, - HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + rcu_torture_updown_one(rtorsup); } torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); stutter_wait("rcu_torture_updown"); -- cgit v1.2.3 From 62d92c9b87db43cc255411fe2827b9de42fa0523 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Feb 2025 06:59:24 -0800 Subject: rcutorture: Complain if an ->up_read() is delayed more than 10 seconds The down/up SRCU reader testing uses an hrtimer handler to exit the SRCU read-side critical section. This might be delayed, and if delayed for too long, it can prevent the rcutorture run from completing. This commit therefore complains if the hrtimer handler is delayed for more than ten seconds. [ paulmck, joel: Apply kernel test robot feedback to avoid false-positive complaint of excessive ->up_read() delays by using HRTIMER_MODE_HARD ] Tested-by: kernel test robot Signed-off-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 13cf0e472868..22288efc8c67 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2447,6 +2447,8 @@ rcu_torture_reader(void *arg) struct rcu_torture_one_read_state_updown { struct hrtimer rtorsu_hrt; bool rtorsu_inuse; + ktime_t rtorsu_kt; + unsigned long rtorsu_j; struct torture_random_state rtorsu_trs; struct rcu_torture_one_read_state rtorsu_rtors; }; @@ -2485,7 +2487,7 @@ static int rcu_torture_updown_init(void) for (i = 0; i < n_up_down; i++) { init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, rand); hrtimer_setup(&updownreaders[i].rtorsu_hrt, rcu_torture_updown_hrt, CLOCK_MONOTONIC, - HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + HRTIMER_MODE_REL | HRTIMER_MODE_HARD); torture_random_init(&updownreaders[i].rtorsu_trs); init_rcu_torture_one_read_state(&updownreaders[i].rtorsu_rtors, &updownreaders[i].rtorsu_trs); @@ -2533,7 +2535,10 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto t = torture_random(&rtorsup->rtorsu_trs) & 0xfffff; // One per million. if (t < 10 * 1000) t = 200 * 1000 * 1000; - hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_SOFT); + hrtimer_start(&rtorsup->rtorsu_hrt, t, HRTIMER_MODE_REL | HRTIMER_MODE_HARD); + smp_mb(); // Sample jiffies after posting hrtimer. + rtorsup->rtorsu_j = jiffies; // Not used by hrtimer handler. + rtorsup->rtorsu_kt = t; } /* @@ -2544,6 +2549,7 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto static int rcu_torture_updown(void *arg) { + unsigned long j; struct rcu_torture_one_read_state_updown *rtorsup; VERBOSE_TOROUT_STRING("rcu_torture_updown task started"); @@ -2551,8 +2557,12 @@ rcu_torture_updown(void *arg) for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { if (torture_must_stop()) break; - if (smp_load_acquire(&rtorsup->rtorsu_inuse)) + j = smp_load_acquire(&jiffies); // Time before ->rtorsu_inuse. + if (smp_load_acquire(&rtorsup->rtorsu_inuse)) { + WARN_ONCE(time_after(j, rtorsup->rtorsu_j + 1 + HZ * 10), + "hrtimer queued at jiffies %lu for %lld ns took %lu jiffies\n", rtorsup->rtorsu_j, rtorsup->rtorsu_kt, j - rtorsup->rtorsu_j); continue; + } rcu_torture_updown_one(rtorsup); } torture_hrtimeout_ms(1, 1000, &rcu_torture_updown_rand); -- cgit v1.2.3 From 93856948be8f5d784a28cfb2eb16884a5522faaa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Feb 2025 09:29:35 -0800 Subject: rcutorture: Check for ->up_read() without matching ->down_read() This commit creates counters in the rcu_torture_one_read_state_updown structure that check for a call to ->up_read() that lacks a matching call to ->down_read(). While in the area, add end-of-run cleanup code that prevents calls to rcu_torture_updown_hrt() from happening after the test has moved on. Yes, the srcu_barrier() at the end of the test will wait for them, but this could result in confusing states, statistics, and diagnostic information. So explicitly wait for them before we get to the end-of-test output. [ paulmck: Apply kernel test robot feedback. ] [ joel: Apply Boqun's fix for counter increment ordering. ] Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 22288efc8c67..f773d4f8f2ae 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2449,6 +2449,8 @@ struct rcu_torture_one_read_state_updown { bool rtorsu_inuse; ktime_t rtorsu_kt; unsigned long rtorsu_j; + unsigned long rtorsu_ndowns; + unsigned long rtorsu_nups; struct torture_random_state rtorsu_trs; struct rcu_torture_one_read_state rtorsu_rtors; }; @@ -2463,6 +2465,8 @@ static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + rtorsup->rtorsu_nups++; smp_store_release(&rtorsup->rtorsu_inuse, false); return HRTIMER_NORESTART; } @@ -2507,8 +2511,12 @@ static void rcu_torture_updown_cleanup(void) for (rtorsup = updownreaders; rtorsup < &updownreaders[n_up_down]; rtorsup++) { if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) continue; - if (!hrtimer_cancel(&rtorsup->rtorsu_hrt)) - WARN_ON_ONCE(rtorsup->rtorsu_inuse); + if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + rtorsup->rtorsu_nups++; + smp_store_release(&rtorsup->rtorsu_inuse, false); + } } kfree(updownreaders); @@ -2524,10 +2532,13 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); rawidx = cur_ops->down_read(); + rtorsup->rtorsu_ndowns++; idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; rtorsup->rtorsu_rtors.rtrsp++; if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { + WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); + rtorsup->rtorsu_nups++; schedule_timeout_idle(HZ); return; } -- cgit v1.2.3 From f6c8785f50443db4c70faebfc22e59e8064c35a5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 8 May 2025 16:45:00 -0700 Subject: rcutorture: Check for no up/down readers at task level The design of testing of up/down readers such as srcu_down_read() and srcu_up_read() assumes that these are tested only by the rcu_torture_updown() kthread, and never by the rcu_torture_reader() kthread. Because we all know which road is paved with good intentions, this commit adds WARN_ON_ONCE() to verify that things are going to plan. Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f773d4f8f2ae..10f3cc4861ee 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2232,6 +2232,7 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); + WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); } return &rtrsp[j]; @@ -2368,6 +2369,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) WARN_ON_ONCE(!rcu_is_watching()); init_rcu_torture_one_read_state(&rtors, trsp); newstate = rcutorture_extend_mask(rtors.readstate, trsp); + WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++); if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp); -- cgit v1.2.3 From cacba0bf6d9f7568e115f3ffe32a95c33f5fd885 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Apr 2025 11:22:49 -0700 Subject: rcutorture: Print number of RCU up/down readers and migrations This commit prints the number of RCU up/down readers and the number of such readers that migrated from one CPU to another, along with the rest of the periodic rcu_torture_stats_print() output. These statistics are currently used only by srcu_down_read{,_fast}() and srcu_up_read(,_fast)(). Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 10f3cc4861ee..3f644b1f5823 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2450,9 +2450,11 @@ struct rcu_torture_one_read_state_updown { struct hrtimer rtorsu_hrt; bool rtorsu_inuse; ktime_t rtorsu_kt; + int rtorsu_cpu; unsigned long rtorsu_j; unsigned long rtorsu_ndowns; unsigned long rtorsu_nups; + unsigned long rtorsu_nmigrates; struct torture_random_state rtorsu_trs; struct rcu_torture_one_read_state rtorsu_rtors; }; @@ -2463,12 +2465,15 @@ static int rcu_torture_updown(void *arg); static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) { + int cpu = raw_smp_processor_id(); struct rcu_torture_one_read_state_updown *rtorsup; rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); - rtorsup->rtorsu_nups++; + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); + WRITE_ONCE(rtorsup->rtorsu_nmigrates, + rtorsup->rtorsu_nmigrates + (cpu != rtorsup->rtorsu_cpu)); smp_store_release(&rtorsup->rtorsu_inuse, false); return HRTIMER_NORESTART; } @@ -2516,7 +2521,7 @@ static void rcu_torture_updown_cleanup(void) if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); - rtorsup->rtorsu_nups++; + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); smp_store_release(&rtorsup->rtorsu_inuse, false); } @@ -2534,13 +2539,14 @@ static void rcu_torture_updown_one(struct rcu_torture_one_read_state_updown *rto init_rcu_torture_one_read_state(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); rawidx = cur_ops->down_read(); - rtorsup->rtorsu_ndowns++; + WRITE_ONCE(rtorsup->rtorsu_ndowns, rtorsup->rtorsu_ndowns + 1); idx = (rawidx << RCUTORTURE_RDR_SHIFT_1) & RCUTORTURE_RDR_MASK_1; rtorsup->rtorsu_rtors.readstate = idx | RCUTORTURE_RDR_UPDOWN; rtorsup->rtorsu_rtors.rtrsp++; + rtorsup->rtorsu_cpu = raw_smp_processor_id(); if (!rcu_torture_one_read_start(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1)) { WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); - rtorsup->rtorsu_nups++; + WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); schedule_timeout_idle(HZ); return; } @@ -2649,6 +2655,10 @@ rcu_torture_stats_print(void) long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; long n_gpwraps = 0; + unsigned long ndowns = 0; + unsigned long nunexpired = 0; + unsigned long nmigrates = 0; + unsigned long nups = 0; struct rcu_torture *rtcp; static unsigned long rtcv_snap = ULONG_MAX; static bool splatted; @@ -2662,10 +2672,18 @@ rcu_torture_stats_print(void) if (cur_ops->get_gpwrap_count) n_gpwraps += cur_ops->get_gpwrap_count(cpu); } + if (updownreaders) { + for (i = 0; i < n_up_down; i++) { + ndowns += READ_ONCE(updownreaders[i].rtorsu_ndowns); + nups += READ_ONCE(updownreaders[i].rtorsu_nups); + nunexpired += READ_ONCE(updownreaders[i].rtorsu_inuse); + nmigrates += READ_ONCE(updownreaders[i].rtorsu_nmigrates); + } + } for (i = RCU_TORTURE_PIPE_LEN; i >= 0; i--) { if (pipesummary[i] != 0) break; - } + } // The value of variable "i" is used later, so don't clobber it! pr_alert("%s%s ", torture_type, TORTURE_FLAG); rtcp = rcu_access_pointer(rcu_torture_current); @@ -2686,6 +2704,8 @@ rcu_torture_stats_print(void) n_rcu_torture_boost_failure, n_rcu_torture_boosts, atomic_long_read(&n_rcu_torture_timers)); + if (updownreaders) + pr_cont("ndowns: %lu nups: %lu nhrt: %lu nmigrates: %lu ", ndowns, nups, nunexpired, nmigrates); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld ", data_race(n_barrier_successes), -- cgit v1.2.3 From f32367d96eba8fc50a0bbd2a792df3199089d13b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 May 2025 21:09:05 -0700 Subject: rcutorture: Drop redundant "insoftirq" parameters Given that the rcutorture_one_extend_check() function now uses in_serving_softirq() and in_hardirq(), it is no longer necessary to pass insoftirq flags down the function-call stack. This commit therefore removes those flags, and, while in the area, does a bit of whitespace cleanup. Signed-off-by: Paul E. McKenney Tested-by: kernel test robot Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3f644b1f5823..470b5a117602 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1975,7 +1975,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, // Verify the specified RCUTORTURE_RDR* state. #define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count() -static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq) +static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) { int mask; @@ -2038,8 +2038,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, * beginning or end of the critical section and if there was actually a * change, do a ->read_delay(). */ -static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, - struct torture_random_state *trsp, +static void rcutorture_one_extend(int *readstate, int newstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { bool first; @@ -2054,7 +2053,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, first = idxold1 == 0; WARN_ON_ONCE(idxold2 < 0); WARN_ON_ONCE(idxold2 & ~(RCUTORTURE_RDR_ALLBITS | RCUTORTURE_RDR_UPDOWN)); - rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("before change", idxold1, statesnew, statesold); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -2074,8 +2073,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2; // Complain unless both the old and the new protection is in place. - rcutorture_one_extend_check("during change", - idxold1 | statesnew, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("during change", idxold1 | statesnew, statesnew, statesold); // Sample CPU under both sets of protections to reduce confusion. if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) { @@ -2150,7 +2148,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq, WARN_ON_ONCE(*readstate < 0); if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS)) pr_info("Unexpected readstate value of %#x\n", *readstate); - rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq); + rcutorture_one_extend_check("after change", *readstate, statesnew, statesold); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -2217,8 +2215,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) * critical section. */ static struct rt_read_seg * -rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp, - struct rt_read_seg *rtrsp) +rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, struct rt_read_seg *rtrsp) { int i; int j; @@ -2233,7 +2230,7 @@ rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_sta for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); WARN_ON_ONCE(mask & RCUTORTURE_RDR_UPDOWN); - rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]); + rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]); } return &rtrsp[j]; } @@ -2279,7 +2276,7 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp (rtorsp->readstate & RCUTORTURE_RDR_UPDOWN)); if (rtorsp->p == NULL) { /* Wait for rcu_torture_writer to get underway */ - rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); return false; } if (rtorsp->p->rtort_mbtest == 0) @@ -2293,7 +2290,7 @@ static bool rcu_torture_one_read_start(struct rcu_torture_one_read_state *rtorsp * critical sections and check for errors. */ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, - struct torture_random_state *trsp, long myid) + struct torture_random_state *trsp) { int i; unsigned long completed; @@ -2340,7 +2337,7 @@ static void rcu_torture_one_read_end(struct rcu_torture_one_read_state *rtorsp, } if (cur_ops->reader_blocked) preempted = cur_ops->reader_blocked(); - rcutorture_one_extend(&rtorsp->readstate, 0, myid < 0, trsp, rtorsp->rtrsp); + rcutorture_one_extend(&rtorsp->readstate, 0, trsp, rtorsp->rtrsp); WARN_ON_ONCE(rtorsp->readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. @@ -2370,13 +2367,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) init_rcu_torture_one_read_state(&rtors, trsp); newstate = rcutorture_extend_mask(rtors.readstate, trsp); WARN_ON_ONCE(newstate & RCUTORTURE_RDR_UPDOWN); - rcutorture_one_extend(&rtors.readstate, newstate, myid < 0, trsp, rtors.rtrsp++); + rcutorture_one_extend(&rtors.readstate, newstate, trsp, rtors.rtrsp++); if (!rcu_torture_one_read_start(&rtors, trsp, myid)) { - rcutorture_one_extend(&rtors.readstate, 0, myid < 0, trsp, rtors.rtrsp); + rcutorture_one_extend(&rtors.readstate, 0, trsp, rtors.rtrsp); return false; } - rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, myid < 0, trsp, rtors.rtrsp); - rcu_torture_one_read_end(&rtors, trsp, myid); + rtors.rtrsp = rcutorture_loop_extend(&rtors.readstate, trsp, rtors.rtrsp); + rcu_torture_one_read_end(&rtors, trsp); return true; } @@ -2469,7 +2466,7 @@ static enum hrtimer_restart rcu_torture_updown_hrt(struct hrtimer *hrtp) struct rcu_torture_one_read_state_updown *rtorsup; rtorsup = container_of(hrtp, struct rcu_torture_one_read_state_updown, rtorsu_hrt); - rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); WRITE_ONCE(rtorsup->rtorsu_nmigrates, @@ -2519,7 +2516,7 @@ static void rcu_torture_updown_cleanup(void) if (!smp_load_acquire(&rtorsup->rtorsu_inuse)) continue; if (hrtimer_cancel(&rtorsup->rtorsu_hrt) || WARN_ON_ONCE(rtorsup->rtorsu_inuse)) { - rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs, -1); + rcu_torture_one_read_end(&rtorsup->rtorsu_rtors, &rtorsup->rtorsu_trs); WARN_ONCE(rtorsup->rtorsu_nups >= rtorsup->rtorsu_ndowns, "%s: Up without matching down #%zu.\n", __func__, rtorsup - updownreaders); WRITE_ONCE(rtorsup->rtorsu_nups, rtorsup->rtorsu_nups + 1); smp_store_release(&rtorsup->rtorsu_inuse, false); -- cgit v1.2.3 From 5f2417ba05541332f334b557febc3af355317f2b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 16 May 2025 14:20:46 -0700 Subject: rcutorture: Make Trivial RCU ignore onoff_interval and shuffle_interval Trivial RCU is a textbook implementation that is not used in the Linux kernel, but tested to keep textbooks (and presentations) honest. It is so trivial that it cannot deal with either CPU hotplug or external migration from one CPU to another. This commit therefore splats whenever onoff_interval or shuffle_interval are non-zero, and then sets them to zero in order to avoid false-positive failures. Those wishing to set these module parameters in order to force failures in Trivial RCU are free to revert this commit. Just don't expect me to be sympathetic to any resulting bug reports! Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202505131651.af6e81d7-lkp@intel.com Signed-off-by: Paul E. McKenney Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 470b5a117602..7a07b2590c27 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -969,7 +969,8 @@ static struct rcu_torture_ops busted_srcud_ops = { /* * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. - * This implementation does not necessarily work well with CPU hotplug. + * This implementation does not work well with CPU hotplug nor + * with rcutorture's shuffling. */ static void synchronize_rcu_trivial(void) @@ -982,6 +983,16 @@ static void synchronize_rcu_trivial(void) } } +static void rcu_sync_torture_init_trivial(void) +{ + rcu_sync_torture_init(); + // if (onoff_interval || shuffle_interval) { + if (WARN_ONCE(onoff_interval || shuffle_interval, "%s: Non-zero onoff_interval (%d) or shuffle_interval (%d) breaks trivial RCU, resetting to zero", __func__, onoff_interval, shuffle_interval)) { + onoff_interval = 0; + shuffle_interval = 0; + } +} + static int rcu_torture_read_lock_trivial(void) { preempt_disable(); @@ -995,7 +1006,7 @@ static void rcu_torture_read_unlock_trivial(int idx) static struct rcu_torture_ops trivial_ops = { .ttype = RCU_TRIVIAL_FLAVOR, - .init = rcu_sync_torture_init, + .init = rcu_sync_torture_init_trivial, .readlock = rcu_torture_read_lock_trivial, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_torture_read_unlock_trivial, -- cgit v1.2.3 From 8d71351d88e478d3c4e945e3218e97ec677fd807 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 7 May 2025 19:26:03 +0800 Subject: rcutorture: Fix rcutorture_one_extend_check() splat in RT kernels For built with CONFIG_PREEMPT_RT=y kernels, running rcutorture tests resulted in the following splat: [ 68.797425] rcutorture_one_extend_check during change: Current 0x1 To add 0x1 To remove 0x0 preempt_count() 0x0 [ 68.797533] WARNING: CPU: 2 PID: 512 at kernel/rcu/rcutorture.c:1993 rcutorture_one_extend_check+0x419/0x560 [rcutorture] [ 68.797601] Call Trace: [ 68.797602] [ 68.797619] ? lockdep_softirqs_off+0xa5/0x160 [ 68.797631] rcutorture_one_extend+0x18e/0xcc0 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797646] ? local_clock+0x19/0x40 [ 68.797659] rcu_torture_one_read+0xf0/0x280 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797678] ? __pfx_rcu_torture_one_read+0x10/0x10 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797804] ? __pfx_rcu_torture_timer+0x10/0x10 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797815] rcu-torture: rcu_torture_reader task started [ 68.797824] rcu-torture: Creating rcu_torture_reader task [ 68.797824] rcu_torture_reader+0x238/0x580 [rcutorture 2466dbd2ff34dbaa36049cb323a80c3306ac997c] [ 68.797836] ? kvm_sched_clock_read+0x15/0x30 Disable BH does not change the SOFTIRQ corresponding bits in preempt_count() for RT kernels, this commit therefore use softirq_count() to check the if BH is disabled. Reviewed-by: Paul E. McKenney Signed-off-by: Zqiang Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7a07b2590c27..213f23f20a64 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -471,7 +471,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp) !(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) { started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); - if (preempt_count() & (SOFTIRQ_MASK | HARDIRQ_MASK)) + if ((preempt_count() & HARDIRQ_MASK) || softirq_count()) longdelay_ms = 5; /* Avoid triggering BH limits. */ mdelay(longdelay_ms); rtrsp->rt_delay_ms = longdelay_ms; @@ -2001,7 +2001,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) return; WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + !softirq_count(), ROEC_ARGS); WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) && !(preempt_count() & PREEMPT_MASK), ROEC_ARGS); WARN_ONCE(cur_ops->readlock_nesting && @@ -2015,7 +2015,7 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) WARN_ONCE(cur_ops->extendables && !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) && - (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS); + softirq_count(), ROEC_ARGS); /* * non-preemptible RCU in a preemptible kernel uses preempt_disable() @@ -2036,6 +2036,9 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old) if (!IS_ENABLED(CONFIG_PREEMPT_RCU)) mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && softirq_count()) + mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; + WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) && cur_ops->readlock_nesting() > 0, ROEC_ARGS); } -- cgit v1.2.3 From aced132599b3c8884c050218d4c48eef203678f6 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 25 Jun 2025 09:40:24 -0700 Subject: bpf: Add range tracking for BPF_NEG Add range tracking for instruction BPF_NEG. Without this logic, a trivial program like the following will fail volatile bool found_value_b; SEC("lsm.s/socket_connect") int BPF_PROG(test_socket_connect) { if (!found_value_b) return -1; return 0; } with verifier log: "At program exit the register R0 has smin=0 smax=4294967295 should have been in [-4095, 0]". This is because range information is lost in BPF_NEG: 0: R1=ctx() R10=fp0 ; if (!found_value_b) @ xxxx.c:24 0: (18) r1 = 0xffa00000011e7048 ; R1_w=map_value(...) 2: (71) r0 = *(u8 *)(r1 +0) ; R0_w=scalar(smin32=0,smax=255) 3: (a4) w0 ^= 1 ; R0_w=scalar(smin32=0,smax=255) 4: (84) w0 = -w0 ; R0_w=scalar(range info lost) Note that, the log above is manually modified to highlight relevant bits. Fix this by maintaining proper range information with BPF_NEG, so that the verifier will know: 4: (84) w0 = -w0 ; R0_w=scalar(smin32=-255,smax=0) Also updated selftests based on the expected behavior. Signed-off-by: Song Liu Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250625164025.3310203-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/tnum.c | 5 +++++ kernel/bpf/verifier.c | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 9dbc31b25e3d..fa353c5d550f 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c @@ -83,6 +83,11 @@ struct tnum tnum_sub(struct tnum a, struct tnum b) return TNUM(dv & ~mu, mu); } +struct tnum tnum_neg(struct tnum a) +{ + return tnum_sub(TNUM(0, 0), a); +} + struct tnum tnum_and(struct tnum a, struct tnum b) { u64 alpha, beta, v; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f403524bd215..2ff22ef42348 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15182,6 +15182,7 @@ static bool is_safe_to_compute_dst_reg_range(struct bpf_insn *insn, switch (BPF_OP(insn->code)) { case BPF_ADD: case BPF_SUB: + case BPF_NEG: case BPF_AND: case BPF_XOR: case BPF_OR: @@ -15250,6 +15251,13 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, scalar_min_max_sub(dst_reg, &src_reg); dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); break; + case BPF_NEG: + env->fake_reg[0] = *dst_reg; + __mark_reg_known(dst_reg, 0); + scalar32_min_max_sub(dst_reg, &env->fake_reg[0]); + scalar_min_max_sub(dst_reg, &env->fake_reg[0]); + dst_reg->var_off = tnum_neg(env->fake_reg[0].var_off); + break; case BPF_MUL: dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); scalar32_min_max_mul(dst_reg, &src_reg); @@ -15473,7 +15481,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } /* check dest operand */ - err = check_reg_arg(env, insn->dst_reg, DST_OP); + if (opcode == BPF_NEG) { + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + err = err ?: adjust_scalar_min_max_vals(env, insn, + ®s[insn->dst_reg], + regs[insn->dst_reg]); + } else { + err = check_reg_arg(env, insn->dst_reg, DST_OP); + } if (err) return err; -- cgit v1.2.3 From b23e97ffc25294025a68924b6feac9cd431c3eb5 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 25 Jun 2025 11:24:12 -0700 Subject: bpf: add bpf_features enum This commit adds a kernel side enum for use in conjucntion with BTF CO-RE bpf_core_enum_value_exists. The goal of the enum is to assist with available BPF features detection. Intended usage looks as follows: if (bpf_core_enum_value_exists(enum bpf_features, BPF_FEAT_)) ... use feature f ... Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250625182414.30659-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2ff22ef42348..527fa80be429 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -44,6 +44,10 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { #undef BPF_LINK_TYPE }; +enum bpf_features { + __MAX_BPF_FEAT, +}; + struct bpf_mem_alloc bpf_global_percpu_ma; static bool bpf_global_percpu_ma_set; @@ -24439,6 +24443,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 u32 log_true_size; bool is_priv; + BTF_TYPE_EMIT(enum bpf_features); + /* no program is valid */ if (ARRAY_SIZE(bpf_verifier_ops) == 0) return -EINVAL; -- cgit v1.2.3 From f2362a57aefff5816dc7cc078dab23de0a2918c4 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 25 Jun 2025 11:24:13 -0700 Subject: bpf: allow void* cast using bpf_rdonly_cast() Introduce support for `bpf_rdonly_cast(v, 0)`, which casts the value `v` to an untyped, untrusted pointer, logically similar to a `void *`. The memory pointed to by such a pointer is treated as read-only. As with other untrusted pointers, memory access violations on loads return zero instead of causing a fault. Technically: - The resulting pointer is represented as a register of type `PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED` with size zero. - Offsets within such pointers are not tracked. - Same load instructions are allowed to have both `PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED` and `PTR_TO_BTF_ID` as the base pointer types. In such cases, `bpf_insn_aux_data->ptr_type` is considered the weaker of the two: `PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED`. The following constraints apply to the new pointer type: - can be used as a base for LDX instructions; - can't be used as a base for ST/STX or atomic instructions; - can't be used as parameter for kfuncs or helpers. These constraints are enforced by existing handling of `MEM_RDONLY` flag and `PTR_TO_MEM` of size zero. Suggested-by: Alexei Starovoitov Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250625182414.30659-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 73 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 527fa80be429..42e497a26d6f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -45,6 +45,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { }; enum bpf_features { + BPF_FEAT_RDONLY_CAST_TO_VOID = 0, __MAX_BPF_FEAT, }; @@ -7539,6 +7540,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (base_type(reg->type) == PTR_TO_MEM) { bool rdonly_mem = type_is_rdonly_mem(reg->type); + bool rdonly_untrusted = rdonly_mem && (reg->type & PTR_UNTRUSTED); if (type_may_be_null(reg->type)) { verbose(env, "R%d invalid mem access '%s'\n", regno, @@ -7558,8 +7560,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_mem_region_access(env, regno, off, size, - reg->mem_size, false); + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to check bounds in that case. + */ + if (!rdonly_untrusted) + err = check_mem_region_access(env, regno, off, size, + reg->mem_size, false); if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { @@ -13606,16 +13613,24 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca regs[BPF_REG_0].btf_id = meta->ret_btf_id; } else if (meta->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { ret_t = btf_type_by_id(desc_btf, meta->arg_constant.value); - if (!ret_t || !btf_type_is_struct(ret_t)) { + if (!ret_t) { + verbose(env, "Unknown type ID %lld passed to kfunc bpf_rdonly_cast\n", + meta->arg_constant.value); + return -EINVAL; + } else if (btf_type_is_struct(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta->arg_constant.value; + } else if (btf_type_is_void(ret_t)) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + regs[BPF_REG_0].mem_size = 0; + } else { verbose(env, - "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); + "kfunc bpf_rdonly_cast type ID argument must be of a struct or void\n"); return -EINVAL; } - - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; - regs[BPF_REG_0].btf = desc_btf; - regs[BPF_REG_0].btf_id = meta->arg_constant.value; } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice] || meta->func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { enum bpf_type_flag type_flag = get_dynptr_type_flag(meta->initialized_dynptr.type); @@ -14414,6 +14429,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } + /* + * Accesses to untrusted PTR_TO_MEM are done through probe + * instructions, hence no need to track offsets. + */ + if (base_type(ptr_reg->type) == PTR_TO_MEM && (ptr_reg->type & PTR_UNTRUSTED)) + return 0; + switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: case PTR_TO_MAP_VALUE: @@ -19622,10 +19644,27 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) !reg_type_mismatch_ok(prev)); } +static bool is_ptr_to_mem_or_btf_id(enum bpf_reg_type type) +{ + switch (base_type(type)) { + case PTR_TO_MEM: + case PTR_TO_BTF_ID: + return true; + default: + return false; + } +} + +static bool is_ptr_to_mem(enum bpf_reg_type type) +{ + return base_type(type) == PTR_TO_MEM; +} + static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch) { enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type; + enum bpf_reg_type merged_type; if (*prev_type == NOT_INIT) { /* Saw a valid insn @@ -19642,15 +19681,24 @@ static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type typ * Reject it. */ if (allow_trust_mismatch && - base_type(type) == PTR_TO_BTF_ID && - base_type(*prev_type) == PTR_TO_BTF_ID) { + is_ptr_to_mem_or_btf_id(type) && + is_ptr_to_mem_or_btf_id(*prev_type)) { /* * Have to support a use case when one path through * the program yields TRUSTED pointer while another * is UNTRUSTED. Fallback to UNTRUSTED to generate * BPF_PROBE_MEM/BPF_PROBE_MEMSX. + * Same behavior of MEM_RDONLY flag. */ - *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + if (is_ptr_to_mem(type) || is_ptr_to_mem(*prev_type)) + merged_type = PTR_TO_MEM; + else + merged_type = PTR_TO_BTF_ID; + if ((type & PTR_UNTRUSTED) || (*prev_type & PTR_UNTRUSTED)) + merged_type |= PTR_UNTRUSTED; + if ((type & MEM_RDONLY) || (*prev_type & MEM_RDONLY)) + merged_type |= MEM_RDONLY; + *prev_type = merged_type; } else { verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; @@ -21258,6 +21306,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: + case PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | -- cgit v1.2.3 From d83caf7c8dad96051267c18786b7bc446b537f3c Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 25 Jun 2025 15:16:21 +0000 Subject: bpf: add btf_type_is_i{32,64} helpers There are places in BPF code which check if a BTF type is an integer of particular size. This code can be made simpler by using helpers. Add new btf_type_is_i{32,64} helpers, and simplify code in a few files. (Suggested by Eduard for a patch which copy-pasted such a check [1].) v1 -> v2: * export less generic helpers (Eduard) * make subject less generic than in [v1] (Eduard) [1] https://lore.kernel.org/bpf/7edb47e73baa46705119a23c6bf4af26517a640f.camel@gmail.com/ [v1] https://lore.kernel.org/bpf/20250624193655.733050-1-a.s.protopopov@gmail.com/ Suggested-by: Eduard Zingerman Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250625151621.1000584-1-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 11 +++-------- kernel/bpf/bpf_local_storage.c | 8 +------- kernel/bpf/btf.c | 41 ++++++++++++++++++++++++++--------------- kernel/bpf/local_storage.c | 9 +-------- 4 files changed, 31 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index eb28c0f219ee..3d080916faf9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -530,8 +530,6 @@ static int array_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - /* One exception for keyless BTF: .bss/.data/.rodata map */ if (btf_type_is_void(key_type)) { if (map->map_type != BPF_MAP_TYPE_ARRAY || @@ -544,14 +542,11 @@ static int array_map_check_btf(const struct bpf_map *map, return 0; } - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - /* bpf array can only take a u32 key. This check makes sure + /* + * Bpf array can only take a u32 key. This check makes sure * that the btf matches the attr used during map_create. */ - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index fa56c30833ff..b931fbceb54d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -722,13 +722,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type) { - u32 int_data; - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i32(key_type)) return -EINVAL; return 0; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 682acb1ed234..05fd64a371af 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -858,26 +858,37 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) EXPORT_SYMBOL_GPL(btf_type_by_id); /* - * Regular int is not a bit field and it must be either - * u8/u16/u32/u64 or __int128. + * Check that the type @t is a regular int. This means that @t is not + * a bit field and it has the same size as either of u8/u16/u32/u64 + * or __int128. If @expected_size is not zero, then size of @t should + * be the same. A caller should already have checked that the type @t + * is an integer. */ +static bool __btf_type_int_is_regular(const struct btf_type *t, size_t expected_size) +{ + u32 int_data = btf_type_int(t); + u8 nr_bits = BTF_INT_BITS(int_data); + u8 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); + + return BITS_PER_BYTE_MASKED(nr_bits) == 0 && + BTF_INT_OFFSET(int_data) == 0 && + (nr_bytes <= 16 && is_power_of_2(nr_bytes)) && + (expected_size == 0 || nr_bytes == expected_size); +} + static bool btf_type_int_is_regular(const struct btf_type *t) { - u8 nr_bits, nr_bytes; - u32 int_data; + return __btf_type_int_is_regular(t, 0); +} - int_data = btf_type_int(t); - nr_bits = BTF_INT_BITS(int_data); - nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); - if (BITS_PER_BYTE_MASKED(nr_bits) || - BTF_INT_OFFSET(int_data) || - (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && - nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && - nr_bytes != (2 * sizeof(u64)))) { - return false; - } +bool btf_type_is_i32(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 4); +} - return true; +bool btf_type_is_i64(const struct btf_type *t) +{ + return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8); } /* diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 3969eb0382af..632d51b05fe9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -394,17 +394,10 @@ static int cgroup_storage_check_btf(const struct bpf_map *map, if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) return -EINVAL; } else { - u32 int_data; - /* * Key is expected to be u64, which stores the cgroup_inode_id */ - - if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) - return -EINVAL; - - int_data = *(u32 *)(key_type + 1); - if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data)) + if (!btf_type_is_i64(key_type)) return -EINVAL; } -- cgit v1.2.3 From b6f5e748587063d4ad3294d07ca29886851d9cd7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:21:22 +0200 Subject: crashdump: add CONFIG_KEYS dependency The dm_crypt code fails to build without CONFIG_KEYS: kernel/crash_dump_dm_crypt.c: In function 'restore_dm_crypt_keys_to_thread_keyring': kernel/crash_dump_dm_crypt.c:105:9: error: unknown type name 'key_ref_t'; did you mean 'key_ref_put'? There is a mix of 'select KEYS' and 'depends on KEYS' in Kconfig, so there is no single obvious solution here, but generally using 'depends on' makes more sense and is less likely to cause dependency loops. Link: https://lkml.kernel.org/r/20250620112140.3396316-1-arnd@kernel.org Fixes: 62f17d9df692 ("crash_dump: retrieve dm crypt keys in kdump kernel") Signed-off-by: Arnd Bergmann Cc: Alexander Graf Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Michael Ellerman Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index e64ce21f9a80..2ee603a98813 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -134,6 +134,7 @@ config CRASH_DM_CRYPT depends on KEXEC_FILE depends on CRASH_DUMP depends on DM_CRYPT + depends on KEYS help With this option enabled, user space can intereact with /sys/kernel/config/crash_dm_crypt_keys to make the dm crypt keys -- cgit v1.2.3 From 4ecf83741401c70d4420588ee1f3b1ca04ef58d5 Mon Sep 17 00:00:00 2001 From: Jake Hillion Date: Wed, 25 Jun 2025 18:05:46 +0100 Subject: sched_ext: Drop kfuncs marked for removal in 6.15 sched_ext performed a kfunc renaming pass in 6.13 and kept the old names around for compatibility with old binaries. These were scheduled for cleanup in 6.15 but were missed. Submitting for cleanup in for-next. Removed the kfuncs, their flags, and any references I could find to them in doc comments. Left the entries in include/scx/compat.bpf.h as they're still useful to make new binaries compatible with old kernels. Tested by applying to my kernel. It builds and a modern version of scx_lavd loads fine. Signed-off-by: Jake Hillion Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 71 ++---------------------------------------------------- 1 file changed, 2 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index df5b2c952cf7..512474eabea6 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6391,7 +6391,8 @@ __bpf_kfunc_start_defs(); * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id * and this function can be called upto ops.dispatch_max_batch times to insert * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the - * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. + * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the + * counter. * * This function doesn't have any locking restrictions and may be called under * BPF locks (in the future when BPF introduces more flexible locking). @@ -6415,14 +6416,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice scx_dsq_insert_commit(p, dsq_id, enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()"); - scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags); -} - /** * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ * @p: task_struct to insert @@ -6460,21 +6453,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, - u64 slice, u64 vtime, u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()"); - scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags); -} - __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { @@ -6647,13 +6630,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) } } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); - return scx_bpf_dsq_move_to_local(dsq_id); -} - /** * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6672,14 +6648,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice( - struct bpf_iter_scx_dsq *it__iter, u64 slice) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()"); - scx_bpf_dsq_move_set_slice(it__iter, slice); -} - /** * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6699,14 +6667,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME; } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime( - struct bpf_iter_scx_dsq *it__iter, u64 vtime) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()"); - scx_bpf_dsq_move_set_vtime(it__iter, vtime); -} - /** * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ * @it__iter: DSQ iterator in progress @@ -6739,15 +6699,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); - return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags); -} - /** * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ * @it__iter: DSQ iterator in progress @@ -6773,30 +6724,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } -/* for backward compatibility, will be removed in v6.15 */ -__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, - struct task_struct *p, u64 dsq_id, - u64 enq_flags) -{ - printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()"); - return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags); -} - __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) -BTF_ID_FLAGS(func, scx_bpf_consume) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { @@ -6927,10 +6864,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime) -BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { -- cgit v1.2.3 From 1476b218327b89bbb64c14619a2d34f0c320f2c3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Wed, 25 Jun 2025 18:07:37 +0100 Subject: perf/aux: Fix pending disable flow when the AUX ring buffer overruns If an AUX event overruns, the event core layer intends to disable the event by setting the 'pending_disable' flag. Unfortunately, the event is not actually disabled afterwards. In commit: ca6c21327c6a ("perf: Fix missing SIGTRAPs") the 'pending_disable' flag was changed to a boolean. However, the AUX event code was not updated accordingly. The flag ends up holding a CPU number. If this number is zero, the flag is taken as false and the IRQ work is never triggered. Later, with commit: 2b84def990d3 ("perf: Split __perf_pending_irq() out of perf_pending_irq()") a new IRQ work 'pending_disable_irq' was introduced to handle event disabling. The AUX event path was not updated to kick off the work queue. To fix this bug, when an AUX ring buffer overrun is detected, call perf_event_disable_inatomic() to initiate the pending disable flow. Also update the outdated comment for setting the flag, to reflect the boolean values (0 or 1). Fixes: 2b84def990d3 ("perf: Split __perf_pending_irq() out of perf_pending_irq()") Fixes: ca6c21327c6a ("perf: Fix missing SIGTRAPs") Signed-off-by: Leo Yan Signed-off-by: Ingo Molnar Reviewed-by: James Clark Reviewed-by: Yeoreum Yun Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Jiri Olsa Cc: Liang Kan Cc: Marco Elver Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: linux-perf-users@vger.kernel.org Link: https://lore.kernel.org/r/20250625170737.2918295-1-leo.yan@arm.com --- kernel/events/core.c | 6 +++--- kernel/events/ring_buffer.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1f746469fda5..7281230044d0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7251,15 +7251,15 @@ static void __perf_pending_disable(struct perf_event *event) * CPU-A CPU-B * * perf_event_disable_inatomic() - * @pending_disable = CPU-A; + * @pending_disable = 1; * irq_work_queue(); * * sched-out - * @pending_disable = -1; + * @pending_disable = 0; * * sched-in * perf_event_disable_inatomic() - * @pending_disable = CPU-B; + * @pending_disable = 1; * irq_work_queue(); // FAILS * * irq_work_run() diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index d2aef87c7e9f..aa9a759e824f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -441,7 +441,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ - event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); WRITE_ONCE(rb->aux_nest, 0); goto err_put; @@ -526,7 +526,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) - handle->event->pending_disable = smp_processor_id(); + perf_event_disable_inatomic(handle->event); perf_output_wakeup(handle); } -- cgit v1.2.3 From e91370550f1fe6fa3b02e8bf9762e3dc0a02fcad Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Thu, 26 Jun 2025 08:08:29 +0200 Subject: bpf: Add kfuncs for read-only string operations String operations are commonly used so this exposes the most common ones to BPF programs. For now, we limit ourselves to operations which do not copy memory around. Unfortunately, most in-kernel implementations assume that strings are %NUL-terminated, which is not necessarily true, and therefore we cannot use them directly in the BPF context. Instead, we open-code them using __get_kernel_nofault instead of plain dereference to make them safe and limit the strings length to XATTR_SIZE_MAX to make sure the functions terminate. When __get_kernel_nofault fails, functions return -EFAULT. Similarly, when the size bound is reached, the functions return -E2BIG. In addition, we return -ERANGE when the passed strings are outside of the kernel address space. Note that thanks to these dynamic safety checks, no other constraints are put on the kfunc args (they are marked with the "__ign" suffix to skip any verifier checks for them). All of the functions return integers, including functions which normally (in kernel or libc) return pointers to the strings. The reason is that since the strings are generally treated as unsafe, the pointers couldn't be dereferenced anyways. So, instead, we return an index to the string and let user decide what to do with it. This also nicely fits with returning various error codes when necessary (see above). Suggested-by: Alexei Starovoitov Signed-off-by: Viktor Malik Link: https://lore.kernel.org/r/4b008a6212852c1b056a413f86e3efddac73551c.1750917800.git.vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 382 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b71e428ad936..2cdcf7b2c91e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "../../lib/kstrtox.h" @@ -3278,6 +3279,376 @@ __bpf_kfunc void __bpf_trap(void) { } +/* + * Kfuncs for string operations. + * + * Since strings are not necessarily %NUL-terminated, we cannot directly call + * in-kernel implementations. Instead, we open-code the implementations using + * __get_kernel_nofault instead of plain dereference to make them safe. + */ + +/** + * bpf_strcmp - Compare two strings + * @s1__ign: One string + * @s2__ign: Another string + * + * Return: + * * %0 - Strings are equal + * * %-1 - @s1__ign is smaller + * * %1 - @s2__ign is smaller + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of strings is too large + * * %-ERANGE - One of strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcmp(const char *s1__ign, const char *s2__ign) +{ + char c1, c2; + int i; + + if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || + !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c1, s1__ign, char, err_out); + __get_kernel_nofault(&c2, s2__ign, char, err_out); + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (c1 == '\0') + return 0; + s1__ign++; + s2__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnchr - Find a character in a length limited string + * @s__ign: The string to be searched + * @count: The number of characters to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in the first @count characters of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnchr(const char *s__ign, size_t count, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + return i; + if (sc == '\0') + return -ENOENT; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : -ENOENT; +err_out: + return -EFAULT; +} + +/** + * bpf_strchr - Find the first occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + * + * Return: + * * >=0 - The index of the first occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchr(const char *s__ign, char c) +{ + return bpf_strnchr(s__ign, XATTR_SIZE_MAX, c); +} + +/** + * bpf_strchrnul - Find and return a character in a string, or end of string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the first occurrence of @c within @s__ign or index of + * the null byte at the end of @s__ign when @c is not found + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strchrnul(const char *s__ign, char c) +{ + char sc; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == '\0' || sc == c) + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strrchr - Find the last occurrence of a character in a string + * @s__ign: The string to be searched + * @c: The character to search for + * + * Return: + * * >=0 - Index of the last occurrence of @c within @s__ign + * * %-ENOENT - @c not found in @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strrchr(const char *s__ign, int c) +{ + char sc; + int i, last = -ENOENT; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&sc, s__ign, char, err_out); + if (sc == c) + last = i; + if (sc == '\0') + return last; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strlen - Calculate the length of a length-limited string + * @s__ign: The string + * @count: The maximum number of characters to count + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strnlen(const char *s__ign, size_t count) +{ + char c; + int i; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1)) + return -ERANGE; + + guard(pagefault)(); + for (i = 0; i < count && i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&c, s__ign, char, err_out); + if (c == '\0') + return i; + s__ign++; + } + return i == XATTR_SIZE_MAX ? -E2BIG : i; +err_out: + return -EFAULT; +} + +/** + * bpf_strlen - Calculate the length of a string + * @s__ign: The string + * + * Return: + * * >=0 - The length of @s__ign + * * %-EFAULT - Cannot read @s__ign + * * %-E2BIG - @s__ign is too large + * * %-ERANGE - @s__ign is outside of kernel address space + */ +__bpf_kfunc int bpf_strlen(const char *s__ign) +{ + return bpf_strnlen(s__ign, XATTR_SIZE_MAX); +} + +/** + * bpf_strspn - Calculate the length of the initial substring of @s__ign which + * only contains letters in @accept__ign + * @s__ign: The string to be searched + * @accept__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which only + * contains letters from @accept__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strspn(const char *s__ign, const char *accept__ign) +{ + char cs, ca; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(accept__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&ca, accept__ign + j, char, err_out); + if (cs == ca || ca == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (ca == '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * strcspn - Calculate the length of the initial substring of @s__ign which + * does not contain letters in @reject__ign + * @s__ign: The string to be searched + * @reject__ign: The string to search for + * + * Return: + * * >=0 - The length of the initial substring of @s__ign which does not + * contain letters from @reject__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strcspn(const char *s__ign, const char *reject__ign) +{ + char cs, cr; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s__ign, 1) || + !copy_from_kernel_nofault_allowed(reject__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + __get_kernel_nofault(&cs, s__ign, char, err_out); + if (cs == '\0') + return i; + for (j = 0; j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&cr, reject__ign + j, char, err_out); + if (cs == cr || cr == '\0') + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (cr != '\0') + return i; + s__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strnstr - Find the first substring in a length-limited string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * @len: the maximum number of characters to search + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within the first @len characters of @s1__ign + * * %-ENOENT - @s2__ign not found in the first @len characters of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len) +{ + char c1, c2; + int i, j; + + if (!copy_from_kernel_nofault_allowed(s1__ign, 1) || + !copy_from_kernel_nofault_allowed(s2__ign, 1)) { + return -ERANGE; + } + + guard(pagefault)(); + for (i = 0; i < XATTR_SIZE_MAX; i++) { + for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + __get_kernel_nofault(&c2, s2__ign + j, char, err_out); + if (c2 == '\0') + return i; + __get_kernel_nofault(&c1, s1__ign + j, char, err_out); + if (c1 == '\0') + return -ENOENT; + if (c1 != c2) + break; + } + if (j == XATTR_SIZE_MAX) + return -E2BIG; + if (i + j == len) + return -ENOENT; + s1__ign++; + } + return -E2BIG; +err_out: + return -EFAULT; +} + +/** + * bpf_strstr - Find the first substring in a string + * @s1__ign: The string to be searched + * @s2__ign: The string to search for + * + * Return: + * * >=0 - Index of the first character of the first occurrence of @s2__ign + * within @s1__ign + * * %-ENOENT - @s2__ign is not a substring of @s1__ign + * * %-EFAULT - Cannot read one of the strings + * * %-E2BIG - One of the strings is too large + * * %-ERANGE - One of the strings is outside of kernel address space + */ +__bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign) +{ + return bpf_strnstr(s1__ign, s2__ign, XATTR_SIZE_MAX); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(generic_btf_ids) @@ -3397,6 +3768,17 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPAB BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) +BTF_ID_FLAGS(func, bpf_strcmp); +BTF_ID_FLAGS(func, bpf_strchr); +BTF_ID_FLAGS(func, bpf_strchrnul); +BTF_ID_FLAGS(func, bpf_strnchr); +BTF_ID_FLAGS(func, bpf_strrchr); +BTF_ID_FLAGS(func, bpf_strlen); +BTF_ID_FLAGS(func, bpf_strnlen); +BTF_ID_FLAGS(func, bpf_strspn); +BTF_ID_FLAGS(func, bpf_strcspn); +BTF_ID_FLAGS(func, bpf_strstr); +BTF_ID_FLAGS(func, bpf_strnstr); BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 12ffc3b1513ebc1f11ae77d053948504a94a68a6 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 13 Jun 2025 16:43:44 -0500 Subject: PM: Restrict swap use to later in the suspend sequence Currently swap is restricted before drivers have had a chance to do their prepare() PM callbacks. Restricting swap this early means that if a driver needs to evict some content from memory into sawp in it's prepare callback, it won't be able to. On AMD dGPUs this can lead to failed suspends under memory pressure situations as all VRAM must be evicted to system memory or swap. Move the swap restriction to right after all devices have had a chance to do the prepare() callback. If there is any problem with the sequence, restore swap in the appropriate dpm resume callbacks or error handling paths. Closes: https://github.com/ROCm/ROCK-Kernel-Driver/issues/174 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2362 Signed-off-by: Mario Limonciello Tested-by: Nat Wittstock Tested-by: Lucian Langa Link: https://patch.msgid.link/20250613214413.4127087-1-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/kexec_core.c | 1 + kernel/power/hibernate.c | 3 --- kernel/power/power.h | 5 ----- kernel/power/suspend.c | 3 +-- 4 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 9c59fa480b0b..3a9a9f240dbc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1136,6 +1136,7 @@ int kernel_kexec(void) Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: + pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); Restore_console: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 519fb09de5e0..9216e3b91d3b 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -423,7 +423,6 @@ int hibernation_snapshot(int platform_mode) } console_suspend_all(); - pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); @@ -559,7 +558,6 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); console_suspend_all(); - pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); @@ -571,7 +569,6 @@ int hibernation_restore(int platform_mode) BUG_ON(!error); } dpm_resume_end(PMSG_RECOVER); - pm_restore_gfp_mask(); console_resume_all(); pm_restore_console(); return error; diff --git a/kernel/power/power.h b/kernel/power/power.h index cb1d71562002..7ccd709af93f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -239,11 +239,6 @@ static inline void suspend_test_finish(const char *label) {} /* kernel/power/main.c */ extern int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down); extern int pm_notifier_call_chain(unsigned long val); -void pm_restrict_gfp_mask(void); -void pm_restore_gfp_mask(void); -#else -static inline void pm_restrict_gfp_mask(void) {} -static inline void pm_restore_gfp_mask(void) {} #endif #ifdef CONFIG_HIGHMEM diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 76b141b9aac0..bb608b68fb30 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -540,6 +540,7 @@ int suspend_devices_and_enter(suspend_state_t state) return error; Recover_platform: + pm_restore_gfp_mask(); platform_recover(state); goto Resume_devices; } @@ -606,9 +607,7 @@ static int enter_state(suspend_state_t state) trace_suspend_resume(TPS("suspend_enter"), state, false); pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]); - pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); - pm_restore_gfp_mask(); Finish: events_check_enabled = false; -- cgit v1.2.3 From 5f295519b42f100c735a1e8e1a70060e26f30c3f Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Sun, 22 Jun 2025 20:00:06 -0400 Subject: smp: Improve locality in smp_call_function_any() smp_call_function_any() tries to make a local call as it's the cheapest option, or switches to a CPU in the same node. If it's not possible, the algorithm gives up and searches for any CPU, in a numerical order. Instead, it can search for the best CPU based on NUMA locality, including the 2nd nearest hop (a set of equidistant nodes), and higher. sched_numa_find_nth_cpu() does exactly that, and also helps to drop most of the housekeeping code. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250623000010.10124-2-yury.norov@gmail.com --- kernel/smp.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 974f3a3962e8..7c8cfab0ce55 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -741,32 +741,19 @@ EXPORT_SYMBOL_GPL(smp_call_function_single_async); * * Selection preference: * 1) current cpu if in @mask - * 2) any cpu of current node if in @mask - * 3) any other online cpu in @mask + * 2) nearest cpu in @mask, based on NUMA topology */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; - const struct cpumask *nodemask; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); - if (cpumask_test_cpu(cpu, mask)) - goto call; - - /* Try for same node. */ - nodemask = cpumask_of_node(cpu_to_node(cpu)); - for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; - cpu = cpumask_next_and(cpu, nodemask, mask)) { - if (cpu_online(cpu)) - goto call; - } + if (!cpumask_test_cpu(cpu, mask)) + cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu)); - /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ - cpu = cpumask_any_and(mask, cpu_online_mask); -call: ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; -- cgit v1.2.3 From 976e0e3103e463725e19a5493d02ce7b7b380663 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Sun, 22 Jun 2025 20:00:07 -0400 Subject: smp: Use cpumask_any_but() in smp_call_function_many_cond() smp_call_function_many_cond() opencodes cpumask_any_but(). Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250623000010.10124-3-yury.norov@gmail.com --- kernel/smp.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 7c8cfab0ce55..5871acf3cd45 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -807,13 +807,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask, run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ - cpu = cpumask_first_and(mask, cpu_online_mask); - if (cpu == this_cpu) - cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (cpu < nr_cpu_ids) + if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) { run_remote = true; - - if (run_remote) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); -- cgit v1.2.3 From 5272b51367ea001c56a4be32342d1da1a3206fcb Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 27 Jun 2025 10:20:01 +0200 Subject: bpf: Fix string kfuncs names in doc comments Documentation comments for bpf_strnlen and bpf_strcspn contained incorrect function names. Fixes: e91370550f1f ("bpf: Add kfuncs for read-only string operations") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/bpf/20250627174759.3a435f86@canb.auug.org.au/T/#u Signed-off-by: Viktor Malik Link: https://lore.kernel.org/r/20250627082001.237606-1-vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6d051416c184..b4117681137e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3451,7 +3451,7 @@ err_out: } /** - * bpf_strlen - Calculate the length of a length-limited string + * bpf_strnlen - Calculate the length of a length-limited string * @s__ign: The string * @count: The maximum number of characters to count * @@ -3541,8 +3541,8 @@ err_out: } /** - * strcspn - Calculate the length of the initial substring of @s__ign which - * does not contain letters in @reject__ign + * bpf_strcspn - Calculate the length of the initial substring of @s__ign which + * does not contain letters in @reject__ign * @s__ign: The string to be searched * @reject__ign: The string to search for * -- cgit v1.2.3 From 9f7729480a2c771bbe49b7eab034a8eaa5e27bfb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:29 +0200 Subject: timekeeping: Update auxiliary timekeepers on clocksource change Propagate a system clocksource change to the auxiliary timekeepers so that they can pick up the new clocksource. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.803890875@linutronix.de --- kernel/time/timekeeping.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7d3693a72a01..ee9757018341 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -119,8 +119,10 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { #ifdef CONFIG_POSIX_AUX_CLOCKS static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); #else static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } #endif unsigned long timekeeper_lock_irqsave(void) @@ -1548,6 +1550,8 @@ static int change_clocksource(void *data) timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } + tk_aux_update_clocksource(); + if (old) { if (old->disable) old->disable(old); @@ -2651,6 +2655,35 @@ EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ #ifdef CONFIG_POSIX_AUX_CLOCKS + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_mono.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) -- cgit v1.2.3 From 05bc6e6290f91d2d40086ab4ef52da21c14ec4b6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:31 +0200 Subject: timekeeping: Provide time getters for auxiliary clocks Provide interfaces similar to the ktime_get*() family which provide access to the auxiliary clocks. These interfaces have a boolean return value, which indicates whether the accessed clock is valid or not. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.868342628@linutronix.de --- kernel/time/timekeeping.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ee9757018341..c7d2913e68c3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2664,6 +2664,18 @@ EXPORT_SYMBOL(hardpps); */ static unsigned long aux_timekeepers; +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + /* Invoked from timekeeping after a clocksource change */ static void tk_aux_update_clocksource(void) { @@ -2684,6 +2696,59 @@ static void tk_aux_update_clocksource(void) } } +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) -- cgit v1.2.3 From 606424bf4ffd9d27865c45b5707c1edac6b187ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:32 +0200 Subject: timekeeping: Add minimal posix-timers support for auxiliary clocks Provide clock_getres(2) and clock_gettime(2) for auxiliary clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.932220594@linutronix.de --- kernel/time/posix-timers.c | 3 +++ kernel/time/posix-timers.h | 1 + kernel/time/timekeeping.c | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2053b1a4c9e4..8b582174b1f9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1526,6 +1526,9 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, +#ifdef CONFIG_POSIX_AUX_CLOCKS + [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux, +#endif }; static const struct k_clock *clockid_to_kclock(const clockid_t id) diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 61906f0688c1..7f259e845d24 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic; extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern const struct k_clock clock_aux; void posix_timer_queue_signal(struct k_itimer *timr); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c7d2913e68c3..10c6e37dc0dc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2655,6 +2655,7 @@ EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ #ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" /* * Bitmap for the activated auxiliary timekeepers to allow lockless quick @@ -2749,6 +2750,26 @@ bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) } EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = 0; + tp->tv_nsec = 1; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, +}; + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) -- cgit v1.2.3 From 60ecc26ec5af567a55f362ad92c0cac8b894541c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:34 +0200 Subject: timekeeping: Provide time setter for auxiliary clocks Add clock_settime(2) support for auxiliary clocks. The function affects the AUX offset which is added to the "monotonic" clock readout of these clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183757.995688714@linutronix.de --- kernel/time/timekeeping.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 10c6e37dc0dc..b6ac7847bc0a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2765,9 +2765,53 @@ static int aux_get_timespec(clockid_t id, struct timespec64 *tp) return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; } +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + const struct k_clock clock_aux = { .clock_getres = aux_get_res, .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, }; static __init void tk_aux_setup(void) -- cgit v1.2.3 From e8db3a55798d70f2c222c6103990776fca6a6ebc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:40 +0200 Subject: timekeeping: Make timekeeping_inject_offset() reusable Split out the inner workings for auxiliary clock support and feed the core time keeper into it. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.059934561@linutronix.de --- kernel/time/timekeeping.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b6ac7847bc0a..2d294cfe185e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1433,32 +1433,32 @@ EXPORT_SYMBOL(do_settimeofday64); /** * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int __timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; + struct timekeeper *tks = &tkd->shadow_timekeeper; struct timespec64 tmp; if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - timekeeping_forward_now(tks); /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); + timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } @@ -1467,7 +1467,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts) int ret; scoped_guard (raw_spinlock_irqsave, &tk_core.lock) - ret = __timekeeping_inject_offset(ts); + ret = __timekeeping_inject_offset(&tk_core, ts); /* Signal hrtimers about time change */ if (!ret) @@ -2568,6 +2568,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback); */ int do_adjtimex(struct __kernel_timex *txc) { + struct tk_data *tkd = &tk_core; struct timespec64 delta, ts; struct audit_ntp_data ad; bool offset_set = false; @@ -2585,16 +2586,19 @@ int do_adjtimex(struct __kernel_timex *txc) ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; + scoped_guard (raw_spinlock_irqsave, &tkd->lock) { + struct timekeeper *tks = &tkd->shadow_timekeeper; s32 orig_tai, tai; + if (!tks->clock_valid) + return -ENODEV; + if (txc->modes & ADJ_SETOFFSET) { delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; - ret = __timekeeping_inject_offset(&delta); + ret = __timekeeping_inject_offset(tkd, &delta); if (ret) return ret; @@ -2607,7 +2611,7 @@ int do_adjtimex(struct __kernel_timex *txc) if (tai != orig_tai) { __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); clock_set = true; } else { tk_update_leap_state_all(&tk_core); @@ -2615,7 +2619,7 @@ int do_adjtimex(struct __kernel_timex *txc) /* Update the multiplier immediately if frequency was set directly */ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= __timekeeping_advance(&tk_core, TK_ADV_FREQ); + clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); } if (txc->modes & ADJ_SETOFFSET) -- cgit v1.2.3 From 2c8aea59c206b12b436373861590baeda728be12 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:42 +0200 Subject: timekeeping: Add auxiliary clock support to __timekeeping_inject_offset() Redirect the relative offset adjustment to the auxiliary clock offset instead of modifying CLOCK_REALTIME, which has no meaning in context of these clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.124057787@linutronix.de --- kernel/time/timekeeping.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2d294cfe185e..e893557cd53f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1431,6 +1431,11 @@ int do_settimeofday64(const struct timespec64 *ts) } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** * __timekeeping_inject_offset - Adds or subtracts from the current time. * @tkd: Pointer to the timekeeper to modify @@ -1448,16 +1453,34 @@ static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespe timekeeping_forward_now(tks); - /* Make sure the proposed value is valid */ - tmp = timespec64_add(tk_xtime(tks), *ts); - if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || - !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(tkd); - return -EINVAL; + if (timekeeper_is_core_tk(tks)) { + /* Make sure the proposed value is valid */ + tmp = timespec64_add(tk_xtime(tks), *ts); + if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || + !timespec64_valid_settod(&tmp)) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + + tk_xtime_add(tks, ts); + tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; + + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tks->offs_aux = offs; } - tk_xtime_add(tks, ts); - tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } -- cgit v1.2.3 From 775f71ebedd382da390dc16a4c28cffa5b937f79 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:43 +0200 Subject: timekeeping: Make do_adjtimex() reusable Split out the actual functionality of adjtimex() and make do_adjtimex() a wrapper which feeds the core timekeeper into it and handles the result including audit at the call site. This allows to reuse the actual functionality for auxiliary clocks. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.187322876@linutronix.de --- kernel/time/timekeeping.c | 102 +++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e893557cd53f..0de61315b40c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2585,17 +2585,18 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - * @txc: Pointer to kernel_timex structure containing NTP parameters - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct tk_data *tkd = &tk_core; - struct timespec64 delta, ts; - struct audit_ntp_data ad; - bool offset_set = false; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + struct timespec64 ts; + s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ @@ -2604,56 +2605,65 @@ int do_adjtimex(struct __kernel_timex *txc) return ret; add_device_randomness(txc, sizeof(*txc)); - audit_ntp_init(&ad); - ktime_get_real_ts64(&ts); add_device_randomness(&ts, sizeof(ts)); - scoped_guard (raw_spinlock_irqsave, &tkd->lock) { - struct timekeeper *tks = &tkd->shadow_timekeeper; - s32 orig_tai, tai; + guard(raw_spinlock_irqsave)(&tkd->lock); - if (!tks->clock_valid) - return -ENODEV; - - if (txc->modes & ADJ_SETOFFSET) { - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; - if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = __timekeeping_inject_offset(tkd, &delta); - if (ret) - return ret; - - offset_set = delta.tv_sec != 0; - clock_set = true; - } + if (!tks->clock_valid) + return -ENODEV; - orig_tai = tai = tks->tai_offset; - ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &ad); + if (txc->modes & ADJ_SETOFFSET) { + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); + if (ret) + return ret; + result->clock_set = true; + } - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); - clock_set = true; - } else { - tk_update_leap_state_all(&tk_core); - } + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(&tk_core); } + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); + + return ret; +} + +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; + + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; + if (txc->modes & ADJ_SETOFFSET) - audit_tk_injoffset(delta); + audit_tk_injoffset(result.delta); - audit_ntp_log(&ad); + audit_ntp_log(&result.ad); - if (clock_set) + if (result.clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(offset_set); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } -- cgit v1.2.3 From 4eca49d0b621b314ac7c80f363932ec6f6c8abc8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:45 +0200 Subject: timekeeping: Prepare do_adtimex() for auxiliary clocks Exclude ADJ_TAI, leap seconds and PPS functionality as they make no sense in the context of auxiliary clocks and provide a time stamp based on the actual clock. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.253203783@linutronix.de --- kernel/time/timekeeping.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0de61315b40c..6770544f8c0e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,6 +58,17 @@ static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; /* The core timekeeper */ #define tk_core (timekeeper_data[TIMEKEEPER_CORE]) +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} +#endif /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -2508,7 +2519,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2567,6 +2578,20 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2595,17 +2620,22 @@ static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, struct adjtimex_result *result) { struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); - ktime_get_real_ts64(&ts); + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); + add_device_randomness(&ts, sizeof(ts)); guard(raw_spinlock_irqsave)(&tkd->lock); -- cgit v1.2.3 From ecf3e70304911be1c14cd21baa0bc611a53ec50b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:46 +0200 Subject: timekeeping: Provide adjtimex() for auxiliary clocks The behaviour is close to clock_adtime(CLOCK_REALTIME) with the following differences: 1) ADJ_SETOFFSET adjusts the auxiliary clock offset 2) ADJ_TAI is not supported 3) Leap seconds are not supported 4) PPS is not supported Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.317946543@linutronix.de --- kernel/time/timekeeping.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6770544f8c0e..523670ec0d2e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2875,10 +2875,26 @@ static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) return 0; } +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + const struct k_clock clock_aux = { .clock_getres = aux_get_res, .clock_get_timespec = aux_get_timespec, .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, }; static __init void tk_aux_setup(void) -- cgit v1.2.3 From e6d4c00719a6b1dda3fb358b4c973595f9dfd455 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:47 +0200 Subject: timekeeping: Provide update for auxiliary timekeepers Update the auxiliary timekeepers periodically. For now this is tied to the system timekeeper update from the tick. This might be revisited and moved out of the tick. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.382451331@linutronix.de --- kernel/time/timekeeping.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 523670ec0d2e..568ba1ffba0b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,9 +131,11 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { #ifdef CONFIG_POSIX_AUX_CLOCKS static __init void tk_aux_setup(void); static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); #else static inline void tk_aux_setup(void) { } static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } #endif unsigned long timekeeper_lock_irqsave(void) @@ -2317,11 +2319,13 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2764,6 +2768,21 @@ static void tk_aux_update_clocksource(void) } } +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + /** * ktime_get_aux - Get time for a AUX clock * @id: ID of the clock to read (CLOCK_AUX...) -- cgit v1.2.3 From 7b95663a3d96b39b40f169dba5faef3e20163c5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Jun 2025 20:38:49 +0200 Subject: timekeeping: Provide interface to control auxiliary clocks Auxiliary clocks are disabled by default and attempts to access them fail. Provide an interface to enable/disable them at run-time. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Link: https://lore.kernel.org/all/20250625183758.444626478@linutronix.de --- kernel/time/timekeeping.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 568ba1ffba0b..6a61887eb87e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -2916,6 +2917,121 @@ const struct k_clock clock_aux = { .clock_adj = aux_clock_adj, }; +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + + if (!tko) + return -ENOMEM; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) { + kobject_put(tko); + return -ENOMEM; + } + + for (int i = 0; i <= MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) + return -ENOMEM; + + int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + + if (ret) + return ret; + } + return 0; +} +late_initcall(tk_aux_sysfs_init); + static __init void tk_aux_setup(void) { for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) -- cgit v1.2.3 From a5a7b25d7535ce5d8db0c121ef588bc6ca3a39dc Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 27 Jun 2025 10:53:09 -0700 Subject: bpf: guard BTF_ID_FLAGS(bpf_cgroup_read_xattr) with CONFIG_BPF_LSM Function bpf_cgroup_read_xattr is defined in fs/bpf_fs_kfuncs.c, which is compiled only when CONFIG_BPF_LSM is set. Add CONFIG_BPF_LSM check to bpf_cgroup_read_xattr spec in common_btf_ids in kernel/bpf/helpers.c to avoid build failures for configs w/o CONFIG_BPF_LSM. Build failure example: BTF .tmp_vmlinux1.btf.o btf_encoder__tag_kfunc: failed to find kfunc 'bpf_cgroup_read_xattr' in BTF ... WARN: resolve_btfids: unresolved symbol bpf_cgroup_read_xattr make[2]: *** [scripts/Makefile.vmlinux:91: vmlinux.unstripped] Error 255 Fixes: 535b070f4a80 ("bpf: Introduce bpf_cgroup_read_xattr to read xattr of cgroup's node") Reported-by: Jake Hillion Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250627175309.2710973-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b4117681137e..f48fa3fe8dec 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3779,7 +3779,7 @@ BTF_ID_FLAGS(func, bpf_strspn); BTF_ID_FLAGS(func, bpf_strcspn); BTF_ID_FLAGS(func, bpf_strstr); BTF_ID_FLAGS(func, bpf_strnstr); -#ifdef CONFIG_CGROUPS +#if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif BTF_KFUNCS_END(common_btf_ids) -- cgit v1.2.3 From 6921d1e07cb5eddec830801087b419194fde0803 Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 24 Jun 2025 14:38:46 +0800 Subject: tracing: Fix filter logic error If the processing of the tr->events loop fails, the filter that has been added to filter_head will be released twice in free_filter_list(&head->rcu) and __free_filter(filter). After adding the filter of tr->events, add the filter to the filter_head process to avoid triggering uaf. Link: https://lore.kernel.org/tencent_4EF87A626D702F816CD0951CE956EC32CD0A@qq.com Fixes: a9d0aab5eb33 ("tracing: Fix regression of filter waiting a long time on RCU synchronization") Reported-by: syzbot+daba72c4af9915e9c894@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=daba72c4af9915e9c894 Tested-by: syzbot+daba72c4af9915e9c894@syzkaller.appspotmail.com Acked-by: Masami Hiramatsu (Google) Signed-off-by: Edward Adam Davis Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 08141f105c95..3885aadc434d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1436,13 +1436,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, INIT_LIST_HEAD(&head->list); - item = kmalloc(sizeof(*item), GFP_KERNEL); - if (!item) - goto free_now; - - item->filter = filter; - list_add_tail(&item->list, &head->list); - list_for_each_entry(file, &tr->events, list) { if (file->system != dir) continue; @@ -1454,6 +1447,13 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, event_clear_filter(file); } + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) + goto free_now; + + item->filter = filter; + list_add_tail(&item->list, &head->list); + delay_free_filter(head); return; free_now: -- cgit v1.2.3 From 7b4c5a37544ba22c6ebe72c0d4ea56c953459fa5 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Thu, 26 Jun 2025 13:54:03 +0000 Subject: perf/core: Fix the WARN_ON_ONCE is out of lock protected region commit 3172fb986666 ("perf/core: Fix WARN in perf_cgroup_switch()") try to fix a concurrency problem between perf_cgroup_switch and perf_cgroup_event_disable. But it does not to move the WARN_ON_ONCE into lock-protected region, so the warning is still be triggered. Fixes: 3172fb986666 ("perf/core: Fix WARN in perf_cgroup_switch()") Signed-off-by: Luo Gengkun Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250626135403.2454105-1-luogengkun@huaweicloud.com --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7281230044d0..bf2118c22126 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -951,8 +951,6 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; @@ -964,6 +962,8 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == NULL) return; + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); -- cgit v1.2.3 From a70e9f647f501e36a6a092888b1ea7386b7c5664 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Tue, 24 Jun 2025 20:35:56 +0200 Subject: entry: Split generic entry into generic exception and syscall entry Currently CONFIG_GENERIC_ENTRY enables both the generic exception entry logic and the generic syscall entry logic, which are otherwise loosely coupled. Introduce separate config options for these so that architectures can select the two independently. This will make it easier for architectures to migrate to generic entry code. Suggested-by: Mark Rutland Signed-off-by: Jinjie Ruan Signed-off-by: Linus Walleij Signed-off-by: Thomas Gleixner Acked-by: Linus Walleij Link: https://lore.kernel.org/20250213130007.1418890-2-ruanjinjie@huawei.com Link: https://lore.kernel.org/all/20250624-generic-entry-split-v1-1-53d5ef4f94df@linaro.org [Linus Walleij: rebase onto v6.16-rc1] --- kernel/entry/Makefile | 3 +- kernel/entry/common.c | 113 +----------------------------------------- kernel/entry/syscall-common.c | 112 +++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 8 +-- 4 files changed, 119 insertions(+), 117 deletions(-) create mode 100644 kernel/entry/syscall-common.c (limited to 'kernel') diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index d4b8bd0af79b..77fcd83dd663 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -12,5 +12,6 @@ ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o +obj-$(CONFIG_GENERIC_IRQ_ENTRY) += common.o +obj-$(CONFIG_GENERIC_SYSCALL) += syscall-common.o syscall_user_dispatch.o obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.c b/kernel/entry/common.c index a8dd1f27417c..b82032777310 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -1,84 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include +#include #include #include #include #include #include -#include #include -#include "common.h" - -#define CREATE_TRACE_POINTS -#include - -static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) -{ - if (unlikely(audit_context())) { - unsigned long args[6]; - - syscall_get_arguments(current, regs, args); - audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); - } -} - -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work) -{ - long ret = 0; - - /* - * Handle Syscall User Dispatch. This must comes first, since - * the ABI here can be something that doesn't make sense for - * other syscall_work features. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (syscall_user_dispatch(regs)) - return -1L; - } - - /* Handle ptrace */ - if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = ptrace_report_syscall_entry(regs); - if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) - return -1L; - } - - /* Do seccomp after ptrace, to catch any tracer changes. */ - if (work & SYSCALL_WORK_SECCOMP) { - ret = __secure_computing(); - if (ret == -1L) - return ret; - } - - /* Either of the above might have changed the syscall number */ - syscall = syscall_get_nr(current, regs); - - if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { - trace_sys_enter(regs, syscall); - /* - * Probes or BPF hooks in the tracepoint may have changed the - * system call number as well. - */ - syscall = syscall_get_nr(current, regs); - } - - syscall_enter_audit(regs, syscall); - - return ret ? : syscall; -} - -noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) -{ - enter_from_user_mode(regs); - instrumentation_begin(); - local_irq_enable(); - instrumentation_end(); -} - /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } @@ -133,46 +62,6 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, return ti_work; } -/* - * If SYSCALL_EMU is set, then the only reason to report is when - * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall - * instruction has been already reported in syscall_enter_from_user_mode(). - */ -static inline bool report_single_step(unsigned long work) -{ - if (work & SYSCALL_WORK_SYSCALL_EMU) - return false; - - return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; -} - -void syscall_exit_work(struct pt_regs *regs, unsigned long work) -{ - bool step; - - /* - * If the syscall was rolled back due to syscall user dispatching, - * then the tracers below are not invoked for the same reason as - * the entry side was not invoked in syscall_trace_enter(): The ABI - * of these syscalls is unknown. - */ - if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { - if (unlikely(current->syscall_dispatch.on_dispatch)) { - current->syscall_dispatch.on_dispatch = false; - return; - } - } - - audit_syscall_exit(regs); - - if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, syscall_get_return_value(current, regs)); - - step = report_single_step(work); - if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - ptrace_report_syscall_exit(regs, step); -} - noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c new file mode 100644 index 000000000000..66e6ba7fa80c --- /dev/null +++ b/kernel/entry/syscall-common.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "common.h" + +#define CREATE_TRACE_POINTS +#include + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +long syscall_trace_enter(struct pt_regs *regs, long syscall, + unsigned long work) +{ + long ret = 0; + + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + + /* Handle ptrace */ + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { + ret = ptrace_report_syscall_entry(regs); + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (work & SYSCALL_WORK_SECCOMP) { + ret = __secure_computing(); + if (ret == -1L) + return ret; + } + + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) { + trace_sys_enter(regs, syscall); + /* + * Probes or BPF hooks in the tracepoint may have changed the + * system call number as well. + */ + syscall = syscall_get_nr(current, regs); + } + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} + +noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) +{ + enter_from_user_mode(regs); + instrumentation_begin(); + local_irq_enable(); + instrumentation_end(); +} + +/* + * If SYSCALL_EMU is set, then the only reason to report is when + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_user_mode(). + */ +static inline bool report_single_step(unsigned long work) +{ + if (work & SYSCALL_WORK_SYSCALL_EMU) + return false; + + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; +} + +void syscall_exit_work(struct pt_regs *regs, unsigned long work) +{ + bool step; + + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + + audit_syscall_exit(regs); + + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) + ptrace_report_syscall_exit(regs, step); +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dce50fa57471..e6269e7954f3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -69,8 +69,8 @@ #include #ifdef CONFIG_PREEMPT_DYNAMIC -# ifdef CONFIG_GENERIC_ENTRY -# include +# ifdef CONFIG_GENERIC_IRQ_ENTRY +# include # endif #endif @@ -7427,8 +7427,8 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC -#ifdef CONFIG_GENERIC_ENTRY -#include +#ifdef CONFIG_GENERIC_IRQ_ENTRY +#include #endif /* -- cgit v1.2.3 From 3ebb1b6522392f64902b4e96954e35927354aa27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 26 Jun 2025 11:23:44 +0200 Subject: sched: Fix preemption string of preempt_dynamic_none MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zero is a valid value for "preempt_dynamic_mode", namely "preempt_dynamic_none". Fix the off-by-one in preempt_model_str(), so that "preempty_dynamic_none" is correctly formatted as PREEMPT(none) instead of PREEMPT(undef). Fixes: 8bdc5daaa01e ("sched: Add a generic function to return the preemption string") Signed-off-by: Thomas Weißschuh Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sebastian Andrzej Siewior Tested-by: Shrikanth Hegde Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250626-preempt-str-none-v2-1-526213b70a89@linutronix.de --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8988d38d46a3..cd80b66a960a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7663,7 +7663,7 @@ const char *preempt_model_str(void) if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) { seq_buf_printf(&s, "(%s)%s", - preempt_dynamic_mode > 0 ? + preempt_dynamic_mode >= 0 ? preempt_modes[preempt_dynamic_mode] : "undef", brace ? "}" : ""); return seq_buf_str(&s); -- cgit v1.2.3 From 009836b4fa52f92cba33618e773b1094affa8cd2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2025 12:00:09 +0200 Subject: sched/core: Fix migrate_swap() vs. hotplug On Mon, Jun 02, 2025 at 03:22:13PM +0800, Kuyo Chang wrote: > So, the potential race scenario is: > > CPU0 CPU1 > // doing migrate_swap(cpu0/cpu1) > stop_two_cpus() > ... > // doing _cpu_down() > sched_cpu_deactivate() > set_cpu_active(cpu, false); > balance_push_set(cpu, true); > cpu_stop_queue_two_works > __cpu_stop_queue_work(stopper1,...); > __cpu_stop_queue_work(stopper2,..); > stop_cpus_in_progress -> true > preempt_enable(); > ... > 1st balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 1st add push_work > wake_up_q(&wakeq); -> "wakeq is empty. > This implies that the stopper is at wakeq@migrate_swap." > preempt_disable > wake_up_q(&wakeq); > wake_up_process // wakeup migrate/0 > try_to_wake_up > ttwu_queue > ttwu_queue_cond ->meet below case > if (cpu == smp_processor_id()) > return false; > ttwu_do_activate > //migrate/0 wakeup done > wake_up_process // wakeup migrate/1 > try_to_wake_up > ttwu_queue > ttwu_queue_cond > ttwu_queue_wakelist > __ttwu_queue_wakelist > __smp_call_single_queue > preempt_enable(); > > 2nd balance_push > stop_one_cpu_nowait > cpu_stop_queue_work > __cpu_stop_queue_work > list_add_tail -> 2nd add push_work, so the double list add is detected > ... > ... > cpu1 get ipi, do sched_ttwu_pending, wakeup migrate/1 > So this balance_push() is part of schedule(), and schedule() is supposed to switch to stopper task, but because of this race condition, stopper task is stuck in WAKING state and not actually visible to be picked. Therefore CPU1 can do another schedule() and end up doing another balance_push() even though the last one hasn't been done yet. This is a confluence of fail, where both wake_q and ttwu_wakelist can cause crucial wakeups to be delayed, resulting in the malfunction of balance_push. Since there is only a single stopper thread to be woken, the wake_q doesn't really add anything here, and can be removed in favour of direct wakeups of the stopper thread. Then add a clause to ttwu_queue_cond() to ensure the stopper threads are never queued / delayed. Of all 3 moving parts, the last addition was the balance_push() machinery, so pick that as the point the bug was introduced. Fixes: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Reported-by: Kuyo Chang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kuyo Chang Link: https://lkml.kernel.org/r/20250605100009.GO39944@noisy.programming.kicks-ass.net --- kernel/sched/core.c | 5 +++++ kernel/stop_machine.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cd80b66a960a..ec68fc686bd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3943,6 +3943,11 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) if (!scx_allow_ttwu_queue(p)) return false; +#ifdef CONFIG_SMP + if (p->sched_class == &stop_sched_class) + return false; +#endif + /* * Do not complicate things with the async wake_list while the CPU is * in hotplug state. diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 5d2d0562115b..3fe6b0c99f3d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -82,18 +82,15 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done) } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, - struct cpu_stop_work *work, - struct wake_q_head *wakeq) + struct cpu_stop_work *work) { list_add_tail(&work->list, &stopper->works); - wake_q_add(wakeq, stopper->thread); } /* queue @work to @stopper. if offline, @work is completed immediately */ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); - DEFINE_WAKE_Q(wakeq); unsigned long flags; bool enabled; @@ -101,12 +98,13 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) raw_spin_lock_irqsave(&stopper->lock, flags); enabled = stopper->enabled; if (enabled) - __cpu_stop_queue_work(stopper, work, &wakeq); + __cpu_stop_queue_work(stopper, work); else if (work->done) cpu_stop_signal_done(work->done); raw_spin_unlock_irqrestore(&stopper->lock, flags); - wake_up_q(&wakeq); + if (enabled) + wake_up_process(stopper->thread); preempt_enable(); return enabled; @@ -264,7 +262,6 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, { struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1); struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2); - DEFINE_WAKE_Q(wakeq); int err; retry: @@ -300,8 +297,8 @@ retry: } err = 0; - __cpu_stop_queue_work(stopper1, work1, &wakeq); - __cpu_stop_queue_work(stopper2, work2, &wakeq); + __cpu_stop_queue_work(stopper1, work1); + __cpu_stop_queue_work(stopper2, work2); unlock: raw_spin_unlock(&stopper2->lock); @@ -316,7 +313,10 @@ unlock: goto retry; } - wake_up_q(&wakeq); + if (!err) { + wake_up_process(stopper1->thread); + wake_up_process(stopper2->thread); + } preempt_enable(); return err; -- cgit v1.2.3 From e78f70bad29c5ae1e1076698b690b15794e9b81e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 1 Jul 2025 14:32:25 +0200 Subject: time/timecounter: Fix the lie that struct cyclecounter is const In both the read callback for struct cyclecounter, and in struct timecounter, struct cyclecounter is declared as a const pointer. Unfortunatly, a number of users of this pointer treat it as a non-const pointer as it is burried in a larger structure that is heavily modified by the callback function when accessed. This lie had been hidden by the fact that container_of() "casts away" a const attribute of a pointer without any compiler warning happening at all. Fix this all up by removing the const attribute in the needed places so that everyone can see that the structure really isn't const, but can, and is, modified by the users of it. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/2025070124-backyard-hurt-783a@gregkh --- kernel/time/timecounter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index e6285288d765..3d2a354cfe1c 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -6,7 +6,7 @@ #include void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, + struct cyclecounter *cc, u64 start_tstamp) { tc->cc = cc; -- cgit v1.2.3 From b6139a6abf673029008f80d42abd3848d80a9108 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 17 Jun 2025 15:43:23 +0200 Subject: lib/group_cpus: Let group_cpu_evenly() return the number of initialized masks group_cpu_evenly() might have allocated less groups then requested: group_cpu_evenly() __group_cpus_evenly() alloc_nodes_groups() # allocated total groups may be less than numgrps when # active total CPU number is less then numgrps In this case, the caller will do an out of bound access because the caller assumes the masks returned has numgrps. Return the number of groups created so the caller can limit the access range accordingly. Acked-by: Thomas Gleixner Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Signed-off-by: Daniel Wagner Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250617-isolcpus-queue-counters-v1-1-13923686b54b@kernel.org Signed-off-by: Jens Axboe --- kernel/irq/affinity.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 44a4eba80315..4013e6ad2b2f 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -69,21 +69,20 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) * have multiple sets, build each sets affinity mask separately. */ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { - unsigned int this_vecs = affd->set_size[i]; - int j; - struct cpumask *result = group_cpus_evenly(this_vecs); + unsigned int nr_masks, this_vecs = affd->set_size[i]; + struct cpumask *result = group_cpus_evenly(this_vecs, &nr_masks); if (!result) { kfree(masks); return NULL; } - for (j = 0; j < this_vecs; j++) + for (int j = 0; j < nr_masks; j++) cpumask_copy(&masks[curvec + j].mask, &result[j]); kfree(result); - curvec += this_vecs; - usedvecs += this_vecs; + curvec += nr_masks; + usedvecs += nr_masks; } /* Fill out vectors at the end that don't need affinity */ -- cgit v1.2.3 From 0df1a55afa832f463f9ad68ddc5de92230f1bc8a Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Tue, 1 Jul 2025 20:36:15 +0200 Subject: bpf: Warn on internal verifier errors This patch is a follow up to commit 1cb0f56d9618 ("bpf: WARN_ONCE on verifier bugs"). It generalizes the use of verifier_error throughout the verifier, in particular for logs previously marked "verifier internal error". As a consequence, all of those verifier bugs will now come with a kernel warning (under CONFIG_DBEUG_KERNEL) detectable by fuzzers. While at it, some error messages that were too generic (ex., "bpf verifier is misconfigured") have been reworded. Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/aGQqnzMyeagzgkCK@Tunnel Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 211 ++++++++++++++++++++++++-------------------------- 1 file changed, 102 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 90e688f81a48..a352b35be479 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -861,7 +861,7 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re * dynptr */ if (state->stack[i].slot_type[0] != STACK_DYNPTR) { - verbose(env, "verifier internal error: misconfigured ref_obj_id\n"); + verifier_bug(env, "misconfigured ref_obj_id"); return -EFAULT; } if (state->stack[i].spilled_ptr.dynptr.first_slot) @@ -2030,12 +2030,10 @@ static int update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifie while (st) { u32 br = --st->branches; - /* WARN_ON(br > 1) technically makes sense here, + /* verifier_bug_if(br > 1, ...) technically makes sense here, * but see comment in push_stack(), hence: */ - WARN_ONCE((int)br < 0, - "BUG update_branch_counts:branches_to_explore=%d\n", - br); + verifier_bug_if((int)br < 0, env, "%s:branches_to_explore=%d", __func__, br); if (br) break; err = maybe_exit_scc(env, st); @@ -2681,13 +2679,13 @@ static int reg_bounds_sanity_check(struct bpf_verifier_env *env, return 0; out: - verbose(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " - "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)\n", - ctx, msg, reg->umin_value, reg->umax_value, - reg->smin_value, reg->smax_value, - reg->u32_min_value, reg->u32_max_value, - reg->s32_min_value, reg->s32_max_value, - reg->var_off.value, reg->var_off.mask); + verifier_bug(env, "REG INVARIANTS VIOLATION (%s): %s u64=[%#llx, %#llx] " + "s64=[%#llx, %#llx] u32=[%#x, %#x] s32=[%#x, %#x] var_off=(%#llx, %#llx)", + ctx, msg, reg->umin_value, reg->umax_value, + reg->smin_value, reg->smax_value, + reg->u32_min_value, reg->u32_max_value, + reg->s32_min_value, reg->s32_max_value, + reg->var_off.value, reg->var_off.mask); if (env->test_reg_invariants) return -EFAULT; __mark_reg_unbounded(reg); @@ -4741,7 +4739,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { - WARN_ONCE(1, "backtracing misuse"); + verifier_bug(env, "backtracking misuse"); return -EFAULT; } bt_set_reg(bt, regno); @@ -7213,7 +7211,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) { if (!btf_is_kernel(reg->btf)) { - verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); + verifier_bug(env, "reg->btf must be kernel btf"); return -EFAULT; } ret = env->ops->btf_struct_access(&env->log, reg, off, size); @@ -7229,7 +7227,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && !(reg->type & MEM_RCU) && !reg->ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); + verifier_bug(env, "ref_obj_id for allocated object must be non-zero"); return -EFAULT; } @@ -8597,7 +8595,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): */ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { - verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + verifier_bug(env, "misconfigured dynptr helper type flags"); return -EFAULT; } @@ -8963,8 +8961,8 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE && cur_iter->iter.state != BPF_ITER_STATE_DRAINED) { - verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n", - cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); + verifier_bug(env, "unexpected iterator state %d (%s)", + cur_iter->iter.state, iter_state_str(cur_iter->iter.state)); return -EFAULT; } @@ -8974,7 +8972,7 @@ static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx, */ if (!cur_st->parent || cur_st->parent->insn_idx != insn_idx || !same_callsites(cur_st->parent, cur_st)) { - verbose(env, "bug: bad parent state for iter next call"); + verifier_bug(env, "bad parent state for iter next call"); return -EFAULT; } /* Note cur_st->parent in the call below, it is necessary to skip @@ -9033,7 +9031,7 @@ static int resolve_map_arg_type(struct bpf_verifier_env *env, { if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->type\n"); + verifier_bug(env, "invalid map_ptr to access map->type"); return -EFAULT; } @@ -9180,7 +9178,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { - verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); + verifier_bug(env, "unsupported arg type %d", arg_type); return -EFAULT; } @@ -9262,7 +9260,7 @@ found: if (!arg_btf_id) { if (!compatible->btf_id) { - verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); + verifier_bug(env, "missing arg compatible BTF ID"); return -EFAULT; } arg_btf_id = compatible->btf_id; @@ -9294,7 +9292,7 @@ found: case PTR_TO_BTF_ID | MEM_PERCPU | MEM_ALLOC: if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock && meta->func_id != BPF_FUNC_kptr_xchg) { - verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); + verifier_bug(env, "unimplemented handling of MEM_ALLOC"); return -EFAULT; } /* Check if local kptr in src arg matches kptr in dst arg */ @@ -9309,7 +9307,7 @@ found: /* Handled by helper specific checks */ break; default: - verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n"); + verifier_bug(env, "invalid PTR_TO_BTF_ID register for type match"); return -EFAULT; } return 0; @@ -9667,7 +9665,7 @@ skip_type_check: return -EINVAL; } if (meta->release_regno) { - verbose(env, "verifier internal error: more than one release argument\n"); + verifier_bug(env, "more than one release argument"); return -EFAULT; } meta->release_regno = regno; @@ -9675,9 +9673,9 @@ skip_type_check: if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); + verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", + regno, reg->ref_obj_id, + meta->ref_obj_id); return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; @@ -9721,7 +9719,7 @@ skip_type_check: * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->key\n"); + verifier_bug(env, "invalid map_ptr to access map->key"); return -EFAULT; } key_size = meta->map_ptr->key_size; @@ -9748,7 +9746,7 @@ skip_type_check: */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose(env, "invalid map_ptr to access map->value\n"); + verifier_bug(env, "invalid map_ptr to access map->value"); return -EFAULT; } meta->raw_mode = arg_type & MEM_UNINIT; @@ -9778,7 +9776,7 @@ skip_type_check: if (err) return err; } else { - verbose(env, "verifier internal error\n"); + verifier_bug(env, "spin lock arg on unexpected helper"); return -EFAULT; } break; @@ -10936,8 +10934,8 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return -EINVAL; } if (!calls_callback(env, callee->callsite)) { - verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n", - *insn_idx, callee->callsite); + verifier_bug(env, "in callback at %d, callsite %d !calls_callback", + *insn_idx, callee->callsite); return -EFAULT; } } else { @@ -11044,7 +11042,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, return 0; if (map == NULL) { - verbose(env, "kernel subsystem misconfigured verifier\n"); + verifier_bug(env, "expected map for helper call"); return -EFAULT; } @@ -11083,7 +11081,7 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call) return 0; if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { - verbose(env, "kernel subsystem misconfigured verifier\n"); + verifier_bug(env, "expected array map for tail call"); return -EFAULT; } @@ -11336,8 +11334,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { - verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", - func_id_name(func_id), func_id); + verifier_bug(env, "func %s#%d: r1 != ctx", func_id_name(func_id), func_id); return -EFAULT; } @@ -11346,8 +11343,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn err = check_func_proto(fn, func_id); if (err) { - verbose(env, "kernel subsystem misconfigured func %s#%d\n", - func_id_name(func_id), func_id); + verifier_bug(env, "incorrect func proto %s#%d", func_id_name(func_id), func_id); return err; } @@ -11420,7 +11416,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn */ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { - verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); + verifier_bug(env, "CONST_PTR_TO_DYNPTR cannot be released"); return -EFAULT; } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); @@ -11537,23 +11533,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.dynptr_id) { - verbose(env, "verifier internal error: meta.dynptr_id already set\n"); + verifier_bug(env, "meta.dynptr_id already set"); return -EFAULT; } if (meta.ref_obj_id) { - verbose(env, "verifier internal error: meta.ref_obj_id already set\n"); + verifier_bug(env, "meta.ref_obj_id already set"); return -EFAULT; } id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } ref_obj_id = dynptr_ref_obj_id(env, reg); if (ref_obj_id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n"); + verifier_bug(env, "failed to obtain dynptr ref_obj_id"); return ref_obj_id; } @@ -11638,8 +11634,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose(env, - "kernel subsystem misconfigured verifier\n"); + verifier_bug(env, "unexpected null map_ptr"); return -EFAULT; } @@ -11730,9 +11725,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } else { if (fn->ret_btf_id == BPF_PTR_POISON) { - verbose(env, "verifier internal error:"); - verbose(env, "func %s has non-overwritten BPF_PTR_POISON return type\n", - func_id_name(func_id)); + verifier_bug(env, "func %s has non-overwritten BPF_PTR_POISON return type", + func_id_name(func_id)); return -EFAULT; } ret_btf = btf_vmlinux; @@ -11758,8 +11752,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].id = ++env->id_gen; if (helper_multiple_ref_obj_use(func_id, meta.map_ptr)) { - verbose(env, "verifier internal error: func %s#%d sets ref_obj_id more than once\n", - func_id_name(func_id), func_id); + verifier_bug(env, "func %s#%d sets ref_obj_id more than once", + func_id_name(func_id), func_id); return -EFAULT; } @@ -12471,7 +12465,7 @@ static int process_irq_flag(struct bpf_verifier_env *env, int regno, if (meta->func_id == special_kfunc_list[KF_bpf_res_spin_unlock_irqrestore]) kfunc_class = IRQ_LOCK_KFUNC; } else { - verbose(env, "verifier internal error: unknown irq flags kfunc\n"); + verifier_bug(env, "unknown irq flags kfunc"); return -EFAULT; } @@ -12512,12 +12506,12 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state struct btf_record *rec = reg_btf_record(reg); if (!env->cur_state->active_locks) { - verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); + verifier_bug(env, "%s w/o active lock", __func__); return -EFAULT; } if (type_flag(reg->type) & NON_OWN_REF) { - verbose(env, "verifier internal error: NON_OWN_REF already set\n"); + verifier_bug(env, "NON_OWN_REF already set"); return -EFAULT; } @@ -12536,8 +12530,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o int i; if (!ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id is zero for " - "owning -> non-owning conversion\n"); + verifier_bug(env, "ref_obj_id is zero for owning -> non-owning conversion"); return -EFAULT; } @@ -12557,7 +12550,7 @@ static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_o return 0; } - verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); + verifier_bug(env, "ref state missing for ref_obj_id"); return -EFAULT; } @@ -12619,7 +12612,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ ptr = reg->btf; break; default: - verbose(env, "verifier internal error: unknown reg type for lock check\n"); + verifier_bug(env, "unknown reg type for lock check"); return -EFAULT; } id = reg->id; @@ -12780,7 +12773,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, u32 head_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12811,7 +12804,7 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, } if (*head_field) { - verbose(env, "verifier internal error: repeating %s arg\n", head_type_name); + verifier_bug(env, "repeating %s arg", head_type_name); return -EFAULT; } *head_field = field; @@ -12848,7 +12841,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, u32 node_off; if (meta->btf != btf_vmlinux) { - verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); + verifier_bug(env, "unexpected btf mismatch in kfunc call"); return -EFAULT; } @@ -12976,7 +12969,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_prog(btf, &args[i])) { /* Used to reject repeated use of __prog. */ if (meta->arg_prog) { - verbose(env, "Only 1 prog->aux argument supported per-kfunc\n"); + verifier_bug(env, "Only 1 prog->aux argument supported per-kfunc"); return -EFAULT; } meta->arg_prog = true; @@ -12992,7 +12985,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_constant(meta->btf, &args[i])) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(reg->var_off)) { @@ -13044,9 +13037,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { - verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, - meta->ref_obj_id); + verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", + regno, reg->ref_obj_id, + meta->ref_obj_id); return -EFAULT; } meta->ref_obj_id = reg->ref_obj_id; @@ -13126,7 +13119,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; default: - WARN_ON_ONCE(1); + verifier_bug(env, "unknown kfunc arg type %d", kf_arg_type); return -EFAULT; } @@ -13195,14 +13188,14 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_dynptr_type parent_type = meta->initialized_dynptr.type; if (parent_type == BPF_DYNPTR_TYPE_INVALID) { - verbose(env, "verifier internal error: no dynptr type for parent of clone\n"); + verifier_bug(env, "no dynptr type for parent of clone"); return -EFAULT; } dynptr_arg_type |= (unsigned int)get_dynptr_type_flag(parent_type); clone_ref_obj_id = meta->initialized_dynptr.ref_obj_id; if (dynptr_type_refcounted(parent_type) && !clone_ref_obj_id) { - verbose(env, "verifier internal error: missing ref obj id for parent of clone\n"); + verifier_bug(env, "missing ref obj id for parent of clone"); return -EFAULT; } } @@ -13215,7 +13208,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ int id = dynptr_id(env, reg); if (id < 0) { - verbose(env, "verifier internal error: failed to obtain dynptr id\n"); + verifier_bug(env, "failed to obtain dynptr id"); return id; } meta->initialized_dynptr.id = id; @@ -13351,7 +13344,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) { if (meta->arg_constant.found) { - verbose(env, "verifier internal error: only one constant argument permitted\n"); + verifier_bug(env, "only one constant argument permitted"); return -EFAULT; } if (!tnum_is_const(size_reg->var_off)) { @@ -13383,7 +13376,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ rec = reg_btf_record(reg); if (!rec) { - verbose(env, "verifier internal error: Couldn't find btf_record\n"); + verifier_bug(env, "Couldn't find btf_record"); return -EFAULT; } @@ -13642,7 +13635,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca mark_reg_known_zero(env, regs, BPF_REG_0); if (!meta->arg_constant.found) { - verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n"); + verifier_bug(env, "bpf_dynptr_slice(_rdwr) no constant size"); return -EFAULT; } @@ -13662,7 +13655,7 @@ static int check_special_kfunc(struct bpf_verifier_env *env, struct bpf_kfunc_ca } if (!meta->initialized_dynptr.id) { - verbose(env, "verifier internal error: no dynptr id\n"); + verifier_bug(env, "no dynptr id"); return -EFAULT; } regs[BPF_REG_0].dynptr_id = meta->initialized_dynptr.id; @@ -14306,8 +14299,7 @@ static int sanitize_err(struct bpf_verifier_env *env, dst, err); return -ENOMEM; default: - verbose(env, "verifier internal error: unknown reason (%d)\n", - reason); + verifier_bug(env, "unknown reason (%d)", reason); break; } @@ -16842,7 +16834,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->btf_id = aux->btf_var.btf_id; break; default: - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "pseudo btf id: unexpected dst reg type"); return -EFAULT; } return 0; @@ -16884,7 +16876,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; } else { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "unexpected src reg value for ldimm64"); return -EFAULT; } @@ -16931,7 +16923,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) } if (!env->ops->gen_ld_abs) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "gen_ld_abs is null"); return -EFAULT; } @@ -17342,7 +17334,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose(env, "insn state internal bug\n"); + verifier_bug(env, "insn state internal bug"); return -EFAULT; } return DONE_EXPLORING; @@ -17803,7 +17795,7 @@ walk_cfg: break; default: if (ret > 0) { - verbose(env, "visit_insn internal bug\n"); + verifier_bug(env, "visit_insn internal bug"); ret = -EFAULT; } goto err_free; @@ -17811,7 +17803,7 @@ walk_cfg: } if (env->cfg.cur_stack < 0) { - verbose(env, "pop stack internal bug\n"); + verifier_bug(env, "pop stack internal bug"); ret = -EFAULT; goto err_free; } @@ -19564,8 +19556,9 @@ miss: return err; } new->insn_idx = insn_idx; - WARN_ONCE(new->branches != 1, - "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); + verifier_bug_if(new->branches != 1, env, + "%s:branches_to_explore=%d insn %d", + __func__, new->branches, insn_idx); err = maybe_enter_scc(env, new); if (err) { free_verifier_state(new, false); @@ -21136,7 +21129,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, -(subprogs[0].stack_depth + 8)); if (epilogue_cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "epilogue is too long"); return -EFAULT; } else if (epilogue_cnt) { /* Save the ARG_PTR_TO_CTX for the epilogue to use */ @@ -21159,13 +21152,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "gen_prologue is null"); return -EFAULT; } cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "prologue is too long"); return -EFAULT; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -21349,7 +21342,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose(env, "bpf verifier narrow ctx access misconfigured\n"); + verifier_bug(env, "narrow ctx access misconfigured"); return -EFAULT; } @@ -21368,7 +21361,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "error during ctx access conversion"); return -EFAULT; } @@ -21376,7 +21369,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 shift = bpf_ctx_narrow_access_offset( off, size, size_default) * 8; if (shift && cnt + 1 >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier narrow ctx load misconfigured\n"); + verifier_bug(env, "narrow ctx load misconfigured"); return -EFAULT; } if (ctx_field_size <= 4) { @@ -21807,8 +21800,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, */ desc = find_kfunc_desc(env->prog, insn->imm, insn->off); if (!desc) { - verbose(env, "verifier internal error: kernel function descriptor not found for func_id %u\n", - insn->imm); + verifier_bug(env, "kernel function descriptor not found for func_id %u", + insn->imm); return -EFAULT; } @@ -21823,8 +21816,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21840,15 +21833,15 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; if (desc->func_id == special_kfunc_list[KF_bpf_percpu_obj_drop_impl] && kptr_struct_meta) { - verbose(env, "verifier internal error: NULL kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "NULL kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } if (desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl] && !kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21870,8 +21863,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (!kptr_struct_meta) { - verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n", - insn_idx); + verifier_bug(env, "kptr_struct_meta expected at insn_idx %d", + insn_idx); return -EFAULT; } @@ -21905,7 +21898,7 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat /* We only reserve one slot for hidden subprogs in subprog_info. */ if (env->hidden_subprog_cnt) { - verbose(env, "verifier internal error: only one hidden subprog supported\n"); + verifier_bug(env, "only one hidden subprog supported"); return -EFAULT; } /* We're not patching any existing instruction, just appending the new @@ -22141,7 +22134,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BPF_MODE(insn->code) == BPF_IND)) { cnt = env->ops->gen_ld_abs(insn, insn_buf); if (cnt == 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "%d insns generated for ld_abs", cnt); return -EFAULT; } @@ -22477,7 +22470,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (cnt == -EOPNOTSUPP) goto patch_map_ops_generic; if (cnt <= 0 || cnt >= INSN_BUF_SIZE) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "%d insns generated for map lookup", cnt); return -EFAULT; } @@ -22765,9 +22758,9 @@ patch_call_imm: * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose(env, - "kernel subsystem misconfigured func %s#%d\n", - func_id_name(insn->imm), insn->imm); + verifier_bug(env, + "not inlined functions %s#%d is missing func", + func_id_name(insn->imm), insn->imm); return -EFAULT; } insn->imm = fn->func - __bpf_call_base; @@ -22837,7 +22830,7 @@ next_insn: if (!map_ptr->ops->map_poke_track || !map_ptr->ops->map_poke_untrack || !map_ptr->ops->map_poke_run) { - verbose(env, "bpf verifier is misconfigured\n"); + verifier_bug(env, "poke tab is misconfigured"); return -EFAULT; } @@ -23155,8 +23148,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) /* caller can pass either PTR_TO_ARENA or SCALAR */ mark_reg_unknown(env, regs, i); } else { - WARN_ONCE(1, "BUG: unhandled arg#%d type %d\n", - i - BPF_REG_1, arg->arg_type); + verifier_bug(env, "unhandled arg#%d type %d", + i - BPF_REG_1, arg->arg_type); ret = -EFAULT; goto out; } -- cgit v1.2.3 From f8242745871f81a3ac37f9f51853d12854fd0b58 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Tue, 1 Jul 2025 21:47:30 +0200 Subject: bpf: Reject %p% format string in bprintf-like helpers static const char fmt[] = "%p%"; bpf_trace_printk(fmt, sizeof(fmt)); The above BPF program isn't rejected and causes a kernel warning at runtime: Please remove unsupported %\x00 in format string WARNING: CPU: 1 PID: 7244 at lib/vsprintf.c:2680 format_decode+0x49c/0x5d0 This happens because bpf_bprintf_prepare skips over the second %, detected as punctuation, while processing %p. This patch fixes it by not skipping over punctuation. %\x00 is then processed in the next iteration and rejected. Reported-by: syzbot+e2c932aec5c8a6e1d31c@syzkaller.appspotmail.com Fixes: 48cac3f4a96d ("bpf: Implement formatted output helpers with bstr_printf") Acked-by: Yonghong Song Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/a0e06cc479faec9e802ae51ba5d66420523251ee.1751395489.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b71e428ad936..ad6df48b540c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -884,6 +884,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (fmt[i] == 'p') { sizeof_cur_arg = sizeof(long); + if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || + ispunct(fmt[i + 1])) { + if (tmp_buf) + cur_arg = raw_args[num_spec]; + goto nocopy_fmt; + } + if ((fmt[i + 1] == 'k' || fmt[i + 1] == 'u') && fmt[i + 2] == 's') { fmt_ptype = fmt[i + 1]; @@ -891,11 +898,9 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, goto fmt_str; } - if (fmt[i + 1] == 0 || isspace(fmt[i + 1]) || - ispunct(fmt[i + 1]) || fmt[i + 1] == 'K' || + if (fmt[i + 1] == 'K' || fmt[i + 1] == 'x' || fmt[i + 1] == 's' || fmt[i + 1] == 'S') { - /* just kernel pointers */ if (tmp_buf) cur_arg = raw_args[num_spec]; i++; -- cgit v1.2.3 From ddb017ec9c3346e511f16dbae784a72eb91994dd Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 29 Mar 2025 15:39:59 +0900 Subject: tracing: probe-events: Cleanup entry-arg storing code Cleanup __store_entry_arg() so that it is easier to understand. The main complexity may come from combining the loops for finding stored-entry-arg and max-offset and appending new entry. This split those different loops into 3 parts, lookup the same entry-arg, find the max offset and append new entry. Link: https://lore.kernel.org/all/174323039929.348535.4705349977127704120.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 128 ++++++++++++++++++++++++--------------------- 1 file changed, 69 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 424751cdf31f..abfab8957a6c 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -779,6 +779,36 @@ static int check_prepare_btf_string_fetch(char *typename, #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API +static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset) +{ + code[0].op = FETCH_OP_ARG; + code[0].param = argnum; + code[1].op = FETCH_OP_ST_EDATA; + code[1].offset = offset; +} + +static int get_entry_arg_max_offset(struct probe_entry_arg *earg) +{ + int i, max_offset = 0; + + /* + * earg->code[] array has an operation sequence which is run in + * the entry handler. + * The sequence stopped by FETCH_OP_END and each data stored in + * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA + * stores the data at the data buffer + its offset, and all data are + * "unsigned long" size. The offset must be increased when a data is + * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the + * code array. + */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) { + if (earg->code[i].op == FETCH_OP_ST_EDATA) + if (earg->code[i].offset > max_offset) + max_offset = earg->code[i].offset; + } + return max_offset; +} + /* * Add the entry code to store the 'argnum'th parameter and return the offset * in the entry data buffer where the data will be stored. @@ -786,8 +816,7 @@ static int check_prepare_btf_string_fetch(char *typename, static int __store_entry_arg(struct trace_probe *tp, int argnum) { struct probe_entry_arg *earg = tp->entry_arg; - bool match = false; - int i, offset; + int i, offset, last_offset = 0; if (!earg) { earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL); @@ -804,78 +833,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum) for (i = 0; i < earg->size; i++) earg->code[i].op = FETCH_OP_END; tp->entry_arg = earg; + store_entry_arg_at(earg->code, argnum, 0); + return 0; } /* - * The entry code array is repeating the pair of - * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)] - * and the rest of entries are filled with [FETCH_OP_END]. + * NOTE: if anyone change the following rule, please rewrite this. + * The entry code array is filled with the pair of + * + * [FETCH_OP_ARG(argnum)] + * [FETCH_OP_ST_EDATA(offset of entry data buffer)] * - * To reduce the redundant function parameter fetching, we scan the entry - * code array to find the FETCH_OP_ARG which already fetches the 'argnum' - * parameter. If it doesn't match, update 'offset' to find the last - * offset. - * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we - * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and - * return data offset so that caller can find the data offset in the entry - * data buffer. + * and the rest of entries are filled with [FETCH_OP_END]. + * The offset should be incremented, thus the last pair should + * have the largest offset. */ - offset = 0; - for (i = 0; i < earg->size - 1; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - earg->code[i].op = FETCH_OP_ARG; - earg->code[i].param = argnum; - earg->code[i + 1].op = FETCH_OP_ST_EDATA; - earg->code[i + 1].offset = offset; - return offset; - case FETCH_OP_ARG: - match = (earg->code[i].param == argnum); - break; - case FETCH_OP_ST_EDATA: - offset = earg->code[i].offset; - if (match) - return offset; - offset += sizeof(unsigned long); - break; - default: - break; - } + + /* Search the offset for the sprcified argnum. */ + for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) { + if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG)) + return -EINVAL; + + if (earg->code[i].param != argnum) + continue; + + if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + + return earg->code[i + 1].offset; } - return -ENOSPC; + /* Not found, append new entry if possible. */ + if (i >= earg->size - 1) + return -ENOSPC; + + /* The last entry must have the largest offset. */ + if (i != 0) { + if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA)) + return -EINVAL; + last_offset = earg->code[i - 1].offset; + } + + offset = last_offset + sizeof(unsigned long); + store_entry_arg_at(&earg->code[i], argnum, offset); + return offset; } int traceprobe_get_entry_data_size(struct trace_probe *tp) { struct probe_entry_arg *earg = tp->entry_arg; - int i, size = 0; if (!earg) return 0; - /* - * earg->code[] array has an operation sequence which is run in - * the entry handler. - * The sequence stopped by FETCH_OP_END and each data stored in - * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA - * stores the data at the data buffer + its offset, and all data are - * "unsigned long" size. The offset must be increased when a data is - * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the - * code array. - */ - for (i = 0; i < earg->size; i++) { - switch (earg->code[i].op) { - case FETCH_OP_END: - goto out; - case FETCH_OP_ST_EDATA: - size = earg->code[i].offset + sizeof(unsigned long); - break; - default: - break; - } - } -out: - return size; + return get_entry_arg_max_offset(earg) + sizeof(unsigned long); } void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs) -- cgit v1.2.3 From c135ab4a96e3f65abf83621a0efe007e64f224cf Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 1 Apr 2025 00:35:53 +0900 Subject: tracing: tprobe-events: Remove mod field from tprobe-event Remove unneeded 'mod' struct module pointer field from trace_fprobe because we don't need to save this info. Link: https://lore.kernel.org/all/174343535351.843280.5868426549023332120.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_fprobe.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index b40fa59159ac..017d6ebfdf8b 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -46,7 +46,6 @@ struct trace_fprobe { struct fprobe fp; const char *symbol; struct tracepoint *tpoint; - struct module *mod; struct trace_probe tp; }; @@ -426,7 +425,6 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, struct tracepoint *tpoint, - struct module *mod, int nargs, bool is_return) { struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; @@ -446,7 +444,6 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, tf->fp.entry_handler = fentry_dispatcher; tf->tpoint = tpoint; - tf->mod = mod; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) @@ -776,7 +773,6 @@ static void __unregister_trace_fprobe(struct trace_fprobe *tf) tracepoint_probe_unregister(tf->tpoint, tf->tpoint->probestub, NULL); tf->tpoint = NULL; - tf->mod = NULL; } } } @@ -1001,23 +997,23 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, mutex_lock(&event_mutex); for_each_trace_fprobe(tf, pos) { + if (!trace_fprobe_is_tracepoint(tf)) + continue; if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); if (tpoint) { tf->tpoint = tpoint; - tf->mod = tp_mod->mod; if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && trace_probe_is_enabled(&tf->tp)) reenable_trace_fprobe(tf); } - } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) { + } else if (val == MODULE_STATE_GOING && + tf->tpoint != TRACEPOINT_STUB && + within_module((unsigned long)tf->tpoint->probestub, tp_mod->mod)) { unregister_fprobe(&tf->fp); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = TRACEPOINT_STUB; - tf->mod = NULL; - } + tracepoint_probe_unregister(tf->tpoint, + tf->tpoint->probestub, NULL); + tf->tpoint = TRACEPOINT_STUB; } } mutex_unlock(&event_mutex); @@ -1218,8 +1214,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod, - argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, tpoint, argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ -- cgit v1.2.3 From e3d6e1b9a34c745b635f122ac471a198867cd0ec Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 1 Apr 2025 00:36:02 +0900 Subject: tracing: tprobe-events: Support multiple tprobes on the same tracepoint Allow user to set multiple tracepoint-probe events on the same tracepoint. After the last tprobe-event is removed, the tracepoint callback is unregistered. Link: https://lore.kernel.org/all/174343536245.843280.6548776576601537671.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_fprobe.c | 251 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 201 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 017d6ebfdf8b..4a9ce7b0e084 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -21,7 +21,6 @@ #define FPROBE_EVENT_SYSTEM "fprobes" #define TRACEPOINT_EVENT_SYSTEM "tracepoints" #define RETHOOK_MAXACTIVE_MAX 4096 -#define TRACEPOINT_STUB ERR_PTR(-ENOENT) static int trace_fprobe_create(const char *raw_command); static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev); @@ -38,6 +37,89 @@ static struct dyn_event_operations trace_fprobe_ops = { .match = trace_fprobe_match, }; +struct tracepoint_user { + struct tracepoint *tpoint; + unsigned int refcount; +}; + +static bool tracepoint_user_is_registered(struct tracepoint_user *tuser) +{ + return tuser && tuser->tpoint; +} + +static int tracepoint_user_register(struct tracepoint_user *tuser) +{ + struct tracepoint *tpoint = tuser->tpoint; + + if (!tpoint) + return 0; + + return tracepoint_probe_register_prio_may_exist(tpoint, + tpoint->probestub, NULL, 0); +} + +static void tracepoint_user_unregister(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return; + + WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL)); + tuser->tpoint = NULL; +} + +static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser) +{ + if (!tuser->tpoint) + return 0UL; + + return (unsigned long)tuser->tpoint->probestub; +} + +static bool tracepoint_user_within_module(struct tracepoint_user *tuser, + struct module *mod) +{ + return within_module(tracepoint_user_ip(tuser), mod); +} + +static struct tracepoint_user *tracepoint_user_allocate(struct tracepoint *tpoint) +{ + struct tracepoint_user *tuser; + + tuser = kzalloc(sizeof(*tuser), GFP_KERNEL); + if (!tuser) + return NULL; + tuser->tpoint = tpoint; + tuser->refcount = 1; + + return tuser; +} + +/* These must be called with event_mutex */ +static void tracepoint_user_get(struct tracepoint_user *tuser) +{ + tuser->refcount++; +} + +static void tracepoint_user_put(struct tracepoint_user *tuser) +{ + if (--tuser->refcount > 0) + return; + + if (tracepoint_user_is_registered(tuser)) + tracepoint_user_unregister(tuser); + kfree(tuser); +} + +static const char *tracepoint_user_lookup(struct tracepoint_user *tuser, char *buf) +{ + struct tracepoint *tpoint = tuser->tpoint; + + if (!tpoint) + return NULL; + + return kallsyms_lookup((unsigned long)tpoint->probestub, NULL, NULL, NULL, buf); +} + /* * Fprobe event core functions */ @@ -45,7 +127,7 @@ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; - struct tracepoint *tpoint; + struct tracepoint_user *tuser; struct trace_probe tp; }; @@ -75,7 +157,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) { - return tf->tpoint != NULL; + return tf->tuser != NULL; } static const char *trace_fprobe_symbol(struct trace_fprobe *tf) @@ -125,6 +207,56 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf) return fprobe_is_registered(&tf->fp); } +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod); + +/* + * Get tracepoint_user if exist, or allocate new one. If tracepoint is on a + * module, get its refcounter. + */ +static struct tracepoint_user * +trace_fprobe_get_tracepoint_user(const char *name, struct module **pmod) +{ + struct tracepoint_user *tuser __free(kfree) = NULL; + struct tracepoint *tpoint; + struct trace_fprobe *tf; + struct dyn_event *dpos; + struct module *mod __free(module_put) = NULL; + int ret; + + /* + * Find appropriate tracepoint and locking module. + * Note: tpoint can be NULL if it is unloaded (or failed to get module.) + */ + tpoint = find_tracepoint(name, &mod); + + /* Search existing tracepoint_user */ + for_each_trace_fprobe(tf, dpos) { + if (!trace_fprobe_is_tracepoint(tf)) + continue; + if (!strcmp(tf->symbol, name)) { + tracepoint_user_get(tf->tuser); + *pmod = no_free_ptr(mod); + return tf->tuser; + } + } + + /* Not found, allocate and register new tracepoint_user. */ + tuser = tracepoint_user_allocate(tpoint); + if (!tuser) + return NULL; + + if (tpoint) { + /* If the tracepoint is not loaded, tpoint can be NULL. */ + ret = tracepoint_user_register(tuser); + if (ret) + return ERR_PTR(ret); + } + + *pmod = no_free_ptr(mod); + return_ptr(tuser); +} + /* * Note that we don't verify the fetch_insn code, since it does not come * from user space. @@ -410,6 +542,8 @@ static void free_trace_fprobe(struct trace_fprobe *tf) { if (tf) { trace_probe_cleanup(&tf->tp); + if (tf->tuser) + tracepoint_user_put(tf->tuser); kfree(tf->symbol); kfree(tf); } @@ -424,7 +558,7 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, - struct tracepoint *tpoint, + struct tracepoint_user *tuser, int nargs, bool is_return) { struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; @@ -443,7 +577,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, else tf->fp.entry_handler = fentry_dispatcher; - tf->tpoint = tpoint; + tf->tuser = tuser; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) @@ -709,19 +843,11 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) { - struct tracepoint *tpoint = tf->tpoint; - unsigned long ip = (unsigned long)tpoint->probestub; - int ret; + unsigned long ip = tracepoint_user_ip(tf->tuser); + + if (!ip) + return -ENOENT; - /* - * Here, we do 2 steps to enable fprobe on a tracepoint. - * At first, put __probestub_##TP function on the tracepoint - * and put a fprobe on the stub function. - */ - ret = tracepoint_probe_register_prio_may_exist(tpoint, - tpoint->probestub, NULL, 0); - if (ret < 0) - return ret; return register_fprobe_ips(&tf->fp, &ip, 1); } @@ -753,7 +879,7 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) if (trace_fprobe_is_tracepoint(tf)) { /* This tracepoint is not loaded yet */ - if (tf->tpoint == TRACEPOINT_STUB) + if (!tracepoint_user_is_registered(tf->tuser)) return 0; return __regsiter_tracepoint_fprobe(tf); @@ -770,9 +896,8 @@ static void __unregister_trace_fprobe(struct trace_fprobe *tf) unregister_fprobe(&tf->fp); memset(&tf->fp, 0, sizeof(tf->fp)); if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = NULL; + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; } } } @@ -988,7 +1113,7 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; - struct tracepoint *tpoint; + struct tracepoint_user *tuser; struct trace_fprobe *tf; struct dyn_event *pos; @@ -999,21 +1124,46 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, for_each_trace_fprobe(tf, pos) { if (!trace_fprobe_is_tracepoint(tf)) continue; - if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) { + + if (val == MODULE_STATE_COMING) { + /* + * If any tracepoint used by tprobe is in the module, + * register the stub. + */ + struct tracepoint *tpoint; + tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); - if (tpoint) { - tf->tpoint = tpoint; - if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && - trace_probe_is_enabled(&tf->tp)) - reenable_trace_fprobe(tf); + /* This is not a tracepoint in this module. Skip it. */ + if (!tpoint) + continue; + + tuser = tf->tuser; + /* If the tracepoint is not registered yet, register it. */ + if (!tracepoint_user_is_registered(tuser)) { + tuser->tpoint = tpoint; + if (WARN_ON_ONCE(tracepoint_user_register(tuser))) + continue; } - } else if (val == MODULE_STATE_GOING && - tf->tpoint != TRACEPOINT_STUB && - within_module((unsigned long)tf->tpoint->probestub, tp_mod->mod)) { - unregister_fprobe(&tf->fp); - tracepoint_probe_unregister(tf->tpoint, - tf->tpoint->probestub, NULL); - tf->tpoint = TRACEPOINT_STUB; + + /* Finally enable fprobe on this module. */ + if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && + trace_probe_is_enabled(&tf->tp)) + reenable_trace_fprobe(tf); + } else if (val == MODULE_STATE_GOING) { + tuser = tf->tuser; + /* Unregister all tracepoint_user in this module. */ + if (tracepoint_user_is_registered(tuser) && + tracepoint_user_within_module(tuser, tp_mod->mod)) + tracepoint_user_unregister(tuser); + + /* + * Here we need to handle shared tracepoint_user case. + * Such tuser is unregistered, but trace_fprobe itself + * is registered. (Note this only handles tprobes.) + */ + if (!tracepoint_user_is_registered(tuser) && + trace_fprobe_is_registered(tf)) + unregister_fprobe(&tf->fp); } } mutex_unlock(&event_mutex); @@ -1082,7 +1232,9 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T)) +DEFINE_FREE(tuser_put, struct tracepoint_user *, + if (!IS_ERR_OR_NULL(_T)) + tracepoint_user_put(_T)) static int trace_fprobe_create_internal(int argc, const char *argv[], struct traceprobe_parse_context *ctx) @@ -1112,6 +1264,8 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; + struct tracepoint_user *tuser __free(tuser_put) = NULL; + struct module *mod __free(module_put) = NULL; int i, new_argc = 0, ret = 0; bool is_return = false; char *symbol __free(kfree) = NULL; @@ -1123,8 +1277,6 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], char abuf[MAX_BTF_ARGS_LEN]; char *dbuf __free(kfree) = NULL; bool is_tracepoint = false; - struct module *tp_mod __free(module_put) = NULL; - struct tracepoint *tpoint = NULL; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1177,20 +1329,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (is_tracepoint) { ctx->flags |= TPARG_FL_TPOINT; - tpoint = find_tracepoint(symbol, &tp_mod); - if (tpoint) { - ctx->funcname = kallsyms_lookup( - (unsigned long)tpoint->probestub, - NULL, NULL, NULL, sbuf); - } else if (IS_ENABLED(CONFIG_MODULES)) { - /* This *may* be loaded afterwards */ - tpoint = TRACEPOINT_STUB; - ctx->funcname = symbol; - } else { + tuser = trace_fprobe_get_tracepoint_user(symbol, &mod); + if (!tuser) + return -ENOMEM; + if (IS_ERR(tuser)) { trace_probe_log_set_index(1); trace_probe_log_err(0, NO_TRACEPOINT); - return -EINVAL; + return PTR_ERR(tuser); } + ctx->funcname = tracepoint_user_lookup(tuser, sbuf); + /* If tracepoint is not loaded yet, use symbol name as funcname. */ + if (!ctx->funcname) + ctx->funcname = symbol; } else ctx->funcname = symbol; @@ -1214,13 +1364,14 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tpoint, argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, tuser, argc, is_return); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); return ret; } + tuser = NULL; /* Move tuser to tf. */ /* parse arguments */ for (i = 0; i < argc; i++) { -- cgit v1.2.3 From 2db832ec9090d3b5f726f49ad4d0322d6b68a490 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 1 Apr 2025 00:36:11 +0900 Subject: tracing: fprobe-events: Register fprobe-events only when it is enabled Currently fprobe events are registered when it is defined. Thus it will give some overhead even if it is disabled. This changes it to register the fprobe only when it is enabled. Link: https://lore.kernel.org/all/174343537128.843280.16131300052837035043.stgit@devnote2/ Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 5 + kernel/trace/trace_fprobe.c | 237 ++++++++++++++++++++++---------------------- 2 files changed, 121 insertions(+), 121 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index ba7ff14f5339..b78fce0982c7 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -648,6 +648,11 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num) #define FPROBE_IPS_MAX INT_MAX +int fprobe_count_ips_from_filter(const char *filter, const char *notfilter) +{ + return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX); +} + /** * register_fprobe() - Register fprobe to ftrace by pattern. * @fp: A fprobe data structure to be registered. diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 4a9ce7b0e084..79ffc1b2d519 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -600,98 +600,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event, return NULL; } -static inline int __enable_trace_fprobe(struct trace_fprobe *tf) -{ - if (trace_fprobe_is_registered(tf)) - enable_fprobe(&tf->fp); - - return 0; -} - -static void __disable_trace_fprobe(struct trace_probe *tp) -{ - struct trace_fprobe *tf; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - if (!trace_fprobe_is_registered(tf)) - continue; - disable_fprobe(&tf->fp); - } -} - -/* - * Enable trace_probe - * if the file is NULL, enable "perf" handler, or enable "trace" handler. - */ -static int enable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - struct trace_fprobe *tf; - bool enabled; - int ret = 0; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - enabled = trace_probe_is_enabled(tp); - - /* This also changes "enabled" state */ - if (file) { - ret = trace_probe_add_file(tp, file); - if (ret) - return ret; - } else - trace_probe_set_flag(tp, TP_FLAG_PROFILE); - - if (!enabled) { - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - /* TODO: check the fprobe is gone */ - __enable_trace_fprobe(tf); - } - } - - return 0; -} - -/* - * Disable trace_probe - * if the file is NULL, disable "perf" handler, or disable "trace" handler. - */ -static int disable_trace_fprobe(struct trace_event_call *call, - struct trace_event_file *file) -{ - struct trace_probe *tp; - - tp = trace_probe_primary_from_call(call); - if (WARN_ON_ONCE(!tp)) - return -ENODEV; - - if (file) { - if (!trace_probe_get_file_link(tp, file)) - return -ENOENT; - if (!trace_probe_has_single_file(tp)) - goto out; - trace_probe_clear_flag(tp, TP_FLAG_TRACE); - } else - trace_probe_clear_flag(tp, TP_FLAG_PROFILE); - - if (!trace_probe_is_enabled(tp)) - __disable_trace_fprobe(tp); - - out: - if (file) - /* - * Synchronization is done in below function. For perf event, - * file == NULL and perf_trace_event_unreg() calls - * tracepoint_synchronize_unregister() to ensure synchronize - * event. We don't need to care about it. - */ - trace_probe_remove_file(tp, file); - - return 0; -} - /* Event entry printers */ static enum print_line_t print_fentry_event(struct trace_iterator *iter, int flags, @@ -851,6 +759,29 @@ static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) return register_fprobe_ips(&tf->fp, &ip, 1); } +/* Returns an error if the target function is not available, or 0 */ +static int trace_fprobe_verify_target(struct trace_fprobe *tf) +{ + int ret; + + if (trace_fprobe_is_tracepoint(tf)) { + + /* This tracepoint is not loaded yet */ + if (!tracepoint_user_is_registered(tf->tuser)) + return 0; + + /* We assume all stab function is tracable. */ + return tracepoint_user_ip(tf->tuser) ? 0 : -ENOENT; + } + + /* + * Note: since we don't lock the module, even if this succeeded, + * register_fprobe() later can fail. + */ + ret = fprobe_count_ips_from_filter(tf->symbol, NULL); + return (ret < 0) ? ret : 0; +} + /* Internal register function - just handle fprobe and flags */ static int __register_trace_fprobe(struct trace_fprobe *tf) { @@ -870,11 +801,7 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Set/clear disabled flag according to tp->flag */ - if (trace_probe_is_enabled(&tf->tp)) - tf->fp.flags &= ~FPROBE_FL_DISABLED; - else - tf->fp.flags |= FPROBE_FL_DISABLED; + tf->fp.flags &= ~FPROBE_FL_DISABLED; if (trace_fprobe_is_tracepoint(tf)) { @@ -895,10 +822,10 @@ static void __unregister_trace_fprobe(struct trace_fprobe *tf) if (trace_fprobe_is_registered(tf)) { unregister_fprobe(&tf->fp); memset(&tf->fp, 0, sizeof(tf->fp)); - if (trace_fprobe_is_tracepoint(tf)) { - tracepoint_user_put(tf->tuser); - tf->tuser = NULL; - } + } + if (trace_fprobe_is_tracepoint(tf)) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; } } @@ -958,7 +885,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig, return false; } -static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) +static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to) { int ret; @@ -986,7 +913,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) if (ret) return ret; - ret = __register_trace_fprobe(tf); + ret = trace_fprobe_verify_target(tf); if (ret) trace_probe_unlink(&tf->tp); else @@ -995,8 +922,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to) return ret; } -/* Register a trace_probe and probe_event */ -static int register_trace_fprobe(struct trace_fprobe *tf) +/* Register a trace_probe and probe_event, and check the fprobe is available. */ +static int register_trace_fprobe_event(struct trace_fprobe *tf) { struct trace_fprobe *old_tf; int ret; @@ -1006,7 +933,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf) old_tf = find_trace_fprobe(trace_probe_name(&tf->tp), trace_probe_group_name(&tf->tp)); if (old_tf) - return append_trace_fprobe(tf, old_tf); + return append_trace_fprobe_event(tf, old_tf); /* Register new event */ ret = register_fprobe_event(tf); @@ -1019,8 +946,8 @@ static int register_trace_fprobe(struct trace_fprobe *tf) return ret; } - /* Register fprobe */ - ret = __register_trace_fprobe(tf); + /* Verify fprobe is sane. */ + ret = trace_fprobe_verify_target(tf); if (ret < 0) unregister_fprobe_event(tf); else @@ -1084,15 +1011,6 @@ static struct tracepoint *find_tracepoint(const char *tp_name, } #ifdef CONFIG_MODULES -static void reenable_trace_fprobe(struct trace_fprobe *tf) -{ - struct trace_probe *tp = &tf->tp; - - list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { - __enable_trace_fprobe(tf); - } -} - /* * Find a tracepoint from specified module. In this case, this does not get the * module's refcount. The caller must ensure the module is not freed. @@ -1146,9 +1064,8 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, } /* Finally enable fprobe on this module. */ - if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) && - trace_probe_is_enabled(&tf->tp)) - reenable_trace_fprobe(tf); + if (trace_probe_is_enabled(&tf->tp) && !trace_fprobe_is_registered(tf)) + WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)); } else if (val == MODULE_STATE_GOING) { tuser = tf->tuser; /* Unregister all tracepoint_user in this module. */ @@ -1397,7 +1314,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], if (ret < 0) return ret; - ret = register_trace_fprobe(tf); + ret = register_trace_fprobe_event(tf); if (ret) { trace_probe_log_set_index(1); if (ret == -EILSEQ) @@ -1466,6 +1383,84 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev) return 0; } +/* + * Enable trace_probe + * if the file is NULL, enable "perf" handler, or enable "trace" handler. + */ +static int enable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_probe *tp; + struct trace_fprobe *tf; + bool enabled; + int ret = 0; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + enabled = trace_probe_is_enabled(tp); + + /* This also changes "enabled" state */ + if (file) { + ret = trace_probe_add_file(tp, file); + if (ret) + return ret; + } else + trace_probe_set_flag(tp, TP_FLAG_PROFILE); + + if (!enabled) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + ret = __register_trace_fprobe(tf); + if (ret < 0) + return ret; + } + } + + return 0; +} + +/* + * Disable trace_probe + * if the file is NULL, disable "perf" handler, or disable "trace" handler. + */ +static int disable_trace_fprobe(struct trace_event_call *call, + struct trace_event_file *file) +{ + struct trace_fprobe *tf; + struct trace_probe *tp; + + tp = trace_probe_primary_from_call(call); + if (WARN_ON_ONCE(!tp)) + return -ENODEV; + + if (file) { + if (!trace_probe_get_file_link(tp, file)) + return -ENOENT; + if (!trace_probe_has_single_file(tp)) + goto out; + trace_probe_clear_flag(tp, TP_FLAG_TRACE); + } else + trace_probe_clear_flag(tp, TP_FLAG_PROFILE); + + if (!trace_probe_is_enabled(tp)) { + list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { + unregister_fprobe(&tf->fp); + } + } + + out: + if (file) + /* + * Synchronization is done in below function. For perf event, + * file == NULL and perf_trace_event_unreg() calls + * tracepoint_synchronize_unregister() to ensure synchronize + * event. We don't need to care about it. + */ + trace_probe_remove_file(tp, file); + + return 0; +} + /* * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex. */ -- cgit v1.2.3 From 2867495dea86324e984fbafa09474bf92534b652 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 1 Apr 2025 00:36:29 +0900 Subject: tracing: tprobe-events: Register tracepoint when enable tprobe event As same as fprobe, register tracepoint stub function only when enabling tprobe events. The major changes are introducing a list of tracepoint_user and its lock, and tprobe_event_module_nb, which is another module notifier for module loading/unloading. By spliting the lock from event_mutex and a module notifier for trace_fprobe, it solved AB-BA lock dependency issue between event_mutex and tracepoint_module_list_mutex. Link: https://lore.kernel.org/all/174343538901.843280.423773753642677941.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_fprobe.c | 382 +++++++++++++++++++++++++------------------- 1 file changed, 217 insertions(+), 165 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 79ffc1b2d519..dbf9d413125a 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -7,7 +7,9 @@ #include #include +#include #include +#include #include #include #include @@ -37,15 +39,21 @@ static struct dyn_event_operations trace_fprobe_ops = { .match = trace_fprobe_match, }; +/* List of tracepoint_user */ +static LIST_HEAD(tracepoint_user_list); +static DEFINE_MUTEX(tracepoint_user_mutex); + +/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */ struct tracepoint_user { + struct list_head list; + const char *name; struct tracepoint *tpoint; unsigned int refcount; }; -static bool tracepoint_user_is_registered(struct tracepoint_user *tuser) -{ - return tuser && tuser->tpoint; -} +/* NOTE: you must lock tracepoint_user_mutex. */ +#define for_each_tracepoint_user(tuser) \ + list_for_each_entry(tuser, &tracepoint_user_list, list) static int tracepoint_user_register(struct tracepoint_user *tuser) { @@ -75,58 +83,111 @@ static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser) return (unsigned long)tuser->tpoint->probestub; } -static bool tracepoint_user_within_module(struct tracepoint_user *tuser, - struct module *mod) +static void __tracepoint_user_free(struct tracepoint_user *tuser) { - return within_module(tracepoint_user_ip(tuser), mod); + if (!tuser) + return; + kfree(tuser->name); + kfree(tuser); } -static struct tracepoint_user *tracepoint_user_allocate(struct tracepoint *tpoint) +DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T)) + +static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint) { - struct tracepoint_user *tuser; + struct tracepoint_user *tuser __free(tuser_free) = NULL; + int ret; tuser = kzalloc(sizeof(*tuser), GFP_KERNEL); if (!tuser) return NULL; + tuser->name = kstrdup(name, GFP_KERNEL); + if (!tuser->name) + return NULL; + + if (tpoint) { + ret = tracepoint_user_register(tuser); + if (ret) + return ERR_PTR(ret); + } + tuser->tpoint = tpoint; tuser->refcount = 1; + INIT_LIST_HEAD(&tuser->list); + list_add(&tuser->list, &tracepoint_user_list); - return tuser; + return_ptr(tuser); } -/* These must be called with event_mutex */ -static void tracepoint_user_get(struct tracepoint_user *tuser) -{ - tuser->refcount++; -} +static struct tracepoint *find_tracepoint(const char *tp_name, + struct module **tp_mod); -static void tracepoint_user_put(struct tracepoint_user *tuser) +/* + * Get tracepoint_user if exist, or allocate new one and register it. + * If tracepoint is on a module, get its refcounter too. + * This returns errno or NULL (not loaded yet) or tracepoint_user. + */ +static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod) { - if (--tuser->refcount > 0) - return; + struct module *mod __free(module_put) = NULL; + struct tracepoint_user *tuser; + struct tracepoint *tpoint; - if (tracepoint_user_is_registered(tuser)) - tracepoint_user_unregister(tuser); - kfree(tuser); + if (!name || !pmod) + return ERR_PTR(-EINVAL); + + /* Get and lock the module which has tracepoint. */ + tpoint = find_tracepoint(name, &mod); + + guard(mutex)(&tracepoint_user_mutex); + /* Search existing tracepoint_user */ + for_each_tracepoint_user(tuser) { + if (!strcmp(tuser->name, name)) { + tuser->refcount++; + *pmod = no_free_ptr(mod); + return tuser; + } + } + + /* The corresponding tracepoint_user is not found. */ + tuser = __tracepoint_user_init(name, tpoint); + if (!IS_ERR_OR_NULL(tuser)) + *pmod = no_free_ptr(mod); + + return tuser; } -static const char *tracepoint_user_lookup(struct tracepoint_user *tuser, char *buf) +static void tracepoint_user_put(struct tracepoint_user *tuser) { - struct tracepoint *tpoint = tuser->tpoint; + scoped_guard(mutex, &tracepoint_user_mutex) { + if (--tuser->refcount > 0) + return; - if (!tpoint) - return NULL; + list_del(&tuser->list); + tracepoint_user_unregister(tuser); + } - return kallsyms_lookup((unsigned long)tpoint->probestub, NULL, NULL, NULL, buf); + __tracepoint_user_free(tuser); } +DEFINE_FREE(tuser_put, struct tracepoint_user *, + if (!IS_ERR_OR_NULL(_T)) + tracepoint_user_put(_T)) + /* * Fprobe event core functions */ + +/* + * @tprobe is true for tracepoint probe. + * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not + * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled. + */ struct trace_fprobe { struct dyn_event devent; struct fprobe fp; const char *symbol; + bool tprobe; struct tracepoint_user *tuser; struct trace_probe tp; }; @@ -157,7 +218,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf) static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf) { - return tf->tuser != NULL; + return tf->tprobe; } static const char *trace_fprobe_symbol(struct trace_fprobe *tf) @@ -207,56 +268,6 @@ static bool trace_fprobe_is_registered(struct trace_fprobe *tf) return fprobe_is_registered(&tf->fp); } -static struct tracepoint *find_tracepoint(const char *tp_name, - struct module **tp_mod); - -/* - * Get tracepoint_user if exist, or allocate new one. If tracepoint is on a - * module, get its refcounter. - */ -static struct tracepoint_user * -trace_fprobe_get_tracepoint_user(const char *name, struct module **pmod) -{ - struct tracepoint_user *tuser __free(kfree) = NULL; - struct tracepoint *tpoint; - struct trace_fprobe *tf; - struct dyn_event *dpos; - struct module *mod __free(module_put) = NULL; - int ret; - - /* - * Find appropriate tracepoint and locking module. - * Note: tpoint can be NULL if it is unloaded (or failed to get module.) - */ - tpoint = find_tracepoint(name, &mod); - - /* Search existing tracepoint_user */ - for_each_trace_fprobe(tf, dpos) { - if (!trace_fprobe_is_tracepoint(tf)) - continue; - if (!strcmp(tf->symbol, name)) { - tracepoint_user_get(tf->tuser); - *pmod = no_free_ptr(mod); - return tf->tuser; - } - } - - /* Not found, allocate and register new tracepoint_user. */ - tuser = tracepoint_user_allocate(tpoint); - if (!tuser) - return NULL; - - if (tpoint) { - /* If the tracepoint is not loaded, tpoint can be NULL. */ - ret = tracepoint_user_register(tuser); - if (ret) - return ERR_PTR(ret); - } - - *pmod = no_free_ptr(mod); - return_ptr(tuser); -} - /* * Note that we don't verify the fetch_insn code, since it does not come * from user space. @@ -558,8 +569,8 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f static struct trace_fprobe *alloc_trace_fprobe(const char *group, const char *event, const char *symbol, - struct tracepoint_user *tuser, - int nargs, bool is_return) + int nargs, bool is_return, + bool is_tracepoint) { struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; int ret = -ENOMEM; @@ -577,7 +588,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group, else tf->fp.entry_handler = fentry_dispatcher; - tf->tuser = tuser; + tf->tprobe = is_tracepoint; ret = trace_probe_init(&tf->tp, event, group, false, nargs); if (ret < 0) @@ -751,12 +762,35 @@ static int unregister_fprobe_event(struct trace_fprobe *tf) static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf) { - unsigned long ip = tracepoint_user_ip(tf->tuser); + struct tracepoint_user *tuser __free(tuser_put) = NULL; + struct module *mod __free(module_put) = NULL; + unsigned long ip; + int ret; - if (!ip) - return -ENOENT; + if (WARN_ON_ONCE(tf->tuser)) + return -EINVAL; + + /* If the tracepoint is in a module, it must be locked in this function. */ + tuser = tracepoint_user_find_get(tf->symbol, &mod); + /* This tracepoint is not loaded yet */ + if (IS_ERR(tuser)) + return PTR_ERR(tuser); + if (!tuser) + return -ENOMEM; - return register_fprobe_ips(&tf->fp, &ip, 1); + /* Register fprobe only if the tracepoint is loaded. */ + if (tuser->tpoint) { + ip = tracepoint_user_ip(tuser); + if (WARN_ON_ONCE(!ip)) + return -ENOENT; + + ret = register_fprobe_ips(&tf->fp, &ip, 1); + if (ret < 0) + return ret; + } + + tf->tuser = no_free_ptr(tuser); + return 0; } /* Returns an error if the target function is not available, or 0 */ @@ -764,15 +798,9 @@ static int trace_fprobe_verify_target(struct trace_fprobe *tf) { int ret; - if (trace_fprobe_is_tracepoint(tf)) { - - /* This tracepoint is not loaded yet */ - if (!tracepoint_user_is_registered(tf->tuser)) - return 0; - - /* We assume all stab function is tracable. */ - return tracepoint_user_ip(tf->tuser) ? 0 : -ENOENT; - } + /* Tracepoint should have a stub function. */ + if (trace_fprobe_is_tracepoint(tf)) + return 0; /* * Note: since we don't lock the module, even if this succeeded, @@ -803,14 +831,8 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) tf->fp.flags &= ~FPROBE_FL_DISABLED; - if (trace_fprobe_is_tracepoint(tf)) { - - /* This tracepoint is not loaded yet */ - if (!tracepoint_user_is_registered(tf->tuser)) - return 0; - + if (trace_fprobe_is_tracepoint(tf)) return __regsiter_tracepoint_fprobe(tf); - } /* TODO: handle filter, nofilter or symbol list */ return register_fprobe(&tf->fp, tf->symbol, NULL); @@ -819,11 +841,9 @@ static int __register_trace_fprobe(struct trace_fprobe *tf) /* Internal unregister function - just handle fprobe and flags */ static void __unregister_trace_fprobe(struct trace_fprobe *tf) { - if (trace_fprobe_is_registered(tf)) { + if (trace_fprobe_is_registered(tf)) unregister_fprobe(&tf->fp); - memset(&tf->fp, 0, sizeof(tf->fp)); - } - if (trace_fprobe_is_tracepoint(tf)) { + if (tf->tuser) { tracepoint_user_put(tf->tuser); tf->tuser = NULL; } @@ -1027,63 +1047,52 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod, return data.tpoint; } +/* These are CONFIG_MODULES=y specific functions. */ +static bool tracepoint_user_within_module(struct tracepoint_user *tuser, + struct module *mod) +{ + return within_module(tracepoint_user_ip(tuser), mod); +} + +static int tracepoint_user_register_again(struct tracepoint_user *tuser, + struct tracepoint *tpoint) +{ + tuser->tpoint = tpoint; + return tracepoint_user_register(tuser); +} + +static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser) +{ + tracepoint_user_unregister(tuser); + tuser->tpoint = NULL; +} + +/* module callback for tracepoint_user */ static int __tracepoint_probe_module_cb(struct notifier_block *self, unsigned long val, void *data) { struct tp_module *tp_mod = data; struct tracepoint_user *tuser; - struct trace_fprobe *tf; - struct dyn_event *pos; + struct tracepoint *tpoint; if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) return NOTIFY_DONE; - mutex_lock(&event_mutex); - for_each_trace_fprobe(tf, pos) { - if (!trace_fprobe_is_tracepoint(tf)) - continue; - + mutex_lock(&tracepoint_user_mutex); + for_each_tracepoint_user(tuser) { if (val == MODULE_STATE_COMING) { - /* - * If any tracepoint used by tprobe is in the module, - * register the stub. - */ - struct tracepoint *tpoint; - - tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol); /* This is not a tracepoint in this module. Skip it. */ + tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name); if (!tpoint) continue; - - tuser = tf->tuser; - /* If the tracepoint is not registered yet, register it. */ - if (!tracepoint_user_is_registered(tuser)) { - tuser->tpoint = tpoint; - if (WARN_ON_ONCE(tracepoint_user_register(tuser))) - continue; - } - - /* Finally enable fprobe on this module. */ - if (trace_probe_is_enabled(&tf->tp) && !trace_fprobe_is_registered(tf)) - WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)); - } else if (val == MODULE_STATE_GOING) { - tuser = tf->tuser; + WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint)); + } else if (val == MODULE_STATE_GOING && + tracepoint_user_within_module(tuser, tp_mod->mod)) { /* Unregister all tracepoint_user in this module. */ - if (tracepoint_user_is_registered(tuser) && - tracepoint_user_within_module(tuser, tp_mod->mod)) - tracepoint_user_unregister(tuser); - - /* - * Here we need to handle shared tracepoint_user case. - * Such tuser is unregistered, but trace_fprobe itself - * is registered. (Note this only handles tprobes.) - */ - if (!tracepoint_user_is_registered(tuser) && - trace_fprobe_is_registered(tf)) - unregister_fprobe(&tf->fp); + tracepoint_user_unregister_clear(tuser); } } - mutex_unlock(&event_mutex); + mutex_unlock(&tracepoint_user_mutex); return NOTIFY_DONE; } @@ -1091,6 +1100,54 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self, static struct notifier_block tracepoint_module_nb = { .notifier_call = __tracepoint_probe_module_cb, }; + +/* module callback for tprobe events */ +static int __tprobe_event_module_cb(struct notifier_block *self, + unsigned long val, void *data) +{ + struct trace_fprobe *tf; + struct dyn_event *pos; + struct module *mod = data; + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING) + return NOTIFY_DONE; + + mutex_lock(&event_mutex); + for_each_trace_fprobe(tf, pos) { + /* Skip fprobe and disabled tprobe events. */ + if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser) + continue; + + /* Before this notification, tracepoint notifier has already done. */ + if (val == MODULE_STATE_COMING && + tracepoint_user_within_module(tf->tuser, mod)) { + unsigned long ip = tracepoint_user_ip(tf->tuser); + + WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1)); + } else if (val == MODULE_STATE_GOING && + /* + * tracepoint_user_within_module() does not work here because + * tracepoint_user is already unregistered and cleared tpoint. + * Instead, checking whether the fprobe is registered but + * tpoint is cleared(unregistered). Such unbalance probes + * must be adjusted anyway. + */ + trace_fprobe_is_registered(tf) && + !tf->tuser->tpoint) { + unregister_fprobe(&tf->fp); + } + } + mutex_unlock(&event_mutex); + + return NOTIFY_DONE; +} + +/* NOTE: this must be called after tracepoint callback */ +static struct notifier_block tprobe_event_module_nb = { + .notifier_call = __tprobe_event_module_cb, + /* Make sure this is later than tracepoint module notifier. */ + .priority = -10, +}; #endif /* CONFIG_MODULES */ static int parse_symbol_and_return(int argc, const char *argv[], @@ -1149,10 +1206,6 @@ static int parse_symbol_and_return(int argc, const char *argv[], return 0; } -DEFINE_FREE(tuser_put, struct tracepoint_user *, - if (!IS_ERR_OR_NULL(_T)) - tracepoint_user_put(_T)) - static int trace_fprobe_create_internal(int argc, const char *argv[], struct traceprobe_parse_context *ctx) { @@ -1181,7 +1234,6 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; - struct tracepoint_user *tuser __free(tuser_put) = NULL; struct module *mod __free(module_put) = NULL; int i, new_argc = 0, ret = 0; bool is_return = false; @@ -1244,21 +1296,19 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], else ctx->flags |= TPARG_FL_FENTRY; + ctx->funcname = NULL; if (is_tracepoint) { + /* Get tracepoint and lock its module until the end of the registration. */ + struct tracepoint *tpoint; + ctx->flags |= TPARG_FL_TPOINT; - tuser = trace_fprobe_get_tracepoint_user(symbol, &mod); - if (!tuser) - return -ENOMEM; - if (IS_ERR(tuser)) { - trace_probe_log_set_index(1); - trace_probe_log_err(0, NO_TRACEPOINT); - return PTR_ERR(tuser); - } - ctx->funcname = tracepoint_user_lookup(tuser, sbuf); - /* If tracepoint is not loaded yet, use symbol name as funcname. */ - if (!ctx->funcname) - ctx->funcname = symbol; - } else + mod = NULL; + tpoint = find_tracepoint(symbol, &mod); + if (tpoint) + ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub, + NULL, NULL, NULL, sbuf); + } + if (!ctx->funcname) ctx->funcname = symbol; argc -= 2; argv += 2; @@ -1281,14 +1331,13 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], return ret; /* setup a probe */ - tf = alloc_trace_fprobe(group, event, symbol, tuser, argc, is_return); + tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint); if (IS_ERR(tf)) { ret = PTR_ERR(tf); /* This must return -ENOMEM, else there is a bug */ WARN_ON_ONCE(ret != -ENOMEM); return ret; } - tuser = NULL; /* Move tuser to tf. */ /* parse arguments */ for (i = 0; i < argc; i++) { @@ -1506,6 +1555,9 @@ static __init int init_fprobe_trace_early(void) ret = register_tracepoint_module_notifier(&tracepoint_module_nb); if (ret) return ret; + ret = register_module_notifier(&tprobe_event_module_nb); + if (ret) + return ret; #endif return 0; -- cgit v1.2.3 From b95ee9049c93b05da66599d6e0d1fcfea291360b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 22 Jun 2025 23:38:52 -0700 Subject: bpf: Introduce bpf_cgroup_read_xattr to read xattr of cgroup's node BPF programs, such as LSM and sched_ext, would benefit from tags on cgroups. One common practice to apply such tags is to set xattrs on cgroupfs folders. Introduce kfunc bpf_cgroup_read_xattr, which allows reading cgroup's xattr. Note that, we already have bpf_get_[file|dentry]_xattr. However, these two APIs are not ideal for reading cgroupfs xattrs, because: 1) These two APIs only works in sleepable contexts; 2) There is no kfunc that matches current cgroup to cgroupfs dentry. bpf_cgroup_read_xattr is generic and can be useful for many program types. It is also safe, because it requires trusted or rcu protected argument (KF_RCU). Therefore, we make it available to all program types. Signed-off-by: Song Liu Link: https://lore.kernel.org/20250623063854.1896364-3-song@kernel.org Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- kernel/bpf/helpers.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b71e428ad936..9ff1b4090289 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3397,6 +3397,9 @@ BTF_ID_FLAGS(func, bpf_iter_dmabuf_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPAB BTF_ID_FLAGS(func, bpf_iter_dmabuf_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, __bpf_trap) +#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) +#endif BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 5bc9557c9f17c38ed8c9070c1d369d49eabea1a9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sun, 22 Jun 2025 23:38:53 -0700 Subject: bpf: Mark cgroup_subsys_state->cgroup RCU safe Mark struct cgroup_subsys_state->cgroup as safe under RCU read lock. This will enable accessing css->cgroup from a bpf css iterator. Signed-off-by: Song Liu Link: https://lore.kernel.org/20250623063854.1896364-4-song@kernel.org Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..db01bf51c792 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6993,6 +6993,10 @@ BTF_TYPE_SAFE_RCU(struct css_set) { struct cgroup *dfl_cgrp; }; +BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state) { + struct cgroup *cgroup; +}; + /* RCU trusted: these fields are trusted in RCU CS and can be NULL */ BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) { struct file __rcu *exe_file; @@ -7043,6 +7047,7 @@ static bool type_is_rcu(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup_subsys_state)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu"); } -- cgit v1.2.3 From e0e9506523fea415e0d5abaa103fd67dc8a39696 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Sun, 22 Jun 2025 20:00:09 -0400 Subject: smp: Defer check for local execution in smp_call_function_many_cond() Defer check for local execution to the actual place where it is needed, which removes the extra local variable. Signed-off-by: Yury Norov [NVIDIA] Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250623000010.10124-5-yury.norov@gmail.com --- kernel/smp.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 5871acf3cd45..99d1fd0e9e0e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -779,7 +779,6 @@ static void smp_call_function_many_cond(const struct cpumask *mask, bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; - bool run_local = false; lockdep_assert_preemption_disabled(); @@ -801,11 +800,6 @@ static void smp_call_function_many_cond(const struct cpumask *mask, */ WARN_ON_ONCE(!in_task()); - /* Check if we need local execution. */ - if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && - (!cond_func || cond_func(this_cpu, info))) - run_local = true; - /* Check if we need remote execution, i.e., any CPU excluding this one. */ if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) { run_remote = true; @@ -851,7 +845,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, send_call_function_ipi_mask(cfd->cpumask_ipi); } - if (run_local) { + /* Check if we need local execution. */ + if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && + (!cond_func || cond_func(this_cpu, info))) { unsigned long flags; local_irq_save(flags); -- cgit v1.2.3 From c3b9faac9bd690263e03b78eb78c75cae5ff7509 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 2 Jul 2025 00:36:19 -0700 Subject: bpf: avoid jump misprediction for PTR_TO_MEM | PTR_UNTRUSTED Commit f2362a57aeff ("bpf: allow void* cast using bpf_rdonly_cast()") added a notion of PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED type. This simultaneously introduced a bug in jump prediction logic for situations like below: p = bpf_rdonly_cast(..., 0); if (p) a(); else b(); Here verifier would wrongly predict that else branch cannot be taken. This happens because: - Function reg_not_null() assumes that PTR_TO_MEM w/o PTR_MAYBE_NULL flag cannot be zero. - The call to bpf_rdonly_cast() returns a rdonly_untrusted_mem value w/o PTR_MAYBE_NULL flag. Tracking of PTR_MAYBE_NULL flag for untrusted PTR_TO_MEM does not make sense, as the main purpose of the flag is to catch null pointer access errors. Such errors are not possible on load of PTR_UNTRUSTED values and verifier makes sure that PTR_UNTRUSTED can't be passed to helpers or kfuncs. Hence, modify reg_not_null() to assume that nullness of untrusted PTR_TO_MEM is not known. Fixes: f2362a57aeff ("bpf: allow void* cast using bpf_rdonly_cast()") Suggested-by: Alexei Starovoitov Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250702073620.897517-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a352b35be479..525a21dbcce3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -410,7 +410,7 @@ static bool reg_not_null(const struct bpf_reg_state *reg) type == PTR_TO_MAP_KEY || type == PTR_TO_SOCK_COMMON || (type == PTR_TO_BTF_ID && is_trusted_reg(reg)) || - type == PTR_TO_MEM || + (type == PTR_TO_MEM && !(reg->type & PTR_UNTRUSTED)) || type == CONST_PTR_TO_MAP; } -- cgit v1.2.3 From 65fdafd6765f677494ab0cb35b8237eb742a18e5 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 2 Jul 2025 15:54:54 +0200 Subject: bpf: Avoid warning on multiple referenced args in call The description of full helper calls in syzkaller [1] and the addition of kernel warnings in commit 0df1a55afa83 ("bpf: Warn on internal verifier errors") allowed syzbot to reach a verifier state that was thought to indicate a verifier bug [2]: 12: (85) call bpf_tcp_raw_gen_syncookie_ipv4#204 verifier bug: more than one arg with ref_obj_id R2 2 2 This error can be reproduced with the program from the previous commit: 0: (b7) r2 = 20 1: (b7) r3 = 0 2: (18) r1 = 0xffff92cee3cbc600 4: (85) call bpf_ringbuf_reserve#131 5: (55) if r0 == 0x0 goto pc+3 6: (bf) r1 = r0 7: (bf) r2 = r0 8: (85) call bpf_tcp_raw_gen_syncookie_ipv4#204 9: (95) exit bpf_tcp_raw_gen_syncookie_ipv4 expects R1 and R2 to be ARG_PTR_TO_FIXED_SIZE_MEM (with a size of at least sizeof(struct iphdr) for R1). R0 is a ring buffer payload of 20B and therefore matches this requirement. The verifier reaches the check on ref_obj_id while verifying R2 and rejects the program because the helper isn't supposed to take two referenced arguments. This case is a legitimate rejection and doesn't indicate a kernel bug, so we shouldn't log it as such and shouldn't emit a kernel warning. Link: https://github.com/google/syzkaller/pull/4313 [1] Link: https://lore.kernel.org/all/686491d6.a70a0220.3b7e22.20ea.GAE@google.com/T/ [2] Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Fixes: 0df1a55afa83 ("bpf: Warn on internal verifier errors") Reported-by: syzbot+69014a227f8edad4d8c6@syzkaller.appspotmail.com Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/cd09afbfd7bef10bbc432d72693f78ffdc1e8ee5.1751463262.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 525a21dbcce3..52e36fd23f40 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9673,10 +9673,10 @@ skip_type_check: if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) { if (meta->ref_obj_id) { - verifier_bug(env, "more than one arg with ref_obj_id R%d %u %u", - regno, reg->ref_obj_id, - meta->ref_obj_id); - return -EFAULT; + verbose(env, "more than one arg with ref_obj_id R%d %u %u", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EACCES; } meta->ref_obj_id = reg->ref_obj_id; } -- cgit v1.2.3 From ba677dbe77af5ffe6204e0f3f547f3ba059c6302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Jul 2025 18:21:44 +0200 Subject: perf: Revert to requiring CAP_SYS_ADMIN for uprobes Jann reports that uprobes can be used destructively when used in the middle of an instruction. The kernel only verifies there is a valid instruction at the requested offset, but due to variable instruction length cannot determine if this is an instruction as seen by the intended execution stream. Additionally, Mark Rutland notes that on architectures that mix data in the text segment (like arm64), a similar things can be done if the data word is 'mistaken' for an instruction. As such, require CAP_SYS_ADMIN for uprobes. Fixes: c9e0924e5c2b ("perf/core: open access to probes for CAP_PERFMON privileged process") Reported-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAG48ez1n4520sq0XrWYDHKiKxE_+WCfAK+qt9qkY4ZiBGmL-5g@mail.gmail.com --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index bf2118c22126..0db36b2b2448 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11116,7 +11116,7 @@ static int perf_uprobe_event_init(struct perf_event *event) if (event->attr.type != perf_uprobe.type) return -ENOENT; - if (!perfmon_capable()) + if (!capable(CAP_SYS_ADMIN)) return -EACCES; /* -- cgit v1.2.3 From 5b605dbee07dda8fd538af1f07cbf1baf0a49cbc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Jul 2025 15:26:58 +0200 Subject: timekeeping: Provide ktime_get_clock_ts64() PTP implements an inline switch case for taking timestamps from various POSIX clock IDs, which already consumes quite some text space. Expanding it for auxiliary clocks really becomes too big for inlining. Provide a out of line version. The function invalidates the timestamp in case the clock is invalid. The invalidation allows to implement a validation check without the need to propagate a return value through deep existing call chains. Due to merge logistics this temporarily defines CLOCK_AUX[_LAST] if undefined, so that the plain branch, which does not contain any of the core timekeeper changes, can be pulled into the networking tree as prerequisite for the PTP side changes. These temporary defines are removed after that branch is merged into the tip::timers/ptp branch. That way the result in -next or upstream in the next merge window has zero dependencies. Signed-off-by: Thomas Gleixner Reviewed-by: Vadim Fedorenko Acked-by: John Stultz Link: https://lore.kernel.org/all/20250701132628.357686408@linutronix.de --- kernel/time/timekeeping.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a009c91f7b05..572e3bd0cc94 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1573,6 +1573,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres -- cgit v1.2.3 From 858e65af91351f8842bbe2c5ae6f100778783f42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 26 Jun 2025 16:48:58 +0200 Subject: irqdomain: Add device pointer to irq_domain_info and msi_domain_info Add device pointer to irq_domain_info and msi_domain_info, so that the device can be specified at domain creation time. Signed-off-by: Thomas Gleixner Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/943e52403b20cf13c320d55bd4446b4562466aab.1750860131.git.namcao@linutronix.de --- kernel/irq/irqdomain.c | 1 + kernel/irq/msi.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c8b6de09047b..4afbd3ac532f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -317,6 +317,7 @@ static struct irq_domain *__irq_domain_instantiate(const struct irq_domain_info domain->flags |= info->domain_flags; domain->exit = info->exit; + domain->dev = info->dev; #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY if (info->parent) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 9febe797a5f6..9b09ad3f9914 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -889,6 +889,7 @@ static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode, if (domain) { irq_domain_update_bus_token(domain, info->bus_token); + domain->dev = info->dev; if (info->flags & MSI_FLAG_PARENT_PM_DEV) domain->pm_dev = parent->pm_dev; } @@ -1051,6 +1052,7 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, bundle->info.data = domain_data; bundle->info.chip_data = chip_data; bundle->info.alloc_data = &bundle->alloc_info; + bundle->info.dev = dev; pops = parent->msi_parent_ops; snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s", @@ -1089,7 +1091,6 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, if (!domain) return false; - domain->dev = dev; dev->msi.data->__domains[domid].domain = domain; if (msi_domain_prepare_irqs(domain, dev, hwsize, &bundle->alloc_info)) { -- cgit v1.2.3 From 4266e8fa56d3d982bf451d382a410b9db432015c Mon Sep 17 00:00:00 2001 From: tuhaowen Date: Wed, 11 Jun 2025 11:23:45 +0800 Subject: PM: sleep: console: Fix the black screen issue When the computer enters sleep status without a monitor connected, the system switches the console to the virtual terminal tty63(SUSPEND_CONSOLE). If a monitor is subsequently connected before waking up, the system skips the required VT restoration process during wake-up, leaving the console on tty63 instead of switching back to tty1. To fix this issue, a global flag vt_switch_done is introduced to record whether the system has successfully switched to the suspend console via vt_move_to_console() during suspend. If the switch was completed, vt_switch_done is set to 1. Later during resume, this flag is checked to ensure that the original console is restored properly by calling vt_move_to_console(orig_fgconsole, 0). This prevents scenarios where the resume logic skips console restoration due to incorrect detection of the console state, especially when a monitor is reconnected before waking up. Signed-off-by: tuhaowen Link: https://patch.msgid.link/20250611032345.29962-1-tuhaowen@uniontech.com Signed-off-by: Rafael J. Wysocki --- kernel/power/console.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index fcdf0e14a47d..19c48aa5355d 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -16,6 +16,7 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static bool vt_switch_done; static DEFINE_MUTEX(vt_switch_mutex); @@ -136,17 +137,21 @@ void pm_prepare_console(void) if (orig_fgconsole < 0) return; + vt_switch_done = true; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return; } void pm_restore_console(void) { - if (!pm_vt_switch()) + if (!pm_vt_switch() && !vt_switch_done) return; if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); } + + vt_switch_done = false; } -- cgit v1.2.3 From 5fc5d8fded57e5de0c0b56b00c4cfc522aa8c572 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 2 Jul 2025 14:03:08 -0700 Subject: bpf: Add bpf_dynptr_memset() kfunc Currently there is no straightforward way to fill dynptr memory with a value (most commonly zero). One can do it with bpf_dynptr_write(), but a temporary buffer is necessary for that. Implement bpf_dynptr_memset() - an analogue of memset() from libc. Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250702210309.3115903-2-isolodrai@meta.com --- kernel/bpf/helpers.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index f48fa3fe8dec..5269381d6d3d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2907,6 +2907,52 @@ __bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, return 0; } +/** + * bpf_dynptr_memset() - Fill dynptr memory with a constant byte. + * @p: Destination dynptr - where data will be filled + * @offset: Offset into the dynptr to start filling from + * @size: Number of bytes to fill + * @val: Constant byte to fill the memory with + * + * Fills the @size bytes of the memory area pointed to by @p + * at @offset with the constant byte @val. + * Returns 0 on success; negative error, otherwise. + */ + __bpf_kfunc int bpf_dynptr_memset(struct bpf_dynptr *p, u32 offset, u32 size, u8 val) + { + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p; + u32 chunk_sz, write_off; + char buf[256]; + void* slice; + int err; + + slice = bpf_dynptr_slice_rdwr(p, offset, NULL, size); + if (likely(slice)) { + memset(slice, val, size); + return 0; + } + + if (__bpf_dynptr_is_rdonly(ptr)) + return -EINVAL; + + err = bpf_dynptr_check_off_len(ptr, offset, size); + if (err) + return err; + + /* Non-linear data under the dynptr, write from a local buffer */ + chunk_sz = min_t(u32, sizeof(buf), size); + memset(buf, val, chunk_sz); + + for (write_off = 0; write_off < size; write_off += chunk_sz) { + chunk_sz = min_t(u32, sizeof(buf), size - write_off); + err = __bpf_dynptr_write(ptr, offset + write_off, buf, chunk_sz, 0); + if (err) + return err; + } + + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -3735,6 +3781,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) BTF_ID_FLAGS(func, bpf_dynptr_copy) +BTF_ID_FLAGS(func, bpf_dynptr_memset) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif -- cgit v1.2.3 From 803f0700a3bbf528c4c624a22f87d08178ca0fbe Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 2 Jul 2025 23:39:56 +0800 Subject: bpf: Show precise link_type for {uprobe,kprobe}_multi fdinfo Alexei suggested, 'link_type' can be more precise and differentiate for human in fdinfo. In fact BPF_LINK_TYPE_KPROBE_MULTI includes kretprobe_multi type, the same as BPF_LINK_TYPE_UPROBE_MULTI, so we can show it more concretely. link_type: kprobe_multi link_id: 1 prog_tag: d2b307e915f0dd37 ... link_type: kretprobe_multi link_id: 2 prog_tag: ab9ea0545870781d ... link_type: uprobe_multi link_id: 9 prog_tag: e729f789e34a8eca ... link_type: uretprobe_multi link_id: 10 prog_tag: 7db356c03e61a4d4 Co-developed-by: Jiri Olsa Signed-off-by: Jiri Olsa Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250702153958.639852-1-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 9 ++++++++- kernel/trace/bpf_trace.c | 10 ++++------ 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 56500381c28a..f1d9ee9717a1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3228,7 +3228,14 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { - seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + if (link->type == BPF_LINK_TYPE_KPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_KPROBE_MULTI_RETURN ? + "kretprobe_multi" : "kprobe_multi"); + else if (link->type == BPF_LINK_TYPE_UPROBE_MULTI) + seq_printf(m, "link_type:\t%s\n", link->flags == BPF_F_UPROBE_MULTI_RETURN ? + "uretprobe_multi" : "uprobe_multi"); + else + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); } else { WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); seq_printf(m, "link_type:\t<%u>\n", type); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0a06ea6638fe..81d7a4e5ae15 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2466,7 +2466,6 @@ struct bpf_kprobe_multi_link { u32 cnt; u32 mods_cnt; struct module **mods; - u32 flags; }; struct bpf_kprobe_multi_run_ctx { @@ -2586,7 +2585,7 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); info->kprobe_multi.count = kmulti_link->cnt; - info->kprobe_multi.flags = kmulti_link->flags; + info->kprobe_multi.flags = kmulti_link->link.flags; info->kprobe_multi.missed = kmulti_link->fp.nmissed; if (!uaddrs) @@ -2976,7 +2975,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->addrs = addrs; link->cookies = cookies; link->cnt = cnt; - link->flags = flags; + link->link.flags = flags; if (cookies) { /* @@ -3045,7 +3044,6 @@ struct bpf_uprobe_multi_link { struct path path; struct bpf_link link; u32 cnt; - u32 flags; struct bpf_uprobe *uprobes; struct task_struct *task; }; @@ -3109,7 +3107,7 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); info->uprobe_multi.count = umulti_link->cnt; - info->uprobe_multi.flags = umulti_link->flags; + info->uprobe_multi.flags = umulti_link->link.flags; info->uprobe_multi.pid = umulti_link->task ? task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; @@ -3369,7 +3367,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->uprobes = uprobes; link->path = path; link->task = task; - link->flags = flags; + link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, &bpf_uprobe_multi_link_lops, prog); -- cgit v1.2.3 From b4dfe26fbf56f7b3e50764268e14b73f1632bfc0 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 2 Jul 2025 23:39:57 +0800 Subject: bpf: Add show_fdinfo for uprobe_multi Show uprobe_multi link info with fdinfo, the info as follows: link_type: uprobe_multi link_id: 9 prog_tag: e729f789e34a8eca prog_id: 39 uprobe_cnt: 3 pid: 0 path: /home/dylane/bpf/tools/testing/selftests/bpf/test_progs cookie offset ref_ctr_offset 3 0xa69f13 0x0 1 0xa69f1e 0x0 2 0xa69f29 0x0 Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250702153958.639852-2-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 81d7a4e5ae15..b2737642c42a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -3152,10 +3152,54 @@ static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_uprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_uprobe_multi_link *umulti_link; + char *p, *buf; + pid_t pid; + + umulti_link = container_of(link, struct bpf_uprobe_multi_link, link); + + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + return; + + p = d_path(&umulti_link->path, buf, PATH_MAX); + if (IS_ERR(p)) { + kfree(buf); + return; + } + + pid = umulti_link->task ? + task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0; + seq_printf(seq, + "uprobe_cnt:\t%u\n" + "pid:\t%u\n" + "path:\t%s\n", + umulti_link->cnt, pid, p); + + seq_printf(seq, "%s\t %s\t %s\n", "cookie", "offset", "ref_ctr_offset"); + for (int i = 0; i < umulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %#llx\t %#lx\n", + umulti_link->uprobes[i].cookie, + umulti_link->uprobes[i].offset, + umulti_link->uprobes[i].ref_ctr_offset); + } + + kfree(buf); +} +#endif + static const struct bpf_link_ops bpf_uprobe_multi_link_lops = { .release = bpf_uprobe_multi_link_release, .dealloc_deferred = bpf_uprobe_multi_link_dealloc, .fill_link_info = bpf_uprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_uprobe_multi_show_fdinfo, +#endif }; static int uprobe_prog_run(struct bpf_uprobe *uprobe, -- cgit v1.2.3 From da7e9c0a7fbc5b8552575db16c991e82b7c5fa5c Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 2 Jul 2025 23:39:58 +0800 Subject: bpf: Add show_fdinfo for kprobe_multi Show kprobe_multi link info with fdinfo, the info as follows: link_type: kprobe_multi link_id: 1 prog_tag: a69740b9746f7da8 prog_id: 21 kprobe_cnt: 8 missed: 0 cookie func 1 bpf_fentry_test1+0x0/0x20 7 bpf_fentry_test2+0x0/0x20 2 bpf_fentry_test3+0x0/0x20 3 bpf_fentry_test4+0x0/0x20 4 bpf_fentry_test5+0x0/0x20 5 bpf_fentry_test6+0x0/0x20 6 bpf_fentry_test7+0x0/0x20 8 bpf_fentry_test8+0x0/0x10 Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250702153958.639852-3-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b2737642c42a..e7f97a9a8bbd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2619,10 +2619,37 @@ static int bpf_kprobe_multi_link_fill_link_info(const struct bpf_link *link, return err; } +#ifdef CONFIG_PROC_FS +static void bpf_kprobe_multi_show_fdinfo(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_kprobe_multi_link *kmulti_link; + + kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link); + + seq_printf(seq, + "kprobe_cnt:\t%u\n" + "missed:\t%lu\n", + kmulti_link->cnt, + kmulti_link->fp.nmissed); + + seq_printf(seq, "%s\t %s\n", "cookie", "func"); + for (int i = 0; i < kmulti_link->cnt; i++) { + seq_printf(seq, + "%llu\t %pS\n", + kmulti_link->cookies[i], + (void *)kmulti_link->addrs[i]); + } +} +#endif + static const struct bpf_link_ops bpf_kprobe_multi_link_lops = { .release = bpf_kprobe_multi_link_release, .dealloc_deferred = bpf_kprobe_multi_link_dealloc, .fill_link_info = bpf_kprobe_multi_link_fill_link_info, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_kprobe_multi_show_fdinfo, +#endif }; static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv) -- cgit v1.2.3 From 0426729f46cd1f6354fad07267a21579186a5757 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:07 -0700 Subject: bpf: Refactor bprintf buffer support Refactor code to be able to get and put bprintf buffers and use bpf_printf_prepare independently. This will be used in the next patch to implement BPF streams support, particularly as a staging buffer for strings that need to be formatted and then allocated and pushed into a stream. Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5269381d6d3d..da66ce307e75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -764,22 +764,13 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary - * arguments representation. - */ -#define MAX_BPRINTF_BIN_ARGS 512 - /* Support executing three nested bprintf helper calls on a given CPU */ #define MAX_BPRINTF_NEST_LEVEL 3 -struct bpf_bprintf_buffers { - char bin_args[MAX_BPRINTF_BIN_ARGS]; - char buf[MAX_BPRINTF_BUF]; -}; static DEFINE_PER_CPU(struct bpf_bprintf_buffers[MAX_BPRINTF_NEST_LEVEL], bpf_bprintf_bufs); static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); -static int try_get_buffers(struct bpf_bprintf_buffers **bufs) +int bpf_try_get_buffers(struct bpf_bprintf_buffers **bufs) { int nest_level; @@ -795,16 +786,21 @@ static int try_get_buffers(struct bpf_bprintf_buffers **bufs) return 0; } -void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +void bpf_put_buffers(void) { - if (!data->bin_args && !data->buf) - return; if (WARN_ON_ONCE(this_cpu_read(bpf_bprintf_nest_level) == 0)) return; this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } +void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) +{ + if (!data->bin_args && !data->buf) + return; + bpf_put_buffers(); +} + /* * bpf_bprintf_prepare - Generic pass on format strings for bprintf-like helpers * @@ -819,7 +815,7 @@ void bpf_bprintf_cleanup(struct bpf_bprintf_data *data) * In argument preparation mode, if 0 is returned, safe temporary buffers are * allocated and bpf_bprintf_cleanup should be called to free them after use. */ -int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, +int bpf_bprintf_prepare(const char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data) { bool get_buffers = (data->get_bin_args && num_args) || data->get_buf; @@ -835,7 +831,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, return -EINVAL; fmt_size = fmt_end - fmt; - if (get_buffers && try_get_buffers(&buffers)) + if (get_buffers && bpf_try_get_buffers(&buffers)) return -EBUSY; if (data->get_bin_args) { -- cgit v1.2.3 From 5ab154f1463a111e1dc8fd5d31eaa7a2a71fe2e6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:08 -0700 Subject: bpf: Introduce BPF standard streams Add support for a stream API to the kernel and expose related kfuncs to BPF programs. Two streams are exposed, BPF_STDOUT and BPF_STDERR. These can be used for printing messages that can be consumed from user space, thus it's similar in spirit to existing trace_pipe interface. The kernel will use the BPF_STDERR stream to notify the program of any errors encountered at runtime. BPF programs themselves may use both streams for writing debug messages. BPF library-like code may use BPF_STDERR to print warnings or errors on misuse at runtime. The implementation of a stream is as follows. Everytime a message is emitted from the kernel (directly, or through a BPF program), a record is allocated by bump allocating from per-cpu region backed by a page obtained using alloc_pages_nolock(). This ensures that we can allocate memory from any context. The eventual plan is to discard this scheme in favor of Alexei's kmalloc_nolock() [0]. This record is then locklessly inserted into a list (llist_add()) so that the printing side doesn't require holding any locks, and works in any context. Each stream has a maximum capacity of 4MB of text, and each printed message is accounted against this limit. Messages from a program are emitted using the bpf_stream_vprintk kfunc, which takes a stream_id argument in addition to working otherwise similar to bpf_trace_vprintk. The bprintf buffer helpers are extracted out to be reused for printing the string into them before copying it into the stream, so that we can (with the defined max limit) format a string and know its true length before performing allocations of the stream element. For consuming elements from a stream, we expose a bpf(2) syscall command named BPF_PROG_STREAM_READ_BY_FD, which allows reading data from the stream of a given prog_fd into a user space buffer. The main logic is implemented in bpf_stream_read(). The log messages are queued in bpf_stream::log by the bpf_stream_vprintk kfunc, and then pulled and ordered correctly in the stream backlog. For this purpose, we hold a lock around bpf_stream_backlog_peek(), as llist_del_first() (if we maintained a second lockless list for the backlog) wouldn't be safe from multiple threads anyway. Then, if we fail to find something in the backlog log, we splice out everything from the lockless log, and place it in the backlog log, and then return the head of the backlog. Once the full length of the element is consumed, we will pop it and free it. The lockless list bpf_stream::log is a LIFO stack. Elements obtained using a llist_del_all() operation are in LIFO order, thus would break the chronological ordering if printed directly. Hence, this batch of messages is first reversed. Then, it is stashed into a separate list in the stream, i.e. the backlog_log. The head of this list is the actual message that should always be returned to the caller. All of this is done in bpf_stream_backlog_fill(). From the kernel side, the writing into the stream will be a bit more involved than the typical printk. First, the kernel typically may print a collection of messages into the stream, and parallel writers into the stream may suffer from interleaving of messages. To ensure each group of messages is visible atomically, we can lift the advantage of using a lockless list for pushing in messages. To enable this, we add a bpf_stream_stage() macro, and require kernel users to use bpf_stream_printk statements for the passed expression to write into the stream. Underneath the macro, we have a message staging API, where a bpf_stream_stage object on the stack accumulates the messages being printed into a local llist_head, and then a commit operation splices the whole batch into the stream's lockless log list. This is especially pertinent for rqspinlock deadlock messages printed to program streams. After this change, we see each deadlock invocation as a non-interleaving contiguous message without any confusion on the reader's part, improving their user experience in debugging the fault. While programs cannot benefit from this staged stream writing API, they could just as well hold an rqspinlock around their print statements to serialize messages, hence this is kept kernel-internal for now. Overall, this infrastructure provides NMI-safe any context printing of messages to two dedicated streams. Later patches will add support for printing splats in case of BPF arena page faults, rqspinlock deadlocks, and cond_break timeouts, and integration of this facility into bpftool for dumping messages to user space. [0]: https://lore.kernel.org/bpf/20250501032718.65476-1-alexei.starovoitov@gmail.com Reviewed-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 5 + kernel/bpf/helpers.c | 1 + kernel/bpf/stream.c | 478 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 25 +++ kernel/bpf/verifier.c | 1 + 6 files changed, 511 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/stream.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 3a335c50e6e3..269c04a24664 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o obj-$(CONFIG_BPF_JIT) += trampoline.o -obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o +obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o rqspinlock.o stream.o ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy) obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o endif diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e536a34a32c8..f0def24573ae 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -134,6 +134,10 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); +#ifdef CONFIG_BPF_SYSCALL + bpf_prog_stream_init(fp); +#endif + return fp; } @@ -2862,6 +2866,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); + bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index da66ce307e75..6bfcbcdf6588 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3825,6 +3825,7 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif +BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c new file mode 100644 index 000000000000..e434541358db --- /dev/null +++ b/kernel/bpf/stream.c @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Simple per-CPU NMI-safe bump allocation mechanism, backed by the NMI-safe + * try_alloc_pages()/free_pages_nolock() primitives. We allocate a page and + * stash it in a local per-CPU variable, and bump allocate from the page + * whenever items need to be printed to a stream. Each page holds a global + * atomic refcount in its first 4 bytes, and then records of variable length + * that describe the printed messages. Once the global refcount has dropped to + * zero, it is a signal to free the page back to the kernel's page allocator, + * given all the individual records in it have been consumed. + * + * It is possible the same page is used to serve allocations across different + * programs, which may be consumed at different times individually, hence + * maintaining a reference count per-page is critical for correct lifetime + * tracking. + * + * The bpf_stream_page code will be replaced to use kmalloc_nolock() once it + * lands. + */ +struct bpf_stream_page { + refcount_t ref; + u32 consumed; + char buf[]; +}; + +/* Available room to add data to a refcounted page. */ +#define BPF_STREAM_PAGE_SZ (PAGE_SIZE - offsetofend(struct bpf_stream_page, consumed)) + +static DEFINE_PER_CPU(local_trylock_t, stream_local_lock) = INIT_LOCAL_TRYLOCK(stream_local_lock); +static DEFINE_PER_CPU(struct bpf_stream_page *, stream_pcpu_page); + +static bool bpf_stream_page_local_lock(unsigned long *flags) +{ + return local_trylock_irqsave(&stream_local_lock, *flags); +} + +static void bpf_stream_page_local_unlock(unsigned long *flags) +{ + local_unlock_irqrestore(&stream_local_lock, *flags); +} + +static void bpf_stream_page_free(struct bpf_stream_page *stream_page) +{ + struct page *p; + + if (!stream_page) + return; + p = virt_to_page(stream_page); + free_pages_nolock(p, 0); +} + +static void bpf_stream_page_get(struct bpf_stream_page *stream_page) +{ + refcount_inc(&stream_page->ref); +} + +static void bpf_stream_page_put(struct bpf_stream_page *stream_page) +{ + if (refcount_dec_and_test(&stream_page->ref)) + bpf_stream_page_free(stream_page); +} + +static void bpf_stream_page_init(struct bpf_stream_page *stream_page) +{ + refcount_set(&stream_page->ref, 1); + stream_page->consumed = 0; +} + +static struct bpf_stream_page *bpf_stream_page_replace(void) +{ + struct bpf_stream_page *stream_page, *old_stream_page; + struct page *page; + + page = alloc_pages_nolock(NUMA_NO_NODE, 0); + if (!page) + return NULL; + stream_page = page_address(page); + bpf_stream_page_init(stream_page); + + old_stream_page = this_cpu_read(stream_pcpu_page); + if (old_stream_page) + bpf_stream_page_put(old_stream_page); + this_cpu_write(stream_pcpu_page, stream_page); + return stream_page; +} + +static int bpf_stream_page_check_room(struct bpf_stream_page *stream_page, int len) +{ + int min = offsetof(struct bpf_stream_elem, str[0]); + int consumed = stream_page->consumed; + int total = BPF_STREAM_PAGE_SZ; + int rem = max(0, total - consumed - min); + + /* Let's give room of at least 8 bytes. */ + WARN_ON_ONCE(rem % 8 != 0); + rem = rem < 8 ? 0 : rem; + return min(len, rem); +} + +static void bpf_stream_elem_init(struct bpf_stream_elem *elem, int len) +{ + init_llist_node(&elem->node); + elem->total_len = len; + elem->consumed_len = 0; +} + +static struct bpf_stream_page *bpf_stream_page_from_elem(struct bpf_stream_elem *elem) +{ + unsigned long addr = (unsigned long)elem; + + return (struct bpf_stream_page *)PAGE_ALIGN_DOWN(addr); +} + +static struct bpf_stream_elem *bpf_stream_page_push_elem(struct bpf_stream_page *stream_page, int len) +{ + u32 consumed = stream_page->consumed; + + stream_page->consumed += round_up(offsetof(struct bpf_stream_elem, str[len]), 8); + return (struct bpf_stream_elem *)&stream_page->buf[consumed]; +} + +static struct bpf_stream_elem *bpf_stream_page_reserve_elem(int len) +{ + struct bpf_stream_elem *elem = NULL; + struct bpf_stream_page *page; + int room = 0; + + page = this_cpu_read(stream_pcpu_page); + if (!page) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + + room = bpf_stream_page_check_room(page, len); + if (room != len) + page = bpf_stream_page_replace(); + if (!page) + return NULL; + bpf_stream_page_get(page); + room = bpf_stream_page_check_room(page, len); + WARN_ON_ONCE(room != len); + + elem = bpf_stream_page_push_elem(page, room); + bpf_stream_elem_init(elem, room); + return elem; +} + +static struct bpf_stream_elem *bpf_stream_elem_alloc(int len) +{ + const int max_len = ARRAY_SIZE((struct bpf_bprintf_buffers){}.buf); + struct bpf_stream_elem *elem; + unsigned long flags; + + BUILD_BUG_ON(max_len > BPF_STREAM_PAGE_SZ); + /* + * Length denotes the amount of data to be written as part of stream element, + * thus includes '\0' byte. We're capped by how much bpf_bprintf_buffers can + * accomodate, therefore deny allocations that won't fit into them. + */ + if (len < 0 || len > max_len) + return NULL; + + if (!bpf_stream_page_local_lock(&flags)) + return NULL; + elem = bpf_stream_page_reserve_elem(len); + bpf_stream_page_local_unlock(&flags); + return elem; +} + +static int __bpf_stream_push_str(struct llist_head *log, const char *str, int len) +{ + struct bpf_stream_elem *elem = NULL; + + /* + * Allocate a bpf_prog_stream_elem and push it to the bpf_prog_stream + * log, elements will be popped at once and reversed to print the log. + */ + elem = bpf_stream_elem_alloc(len); + if (!elem) + return -ENOMEM; + + memcpy(elem->str, str, len); + llist_add(&elem->node, log); + + return 0; +} + +static int bpf_stream_consume_capacity(struct bpf_stream *stream, int len) +{ + if (atomic_read(&stream->capacity) >= BPF_STREAM_MAX_CAPACITY) + return -ENOSPC; + if (atomic_add_return(len, &stream->capacity) >= BPF_STREAM_MAX_CAPACITY) { + atomic_sub(len, &stream->capacity); + return -ENOSPC; + } + return 0; +} + +static void bpf_stream_release_capacity(struct bpf_stream *stream, struct bpf_stream_elem *elem) +{ + int len = elem->total_len; + + atomic_sub(len, &stream->capacity); +} + +static int bpf_stream_push_str(struct bpf_stream *stream, const char *str, int len) +{ + int ret = bpf_stream_consume_capacity(stream, len); + + return ret ?: __bpf_stream_push_str(&stream->log, str, len); +} + +static struct bpf_stream *bpf_stream_get(enum bpf_stream_id stream_id, struct bpf_prog_aux *aux) +{ + if (stream_id != BPF_STDOUT && stream_id != BPF_STDERR) + return NULL; + return &aux->stream[stream_id - 1]; +} + +static void bpf_stream_free_elem(struct bpf_stream_elem *elem) +{ + struct bpf_stream_page *p; + + p = bpf_stream_page_from_elem(elem); + bpf_stream_page_put(p); +} + +static void bpf_stream_free_list(struct llist_node *list) +{ + struct bpf_stream_elem *elem, *tmp; + + llist_for_each_entry_safe(elem, tmp, list, node) + bpf_stream_free_elem(elem); +} + +static struct llist_node *bpf_stream_backlog_peek(struct bpf_stream *stream) +{ + return stream->backlog_head; +} + +static struct llist_node *bpf_stream_backlog_pop(struct bpf_stream *stream) +{ + struct llist_node *node; + + node = stream->backlog_head; + if (stream->backlog_head == stream->backlog_tail) + stream->backlog_head = stream->backlog_tail = NULL; + else + stream->backlog_head = node->next; + return node; +} + +static void bpf_stream_backlog_fill(struct bpf_stream *stream) +{ + struct llist_node *head, *tail; + + if (llist_empty(&stream->log)) + return; + tail = llist_del_all(&stream->log); + if (!tail) + return; + head = llist_reverse_order(tail); + + if (!stream->backlog_head) { + stream->backlog_head = head; + stream->backlog_tail = tail; + } else { + stream->backlog_tail->next = head; + stream->backlog_tail = tail; + } + + return; +} + +static bool bpf_stream_consume_elem(struct bpf_stream_elem *elem, int *len) +{ + int rem = elem->total_len - elem->consumed_len; + int used = min(rem, *len); + + elem->consumed_len += used; + *len -= used; + + return elem->consumed_len == elem->total_len; +} + +static int bpf_stream_read(struct bpf_stream *stream, void __user *buf, int len) +{ + int rem_len = len, cons_len, ret = 0; + struct bpf_stream_elem *elem = NULL; + struct llist_node *node; + + mutex_lock(&stream->lock); + + while (rem_len) { + int pos = len - rem_len; + bool cont; + + node = bpf_stream_backlog_peek(stream); + if (!node) { + bpf_stream_backlog_fill(stream); + node = bpf_stream_backlog_peek(stream); + } + if (!node) + break; + elem = container_of(node, typeof(*elem), node); + + cons_len = elem->consumed_len; + cont = bpf_stream_consume_elem(elem, &rem_len) == false; + + ret = copy_to_user(buf + pos, elem->str + cons_len, + elem->consumed_len - cons_len); + /* Restore in case of error. */ + if (ret) { + ret = -EFAULT; + elem->consumed_len = cons_len; + break; + } + + if (cont) + continue; + bpf_stream_backlog_pop(stream); + bpf_stream_release_capacity(stream, elem); + bpf_stream_free_elem(elem); + } + + mutex_unlock(&stream->lock); + return ret ? ret : len - rem_len; +} + +int bpf_prog_stream_read(struct bpf_prog *prog, enum bpf_stream_id stream_id, void __user *buf, int len) +{ + struct bpf_stream *stream; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -ENOENT; + return bpf_stream_read(stream, buf, len); +} + +__bpf_kfunc_start_defs(); + +/* + * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the + * enum in headers. + */ +__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +{ + struct bpf_bprintf_data data = { + .get_bin_args = true, + .get_buf = true, + }; + struct bpf_prog_aux *aux = aux__prog; + u32 fmt_size = strlen(fmt__str) + 1; + struct bpf_stream *stream; + u32 data_len = len__sz; + int ret, num_args; + + stream = bpf_stream_get(stream_id, aux); + if (!stream) + return -ENOENT; + + if (data_len & 7 || data_len > MAX_BPRINTF_VARARGS * 8 || + (data_len && !args)) + return -EINVAL; + num_args = data_len / 8; + + ret = bpf_bprintf_prepare(fmt__str, fmt_size, args, num_args, &data); + if (ret < 0) + return ret; + + ret = bstr_printf(data.buf, MAX_BPRINTF_BUF, fmt__str, data.bin_args); + /* Exclude NULL byte during push. */ + ret = bpf_stream_push_str(stream, data.buf, ret); + bpf_bprintf_cleanup(&data); + + return ret; +} + +__bpf_kfunc_end_defs(); + +/* Added kfunc to common_btf_ids */ + +void bpf_prog_stream_init(struct bpf_prog *prog) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + atomic_set(&prog->aux->stream[i].capacity, 0); + init_llist_head(&prog->aux->stream[i].log); + mutex_init(&prog->aux->stream[i].lock); + prog->aux->stream[i].backlog_head = NULL; + prog->aux->stream[i].backlog_tail = NULL; + } +} + +void bpf_prog_stream_free(struct bpf_prog *prog) +{ + struct llist_node *list; + int i; + + for (i = 0; i < ARRAY_SIZE(prog->aux->stream); i++) { + list = llist_del_all(&prog->aux->stream[i].log); + bpf_stream_free_list(list); + bpf_stream_free_list(prog->aux->stream[i].backlog_head); + } +} + +void bpf_stream_stage_init(struct bpf_stream_stage *ss) +{ + init_llist_head(&ss->log); + ss->len = 0; +} + +void bpf_stream_stage_free(struct bpf_stream_stage *ss) +{ + struct llist_node *node; + + node = llist_del_all(&ss->log); + bpf_stream_free_list(node); +} + +int bpf_stream_stage_printk(struct bpf_stream_stage *ss, const char *fmt, ...) +{ + struct bpf_bprintf_buffers *buf; + va_list args; + int ret; + + if (bpf_try_get_buffers(&buf)) + return -EBUSY; + + va_start(args, fmt); + ret = vsnprintf(buf->buf, ARRAY_SIZE(buf->buf), fmt, args); + va_end(args); + ss->len += ret; + /* Exclude NULL byte during push. */ + ret = __bpf_stream_push_str(&ss->log, buf->buf, ret); + bpf_put_buffers(); + return ret; +} + +int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, + enum bpf_stream_id stream_id) +{ + struct llist_node *list, *head, *tail; + struct bpf_stream *stream; + int ret; + + stream = bpf_stream_get(stream_id, prog->aux); + if (!stream) + return -EINVAL; + + ret = bpf_stream_consume_capacity(stream, ss->len); + if (ret) + return ret; + + list = llist_del_all(&ss->log); + head = tail = list; + + if (!list) + return 0; + while (llist_next(list)) { + tail = llist_next(list); + list = tail; + } + llist_add_batch(head, tail, &stream->log); + return 0; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f1d9ee9717a1..7db7182a3057 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5943,6 +5943,28 @@ static int token_create(union bpf_attr *attr) return bpf_token_create(attr); } +#define BPF_PROG_STREAM_READ_BY_FD_LAST_FIELD prog_stream_read.prog_fd + +static int prog_stream_read(union bpf_attr *attr) +{ + char __user *buf = u64_to_user_ptr(attr->prog_stream_read.stream_buf); + u32 len = attr->prog_stream_read.stream_buf_len; + struct bpf_prog *prog; + int ret; + + if (CHECK_ATTR(BPF_PROG_STREAM_READ_BY_FD)) + return -EINVAL; + + prog = bpf_prog_get(attr->prog_stream_read.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + ret = bpf_prog_stream_read(prog, attr->prog_stream_read.stream_id, buf, len); + bpf_prog_put(prog); + + return ret; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6079,6 +6101,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_TOKEN_CREATE: err = token_create(&attr); break; + case BPF_PROG_STREAM_READ_BY_FD: + err = prog_stream_read(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52e36fd23f40..9f09dcd2eabb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -46,6 +46,7 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { enum bpf_features { BPF_FEAT_RDONLY_CAST_TO_VOID = 0, + BPF_FEAT_STREAMS = 1, __MAX_BPF_FEAT, }; -- cgit v1.2.3 From 0e521efaf36350b8f783984541efa56f560c90b0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:09 -0700 Subject: bpf: Add function to extract program source info Prepare a function for use in future patches that can extract the file info, line info, and the source line number for a given BPF program provided it's program counter. Only the basename of the file path is provided, given it can be excessively long in some cases. This will be used in later patches to print source info to the BPF stream. Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f0def24573ae..2dc5b846ae50 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3213,3 +3213,50 @@ EXPORT_SYMBOL(bpf_stats_enabled_key); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); + +#ifdef CONFIG_BPF_SYSCALL + +int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, + const char **linep, int *nump) +{ + int idx = -1, insn_start, insn_end, len; + struct bpf_line_info *linfo; + void **jited_linfo; + struct btf *btf; + + btf = prog->aux->btf; + linfo = prog->aux->linfo; + jited_linfo = prog->aux->jited_linfo; + + if (!btf || !linfo || !jited_linfo) + return -EINVAL; + len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; + + linfo = &prog->aux->linfo[prog->aux->linfo_idx]; + jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; + + insn_start = linfo[0].insn_off; + insn_end = insn_start + len; + + for (int i = 0; i < prog->aux->nr_linfo && + linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { + if (jited_linfo[i] >= (void *)ip) + break; + idx = i; + } + + if (idx == -1) + return -ENOENT; + + /* Get base component of the file path. */ + *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); + *filep = kbasename(*filep); + /* Obtain the source line, and strip whitespace in prefix. */ + *linep = btf_name_by_offset(btf, linfo[idx].line_off); + while (isspace(**linep)) + *linep += 1; + *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); + return 0; +} + +#endif -- cgit v1.2.3 From d090326860096df9dac6f27cff76d3f8df44d4f1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:10 -0700 Subject: bpf: Ensure RCU lock is held around bpf_prog_ksym_find Add a warning to ensure RCU lock is held around tree lookup, and then fix one of the invocations in bpf_stack_walker. The program has an active stack frame and won't disappear. Use the opportunity to remove unneeded invocation of is_bpf_text_address. Fixes: f18b03fabaa9 ("bpf: Implement BPF exceptions") Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 5 ++++- kernel/bpf/helpers.c | 11 +++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2dc5b846ae50..833442661742 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -782,7 +782,10 @@ bool is_bpf_text_address(unsigned long addr) struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { - struct bpf_ksym *ksym = bpf_ksym_find(addr); + struct bpf_ksym *ksym; + + WARN_ON_ONCE(!rcu_read_lock_held()); + ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6bfcbcdf6588..3d33181d5e67 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2981,9 +2981,16 @@ static bool bpf_stack_walker(void *cookie, u64 ip, u64 sp, u64 bp) struct bpf_throw_ctx *ctx = cookie; struct bpf_prog *prog; - if (!is_bpf_text_address(ip)) - return !ctx->cnt; + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return !ctx->cnt; ctx->cnt++; if (bpf_is_subprog(prog)) return true; -- cgit v1.2.3 From f0c53fd4a742f957da7077a691a85ef9775907dc Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:11 -0700 Subject: bpf: Add function to find program from stack trace In preparation of figuring out the closest program that led to the current point in the kernel, implement a function that scans through the stack trace and finds out the closest BPF program when walking down the stack trace. Special care needs to be taken to skip over kernel and BPF subprog frames. We basically scan until we find a BPF main prog frame. The assumption is that if a program calls into us transitively, we'll hit it along the way. If not, we end up returning NULL. Contextually the function will be used in places where we know the program may have called into us. Due to reliance on arch_bpf_stack_walk(), this function only works on x86 with CONFIG_UNWINDER_ORC, arm64, and s390. Remove the warning from arch_bpf_stack_walk as well since we call it outside bpf_throw() context. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 833442661742..037d67cf5fb1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3262,4 +3262,37 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * return 0; } +struct walk_stack_ctx { + struct bpf_prog *prog; +}; + +static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct walk_stack_ctx *ctxp = cookie; + struct bpf_prog *prog; + + /* + * The RCU read lock is held to safely traverse the latch tree, but we + * don't need its protection when accessing the prog, since it has an + * active stack frame on the current stack trace, and won't disappear. + */ + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (!prog) + return true; + if (bpf_is_subprog(prog)) + return true; + ctxp->prog = prog; + return false; +} + +struct bpf_prog *bpf_prog_find_from_stack(void) +{ + struct walk_stack_ctx ctx = {}; + + arch_bpf_stack_walk(find_from_stack_cb, &ctx); + return ctx.prog; +} + #endif -- cgit v1.2.3 From d7c431cafcb4917b0d87b5cd10637cd47b6c8d79 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:12 -0700 Subject: bpf: Add dump_stack() analogue to print to BPF stderr Introduce a kernel function which is the analogue of dump_stack() printing some useful information and the stack trace. This is not exposed to BPF programs yet, but can be made available in the future. When we have a program counter for a BPF program in the stack trace, also additionally output the filename and line number to make the trace helpful. The rest of the trace can be passed into ./decode_stacktrace.sh to obtain the line numbers for kernel symbols. Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/stream.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index e434541358db..8c842f845245 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -2,6 +2,7 @@ /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ #include +#include #include #include #include @@ -476,3 +477,50 @@ int bpf_stream_stage_commit(struct bpf_stream_stage *ss, struct bpf_prog *prog, llist_add_batch(head, tail, &stream->log); return 0; } + +struct dump_stack_ctx { + struct bpf_stream_stage *ss; + int err; +}; + +static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) +{ + struct dump_stack_ctx *ctxp = cookie; + const char *file = "", *line = ""; + struct bpf_prog *prog; + int num, ret; + + rcu_read_lock(); + prog = bpf_prog_ksym_find(ip); + rcu_read_unlock(); + if (prog) { + ret = bpf_prog_get_file_line(prog, ip, &file, &line, &num); + if (ret < 0) + goto end; + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", + (void *)ip, line, file, num); + return !ctxp->err; + } +end: + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)ip); + return !ctxp->err; +} + +int bpf_stream_stage_dump_stack(struct bpf_stream_stage *ss) +{ + struct dump_stack_ctx ctx = { .ss = ss }; + int ret; + + ret = bpf_stream_stage_printk(ss, "CPU: %d UID: %d PID: %d Comm: %s\n", + raw_smp_processor_id(), __kuid_val(current_real_cred()->euid), + current->pid, current->comm); + if (ret) + return ret; + ret = bpf_stream_stage_printk(ss, "Call trace:\n"); + if (ret) + return ret; + arch_bpf_stack_walk(dump_stack_cb, &ctx); + if (ctx.err) + return ctx.err; + return bpf_stream_stage_printk(ss, "\n"); +} -- cgit v1.2.3 From e8d01330225210a8af55f5683fb2ca726717ee16 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:13 -0700 Subject: bpf: Report may_goto timeout to BPF stderr Begin reporting may_goto timeouts to BPF program's stderr stream. Acked-by: Eduard Zingerman Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 037d67cf5fb1..fe8a53f3c5bc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3168,6 +3168,22 @@ u64 __weak arch_bpf_timed_may_goto(void) return 0; } +static noinline void bpf_prog_report_may_goto_violation(void) +{ +#ifdef CONFIG_BPF_SYSCALL + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n"); + bpf_stream_dump_stack(ss); + })); +#endif +} + u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) { u64 time = ktime_get_mono_fast_ns(); @@ -3178,8 +3194,10 @@ u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) return BPF_MAX_TIMED_LOOPS; } /* Check if we've exhausted our time slice, and zero count. */ - if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) { + bpf_prog_report_may_goto_violation(); return 0; + } /* Refresh the count for the stack frame. */ return BPF_MAX_TIMED_LOOPS; } -- cgit v1.2.3 From ecec5b5743bf964f3e2bb5486daee058344d9066 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 3 Jul 2025 13:48:14 -0700 Subject: bpf: Report rqspinlock deadlocks/timeout to BPF stderr Begin reporting rqspinlock deadlocks and timeout to BPF program's stderr. Reviewed-by: Emil Tsalapatis Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250703204818.925464-9-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/rqspinlock.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 338305c8852c..5ab354d55d82 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -666,6 +666,27 @@ EXPORT_SYMBOL_GPL(resilient_queued_spin_lock_slowpath); __bpf_kfunc_start_defs(); +static void bpf_prog_report_rqspinlock_violation(const char *str, void *lock, bool irqsave) +{ + struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); + struct bpf_stream_stage ss; + struct bpf_prog *prog; + + prog = bpf_prog_find_from_stack(); + if (!prog) + return; + bpf_stream_stage(ss, prog, BPF_STDERR, ({ + bpf_stream_printk(ss, "ERROR: %s for bpf_res_spin_lock%s\n", str, irqsave ? "_irqsave" : ""); + bpf_stream_printk(ss, "Attempted lock = 0x%px\n", lock); + bpf_stream_printk(ss, "Total held locks = %d\n", rqh->cnt); + for (int i = 0; i < min(RES_NR_HELD, rqh->cnt); i++) + bpf_stream_printk(ss, "Held lock[%2d] = 0x%px\n", i, rqh->locks[i]); + bpf_stream_dump_stack(ss); + })); +} + +#define REPORT_STR(ret) ({ (ret) == -ETIMEDOUT ? "Timeout detected" : "AA or ABBA deadlock detected"; }) + __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) { int ret; @@ -676,6 +697,7 @@ __bpf_kfunc int bpf_res_spin_lock(struct bpf_res_spin_lock *lock) preempt_disable(); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, false); preempt_enable(); return ret; } @@ -698,6 +720,7 @@ __bpf_kfunc int bpf_res_spin_lock_irqsave(struct bpf_res_spin_lock *lock, unsign local_irq_save(flags); ret = res_spin_lock((rqspinlock_t *)lock); if (unlikely(ret)) { + bpf_prog_report_rqspinlock_violation(REPORT_STR(ret), lock, true); local_irq_restore(flags); preempt_enable(); return ret; -- cgit v1.2.3 From 032547272eb0884470165e6a8fd73b80688e847f Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 3 Jul 2025 23:35:09 +0200 Subject: bpf: Avoid warning on unexpected map for tail call Before handling the tail call in record_func_key(), we check that the map is of the expected type and log a verifier error if it isn't. Such an error however doesn't indicate anything wrong with the verifier. The check for map<>func compatibility is done after record_func_key(), by check_map_func_compatibility(). Therefore, this patch logs the error as a typical reject instead of a verifier error. Fixes: d2e4c1e6c294 ("bpf: Constant map key tracking for prog array pokes") Fixes: 0df1a55afa83 ("bpf: Warn on internal verifier errors") Reported-by: syzbot+efb099d5833bca355e51@syzkaller.appspotmail.com Signed-off-by: Paul Chaignon Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/1f395b74e73022e47e04a31735f258babf305420.1751578055.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9f09dcd2eabb..29e0dddc22f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11082,8 +11082,8 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call) return 0; if (!map || map->map_type != BPF_MAP_TYPE_PROG_ARRAY) { - verifier_bug(env, "expected array map for tail call"); - return -EFAULT; + verbose(env, "expected prog array map for tail call"); + return -EINVAL; } reg = ®s[BPF_REG_3]; -- cgit v1.2.3 From 3b87251439b2abf207b9b2cc03bf8d29ae6bb257 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 3 Jul 2025 07:11:06 -0700 Subject: bpf: Simplify assignment to struct bpf_insn pointer in do_misc_fixups() In verifier.c, the following code patterns (in two places) struct bpf_insn *patch = &insn_buf[0]; can be simplified to struct bpf_insn *patch = insn_buf; which is easier to understand. Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250703141106.1483216-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 29e0dddc22f8..3ecb91c7e4f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -22103,7 +22103,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) == BPF_PROBE_MEM || BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) { - struct bpf_insn *patch = &insn_buf[0]; + struct bpf_insn *patch = insn_buf; u64 uaddress_limit = bpf_arch_uaddress_limit(); if (!uaddress_limit) @@ -22154,7 +22154,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; - struct bpf_insn *patch = &insn_buf[0]; + struct bpf_insn *patch = insn_buf; bool issrc, isneg, isimm; u32 off_reg; -- cgit v1.2.3 From 45e9cd38aa8df9f4673d73bb86f7c9626d513a76 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 3 Jul 2025 07:11:11 -0700 Subject: bpf: Reduce stack frame size by using env->insn_buf for bpf insns Arnd Bergmann reported an issue ([1]) where clang compiler (less than llvm18) may trigger an error where the stack frame size exceeds the limit. I can reproduce the error like below: kernel/bpf/verifier.c:24491:5: error: stack frame size (2552) exceeds limit (1280) in 'bpf_check' [-Werror,-Wframe-larger-than] kernel/bpf/verifier.c:19921:12: error: stack frame size (1368) exceeds limit (1280) in 'do_check' [-Werror,-Wframe-larger-than] Use env->insn_buf for bpf insns instead of putting these insns on the stack. This can resolve the above 'bpf_check' error. The 'do_check' error will be resolved in the next patch. [1] https://lore.kernel.org/bpf/20250620113846.3950478-1-arnd@kernel.org/ Reported-by: Arnd Bergmann Tested-by: Arnd Bergmann Acked-by: Jiri Olsa Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250703141111.1484521-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 190 ++++++++++++++++++++++++-------------------------- 1 file changed, 92 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3ecb91c7e4f8..92dba3c9664f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21011,7 +21011,10 @@ static int opt_remove_nops(struct bpf_verifier_env *env) static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, const union bpf_attr *attr) { - struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; + struct bpf_insn *patch; + /* use env->insn_buf as two independent buffers */ + struct bpf_insn *zext_patch = env->insn_buf; + struct bpf_insn *rnd_hi32_patch = &env->insn_buf[2]; struct bpf_insn_aux_data *aux = env->insn_aux_data; int i, patch_len, delta = 0, len = env->prog->len; struct bpf_insn *insns = env->prog->insnsi; @@ -21189,13 +21192,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) if (env->insn_aux_data[i + delta].nospec) { WARN_ON_ONCE(env->insn_aux_data[i + delta].alu_state); - struct bpf_insn patch[] = { - BPF_ST_NOSPEC(), - *insn, - }; + struct bpf_insn *patch = insn_buf; - cnt = ARRAY_SIZE(patch); - new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + *patch++ = BPF_ST_NOSPEC(); + *patch++ = *insn; + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -21263,13 +21265,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) /* nospec_result is only used to mitigate Spectre v4 and * to limit verification-time for Spectre v1. */ - struct bpf_insn patch[] = { - *insn, - BPF_ST_NOSPEC(), - }; + struct bpf_insn *patch = insn_buf; - cnt = ARRAY_SIZE(patch); - new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + *patch++ = *insn; + *patch++ = BPF_ST_NOSPEC(); + cnt = patch - insn_buf; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -21939,13 +21940,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) u16 stack_depth_extra = 0; if (env->seen_exception && !env->exception_callback_subprog) { - struct bpf_insn patch[] = { - env->prog->insnsi[insn_cnt - 1], - BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), - BPF_EXIT_INSN(), - }; + struct bpf_insn *patch = insn_buf; - ret = add_hidden_subprog(env, patch, ARRAY_SIZE(patch)); + *patch++ = env->prog->insnsi[insn_cnt - 1]; + *patch++ = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *patch++ = BPF_EXIT_INSN(); + ret = add_hidden_subprog(env, insn_buf, patch - insn_buf); if (ret < 0) return ret; prog = env->prog; @@ -21981,20 +21981,18 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->off == 1 && insn->imm == -1) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; bool isdiv = BPF_OP(insn->code) == BPF_DIV; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_sdiv[] = { - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - }; - struct bpf_insn chk_and_smod[] = { - BPF_MOV32_IMM(insn->dst_reg, 0), - }; + struct bpf_insn *patch = insn_buf; - patchlet = isdiv ? chk_and_sdiv : chk_and_smod; - cnt = isdiv ? ARRAY_SIZE(chk_and_sdiv) : ARRAY_SIZE(chk_and_smod); + if (isdiv) + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + else + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); + + cnt = patch - insn_buf; - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -22013,83 +22011,79 @@ static int do_misc_fixups(struct bpf_verifier_env *env) bool isdiv = BPF_OP(insn->code) == BPF_DIV; bool is_sdiv = isdiv && insn->off == 1; bool is_smod = !isdiv && insn->off == 1; - struct bpf_insn *patchlet; - struct bpf_insn chk_and_div[] = { - /* [R,W]x div 0 -> 0 */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JNE | BPF_K, insn->src_reg, - 0, 2, 0), - BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_mod[] = { - /* [R,W]x mod 0 -> [R,W]x */ - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, insn->src_reg, - 0, 1 + (is64 ? 0 : 1), 0), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - struct bpf_insn chk_and_sdiv[] = { + struct bpf_insn *patch = insn_buf; + + if (is_sdiv) { /* [R,W]x sdiv 0 -> 0 * LLONG_MIN sdiv -1 -> LLONG_MIN * INT_MIN sdiv -1 -> INT_MIN */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 4, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 1, 0), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_MOV | BPF_K, insn->dst_reg, - 0, 0, 0), + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 4, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 1, 0); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_MOV | BPF_K, insn->dst_reg, + 0, 0, 0); /* BPF_NEG(LLONG_MIN) == -LLONG_MIN == LLONG_MIN */ - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_NEG | BPF_K, insn->dst_reg, - 0, 0, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - }; - struct bpf_insn chk_and_smod[] = { + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_NEG | BPF_K, insn->dst_reg, + 0, 0, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; + } else if (is_smod) { /* [R,W]x mod 0 -> [R,W]x */ /* [R,W]x mod -1 -> 0 */ - BPF_MOV64_REG(BPF_REG_AX, insn->src_reg), - BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | - BPF_ADD | BPF_K, BPF_REG_AX, - 0, 0, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JGT | BPF_K, BPF_REG_AX, - 0, 3, 1), - BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | - BPF_JEQ | BPF_K, BPF_REG_AX, - 0, 3 + (is64 ? 0 : 1), 1), - BPF_MOV32_IMM(insn->dst_reg, 0), - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - *insn, - BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), - }; - - if (is_sdiv) { - patchlet = chk_and_sdiv; - cnt = ARRAY_SIZE(chk_and_sdiv); - } else if (is_smod) { - patchlet = chk_and_smod; - cnt = ARRAY_SIZE(chk_and_smod) - (is64 ? 2 : 0); + *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg); + *patch++ = BPF_RAW_INSN((is64 ? BPF_ALU64 : BPF_ALU) | + BPF_ADD | BPF_K, BPF_REG_AX, + 0, 0, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JGT | BPF_K, BPF_REG_AX, + 0, 3, 1); + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, BPF_REG_AX, + 0, 3 + (is64 ? 0 : 1), 1); + *patch++ = BPF_MOV32_IMM(insn->dst_reg, 0); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; + } else if (isdiv) { + /* [R,W]x div 0 -> 0 */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0); + *patch++ = BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg); + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = *insn; + cnt = patch - insn_buf; } else { - patchlet = isdiv ? chk_and_div : chk_and_mod; - cnt = isdiv ? ARRAY_SIZE(chk_and_div) : - ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); + /* [R,W]x mod 0 -> [R,W]x */ + *patch++ = BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1 + (is64 ? 0 : 1), 0); + *patch++ = *insn; + + if (!is64) { + *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *patch++ = BPF_MOV32_REG(insn->dst_reg, insn->dst_reg); + } + cnt = patch - insn_buf; } - new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; -- cgit v1.2.3 From 82bc4abf28d8147dd5da9ba52f0aa1bac23c125e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 3 Jul 2025 07:11:17 -0700 Subject: bpf: Avoid putting struct bpf_scc_callchain variables on the stack Add a 'struct bpf_scc_callchain callchain_buf' field in bpf_verifier_env. This way, the previous bpf_scc_callchain local variables can be replaced by taking address of env->callchain_buf. This can reduce stack usage and fix the following error: kernel/bpf/verifier.c:19921:12: error: stack frame size (1368) exceeds limit (1280) in 'do_check' [-Werror,-Wframe-larger-than] Reported-by: Arnd Bergmann Acked-by: Jiri Olsa Acked-by: Eduard Zingerman Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250703141117.1485108-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 92dba3c9664f..0f6cc2275695 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1914,19 +1914,19 @@ static char *format_callchain(struct bpf_verifier_env *env, struct bpf_scc_callc */ static int maybe_enter_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) + if (!compute_scc_callchain(env, st, callchain)) return 0; - visit = scc_visit_lookup(env, &callchain); - visit = visit ?: scc_visit_alloc(env, &callchain); + visit = scc_visit_lookup(env, callchain); + visit = visit ?: scc_visit_alloc(env, callchain); if (!visit) return -ENOMEM; if (!visit->entry_state) { visit->entry_state = st; if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC enter %s\n", format_callchain(env, &callchain)); + verbose(env, "SCC enter %s\n", format_callchain(env, callchain)); } return 0; } @@ -1939,21 +1939,21 @@ static int propagate_backedges(struct bpf_verifier_env *env, struct bpf_scc_visi */ static int maybe_exit_scc(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) + if (!compute_scc_callchain(env, st, callchain)) return 0; - visit = scc_visit_lookup(env, &callchain); + visit = scc_visit_lookup(env, callchain); if (!visit) { verifier_bug(env, "scc exit: no visit info for call chain %s", - format_callchain(env, &callchain)); + format_callchain(env, callchain)); return -EFAULT; } if (visit->entry_state != st) return 0; if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC exit %s\n", format_callchain(env, &callchain)); + verbose(env, "SCC exit %s\n", format_callchain(env, callchain)); visit->entry_state = NULL; env->num_backedges -= visit->num_backedges; visit->num_backedges = 0; @@ -1968,22 +1968,22 @@ static int add_scc_backedge(struct bpf_verifier_env *env, struct bpf_verifier_state *st, struct bpf_scc_backedge *backedge) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) { + if (!compute_scc_callchain(env, st, callchain)) { verifier_bug(env, "add backedge: no SCC in verification path, insn_idx %d", st->insn_idx); return -EFAULT; } - visit = scc_visit_lookup(env, &callchain); + visit = scc_visit_lookup(env, callchain); if (!visit) { verifier_bug(env, "add backedge: no visit info for call chain %s", - format_callchain(env, &callchain)); + format_callchain(env, callchain)); return -EFAULT; } if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "SCC backedge %s\n", format_callchain(env, &callchain)); + verbose(env, "SCC backedge %s\n", format_callchain(env, callchain)); backedge->next = visit->backedges; visit->backedges = backedge; visit->num_backedges++; @@ -1999,12 +1999,12 @@ static int add_scc_backedge(struct bpf_verifier_env *env, static bool incomplete_read_marks(struct bpf_verifier_env *env, struct bpf_verifier_state *st) { - struct bpf_scc_callchain callchain; + struct bpf_scc_callchain *callchain = &env->callchain_buf; struct bpf_scc_visit *visit; - if (!compute_scc_callchain(env, st, &callchain)) + if (!compute_scc_callchain(env, st, callchain)) return false; - visit = scc_visit_lookup(env, &callchain); + visit = scc_visit_lookup(env, callchain); if (!visit) return false; return !!visit->backedges; -- cgit v1.2.3 From fc975cfb36393db1db517fbbe366e550bcdcff14 Mon Sep 17 00:00:00 2001 From: kuyo chang Date: Wed, 2 Jul 2025 10:12:25 +0800 Subject: sched/deadline: Fix dl_server runtime calculation formula In our testing with 6.12 based kernel on a big.LITTLE system, we were seeing instances of RT tasks being blocked from running on the LITTLE cpus for multiple seconds of time, apparently by the dl_server. This far exceeds the default configured 50ms per second runtime. This is due to the fair dl_server runtime calculation being scaled for frequency & capacity of the cpu. Consider the following case under a Big.LITTLE architecture: Assume the runtime is: 50,000,000 ns, and Frequency/capacity scale-invariance defined as below: Frequency scale-invariance: 100 Capacity scale-invariance: 50 First by Frequency scale-invariance, the runtime is scaled to 50,000,000 * 100 >> 10 = 4,882,812 Then by capacity scale-invariance, it is further scaled to 4,882,812 * 50 >> 10 = 238,418. So it will scaled to 238,418 ns. This smaller "accounted runtime" value is what ends up being subtracted against the fair-server's runtime for the current period. Thus after 50ms of real time, we've only accounted ~238us against the fair servers runtime. This 209:1 ratio in this example means that on the smaller cpu the fair server is allowed to continue running, blocking RT tasks, for over 10 seconds before it exhausts its supposed 50ms of runtime. And on other hardware configurations it can be even worse. For the fair deadline_server, to prevent realtime tasks from being unexpectedly delayed, we really do want to use fixed time, and not scaled time for smaller capacity/frequency cpus. So remove the scaling from the fair server's accounting to fix this. Fixes: a110a81c52a9 ("sched/deadline: Deferrable dl server") Suggested-by: Peter Zijlstra Suggested-by: John Stultz Signed-off-by: kuyo chang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Acked-by: John Stultz Tested-by: John Stultz Link: https://lore.kernel.org/r/20250702021440.2594736-1-kuyo.chang@mediatek.com --- kernel/sched/deadline.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245..89019a140826 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1504,7 +1504,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 if (dl_entity_is_special(dl_se)) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + scaled_delta_exec = delta_exec; + if (!dl_server(dl_se)) + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); dl_se->runtime -= scaled_delta_exec; @@ -1611,7 +1613,7 @@ throttle: */ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) { - s64 delta_exec, scaled_delta_exec; + s64 delta_exec; if (!rq->fair_server.dl_defer) return; @@ -1624,9 +1626,7 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) if (delta_exec < 0) return; - scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); - - rq->fair_server.runtime -= scaled_delta_exec; + rq->fair_server.runtime -= delta_exec; if (rq->fair_server.runtime < 0) { rq->fair_server.dl_defer_running = 0; -- cgit v1.2.3 From 60bc47b5a0b164082d448815d7db3066266aa3ed Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 1 Jul 2025 19:02:13 +0800 Subject: watchdog/perf: Provide function for adjusting the event period Architecture's using perf events for hard lockup detection needs to convert the watchdog_thresh to the event's period, some architecture for example arm64 perform this conversion using the CPU's maximum frequency which will be acquired by cpufreq. However by the time the lockup detector's initialized the cpufreq driver may not be initialized, thus launch a watchdog with inaccurate period. Provide a function hardlockup_detector_perf_adjust_period() to allowing adjust the event period. Then architecture can update with more accurate period if cpufreq is initialized. Reviewed-by: Douglas Anderson Signed-off-by: Yicong Yang Link: https://lore.kernel.org/r/20250701110214.27242-2-yangyicong@huawei.com Signed-off-by: Will Deacon --- kernel/watchdog_perf.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 75af12ff774e..9c58f5b4381d 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -186,6 +186,28 @@ void watchdog_hardlockup_disable(unsigned int cpu) } } +/** + * hardlockup_detector_perf_adjust_period - Adjust the event period due + * to current cpu frequency change + * @period: The target period to be set + */ +void hardlockup_detector_perf_adjust_period(u64 period) +{ + struct perf_event *event = this_cpu_read(watchdog_ev); + + if (!(watchdog_enabled & WATCHDOG_HARDLOCKUP_ENABLED)) + return; + + if (!event) + return; + + if (event->attr.sample_period == period) + return; + + if (perf_event_period(event, period)) + pr_err("failed to change period to %llu\n", period); +} + /** * hardlockup_detector_perf_stop - Globally stop watchdog events * -- cgit v1.2.3 From b86ced882b8e667758afddffd8d6354197842110 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 30 Jun 2025 09:06:37 -0700 Subject: lib/crypto: sha256: Make library API use strongly-typed contexts Currently the SHA-224 and SHA-256 library functions can be mixed arbitrarily, even in ways that are incorrect, for example using sha224_init() and sha256_final(). This is because they operate on the same structure, sha256_state. Introduce stronger typing, as I did for SHA-384 and SHA-512. Also as I did for SHA-384 and SHA-512, use the names *_ctx instead of *_state. The *_ctx names have the following small benefits: - They're shorter. - They avoid an ambiguity with the compression function state. - They're consistent with the well-known OpenSSL API. - Users usually name the variable 'sctx' anyway, which suggests that *_ctx would be the more natural name for the actual struct. Therefore: update the SHA-224 and SHA-256 APIs, implementation, and calling code accordingly. In the new structs, also strongly-type the compression function state. Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250630160645.3198-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- kernel/kexec_file.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 69fe76fd9233..b835033c65eb 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -751,7 +751,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Calculate and store the digest of segments */ static int kexec_calculate_store_digests(struct kimage *image) { - struct sha256_state state; + struct sha256_ctx sctx; int ret = 0, i, j, zero_buf_sz, sha_region_sz; size_t nullsz; u8 digest[SHA256_DIGEST_SIZE]; @@ -770,7 +770,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (!sha_regions) return -ENOMEM; - sha256_init(&state); + sha256_init(&sctx); for (j = i = 0; i < image->nr_segments; i++) { struct kexec_segment *ksegment; @@ -796,7 +796,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (check_ima_segment_index(image, i)) continue; - sha256_update(&state, ksegment->kbuf, ksegment->bufsz); + sha256_update(&sctx, ksegment->kbuf, ksegment->bufsz); /* * Assume rest of the buffer is filled with zero and @@ -808,7 +808,7 @@ static int kexec_calculate_store_digests(struct kimage *image) if (bytes > zero_buf_sz) bytes = zero_buf_sz; - sha256_update(&state, zero_buf, bytes); + sha256_update(&sctx, zero_buf, bytes); nullsz -= bytes; } @@ -817,7 +817,7 @@ static int kexec_calculate_store_digests(struct kimage *image) j++; } - sha256_final(&state, digest); + sha256_final(&sctx, digest); ret = kexec_purgatory_get_set_symbol(image, "purgatory_sha_regions", sha_regions, sha_region_sz, 0); -- cgit v1.2.3 From 946a7281982530d333eaee62bd1726f25908b3a9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 2 Jul 2025 13:52:54 -0400 Subject: smp: Wait only if work was enqueued Whenever work is enqueued for a remote CPU, smp_call_function_many_cond() may need to wait for that work to be completed. However, if no work is enqueued for a remote CPU, because the condition func() evaluated to false for all CPUs, there is no need to wait. Set run_remote only if work was enqueued on remote CPUs. Document the difference between "work enqueued", and "CPU needs to be woken up" Suggested-by: Jann Horn Signed-off-by: Rik van Riel Signed-off-by: Thomas Gleixner Reviewed-by: Yury Norov (NVIDIA) Link: https://lore.kernel.org/all/20250703203019.11331ac3@fangorn --- kernel/smp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 99d1fd0e9e0e..c5e1da7a88da 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -802,7 +802,6 @@ static void smp_call_function_many_cond(const struct cpumask *mask, /* Check if we need remote execution, i.e., any CPU excluding this one. */ if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) { - run_remote = true; cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); @@ -816,6 +815,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, continue; } + /* Work is enqueued on a remote CPU. */ + run_remote = true; + csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; @@ -827,6 +829,10 @@ static void smp_call_function_many_cond(const struct cpumask *mask, #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); + /* + * Kick the remote CPU if this is the first work + * item enqueued. + */ if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; -- cgit v1.2.3 From a33ad03aaed2c39d29a27a64ee55ff919810de59 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 18 Mar 2025 10:23:36 +0100 Subject: rcu/nocb: Dump gp state even if rdp gp itself is not offloaded When a stall is detected, the state of each NOCB CPU is dumped along with the state of each NOCB group. The latter part however is incidentally ignored if the NOCB group leader happens not to be offloaded itself. Fix this to make sure related precious informations aren't lost over a stall report. Reported-by: "Paul E. McKenney" Signed-off-by: Frederic Weisbecker Reviewed-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_nocb.h | 3 +++ kernel/rcu/tree_stall.h | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index b473ff056f49..cb29b6bb0ed4 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1564,6 +1564,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) + return; + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, &rdp->nocb_entry_rdp, typeof(*rdp), diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 486c00536207..4fa64c959083 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -953,8 +953,7 @@ void show_rcu_gp_kthreads(void) for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); cbs += data_race(READ_ONCE(rdp->n_cbs_invoked)); - if (rcu_segcblist_is_offloaded(&rdp->cblist)) - show_rcu_nocb_state(rdp); + show_rcu_nocb_state(rdp); } pr_info("RCU callbacks invoked since boot: %lu\n", cbs); show_rcu_tasks_gp_kthreads(); -- cgit v1.2.3 From 005b6187705bc9723518ce19c5cb911fc1f7ef07 Mon Sep 17 00:00:00 2001 From: Artem Sadovnikov Date: Sun, 29 Jun 2025 23:12:12 +0000 Subject: refscale: Check that nreaders and loops multiplication doesn't overflow The nreaders and loops variables are exposed as module parameters, which, in certain combinations, can lead to multiplication overflow. Besides, loops parameter is defined as long, while through the code is used as int, which can cause truncation on 64-bit kernels and possible zeroes where they shouldn't appear. Since code uses result of multiplication as int anyway, it only makes sense to replace loops with int. Multiplication overflow check is also added due to possible multiplication between two very big numbers. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 653ed64b01dc ("refperf: Add a test to measure performance of read-side synchronization") Signed-off-by: Artem Sadovnikov Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/refscale.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f11a7c2af778..ab7fcdc94cc0 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -85,7 +85,7 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, // Number of typesafe_lookup structures, that is, the degree of concurrency. torture_param(long, lookup_instances, 0, "Number of typesafe_lookup structures."); // Number of loops per experiment, all readers execute operations concurrently. -torture_param(long, loops, 10000, "Number of loops per experiment."); +torture_param(int, loops, 10000, "Number of loops per experiment."); // Number of readers, with -1 defaulting to about 75% of the CPUs. torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs."); // Number of runs. @@ -1140,7 +1140,7 @@ static void ref_scale_print_module_parms(const struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%d nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } @@ -1238,12 +1238,16 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); - if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + if (WARN_ONCE(loops <= 0, "%s: loops = %d, adjusted to 1\n", __func__, loops)) loops = 1; if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) nreaders = 1; if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) nruns = 1; + if (WARN_ONCE(loops > INT_MAX / nreaders, + "%s: nreaders * loops will overflow, adjusted loops to %d", + __func__, INT_MAX / nreaders)) + loops = INT_MAX / nreaders; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { -- cgit v1.2.3 From b9d44bc9fd30550052d0854d71b0c0731dc9f053 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 4 Jul 2025 16:03:47 -0700 Subject: bpf: make makr_btf_ld_reg return error for unexpected reg types Non-functional change: mark_btf_ld_reg() expects 'reg_type' parameter to be either SCALAR_VALUE or PTR_TO_BTF_ID. Next commit expands this set, so update this function to fail if unexpected type is passed. Also update callers to propagate the error. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250704230354.1323244-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 59 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0f6cc2275695..9e8328f40b88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2796,22 +2796,28 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, __mark_reg_not_init(env, regs + regno); } -static void mark_btf_ld_reg(struct bpf_verifier_env *env, - struct bpf_reg_state *regs, u32 regno, - enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id, - enum bpf_type_flag flag) +static int mark_btf_ld_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno, + enum bpf_reg_type reg_type, + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { - if (reg_type == SCALAR_VALUE) { + switch (reg_type) { + case SCALAR_VALUE: mark_reg_unknown(env, regs, regno); - return; + return 0; + case PTR_TO_BTF_ID: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_BTF_ID | flag; + regs[regno].btf = btf; + regs[regno].btf_id = btf_id; + if (type_may_be_null(flag)) + regs[regno].id = ++env->id_gen; + return 0; + default: + verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__); + return -EFAULT; } - mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID | flag; - regs[regno].btf = btf; - regs[regno].btf_id = btf_id; - if (type_may_be_null(flag)) - regs[regno].id = ++env->id_gen; } #define DEF_NOT_SUBREG (0) @@ -5965,6 +5971,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); struct bpf_reg_state *val_reg; + int ret; /* Things we already checked for in check_map_access and caller: * - Reject cases where variable offset may touch kptr @@ -5998,8 +6005,11 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, - kptr_field->kptr.btf_id, btf_ld_kptr_type(env, kptr_field)); + ret = mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + btf_ld_kptr_type(env, kptr_field)); + if (ret < 0) + return ret; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && @@ -7298,8 +7308,11 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (atype == BPF_READ && value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } @@ -7353,13 +7366,19 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, /* Simulate access to a PTR_TO_BTF_ID */ memset(&map_reg, 0, sizeof(map_reg)); - mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); + ret = mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, + btf_vmlinux, *map->ops->map_btf_id, 0); + if (ret < 0) + return ret; ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL); if (ret < 0) return ret; - if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (value_regno >= 0) { + ret = mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); + if (ret < 0) + return ret; + } return 0; } -- cgit v1.2.3 From 2d5c91e1cc14c3511d163ac36ee869ecf3a29cc2 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 4 Jul 2025 16:03:48 -0700 Subject: bpf: rdonly_untrusted_mem for btf id walk pointer leafs When processing a load from a PTR_TO_BTF_ID, the verifier calculates the type of the loaded structure field based on the load offset. For example, given the following types: struct foo { struct foo *a; int *b; } *p; The verifier would calculate the type of `p->a` as a pointer to `struct foo`. However, the type of `p->b` is currently calculated as a SCALAR_VALUE. This commit updates the logic for processing PTR_TO_BTF_ID to instead calculate the type of p->b as PTR_TO_MEM|MEM_RDONLY|PTR_UNTRUSTED. This change allows further dereferencing of such pointers (using probe memory instructions). Suggested-by: Alexei Starovoitov Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250704230354.1323244-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 6 ++++++ kernel/bpf/verifier.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 05fd64a371af..b3c8a95d38fb 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6915,6 +6915,7 @@ enum bpf_struct_walk_result { /* < 0 error */ WALK_SCALAR = 0, WALK_PTR, + WALK_PTR_UNTRUSTED, WALK_STRUCT, }; @@ -7156,6 +7157,8 @@ error: *field_name = mname; return WALK_PTR; } + + return WALK_PTR_UNTRUSTED; } /* Allow more flexible access within an int as long as @@ -7228,6 +7231,9 @@ int btf_struct_access(struct bpf_verifier_log *log, *next_btf_id = id; *flag = tmp_flag; return PTR_TO_BTF_ID; + case WALK_PTR_UNTRUSTED: + *flag = MEM_RDONLY | PTR_UNTRUSTED; + return PTR_TO_MEM; case WALK_SCALAR: return SCALAR_VALUE; case WALK_STRUCT: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9e8328f40b88..87ab00b40d9f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2814,6 +2814,11 @@ static int mark_btf_ld_reg(struct bpf_verifier_env *env, if (type_may_be_null(flag)) regs[regno].id = ++env->id_gen; return 0; + case PTR_TO_MEM: + mark_reg_known_zero(env, regs, regno); + regs[regno].type = PTR_TO_MEM | flag; + regs[regno].mem_size = 0; + return 0; default: verifier_bug(env, "unexpected reg_type %d in %s\n", reg_type, __func__); return -EFAULT; -- cgit v1.2.3 From 182f7df70419f368c4310dc151677d574e53c44a Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 4 Jul 2025 16:03:50 -0700 Subject: bpf: attribute __arg_untrusted for global function parameters Add support for PTR_TO_BTF_ID | PTR_UNTRUSTED global function parameters. Anything is allowed to pass to such parameters, as these are read-only and probe read instructions would protect against invalid memory access. Suggested-by: Alexei Starovoitov Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250704230354.1323244-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 38 +++++++++++++++++++++++++++++++++----- kernel/bpf/verifier.c | 6 ++++++ 2 files changed, 39 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b3c8a95d38fb..e0414d9f5e29 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -7646,11 +7646,12 @@ cand_cache_unlock: } enum btf_arg_tag { - ARG_TAG_CTX = BIT_ULL(0), - ARG_TAG_NONNULL = BIT_ULL(1), - ARG_TAG_TRUSTED = BIT_ULL(2), - ARG_TAG_NULLABLE = BIT_ULL(3), - ARG_TAG_ARENA = BIT_ULL(4), + ARG_TAG_CTX = BIT_ULL(0), + ARG_TAG_NONNULL = BIT_ULL(1), + ARG_TAG_TRUSTED = BIT_ULL(2), + ARG_TAG_UNTRUSTED = BIT_ULL(3), + ARG_TAG_NULLABLE = BIT_ULL(4), + ARG_TAG_ARENA = BIT_ULL(5), }; /* Process BTF of a function to produce high-level expectation of function @@ -7758,6 +7759,8 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) tags |= ARG_TAG_CTX; } else if (strcmp(tag, "trusted") == 0) { tags |= ARG_TAG_TRUSTED; + } else if (strcmp(tag, "untrusted") == 0) { + tags |= ARG_TAG_UNTRUSTED; } else if (strcmp(tag, "nonnull") == 0) { tags |= ARG_TAG_NONNULL; } else if (strcmp(tag, "nullable") == 0) { @@ -7818,6 +7821,31 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) sub->args[i].btf_id = kern_type_id; continue; } + if (tags & ARG_TAG_UNTRUSTED) { + struct btf *vmlinux_btf; + int kern_type_id; + + if (tags & ~ARG_TAG_UNTRUSTED) { + bpf_log(log, "arg#%d untrusted cannot be combined with any other tags\n", i); + return -EINVAL; + } + + kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); + if (kern_type_id < 0) + return kern_type_id; + + vmlinux_btf = bpf_get_btf_vmlinux(); + ref_t = btf_type_by_id(vmlinux_btf, kern_type_id); + if (!btf_type_is_struct(ref_t)) { + tname = __btf_name_by_offset(vmlinux_btf, t->name_off); + bpf_log(log, "arg#%d has type %s '%s', but only struct types are allowed\n", + i, btf_type_str(ref_t), tname); + return -EINVAL; + } + sub->args[i].arg_type = ARG_PTR_TO_BTF_ID | PTR_UNTRUSTED; + sub->args[i].btf_id = kern_type_id; + continue; + } if (tags & ARG_TAG_ARENA) { if (tags & ~ARG_TAG_ARENA) { bpf_log(log, "arg#%d arena cannot be combined with any other tags\n", i); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 87ab00b40d9f..7af902c3ecc3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10437,6 +10437,12 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "R%d is not a scalar\n", regno); return -EINVAL; } + } else if (arg->arg_type & PTR_UNTRUSTED) { + /* + * Anything is allowed for untrusted arguments, as these are + * read-only and probe read instructions would protect against + * invalid memory access. + */ } else if (arg->arg_type == ARG_PTR_TO_CTX) { ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE); if (ret < 0) -- cgit v1.2.3 From c4aa454c64ae022e5a9d55b3c31e9b8dd8a1544f Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 4 Jul 2025 16:03:53 -0700 Subject: bpf: support for void/primitive __arg_untrusted global func params Allow specifying __arg_untrusted for void */char */int */long * parameters. Treat such parameters as PTR_TO_MEM|MEM_RDONLY|PTR_UNTRUSTED of size zero. Intended usage is as follows: int memcmp(char *a __arg_untrusted, char *b __arg_untrusted, size_t n) { bpf_for(i, 0, n) { if (a[i] - b[i]) // load at any offset is allowed return a[i] - b[i]; } return 0; } Allocate register id for ARG_PTR_TO_MEM parameters only when PTR_MAYBE_NULL is set. Register id for PTR_TO_MEM is used only to propagate non-null status after conditionals. Suggested-by: Alexei Starovoitov Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250704230354.1323244-8-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 15 ++++++++++++++- kernel/bpf/verifier.c | 7 ++++--- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e0414d9f5e29..2dd13eea7b0e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -891,6 +891,12 @@ bool btf_type_is_i64(const struct btf_type *t) return btf_type_is_int(t) && __btf_type_int_is_regular(t, 8); } +bool btf_type_is_primitive(const struct btf_type *t) +{ + return (btf_type_is_int(t) && btf_type_int_is_regular(t)) || + btf_is_any_enum(t); +} + /* * Check that given struct member is a regular int with expected * offset and size. @@ -7830,6 +7836,13 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) return -EINVAL; } + ref_t = btf_type_skip_modifiers(btf, t->type, NULL); + if (btf_type_is_void(ref_t) || btf_type_is_primitive(ref_t)) { + sub->args[i].arg_type = ARG_PTR_TO_MEM | MEM_RDONLY | PTR_UNTRUSTED; + sub->args[i].mem_size = 0; + continue; + } + kern_type_id = btf_get_ptr_to_btf_id(log, i, btf, t); if (kern_type_id < 0) return kern_type_id; @@ -7838,7 +7851,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog) ref_t = btf_type_by_id(vmlinux_btf, kern_type_id); if (!btf_type_is_struct(ref_t)) { tname = __btf_name_by_offset(vmlinux_btf, t->name_off); - bpf_log(log, "arg#%d has type %s '%s', but only struct types are allowed\n", + bpf_log(log, "arg#%d has type %s '%s', but only struct or primitive types are allowed\n", i, btf_type_str(ref_t), tname); return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7af902c3ecc3..1e567fff6f23 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23152,11 +23152,12 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) __mark_dynptr_reg(reg, BPF_DYNPTR_TYPE_LOCAL, true, ++env->id_gen); } else if (base_type(arg->arg_type) == ARG_PTR_TO_MEM) { reg->type = PTR_TO_MEM; - if (arg->arg_type & PTR_MAYBE_NULL) - reg->type |= PTR_MAYBE_NULL; + reg->type |= arg->arg_type & + (PTR_MAYBE_NULL | PTR_UNTRUSTED | MEM_RDONLY); mark_reg_known_zero(env, regs, i); reg->mem_size = arg->mem_size; - reg->id = ++env->id_gen; + if (arg->arg_type & PTR_MAYBE_NULL) + reg->id = ++env->id_gen; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { reg->type = PTR_TO_BTF_ID; if (arg->arg_type & PTR_MAYBE_NULL) -- cgit v1.2.3 From 116c8f474722bd06f314ca88fb1a01e5526e3366 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Jul 2025 22:30:34 -0700 Subject: bpf: Fix bounds for bpf_prog_get_file_line linfo loop We may overrun the bounds because linfo and jited_linfo are already advanced to prog->aux->linfo_idx, hence we must only iterate the remaining elements until we reach prog->aux->nr_linfo. Adjust the nr_linfo calculation to fix this. Reported in [0]. [0]: https://lore.kernel.org/bpf/f3527af3b0620ce36e299e97e7532d2555018de2.camel@gmail.com Reported-by: Eduard Zingerman Fixes: 0e521efaf363 ("bpf: Add function to extract program source info") Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250705053035.3020320-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index fe8a53f3c5bc..61613785bdd0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3244,6 +3244,7 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * struct bpf_line_info *linfo; void **jited_linfo; struct btf *btf; + int nr_linfo; btf = prog->aux->btf; linfo = prog->aux->linfo; @@ -3258,8 +3259,9 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * insn_start = linfo[0].insn_off; insn_end = insn_start + len; + nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx; - for (int i = 0; i < prog->aux->nr_linfo && + for (int i = 0; i < nr_linfo && linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { if (jited_linfo[i] >= (void *)ip) break; -- cgit v1.2.3 From bfa2bb9abd99beff078eaf9d9b59dbd4eb726040 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Jul 2025 22:30:35 -0700 Subject: bpf: Fix improper int-to-ptr cast in dump_stack_cb On 32-bit platforms, we'll try to convert a u64 directly to a pointer type which is 32-bit, which causes the compiler to complain about cast from an integer of a different size to a pointer type. Cast to long before casting to the pointer type to match the pointer width. Reported-by: kernelci.org bot Reported-by: Randy Dunlap Fixes: d7c431cafcb4 ("bpf: Add dump_stack() analogue to print to BPF stderr") Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Randy Dunlap Tested-by: Randy Dunlap Link: https://lore.kernel.org/r/20250705053035.3020320-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/stream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index 8c842f845245..ab592db4a4bf 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -498,11 +498,11 @@ static bool dump_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) if (ret < 0) goto end; ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n %s @ %s:%d\n", - (void *)ip, line, file, num); + (void *)(long)ip, line, file, num); return !ctxp->err; } end: - ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)ip); + ctxp->err = bpf_stream_stage_printk(ctxp->ss, "%pS\n", (void *)(long)ip); return !ctxp->err; } -- cgit v1.2.3 From dadb59104c6441f54d0c42bba3e4bd11e25fc6d9 Mon Sep 17 00:00:00 2001 From: Luis Gerhorst Date: Sat, 5 Jul 2025 21:09:07 +0200 Subject: bpf: Fix aux usage after do_check_insn() We must terminate the speculative analysis if the just-analyzed insn had nospec_result set. Using cur_aux() here is wrong because insn_idx might have been incremented by do_check_insn(). Therefore, introduce and use insn_aux variable. Also change cur_aux(env)->nospec in case do_check_insn() ever manages to increment insn_idx but still fail. Change the warning to check the insn class (which prevents it from triggering for ldimm64, for which nospec_result would not be problematic) and use verifier_bug_if(). In line with Eduard's suggestion, do not introduce prev_aux() because that requires one to understand that after do_check_insn() call what was current became previous. This would at-least require a comment. Fixes: d6f1c85f2253 ("bpf: Fall back to nospec for Spectre v1") Reported-by: Paul Chaignon Reported-by: Eduard Zingerman Reported-by: syzbot+dc27c5fb8388e38d2d37@syzkaller.appspotmail.com Link: https://lore.kernel.org/bpf/685b3c1b.050a0220.2303ee.0010.GAE@google.com/ Link: https://lore.kernel.org/bpf/4266fd5de04092aa4971cbef14f1b4b96961f432.camel@gmail.com/ Suggested-by: Eduard Zingerman Signed-off-by: Luis Gerhorst Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250705190908.1756862-2-luis.gerhorst@fau.de Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e567fff6f23..53007182b46b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19953,6 +19953,7 @@ static int do_check(struct bpf_verifier_env *env) for (;;) { struct bpf_insn *insn; + struct bpf_insn_aux_data *insn_aux; int err; /* reset current history entry on each new instruction */ @@ -19966,6 +19967,7 @@ static int do_check(struct bpf_verifier_env *env) } insn = &insns[env->insn_idx]; + insn_aux = &env->insn_aux_data[env->insn_idx]; if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { verbose(env, @@ -20042,7 +20044,7 @@ static int do_check(struct bpf_verifier_env *env) /* Reduce verification complexity by stopping speculative path * verification when a nospec is encountered. */ - if (state->speculative && cur_aux(env)->nospec) + if (state->speculative && insn_aux->nospec) goto process_bpf_exit; err = do_check_insn(env, &do_print_state); @@ -20050,11 +20052,11 @@ static int do_check(struct bpf_verifier_env *env) /* Prevent this speculative path from ever reaching the * insn that would have been unsafe to execute. */ - cur_aux(env)->nospec = true; + insn_aux->nospec = true; /* If it was an ADD/SUB insn, potentially remove any * markings for alu sanitization. */ - cur_aux(env)->alu_state = 0; + insn_aux->alu_state = 0; goto process_bpf_exit; } else if (err < 0) { return err; @@ -20063,7 +20065,7 @@ static int do_check(struct bpf_verifier_env *env) } WARN_ON_ONCE(err); - if (state->speculative && cur_aux(env)->nospec_result) { + if (state->speculative && insn_aux->nospec_result) { /* If we are on a path that performed a jump-op, this * may skip a nospec patched-in after the jump. This can * currently never happen because nospec_result is only @@ -20072,8 +20074,15 @@ static int do_check(struct bpf_verifier_env *env) * never skip the following insn. Still, add a warning * to document this in case nospec_result is used * elsewhere in the future. + * + * All non-branch instructions have a single + * fall-through edge. For these, nospec_result should + * already work. */ - WARN_ON_ONCE(env->insn_idx != prev_insn_idx + 1); + if (verifier_bug_if(BPF_CLASS(insn->code) == BPF_JMP || + BPF_CLASS(insn->code) == BPF_JMP32, env, + "speculation barrier after jump instruction may not have the desired effect")) + return -EFAULT; process_bpf_exit: mark_verifier_state_scratched(env); err = update_branch_counts(env, env->cur_state); -- cgit v1.2.3 From 3413bc0cf16e38db53830c837b4bf639fb2126ef Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Fri, 4 Jul 2025 00:37:00 +0800 Subject: bpf: Clean code with bpf_copy_to_user() No logic change, use bpf_copy_to_user() to clean code. Signed-off-by: Tao Chen Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250703163700.677628-1-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7db7182a3057..3f36bfe13266 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5344,21 +5344,10 @@ static int bpf_task_fd_query_copy(const union bpf_attr *attr, if (put_user(zero, ubuf)) return -EFAULT; - } else if (input_len >= len + 1) { - /* ubuf can hold the string with NULL terminator */ - if (copy_to_user(ubuf, buf, len + 1)) - return -EFAULT; } else { - /* ubuf cannot hold the string with NULL terminator, - * do a partial copy with NULL terminator. - */ - char zero = '\0'; - - err = -ENOSPC; - if (copy_to_user(ubuf, buf, input_len - 1)) - return -EFAULT; - if (put_user(zero, ubuf + input_len - 1)) - return -EFAULT; + err = bpf_copy_to_user(ubuf, buf, input_len, len); + if (err == -EFAULT) + return err; } } -- cgit v1.2.3 From a683a5b2ba23598ad343e5ec10a4ef4077497fc9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 2 Jul 2025 06:34:37 +0100 Subject: fold fs_struct->{lock,seq} into a seqlock The combination of spinlock_t lock and seqcount_spinlock_t seq in struct fs_struct is an open-coded seqlock_t (see linux/seqlock_types.h). Combine and switch to equivalent seqlock_t primitives. AFAICS, that does end up with the same sequence of underlying operations in all cases. While we are at it, get_fs_pwd() is open-coded verbatim in get_path_from_fd(); rather than applying conversion to it, replace with the call of get_fs_pwd() there. Not worth splitting the commit for that, IMO... A bit of historical background - conversion of seqlock_t to use of seqcount_spinlock_t happened several months after the same had been done to struct fs_struct; switching fs_struct to seqlock_t could've been done immediately after that, but it looks like nobody had gotten around to that until now. Signed-off-by: Al Viro Link: https://lore.kernel.org/20250702053437.GC1880847@ZenIV Acked-by: Ahmed S. Darwish Acked-by: Peter Zijlstra (Intel) Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..6318a25a16ba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1542,14 +1542,14 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return -EAGAIN; } fs->users++; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); return 0; } tsk->fs = copy_fs_struct(fs); @@ -3149,13 +3149,13 @@ int ksys_unshare(unsigned long unshare_flags) if (new_fs) { fs = current->fs; - spin_lock(&fs->lock); + read_seqlock_excl(&fs->seq); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; - spin_unlock(&fs->lock); + read_sequnlock_excl(&fs->seq); } if (new_fd) -- cgit v1.2.3 From 4b9432ed65cb3a692e8d8bd86abb93f5f2dc5b69 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:01 +0200 Subject: rcu/exp: Remove confusing needless full barrier on task unblock A full memory barrier in the RCU-PREEMPT task unblock path advertizes to order the context switch (or rather the accesses prior to rcu_read_unlock()) with the expedited grace period fastpath. However the grace period can not complete without the rnp calling into rcu_report_exp_rnp() with the node locked. This reports the quiescent state in a fully ordered fashion against updater's accesses thanks to: 1) The READ-SIDE smp_mb__after_unlock_lock() barrier across nodes locking while propagating QS up to the root. 2) The UPDATE-SIDE smp_mb__after_unlock_lock() barrier while holding the the root rnp to wait/check for the GP completion. 3) The (perhaps redundant given step 1) and 2)) smp_mb() in rcu_seq_end() before the grace period completes. This makes the explicit barrier in this place superfluous. Therefore remove it as it is confusing. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_plugin.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b0f56f6abc8..0532a13cb75e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -534,7 +534,6 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq && (!empty_norm || rnp->qsmask)); empty_exp = sync_rcu_exp_done(rnp); - smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; -- cgit v1.2.3 From bf0a57445d3fddfb47046aeccf4f5cb938a4e996 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:02 +0200 Subject: rcu/exp: Remove needless CPU up quiescent state report A CPU coming online checks for an ongoing grace period and reports a quiescent state accordingly if needed. This special treatment that shortcuts the expedited IPI finds its origin as an optimization purpose on the following commit: 338b0f760e84 (rcu: Better hotplug handling for synchronize_sched_expedited() The point is to avoid an IPI while waiting for a CPU to become online or failing to become offline. However this is pointless and even error prone for several reasons: * If the CPU has been seen offline in the first round scanning offline and idle CPUs, no IPI is even tried and the quiescent state is reported on behalf of the CPU. * This means that if the IPI fails, the CPU just became offline. So it's unlikely to become online right away, unless the cpu hotplug operation failed and rolled back, which is a rare event that can wait a jiffy for a new IPI to be issued. * But then the "optimization" applying on failing CPU hotplug down only applies to !PREEMPT_RCU. * This force reports a quiescent state even if ->cpu_no_qs.b.exp is not set. As a result it can race with remote QS reports on the same rdp. Fortunately it happens to be OK but an accident is waiting to happen. For all those reasons, remove this optimization that doesn't look worthy to keep around. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 2 -- kernel/rcu/tree_exp.h | 45 ++------------------------------------------- 2 files changed, 2 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 14d4499c6fc3..0bda23fec690 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -160,7 +160,6 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); -static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); static bool rcu_rdp_cpu_online(struct rcu_data *rdp); @@ -4268,7 +4267,6 @@ int rcutree_online_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ - sync_sched_exp_online_cleanup(cpu); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 2fa7aa9155bd..6058a734090c 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -751,12 +751,8 @@ static void rcu_exp_handler(void *unused) struct task_struct *t = current; /* - * First, is there no need for a quiescent state from this CPU, - * or is this CPU already looking for a quiescent state for the - * current grace period? If either is the case, just leave. - * However, this should not happen due to the preemptible - * sync_sched_exp_online_cleanup() implementation being a no-op, - * so warn if this does happen. + * WARN if the CPU is unexpectedly already looking for a + * QS or has already reported one. */ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp); if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) || @@ -803,11 +799,6 @@ static void rcu_exp_handler(void *unused) WARN_ON_ONCE(1); } -/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ -} - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each that is blocking the current @@ -885,38 +876,6 @@ static void rcu_exp_handler(void *unused) rcu_exp_need_qs(); } -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - unsigned long flags; - int my_cpu; - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - my_cpu = get_cpu(); - /* Quiescent state either not needed or already requested, leave. */ - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - READ_ONCE(rdp->cpu_no_qs.b.exp)) { - put_cpu(); - return; - } - /* Quiescent state needed on current CPU, so set it up locally. */ - if (my_cpu == cpu) { - local_irq_save(flags); - rcu_exp_need_qs(); - local_irq_restore(flags); - put_cpu(); - return; - } - /* Quiescent state needed on some other CPU, send IPI. */ - ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); - put_cpu(); - WARN_ON_ONCE(ret); -} - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections that are -- cgit v1.2.3 From fc39760cd0f432cd399a209c43dcb887fffdb6dc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Apr 2025 15:43:03 +0200 Subject: rcu/exp: Warn on QS requested on dying CPU It is not possible to send an IPI to a dying CPU that has passed the CPUHP_TEARDOWN_CPU stage. Remaining unhandled IPIs are handled later at CPUHP_AP_SMPCFD_DYING stage by stop machine. This is the last opportunity for RCU exp handler to request an expedited quiescent state. And the upcoming final context switch between stop machine and idle must have reported the requested context switch. Therefore, it should not be possible to observe a pending requested expedited quiescent state when RCU finally stops watching the outgoing CPU. Once IPIs aren't possible anymore, the QS for the target CPU will be reported on its behalf by the RCU exp kworker. Provide an assertion to verify those expectations. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0bda23fec690..00c182b3f978 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4356,6 +4356,12 @@ void rcutree_report_cpu_dead(void) * may introduce a new READ-side while it is actually off the QS masks. */ lockdep_assert_irqs_disabled(); + /* + * CPUHP_AP_SMPCFD_DYING was the last call for rcu_exp_handler() execution. + * The requested QS must have been reported on the last context switch + * from stop machine to idle. + */ + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); -- cgit v1.2.3 From 1bba3900ca18bdae28d1b9fa10f16a8f8cb2ada1 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 7 May 2025 19:26:05 +0800 Subject: rcu/nocb: Fix possible invalid rdp's->nocb_cb_kthread pointer access In the preparation stage of CPU online, if the corresponding the rdp's->nocb_cb_kthread does not exist, will be created, there is a situation where the rdp's rcuop kthreads creation fails, and then de-offload this CPU's rdp, does not assign this CPU's rdp->nocb_cb_kthread pointer, but this rdp's->nocb_gp_rdp and rdp's->rdp_gp->nocb_gp_kthread is still valid. This will cause the subsequent re-offload operation of this offline CPU, which will pass the conditional check and the kthread_unpark() will access invalid rdp's->nocb_cb_kthread pointer. This commit therefore use rdp's->nocb_gp_kthread instead of rdp_gp's->nocb_gp_kthread for safety check. Signed-off-by: Zqiang Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_nocb.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index cb29b6bb0ed4..08eb9b0e2fab 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1146,7 +1146,6 @@ static bool rcu_nocb_rdp_offload_wait_cond(struct rcu_data *rdp) static int rcu_nocb_rdp_offload(struct rcu_data *rdp) { int wake_gp; - struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; WARN_ON_ONCE(cpu_online(rdp->cpu)); /* @@ -1156,7 +1155,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return -EINVAL; - if (WARN_ON_ONCE(!rdp_gp->nocb_gp_kthread)) + if (WARN_ON_ONCE(!rdp->nocb_gp_kthread)) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); @@ -1166,7 +1165,7 @@ static int rcu_nocb_rdp_offload(struct rcu_data *rdp) wake_gp = rcu_nocb_queue_toggle_rdp(rdp); if (wake_gp) - wake_up_process(rdp_gp->nocb_gp_kthread); + wake_up_process(rdp->nocb_gp_kthread); swait_event_exclusive(rdp->nocb_state_wq, rcu_nocb_rdp_offload_wait_cond(rdp)); -- cgit v1.2.3 From ca3881f6fd8e9b6eb2d51e8718d07d3b8029d886 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 18 Jun 2025 14:26:26 +0200 Subject: module: Fix memory deallocation on error path in move_module() The function move_module() uses the variable t to track how many memory types it has allocated and consequently how many should be freed if an error occurs. The variable is initially set to 0 and is updated when a call to module_memory_alloc() fails. However, move_module() can fail for other reasons as well, in which case t remains set to 0 and no memory is freed. Fix the problem by initializing t to MOD_MEM_NUM_TYPES. Additionally, make the deallocation loop more robust by not relying on the mod_mem_type_t enum having a signed integer as its underlying type. Fixes: c7ee8aebf6c0 ("module: add stop-grap sanity check on module memcpy()") Signed-off-by: Petr Pavlu Reviewed-by: Sami Tolvanen Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250618122730.51324-2-petr.pavlu@suse.com Signed-off-by: Daniel Gomez Message-ID: <20250618122730.51324-2-petr.pavlu@suse.com> --- kernel/module/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..9ac994b2f354 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2697,7 +2697,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - enum mod_mem_type t = 0; + enum mod_mem_type t = MOD_MEM_NUM_TYPES; int ret = -ENOMEM; bool codetag_section_found = false; @@ -2776,7 +2776,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_err: module_memory_restore_rox(mod); - for (t--; t >= 0; t--) + while (t--) module_memory_free(mod, t); if (codetag_section_found) codetag_free_module_sections(mod); -- cgit v1.2.3 From eb0994a954978f0edd3efb38d0cbe6744df8b83d Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 18 Jun 2025 14:26:27 +0200 Subject: module: Avoid unnecessary return value initialization in move_module() All error conditions in move_module() set the return value by updating the ret variable. Therefore, it is not necessary to the initialize the variable when declaring it. Remove the unnecessary initialization. Signed-off-by: Petr Pavlu Reviewed-by: Sami Tolvanen Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250618122730.51324-3-petr.pavlu@suse.com Signed-off-by: Daniel Gomez Message-ID: <20250618122730.51324-3-petr.pavlu@suse.com> --- kernel/module/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 9ac994b2f354..7822b91fca6b 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2696,9 +2696,8 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { - int i; + int i, ret; enum mod_mem_type t = MOD_MEM_NUM_TYPES; - int ret = -ENOMEM; bool codetag_section_found = false; for_each_mod_mem_type(type) { -- cgit v1.2.3 From 570db4b39f535a8bb722adb8be0280d09e34ca99 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 10 Jun 2025 18:33:28 +0200 Subject: module: Make sure relocations are applied to the per-CPU section The per-CPU data section is handled differently than the other sections. The memory allocations requires a special __percpu pointer and then the section is copied into the view of each CPU. Therefore the SHF_ALLOC flag is removed to ensure move_module() skips it. Later, relocations are applied and apply_relocations() skips sections without SHF_ALLOC because they have not been copied. This also skips the per-CPU data section. The missing relocations result in a NULL pointer on x86-64 and very small values on x86-32. This results in a crash because it is not skipped like NULL pointer would and can't be dereferenced. Such an assignment happens during static per-CPU lock initialisation with lockdep enabled. Allow relocation processing for the per-CPU section even if SHF_ALLOC is missing. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202506041623.e45e4f7d-lkp@intel.com Fixes: 1a6100caae425 ("Don't relocate non-allocated regions in modules.") #v2.6.1-rc3 Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20250610163328.URcsSUC1@linutronix.de Signed-off-by: Daniel Gomez Message-ID: <20250610163328.URcsSUC1@linutronix.de> --- kernel/module/main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 7822b91fca6b..c2c08007029d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1573,8 +1573,14 @@ static int apply_relocations(struct module *mod, const struct load_info *info) if (infosec >= info->hdr->e_shnum) continue; - /* Don't bother with non-allocated sections */ - if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + /* + * Don't bother with non-allocated sections. + * An exception is the percpu section, which has separate allocations + * for individual CPUs. We relocate the percpu section in the initial + * ELF template and subsequently copy it to the per-CPU destinations. + */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC) && + (!infosec || infosec != info->index.pcpu)) continue; if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH) -- cgit v1.2.3 From 88c79ecfb68fac057a0201db883518b662a0417d Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 4 Jun 2025 20:06:50 -0400 Subject: tracing: Replace opencoded cpumask_next_wrap() in move_to_next_cpu() The dedicated cpumask_next_wrap() is more verbose and effective than cpumask_next() followed by cpumask_first(). Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250605000651.45281-1-yury.norov@gmail.com Signed-off-by: Yury Norov Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_hwlat.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index b65353ec2837..2f7b94e98317 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -325,12 +325,9 @@ static void move_to_next_cpu(void) cpus_read_lock(); cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask); - next_cpu = cpumask_next(raw_smp_processor_id(), current_mask); + next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask); cpus_read_unlock(); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(current_mask); - if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */ goto change_mode; -- cgit v1.2.3 From 3aceaa539cfe3a2e62bd92e6697d9fae1c20c0be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 9 Jun 2025 13:17:32 -0400 Subject: tracing: Use queue_rcu_work() to free filters Freeing of filters requires to wait for both an RCU grace period as well as a RCU task trace wait period after they have been detached from their lists. The trace task period can be quite large so the freeing of the filters was moved to use the call_rcu*() routines. The problem with that is that the callback functions of call_rcu*() is done from a soft irq and can cause latencies if the callback takes a bit of time. The filters are freed per event in a system and the syscalls system contains an event per system call, which can be over 700 events. Freeing 700 filters in a bottom half is undesirable. Instead, move the freeing to use queue_rcu_work() which is done in task context. Link: https://lore.kernel.org/all/9a2f0cd0-1561-4206-8966-f93ccd25927f@paulmck-laptop/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250609131732.04fd303b@gandalf.local.home Fixes: a9d0aab5eb33 ("tracing: Fix regression of filter waiting a long time on RCU synchronization") Suggested-by: "Paul E. McKenney" Reviewed-by: Paul E. McKenney Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3885aadc434d..196c8bf34970 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1344,13 +1344,14 @@ struct filter_list { struct filter_head { struct list_head list; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct rcu_work rwork; + }; }; - -static void free_filter_list(struct rcu_head *rhp) +static void free_filter_list(struct filter_head *filter_list) { - struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); struct filter_list *filter_item, *tmp; list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) { @@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp) kfree(filter_list); } +static void free_filter_list_work(struct work_struct *work) +{ + struct filter_head *filter_list; + + filter_list = container_of(to_rcu_work(work), struct filter_head, rwork); + free_filter_list(filter_list); +} + static void free_filter_list_tasks(struct rcu_head *rhp) { - call_rcu(rhp, free_filter_list); + struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu); + + INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work); + queue_rcu_work(system_wq, &filter_list->rwork); } /* @@ -1460,7 +1472,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir, tracepoint_synchronize_unregister(); if (head) - free_filter_list(&head->rcu); + free_filter_list(head); list_for_each_entry(file, &tr->events, list) { if (file->system != dir || !file->filter) @@ -2305,7 +2317,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, return 0; fail: /* No call succeeded */ - free_filter_list(&filter_list->rcu); + free_filter_list(filter_list); parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; fail_mem: @@ -2315,7 +2327,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, if (!fail) delay_free_filter(filter_list); else - free_filter_list(&filter_list->rcu); + free_filter_list(filter_list); return -ENOMEM; } -- cgit v1.2.3 From adc353c0bfb243ebfd29b6222fa3bf149169a6de Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:12:12 +0200 Subject: kernel: trace: preemptirq_delay_test: use offstack cpu mask MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A CPU mask on the stack is broken for large values of CONFIG_NR_CPUS: kernel/trace/preemptirq_delay_test.c: In function ‘preemptirq_delay_run’: kernel/trace/preemptirq_delay_test.c:143:1: error: the frame size of 8512 bytes is larger than 1536 bytes [-Werror=frame-larger-than=] Fall back to dynamic allocation here. Cc: Masami Hiramatsu Cc: Song Chen Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250620111215.3365305-1-arnd@kernel.org Fixes: 4b9091e1c194 ("kernel: trace: preemptirq_delay_test: add cpu affinity") Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (Google) --- kernel/trace/preemptirq_delay_test.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index 314ffc143039..acb0c971a408 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data) { int i; int s = MIN(burst_size, NR_TEST_FUNCS); - struct cpumask cpu_mask; + cpumask_var_t cpu_mask; + + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; if (cpu_affinity > -1) { - cpumask_clear(&cpu_mask); - cpumask_set_cpu(cpu_affinity, &cpu_mask); - if (set_cpus_allowed_ptr(current, &cpu_mask)) + cpumask_clear(cpu_mask); + cpumask_set_cpu(cpu_affinity, cpu_mask); + if (set_cpus_allowed_ptr(current, cpu_mask)) pr_err("cpu_affinity:%d, failed\n", cpu_affinity); } @@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data) __set_current_state(TASK_RUNNING); + free_cpumask_var(cpu_mask); + return 0; } -- cgit v1.2.3 From 6fedaf682a5e1866efdaddc70ff0ada329825d53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:57:56 +0200 Subject: vdso/vsyscall: Introduce a helper to fill clock configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The logic to configure a 'struct vdso_clock' from a 'struct tk_read_base' is copied two times. Split it into a shared function to reduce the duplication, especially as another user will be added for auxiliary clocks. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-2-df7d9f87b9b8@linutronix.de --- kernel/time/vsyscall.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 32ef27c71b57..d655df259733 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,26 +15,25 @@ #include "timekeeping_internal.h" +static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) +{ + vc->cycle_last = base->cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vc->max_cycles = base->clock->max_cycles; +#endif + vc->mask = base->mask; + vc->mult = base->mult; + vc->shift = base->shift; +} + static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; -#endif - vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; -#endif - vc[CS_RAW].mask = tk->tkr_raw.mask; - vc[CS_RAW].mult = tk->tkr_raw.mult; - vc[CS_RAW].shift = tk->tkr_raw.shift; + fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); + fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; -- cgit v1.2.3 From 76164ca0d113e6a9f3033f948c739586fc606ed1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:57:57 +0200 Subject: vdso/vsyscall: Split up __arch_update_vsyscall() into __arch_update_vdso_clock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upcoming auxiliary clocks need this hook, too. To separate the architecture hooks from the timekeeper internals, refactor the hook to only operate on a single vDSO clock. While at it, use a more robust #define for the hook override. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-3-df7d9f87b9b8@linutronix.de --- kernel/time/vsyscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index d655df259733..df6bada2d58e 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -118,7 +118,8 @@ void update_vsyscall(struct timekeeper *tk) if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_time_data(vdata, tk); - __arch_update_vsyscall(vdata); + __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); + __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); -- cgit v1.2.3 From 3da6bb419750f3ad834786d6ba7c9d5d062c770b Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 9 Jul 2025 20:27:52 +0900 Subject: perf/core: Fix WARN in perf_sigtrap() Since exit_task_work() runs after perf_event_exit_task_context() updated ctx->task to TASK_TOMBSTONE, perf_sigtrap() from perf_pending_task() might observe event->ctx->task == TASK_TOMBSTONE. Swap the early exit tests in order not to hit WARN_ON_ONCE(). Closes: https://syzkaller.appspot.com/bug?extid=2fe61cb2a86066be6985 Reported-by: syzbot Signed-off-by: Tetsuo Handa Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/b1c224bd-97f9-462c-a3e3-125d5e19c983@I-love.SAKURA.ne.jp --- kernel/events/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0db36b2b2448..22fdf0c187cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7204,18 +7204,18 @@ void perf_event_wakeup(struct perf_event *event) static void perf_sigtrap(struct perf_event *event) { /* - * We'd expect this to only occur if the irq_work is delayed and either - * ctx->task or current has changed in the meantime. This can be the - * case on architectures that do not implement arch_irq_work_raise(). + * Both perf_pending_task() and perf_pending_irq() can race with the + * task exiting. */ - if (WARN_ON_ONCE(event->ctx->task != current)) + if (current->flags & PF_EXITING) return; /* - * Both perf_pending_task() and perf_pending_irq() can race with the - * task exiting. + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). */ - if (current->flags & PF_EXITING) + if (WARN_ON_ONCE(event->ctx->task != current)) return; send_sig_perf((void __user *)event->pending_addr, -- cgit v1.2.3 From 155213a2aed42c85361bf4f5c817f5cb68951c3b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 26 Jun 2025 07:39:10 -0700 Subject: sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails schbench (https://github.com/masoncl/schbench.git) is showing a regression from previous production kernels that bisected down to: sched/fair: Remove sysctl_sched_migration_cost condition (c5b0a7eefc) The schbench command line was: schbench -L -m 4 -M auto -t 256 -n 0 -r 0 -s 0 This creates 4 message threads pinned to CPUs 0-3, and 256x4 worker threads spread across the rest of the CPUs. Neither the worker threads or the message threads do any work, they just wake each other up and go back to sleep as soon as possible. The end result is the first 4 CPUs are pegged waking up those 1024 workers, and the rest of the CPUs are constantly banging in and out of idle. If I take a v6.9 Linus kernel and revert that one commit, performance goes from 3.4M RPS to 5.4M RPS. schedstat shows there are ~100x more new idle balance operations, and profiling shows the worker threads are spending ~20% of their CPU time on new idle balance. schedstats also shows that almost all of these new idle balance attemps are failing to find busy groups. The fix used here is to crank up the cost of the newidle balance whenever it fails. Since we don't want sd->max_newidle_lb_cost to grow out of control, this also changes update_newidle_cost() to use sysctl_sched_migration_cost as the upper limit on max_newidle_lb_cost. Signed-off-by: Chris Mason Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Link: https://lkml.kernel.org/r/20250626144017.1510594-2-clm@fb.com --- kernel/sched/fair.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e2963efe800..ab0822cc51c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12064,8 +12064,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. + * + * sched_balance_newidle() bumps the cost whenever newidle + * balance fails, and we don't want things to grow out of + * control. Use the sysctl_sched_migration_cost as the upper + * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = cost; + sd->max_newidle_lb_cost = + min(cost, sysctl_sched_migration_cost + 200); sd->last_decay_max_lb_cost = jiffies; } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { /* @@ -12757,10 +12763,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Failing newidle means it is not effective; + * bump the cost so we end up doing less of it. + */ + if (!pulled_task) + domain_cost = (3 * sd->max_newidle_lb_cost) / 2; + + update_newidle_cost(sd, domain_cost); } /* -- cgit v1.2.3 From 570c8efd5eb79c3725ba439ce105ed1bedc5acd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 May 2025 17:28:00 +0200 Subject: sched/psi: Optimize psi_group_change() cpu_clock() usage Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") caused a regression for him on a high context switch rate benchmark (schbench) due to the now repeating cpu_clock() calls. In particular the problem is that get_recent_times() will extrapolate the current state to 'now'. But if an update uses a timestamp from before the start of the update, it is possible to get two reads with inconsistent results. It is effectively back-dating an update. (note that this all hard-relies on the clock being synchronized across CPUs -- if this is not the case, all bets are off). Combine this problem with the fact that there are per-group-per-cpu seqcounts, the commit in question pushed the clock read into the group iteration, causing tree-depth cpu_clock() calls. On architectures where cpu_clock() has appreciable overhead, this hurts. Instead move to a per-cpu seqcount, which allows us to have a single clock read for all group updates, increasing internal consistency and lowering update overhead. This comes at the cost of a longer update side (proportional to the tree depth) which can cause the read side to retry more often. Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") Reported-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Tested-by: Dietmar Eggemann , Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net --- kernel/sched/psi.c | 121 +++++++++++++++++++++++++++++------------------------ 1 file changed, 66 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5837cd8e7b97..2024c1d36402 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -176,6 +176,28 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +static DEFINE_PER_CPU(seqcount_t, psi_seq); + +static inline void psi_write_begin(int cpu) +{ + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); @@ -186,7 +208,7 @@ static void group_init(struct psi_group *group) group->enabled = true; for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); @@ -266,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu, /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; if (cpu == current_cpu) memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); - } while (read_seqcount_retry(&groupc->seq, seq)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -772,30 +794,20 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, unsigned int clear, unsigned int set, - bool wake_clock) + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; - u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); - /* - * First we update the task counts according to the state - * change requested through the @clear and @set bits. - * - * Then if the cgroup PSI stats accounting enabled, we - * assess the aggregate resource states this CPU's tasks - * have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - */ - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in @@ -847,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); return; } @@ -868,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); - if (state_mask & group->rtpoll_states) psi_schedule_rtpoll_work(group, 1, false); @@ -904,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); - struct psi_group *group; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - group = task_psi_group(task); - do { - psi_group_change(group, cpu, clear, set, true); - } while ((group = group->parent)); + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { - struct psi_group *group, *common = NULL; + struct psi_group *common = NULL; int cpu = task_cpu(prev); + u64 now; + + psi_write_begin(cpu); + now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -930,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - group = task_psi_group(next); - do { - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & - PSI_ONCPU) { + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { common = group; break; } - - psi_group_change(group, cpu, 0, TSK_ONCPU, true); - } while ((group = group->parent)); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } } if (prev->pid) { @@ -972,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - group = task_psi_group(prev); - do { + for_each_group(group, task_psi_group(prev)) { if (group == common) break; - psi_group_change(group, cpu, clear, set, wake_clock); - } while ((group = group->parent)); + psi_group_change(group, cpu, clear, set, now, wake_clock); + } /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -987,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, wake_clock); + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } + psi_write_end(cpu); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { int cpu = task_cpu(curr); - struct psi_group *group; struct psi_group_cpu *groupc; s64 delta; u64 irq; + u64 now; if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; @@ -1009,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; lockdep_assert_rq_held(rq); - group = task_psi_group(curr); - if (prev && task_psi_group(prev) == group) + if (prev && task_psi_group(prev) == task_psi_group(curr)) return; irq = irq_time_read(cpu); @@ -1019,25 +1031,22 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; rq->psi_irq_time = irq; - do { - u64 now; + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(curr)) { if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; - write_seqcount_end(&groupc->seq); - if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) psi_schedule_rtpoll_work(group, 1, false); - } while ((group = group->parent)); + } + psi_write_end(cpu); } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ @@ -1225,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group) return; for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; + u64 now; - rq_lock_irq(rq, &rf); - psi_group_change(group, cpu, 0, 0, true); - rq_unlock_irq(rq, &rf); + guard(rq_lock_irq)(cpu_rq(cpu)); + + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); } } #endif /* CONFIG_CGROUPS */ -- cgit v1.2.3 From cccb45d7c4295bbfeba616582d0249f2d21e6df5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 May 2025 11:19:30 +0200 Subject: sched/deadline: Less agressive dl_server handling Chris reported that commit 5f6bd380c7bd ("sched/rt: Remove default bandwidth control") caused a significant dip in his favourite benchmark of the day. Simply disabling dl_server cured things. His workload hammers the 0->1, 1->0 transitions, and the dl_server_{start,stop}() overhead kills it -- fairly obviously a bad idea in hind sight and all that. Change things around to only disable the dl_server when there has not been a fair task around for a whole period. Since the default period is 1 second, this ensures the benchmark never trips this, overhead gone. Fixes: 557a6bfc662c ("sched/fair: Add trivial fair server") Reported-by: Chris Mason Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20250702121158.465086194@infradead.org --- kernel/sched/deadline.c | 25 ++++++++++++++++++++++--- kernel/sched/fair.c | 9 --------- 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0f30697ad795..23668fc60bd3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1150,6 +1150,8 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; +static bool dl_server_stopped(struct sched_dl_entity *dl_se); + static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1169,6 +1171,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->server_has_tasks(dl_se)) { replenish_dl_entity(dl_se); + dl_server_stopped(dl_se); return HRTIMER_NORESTART; } @@ -1572,8 +1575,10 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_runtime) { + dl_se->dl_server_idle = 0; update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1596,7 +1601,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) setup_new_dl_entity(dl_se); } - if (!dl_se->dl_runtime) + if (!dl_se->dl_runtime || dl_se->dl_server_active) return; dl_se->dl_server_active = 1; @@ -1617,6 +1622,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } +static bool dl_server_stopped(struct sched_dl_entity *dl_se) +{ + if (!dl_se->dl_server_active) + return false; + + if (dl_se->dl_server_idle) { + dl_server_stop(dl_se); + return true; + } + + dl_se->dl_server_idle = 1; + return false; +} + void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) @@ -2354,7 +2373,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (dl_server_active(dl_se)) { + if (!dl_server_stopped(dl_se)) { dl_se->dl_yielded = 1; update_curr_dl_se(rq, dl_se, 0); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab0822cc51c2..a1350c513a87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5802,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long queued_delta, runnable_delta, idle_delta, dequeue = 1; - long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5886,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, queued_delta); - - /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the @@ -6966,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; @@ -7050,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); - /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; -- cgit v1.2.3 From 9cdb4fe20cd239c848b5c3f5753d83a9443ba329 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:25 +0200 Subject: sched/fair: Use protect_slice() instead of direct comparison Replace the test by the relevant protect_slice() function. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dhaval Giani (AMD) Link: https://lkml.kernel.org/r/20250708165630.1948751-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a1350c513a87..43fe5c831dd5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1161,7 +1161,7 @@ static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity if (!sched_feat(PREEMPT_SHORT)) return false; - if (curr->vlag == curr->deadline) + if (protect_slice(curr)) return false; return !entity_eligible(cfs_rq, curr); -- cgit v1.2.3 From 74eec63661d46a7153d04c2e0249eeb76cc76d44 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:26 +0200 Subject: sched/fair: Fix NO_RUN_TO_PARITY case EEVDF expects the scheduler to allocate a time quantum to the selected entity and then pick a new entity for next quantum. Although this notion of time quantum is not strictly doable in our case, we can ensure a minimum runtime for each task most of the time and pick a new entity after a minimum time has elapsed. Reuse the slice protection of run to parity to ensure such runtime quantum. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 43fe5c831dd5..8d288df79a6b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,23 +882,35 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. + * Set the vruntime up to which an entity can run before looking + * for another entity to pick. + * In case of run to parity, we protect the entity up to its deadline. + * When run to parity is disabled, we give a minimum quantum to the running + * entity to ensure progress. */ static inline void set_protect_slice(struct sched_entity *se) { - se->vlag = se->deadline; + u64 slice = se->slice; + u64 vprot = se->deadline; + + if (!sched_feat(RUN_TO_PARITY)) + slice = min(slice, normalized_sysctl_sched_base_slice); + + if (slice != se->slice) + vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); + + se->vprot = vprot; } static inline bool protect_slice(struct sched_entity *se) { - return se->vlag == se->deadline; + return ((s64)(se->vprot - se->vruntime) > 0); } static inline void cancel_protect_slice(struct sched_entity *se) { if (protect_slice(se)) - se->vlag = se->deadline + 1; + se->vprot = se->vruntime; } /* @@ -937,7 +949,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) + if (curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -1156,11 +1168,8 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) cgroup_account_cputime(p, delta_exec); } -static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) +static inline bool resched_next_slice(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - if (!sched_feat(PREEMPT_SHORT)) - return false; - if (protect_slice(curr)) return false; @@ -1248,7 +1257,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return; - if (resched || did_preempt_short(cfs_rq, curr)) { + if (resched || resched_next_slice(cfs_rq, curr)) { resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } -- cgit v1.2.3 From 9de74a9850b9468ac2f515bfbe0844e0bfae869d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:27 +0200 Subject: sched/fair: Remove spurious shorter slice preemption Even if the waking task can preempt current, it might not be the one selected by pick_task_fair. Check that the waking task will be selected if we cancel the slice protection before doing so. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8d288df79a6b..96718b3b4bd1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -932,7 +932,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -949,7 +949,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - if (curr && protect_slice(curr)) + if (curr && protect && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -993,6 +993,11 @@ found: return best; } +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +{ + return __pick_eevdf(cfs_rq, true); +} + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -1176,27 +1181,6 @@ static inline bool resched_next_slice(struct cfs_rq *cfs_rq, struct sched_entity return !entity_eligible(cfs_rq, curr); } -static inline bool do_preempt_short(struct cfs_rq *cfs_rq, - struct sched_entity *pse, struct sched_entity *se) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (pse->slice >= se->slice) - return false; - - if (!entity_eligible(cfs_rq, pse)) - return false; - - if (entity_before(pse, se)) - return true; - - if (!entity_eligible(cfs_rq, se)) - return true; - - return false; -} - /* * Used by other classes to account runtime. */ @@ -8658,6 +8642,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8706,7 +8691,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - cancel_protect_slice(se); + do_preempt_short = true; goto preempt; } @@ -8724,22 +8709,21 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. - * - * Note that even if @p does not turn out to be the most eligible - * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se)) - cancel_protect_slice(se); + do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); /* * If @p has become the most eligible task, force preemption. */ - if (pick_eevdf(cfs_rq) == pse) + if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; return; preempt: + if (do_preempt_short) + cancel_protect_slice(se); + resched_curr_lazy(rq); } -- cgit v1.2.3 From 052c3d87c82ea4ee83232b747512847b4e8c9976 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:28 +0200 Subject: sched/fair: Limit run to parity to the min slice of enqueued entities Run to parity ensures that current will get a chance to run its full slice in one go but this can create large latency and/or lag for entities with shorter slice that have exhausted their previous slice and wait to run their next slice. Clamp the run to parity to the shortest slice of all enqueued entities. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-5-vincent.guittot@linaro.org --- kernel/sched/fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96718b3b4bd1..45e057fc2354 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -884,18 +884,20 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) /* * Set the vruntime up to which an entity can run before looking * for another entity to pick. - * In case of run to parity, we protect the entity up to its deadline. + * In case of run to parity, we use the shortest slice of the enqueued + * entities to set the protected period. * When run to parity is disabled, we give a minimum quantum to the running * entity to ensure progress. */ static inline void set_protect_slice(struct sched_entity *se) { - u64 slice = se->slice; + u64 slice = normalized_sysctl_sched_base_slice; u64 vprot = se->deadline; - if (!sched_feat(RUN_TO_PARITY)) - slice = min(slice, normalized_sysctl_sched_base_slice); + if (sched_feat(RUN_TO_PARITY)) + slice = cfs_rq_min_slice(cfs_rq_of(se)); + slice = min(slice, se->slice); if (slice != se->slice) vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); -- cgit v1.2.3 From 3a0baa8e6c570c252999cb651398a88f8f990b4a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:29 +0200 Subject: sched/fair: Fix entity's lag with run to parity When an entity is enqueued without preempting current, we must ensure that the slice protection is updated to take into account the slice duration of the newly enqueued task so that its lag will not exceed its slice (+ tick). Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-6-vincent.guittot@linaro.org --- kernel/sched/fair.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45e057fc2354..1660960d64af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -889,13 +889,13 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * When run to parity is disabled, we give a minimum quantum to the running * entity to ensure progress. */ -static inline void set_protect_slice(struct sched_entity *se) +static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { u64 slice = normalized_sysctl_sched_base_slice; u64 vprot = se->deadline; if (sched_feat(RUN_TO_PARITY)) - slice = cfs_rq_min_slice(cfs_rq_of(se)); + slice = cfs_rq_min_slice(cfs_rq); slice = min(slice, se->slice); if (slice != se->slice) @@ -904,6 +904,13 @@ static inline void set_protect_slice(struct sched_entity *se) se->vprot = vprot; } +static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = cfs_rq_min_slice(cfs_rq); + + se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se)); +} + static inline bool protect_slice(struct sched_entity *se) { return ((s64)(se->vprot - se->vruntime) > 0); @@ -5467,7 +5474,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(se); + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -8720,6 +8727,9 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; + if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + update_protect_slice(cfs_rq, se); + return; preempt: -- cgit v1.2.3 From 0b9ca2dcabc3c8816a6ee75599cab7bef3330609 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:30 +0200 Subject: sched/fair: Always trigger resched at the end of a protected period Always trigger a resched after a protected period even if the entity is still eligible. It can happen that an entity remains eligible at the end of the protected period but must let an entity with a shorter slice to run in order to keep its lag shorter than slice. This is particulalry true with run to parity which tries to maximize the lag. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-7-vincent.guittot@linaro.org --- kernel/sched/fair.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1660960d64af..20a845697c1d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1182,14 +1182,6 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) cgroup_account_cputime(p, delta_exec); } -static inline bool resched_next_slice(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - if (protect_slice(curr)) - return false; - - return !entity_eligible(cfs_rq, curr); -} - /* * Used by other classes to account runtime. */ @@ -1250,7 +1242,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return; - if (resched || resched_next_slice(cfs_rq, curr)) { + if (resched || !protect_slice(curr)) { resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } -- cgit v1.2.3 From 2d088762631b212eb0809e112642843844ef64eb Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:12 +0200 Subject: rv: Add #undef TRACE_INCLUDE_FILE Without "#undef TRACE_INCLUDE_FILE", there could be a build error due to TRACE_INCLUDE_FILE being redefined. Therefore add it. Also fix a typo while at it. Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/f805e074581e927bb176c742c981fa7675b6ebe5.1752088709.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv_trace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 422b75f58891..99c3801616d4 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -129,8 +129,9 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, #endif /* CONFIG_DA_MON_EVENTS_ID */ #endif /* _TRACE_RV_H */ -/* This part ust be outside protection */ +/* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE rv_trace #include -- cgit v1.2.3 From 0af3ecdde58676f6c42eeec07d6816d5bf87ff88 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:13 +0200 Subject: printk: Make vprintk_deferred() public vprintk_deferred() is useful for implementing runtime verification reactors. Make it public. Signed-off-by: Nam Cao Reviewed-by: Petr Mladek Signed-off-by: Steven Rostedt (Google) --- kernel/printk/internal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 48a24e7b309d..bbed41ad29cf 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -72,7 +72,6 @@ int vprintk_store(int facility, int level, const char *fmt, va_list args); __printf(1, 0) int vprintk_default(const char *fmt, va_list args); -__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); -- cgit v1.2.3 From 3f045de7f557850ca6b3632c6d45c2cdaf948694 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:14 +0200 Subject: panic: Add vpanic() vpanic() is useful for implementing runtime verification reactors. Add it. Signed-off-by: Nam Cao Reviewed-by: Petr Mladek Signed-off-by: Steven Rostedt (Google) --- kernel/panic.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b0b9a8bf4560..6a1823c383d0 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -309,13 +309,13 @@ static void panic_other_cpus_shutdown(bool crash_kexec) /** * panic - halt the system * @fmt: The text string to print + * @args: Arguments for the format string * * Display a message, then perform cleanups. This function never returns. */ -void panic(const char *fmt, ...) +void vpanic(const char *fmt, va_list args) { static char buf[1024]; - va_list args; long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; @@ -366,9 +366,7 @@ void panic(const char *fmt, ...) console_verbose(); bust_spinlocks(1); - va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; @@ -505,7 +503,17 @@ void panic(const char *fmt, ...) mdelay(PANIC_TIMER_STEP); } } +EXPORT_SYMBOL(vpanic); +/* Identical to vpanic(), except it takes variadic arguments instead of va_list */ +void panic(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vpanic(fmt, args); + va_end(args); +} EXPORT_SYMBOL(panic); #define TAINT_FLAG(taint, _c_true, _c_false, _module) \ -- cgit v1.2.3 From ff4e233d8ab70fe6ae460ecc8c0e5b24dd0fedb0 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:15 +0200 Subject: rv: Let the reactors take care of buffers Each RV monitor has one static buffer to send to the reactors. If multiple errors are detected simultaneously, the one buffer could be overwritten. Instead, leave it to the reactors to handle buffering. Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/reactor_panic.c | 8 ++++++-- kernel/trace/rv/reactor_printk.c | 8 ++++++-- kernel/trace/rv/rv_reactors.c | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c index 0186ff4cbd0b..74c6bcc2c749 100644 --- a/kernel/trace/rv/reactor_panic.c +++ b/kernel/trace/rv/reactor_panic.c @@ -13,9 +13,13 @@ #include #include -static void rv_panic_reaction(char *msg) +__printf(1, 2) static void rv_panic_reaction(const char *msg, ...) { - panic(msg); + va_list args; + + va_start(args, msg); + vpanic(msg, args); + va_end(args); } static struct rv_reactor rv_panic = { diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c index 178759dbf89f..2dae2916c05f 100644 --- a/kernel/trace/rv/reactor_printk.c +++ b/kernel/trace/rv/reactor_printk.c @@ -12,9 +12,13 @@ #include #include -static void rv_printk_reaction(char *msg) +__printf(1, 2) static void rv_printk_reaction(const char *msg, ...) { - printk_deferred(msg); + va_list args; + + va_start(args, msg); + vprintk_deferred(msg, args); + va_end(args); } static struct rv_reactor rv_printk = { diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 9501ca886d83..740603670dd1 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -490,7 +490,7 @@ void reactor_cleanup_monitor(struct rv_monitor_def *mdef) /* * Nop reactor register */ -static void rv_nop_reaction(char *msg) +__printf(1, 2) static void rv_nop_reaction(const char *msg, ...) { } -- cgit v1.2.3 From c94d27c01b1ff2e26ca347524b3527534e285a39 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:16 +0200 Subject: rv: rename CONFIG_DA_MON_EVENTS to CONFIG_RV_MON_EVENTS CONFIG_DA_MON_EVENTS is not specific to deterministic automaton. It could be used for other monitor types. Therefore rename it to CONFIG_RV_MON_EVENTS. This prepares for the introduction of linear temporal logic monitor. Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/507210517123d887c1d208aa2fd45ec69765d3f0.1752088709.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 6 +++--- kernel/trace/rv/rv.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index b39f36013ef2..6cdffc04b73c 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -1,14 +1,14 @@ # SPDX-License-Identifier: GPL-2.0-only # -config DA_MON_EVENTS +config RV_MON_EVENTS bool config DA_MON_EVENTS_IMPLICIT - select DA_MON_EVENTS + select RV_MON_EVENTS bool config DA_MON_EVENTS_ID - select DA_MON_EVENTS + select RV_MON_EVENTS bool menuconfig RV diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index e4077500a91d..e25d65fe432a 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -143,7 +143,7 @@ #include #include -#ifdef CONFIG_DA_MON_EVENTS +#ifdef CONFIG_RV_MON_EVENTS #define CREATE_TRACE_POINTS #include #endif -- cgit v1.2.3 From a9769a5b987838f03f3dd57b097794cd4c691098 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:17 +0200 Subject: rv: Add support for LTL monitors While attempting to implement DA monitors for some complex specifications, deterministic automaton is found to be inappropriate as the specification language. The automaton is complicated, hard to understand, and error-prone. For these cases, linear temporal logic is more suitable as the specification language. Add support for linear temporal logic runtime verification monitor. Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Gabriele Monaco Link: https://lore.kernel.org/d366c1fed60ed4e8f6451f3c15a99755f2740b5f.1752088709.git.namcao@linutronix.de Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/fork.c | 5 +---- kernel/trace/rv/Kconfig | 7 +++++++ kernel/trace/rv/rv_trace.h | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..1f06559d17bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1886,10 +1886,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) #ifdef CONFIG_RV static void rv_task_fork(struct task_struct *p) { - int i; - - for (i = 0; i < RV_PER_TASK_MONITORS; i++) - p->rv[i].da_mon.monitoring = false; + memset(&p->rv, 0, sizeof(p->rv)); } #else #define rv_task_fork(p) do {} while (0) diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 6cdffc04b73c..6e157f964991 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -11,6 +11,13 @@ config DA_MON_EVENTS_ID select RV_MON_EVENTS bool +config LTL_MON_EVENTS_ID + select RV_MON_EVENTS + bool + +config RV_LTL_MONITOR + bool + menuconfig RV bool "Runtime Verification" depends on TRACING diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 99c3801616d4..fd3111ad1d51 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -127,6 +127,53 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ +#ifdef CONFIG_LTL_MON_EVENTS_ID +DECLARE_EVENT_CLASS(event_ltl_monitor_id, + + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + + TP_ARGS(task, states, atoms, next), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + __string(states, states) + __string(atoms, atoms) + __string(next, next) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + __assign_str(states); + __assign_str(atoms); + __assign_str(next); + ), + + TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid, + __get_str(states), __get_str(atoms), __get_str(next)) +); + +DECLARE_EVENT_CLASS(error_ltl_monitor_id, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __string(comm, task->comm) + __field(pid_t, pid) + ), + + TP_fast_assign( + __assign_str(comm); + __entry->pid = task->pid; + ), + + TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid) +); +// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here +#endif /* CONFIG_LTL_MON_EVENTS_ID */ #endif /* _TRACE_RV_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 886fc86e9419a2bf61713ba566eb6edd8e6aa989 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:18 +0200 Subject: rv: Add rtapp container monitor Add the container "rtapp" which is the monitor collection for detecting problems with real-time applications. The monitors will be added in the follow-up commits. Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/fb18b87631d386271de00959d8d4826f23fcd1cd.1752088709.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 1 + kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/rtapp/Kconfig | 10 ++++++++++ kernel/trace/rv/monitors/rtapp/rtapp.c | 33 +++++++++++++++++++++++++++++++++ kernel/trace/rv/monitors/rtapp/rtapp.h | 3 +++ 5 files changed, 48 insertions(+) create mode 100644 kernel/trace/rv/monitors/rtapp/Kconfig create mode 100644 kernel/trace/rv/monitors/rtapp/rtapp.c create mode 100644 kernel/trace/rv/monitors/rtapp/rtapp.h (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 6e157f964991..5c407d291661 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -41,6 +41,7 @@ source "kernel/trace/rv/monitors/snroc/Kconfig" source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" source "kernel/trace/rv/monitors/sncid/Kconfig" +source "kernel/trace/rv/monitors/rtapp/Kconfig" # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index f9b2cd0483c3..9b28c2419995 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o +obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig new file mode 100644 index 000000000000..b7415c3570bb --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/Kconfig @@ -0,0 +1,10 @@ +config RV_MON_RTAPP + depends on RV + bool "rtapp monitor" + help + Collection of monitors to check for common problems with real-time + application that may cause unexpected latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c new file mode 100644 index 000000000000..fd75fc927d65 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#define MODULE_NAME "rtapp" + +#include "rtapp.h" + +struct rv_monitor rv_rtapp; + +struct rv_monitor rv_rtapp = { + .name = "rtapp", + .description = "Collection of monitors for detecting problems with real-time applications", +}; + +static int __init register_rtapp(void) +{ + return rv_register_monitor(&rv_rtapp, NULL); +} + +static void __exit unregister_rtapp(void) +{ + rv_unregister_monitor(&rv_rtapp); +} + +module_init(register_rtapp); +module_exit(unregister_rtapp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao "); +MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications"); diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h new file mode 100644 index 000000000000..4c200d67c7f6 --- /dev/null +++ b/kernel/trace/rv/monitors/rtapp/rtapp.h @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +extern struct rv_monitor rv_rtapp; -- cgit v1.2.3 From 9162620eb604d7461da5b02ec379bb50c3c3b604 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:20 +0200 Subject: rv: Add rtapp_pagefault monitor Userspace real-time applications may have design flaws that they raise page faults in real-time threads, and thus have unexpected latencies. Add an linear temporal logic monitor to detect this scenario. Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/78fea8a2de6d058241d3c6502c1a92910772b0ed.1752088709.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 1 + kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/pagefault/Kconfig | 20 +++++ kernel/trace/rv/monitors/pagefault/pagefault.c | 88 ++++++++++++++++++++++ kernel/trace/rv/monitors/pagefault/pagefault.h | 64 ++++++++++++++++ .../trace/rv/monitors/pagefault/pagefault_trace.h | 14 ++++ kernel/trace/rv/rv_trace.h | 1 + 7 files changed, 189 insertions(+) create mode 100644 kernel/trace/rv/monitors/pagefault/Kconfig create mode 100644 kernel/trace/rv/monitors/pagefault/pagefault.c create mode 100644 kernel/trace/rv/monitors/pagefault/pagefault.h create mode 100644 kernel/trace/rv/monitors/pagefault/pagefault_trace.h (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 5c407d291661..6f86d8501e87 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -42,6 +42,7 @@ source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" source "kernel/trace/rv/monitors/sncid/Kconfig" source "kernel/trace/rv/monitors/rtapp/Kconfig" +source "kernel/trace/rv/monitors/pagefault/Kconfig" # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 9b28c2419995..353ecf939d0e 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -13,6 +13,7 @@ obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o +obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig new file mode 100644 index 000000000000..5e16625f1653 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_PAGEFAULT + depends on RV + select RV_LTL_MONITOR + depends on RV_MON_RTAPP + depends on X86 || RISCV + default y + select LTL_MON_EVENTS_ID + bool "pagefault monitor" + help + Monitor that real-time tasks do not raise page faults, causing + undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + This monitor does not affect execution speed while it is not running, + therefore it is safe to enable this in production kernel. diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c new file mode 100644 index 000000000000..9fe6123b2200 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "pagefault" + +#include +#include +#include + +#include "pagefault.h" +#include + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + if (task_creation) + ltl_atom_set(mon, LTL_PAGEFAULT, false); +} + +static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + ltl_atom_pulse(current, LTL_PAGEFAULT, true); +} + +static int enable_pagefault(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + return 0; +} + +static void disable_pagefault(void) +{ + rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault); + rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_pagefault = { + .name = "pagefault", + .description = "Monitor that RT tasks do not raise page faults", + .enable = enable_pagefault, + .disable = disable_pagefault, +}; + +static int __init register_pagefault(void) +{ + return rv_register_monitor(&rv_pagefault, &rv_rtapp); +} + +static void __exit unregister_pagefault(void) +{ + rv_unregister_monitor(&rv_pagefault); +} + +module_init(register_pagefault); +module_exit(unregister_pagefault); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao "); +MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults"); diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h new file mode 100644 index 000000000000..c580ec194009 --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include + +#define MONITOR_NAME pagefault + +enum ltl_atom { + LTL_PAGEFAULT, + LTL_RT, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "pa", + "rt", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + if (val4) + __set_bit(S0, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms); + bool val3 = !pagefault; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val4 = val1 || val3; + + switch (state) { + case S0: + if (val4) + __set_bit(S0, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h new file mode 100644 index 000000000000..fe1f82597b1a --- /dev/null +++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_PAGEFAULT +DEFINE_EVENT(event_ltl_monitor_id, event_pagefault, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_pagefault, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_PAGEFAULT */ diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index fd3111ad1d51..98eee8ec96e4 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -172,6 +172,7 @@ DECLARE_EVENT_CLASS(error_ltl_monitor_id, TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid) ); +#include // Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here #endif /* CONFIG_LTL_MON_EVENTS_ID */ #endif /* _TRACE_RV_H */ -- cgit v1.2.3 From f74f8bb246cf22f27752977da62079cb615f55b2 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:21 +0200 Subject: rv: Add rtapp_sleep monitor Add a monitor for checking that real-time tasks do not go to sleep in a manner that may cause undesirable latency. Also change RV depends on TRACING to RV select TRACING to avoid the following recursive dependency: error: recursive dependency detected! symbol TRACING is selected by PREEMPTIRQ_TRACEPOINTS symbol PREEMPTIRQ_TRACEPOINTS depends on TRACE_IRQFLAGS symbol TRACE_IRQFLAGS is selected by RV_MON_SLEEP symbol RV_MON_SLEEP depends on RV symbol RV depends on TRACING Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/75bc5bcc741d153aa279c95faf778dff35c5c8ad.1752088709.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 3 +- kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/sleep/Kconfig | 22 +++ kernel/trace/rv/monitors/sleep/sleep.c | 237 ++++++++++++++++++++++++ kernel/trace/rv/monitors/sleep/sleep.h | 257 +++++++++++++++++++++++++++ kernel/trace/rv/monitors/sleep/sleep_trace.h | 14 ++ kernel/trace/rv/rv_trace.h | 1 + 7 files changed, 534 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/rv/monitors/sleep/Kconfig create mode 100644 kernel/trace/rv/monitors/sleep/sleep.c create mode 100644 kernel/trace/rv/monitors/sleep/sleep.h create mode 100644 kernel/trace/rv/monitors/sleep/sleep_trace.h (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 6f86d8501e87..942d57575e67 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -20,7 +20,7 @@ config RV_LTL_MONITOR menuconfig RV bool "Runtime Verification" - depends on TRACING + select TRACING help Enable the kernel runtime verification infrastructure. RV is a lightweight (yet rigorous) method that complements classical @@ -43,6 +43,7 @@ source "kernel/trace/rv/monitors/snep/Kconfig" source "kernel/trace/rv/monitors/sncid/Kconfig" source "kernel/trace/rv/monitors/rtapp/Kconfig" source "kernel/trace/rv/monitors/pagefault/Kconfig" +source "kernel/trace/rv/monitors/sleep/Kconfig" # Add new monitors here config RV_REACTORS diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 353ecf939d0e..13ec2944c665 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o +obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig new file mode 100644 index 000000000000..6b7a122e7b47 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/Kconfig @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SLEEP + depends on RV + select RV_LTL_MONITOR + depends on HAVE_SYSCALL_TRACEPOINTS + depends on RV_MON_RTAPP + select TRACE_IRQFLAGS + default y + select LTL_MON_EVENTS_ID + bool "sleep monitor" + help + Monitor that real-time tasks do not sleep in a manner that may + cause undesirable latency. + + If you are developing a real-time system and not entirely sure whether + the applications are designed correctly for real-time, you want to say + Y here. + + Enabling this monitor may have performance impact (due to select + TRACE_IRQFLAGS). Therefore, you probably should say N for + production kernel. diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c new file mode 100644 index 000000000000..eea447b06907 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "sleep" + +#include +#include +#include +#include +#include +#include + +#include "sleep.h" +#include + +static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon) +{ + /* + * This includes "actual" real-time tasks and also PI-boosted + * tasks. A task being PI-boosted means it is blocking an "actual" + * real-task, therefore it should also obey the monitor's rule, + * otherwise the "actual" real-task may be delayed. + */ + ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task)); +} + +static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation) +{ + ltl_atom_set(mon, LTL_SLEEP, false); + ltl_atom_set(mon, LTL_WAKE, false); + ltl_atom_set(mon, LTL_ABORT_SLEEP, false); + ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false); + ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false); + ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false); + + if (task_creation) { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false); + } + + if (task->flags & PF_KTHREAD) { + ltl_atom_set(mon, LTL_KERNEL_THREAD, true); + + /* kernel tasks do not do syscall */ + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false); + + if (strstarts(task->comm, "migration/")) + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true); + else + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + + if (strstarts(task->comm, "rcu")) + ltl_atom_set(mon, LTL_TASK_IS_RCU, true); + else + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + } else { + ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false); + ltl_atom_set(mon, LTL_KERNEL_THREAD, false); + ltl_atom_set(mon, LTL_TASK_IS_RCU, false); + ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false); + } + +} + +static void handle_sched_set_state(void *data, struct task_struct *task, int state) +{ + if (state & TASK_INTERRUPTIBLE) + ltl_atom_pulse(task, LTL_SLEEP, true); + else if (state == TASK_RUNNING) + ltl_atom_pulse(task, LTL_ABORT_SLEEP, true); +} + +static void handle_sched_wakeup(void *data, struct task_struct *task) +{ + ltl_atom_pulse(task, LTL_WAKE, true); +} + +static void handle_sched_waking(void *data, struct task_struct *task) +{ + if (this_cpu_read(hardirq_context)) { + ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true); + } else if (in_task()) { + if (current->prio <= task->prio) + ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true); + } else if (in_nmi()) { + ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true); + } +} + +static void handle_contention_begin(void *data, void *lock, unsigned int flags) +{ + if (flags & LCB_F_RT) + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true); +} + +static void handle_contention_end(void *data, void *lock, int ret) +{ + ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false); +} + +static void handle_sys_enter(void *data, struct pt_regs *regs, long id) +{ + struct ltl_monitor *mon; + unsigned long args[6]; + int op, cmd; + + mon = ltl_get_monitor(current); + + switch (id) { + case __NR_clock_nanosleep: +#ifdef __NR_clock_nanosleep_time64 + case __NR_clock_nanosleep_time64: +#endif + syscall_get_arguments(current, regs, args); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true); + break; + + case __NR_futex: +#ifdef __NR_futex_time64 + case __NR_futex_time64: +#endif + syscall_get_arguments(current, regs, args); + op = args[1]; + cmd = op & FUTEX_CMD_MASK; + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_LOCK_PI2: + ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true); + break; + case FUTEX_WAIT: + case FUTEX_WAIT_BITSET: + case FUTEX_WAIT_REQUEUE_PI: + ltl_atom_update(current, LTL_FUTEX_WAIT, true); + break; + } + break; + } +} + +static void handle_sys_exit(void *data, struct pt_regs *regs, long ret) +{ + struct ltl_monitor *mon = ltl_get_monitor(current); + + ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false); + ltl_atom_set(mon, LTL_FUTEX_WAIT, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false); + ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false); + ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false); + ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false); +} + +static void handle_kthread_stop(void *data, struct task_struct *task) +{ + /* FIXME: this could race with other tracepoint handlers */ + ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true); +} + +static int enable_sleep(void) +{ + int retval; + + retval = ltl_monitor_init(); + if (retval) + return retval; + + rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + return 0; +} + +static void disable_sleep(void) +{ + rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking); + rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin); + rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end); + rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop); + rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter); + rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit); + + ltl_monitor_destroy(); +} + +static struct rv_monitor rv_sleep = { + .name = "sleep", + .description = "Monitor that RT tasks do not undesirably sleep", + .enable = enable_sleep, + .disable = disable_sleep, +}; + +static int __init register_sleep(void) +{ + return rv_register_monitor(&rv_sleep, &rv_rtapp); +} + +static void __exit unregister_sleep(void) +{ + rv_unregister_monitor(&rv_sleep); +} + +module_init(register_sleep); +module_exit(unregister_sleep); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Nam Cao "); +MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep"); diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h new file mode 100644 index 000000000000..2ab46fd218d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * C implementation of Buchi automaton, automatically generated by + * tools/verification/rvgen from the linear temporal logic specification. + * For further information, see kernel documentation: + * Documentation/trace/rv/linear_temporal_logic.rst + */ + +#include + +#define MONITOR_NAME sleep + +enum ltl_atom { + LTL_ABORT_SLEEP, + LTL_BLOCK_ON_RT_MUTEX, + LTL_CLOCK_NANOSLEEP, + LTL_FUTEX_LOCK_PI, + LTL_FUTEX_WAIT, + LTL_KERNEL_THREAD, + LTL_KTHREAD_SHOULD_STOP, + LTL_NANOSLEEP_CLOCK_MONOTONIC, + LTL_NANOSLEEP_CLOCK_TAI, + LTL_NANOSLEEP_TIMER_ABSTIME, + LTL_RT, + LTL_SLEEP, + LTL_TASK_IS_MIGRATION, + LTL_TASK_IS_RCU, + LTL_WAKE, + LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + LTL_WOKEN_BY_HARDIRQ, + LTL_WOKEN_BY_NMI, + LTL_NUM_ATOM +}; +static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM); + +static const char *ltl_atom_str(enum ltl_atom atom) +{ + static const char *const names[] = { + "ab_sl", + "bl_on_rt_mu", + "cl_na", + "fu_lo_pi", + "fu_wa", + "ker_th", + "kth_sh_st", + "na_cl_mo", + "na_cl_ta", + "na_ti_ab", + "rt", + "sl", + "ta_mi", + "ta_rc", + "wak", + "wo_eq_hi_pr", + "wo_ha", + "wo_nm", + }; + + return names[atom]; +} + +enum ltl_buchi_state { + S0, + S1, + S2, + S3, + S4, + S5, + S6, + S7, + RV_NUM_BA_STATES +}; +static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES); + +static void ltl_start(struct task_struct *task, struct ltl_monitor *mon) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + if (val3) + __set_bit(S0, mon->states); + if (val11 && val13) + __set_bit(S1, mon->states); + if (val11 && val14) + __set_bit(S4, mon->states); + if (val5) + __set_bit(S5, mon->states); +} + +static void +ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next) +{ + bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms); + bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms); + bool val40 = task_is_rcu || task_is_migration; + bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms); + bool val41 = futex_lock_pi || val40; + bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms); + bool val5 = block_on_rt_mutex || val41; + bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms); + bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms); + bool val32 = abort_sleep || kthread_should_stop; + bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms); + bool val33 = woken_by_nmi || val32; + bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms); + bool val34 = woken_by_hardirq || val33; + bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, + mon->atoms); + bool val14 = woken_by_equal_or_higher_prio || val34; + bool wake = test_bit(LTL_WAKE, mon->atoms); + bool val13 = !wake; + bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms); + bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms); + bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms); + bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai; + bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms); + bool val25 = nanosleep_timer_abstime && val24; + bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms); + bool val18 = clock_nanosleep && val25; + bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms); + bool val9 = futex_wait || val18; + bool val11 = val9 || kernel_thread; + bool sleep = test_bit(LTL_SLEEP, mon->atoms); + bool val2 = !sleep; + bool rt = test_bit(LTL_RT, mon->atoms); + bool val1 = !rt; + bool val3 = val1 || val2; + + switch (state) { + case S0: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S1: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S2: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S3: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S4: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S5: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + case S6: + if (val11 && val13) + __set_bit(S1, next); + if (val13 && val3) + __set_bit(S2, next); + if (val14 && val3) + __set_bit(S3, next); + if (val11 && val14) + __set_bit(S4, next); + if (val13 && val5) + __set_bit(S6, next); + if (val14 && val5) + __set_bit(S7, next); + break; + case S7: + if (val3) + __set_bit(S0, next); + if (val11 && val13) + __set_bit(S1, next); + if (val11 && val14) + __set_bit(S4, next); + if (val5) + __set_bit(S5, next); + break; + } +} diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h new file mode 100644 index 000000000000..22eaf31da987 --- /dev/null +++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SLEEP +DEFINE_EVENT(event_ltl_monitor_id, event_sleep, + TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next), + TP_ARGS(task, states, atoms, next)); +DEFINE_EVENT(error_ltl_monitor_id, error_sleep, + TP_PROTO(struct task_struct *task), + TP_ARGS(task)); +#endif /* CONFIG_RV_MON_SLEEP */ diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 98eee8ec96e4..b6f310498466 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -173,6 +173,7 @@ DECLARE_EVENT_CLASS(error_ltl_monitor_id, TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid) ); #include +#include // Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here #endif /* CONFIG_LTL_MON_EVENTS_ID */ #endif /* _TRACE_RV_H */ -- cgit v1.2.3 From fac5493251a680cb74343895d0e76843624a90d8 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Wed, 9 Jul 2025 21:21:23 +0200 Subject: rv: Allow to configure the number of per-task monitor Now that there are 2 monitors for real-time applications, users may want to enable both of them simultaneously. Make the number of per-task monitor configurable. Default it to 2 for now. Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/93e83313fc4ba7f6e66f4abe80ca5f5494d658d0.1752088709.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 9 +++++++++ kernel/trace/rv/monitors/rtapp/Kconfig | 1 + kernel/trace/rv/rv.c | 8 ++++---- 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 942d57575e67..c11bf7e61ebf 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -32,6 +32,15 @@ menuconfig RV For further information, see: Documentation/trace/rv/runtime-verification.rst +config RV_PER_TASK_MONITORS + int "Maximum number of per-task monitor" + depends on RV + range 1 8 + default 2 + help + This option configures the maximum number of per-task RV monitors that can run + simultaneously. + source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" source "kernel/trace/rv/monitors/sched/Kconfig" diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig index b7415c3570bb..1ce9370a9ba8 100644 --- a/kernel/trace/rv/monitors/rtapp/Kconfig +++ b/kernel/trace/rv/monitors/rtapp/Kconfig @@ -1,5 +1,6 @@ config RV_MON_RTAPP depends on RV + depends on RV_PER_TASK_MONITORS >= 2 bool "rtapp monitor" help Collection of monitors to check for common problems with real-time diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index e25d65fe432a..108429d16ec1 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -165,7 +165,7 @@ struct dentry *get_monitors_root(void) LIST_HEAD(rv_monitors_list); static int task_monitor_count; -static bool task_monitor_slots[RV_PER_TASK_MONITORS]; +static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS]; int rv_get_task_monitor_slot(void) { @@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void) lockdep_assert_held(&rv_interface_lock); - if (task_monitor_count == RV_PER_TASK_MONITORS) + if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS) return -EBUSY; task_monitor_count++; - for (i = 0; i < RV_PER_TASK_MONITORS; i++) { + for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) { if (task_monitor_slots[i] == false) { task_monitor_slots[i] = true; return i; @@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot) { lockdep_assert_held(&rv_interface_lock); - if (slot < 0 || slot >= RV_PER_TASK_MONITORS) { + if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) { WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot); return; } -- cgit v1.2.3 From ca296d32ece38b07113bad64e08add75073a0e2b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 4 Jun 2025 09:10:21 +0900 Subject: tracing: ring_buffer: Rewind persistent ring buffer on reboot Rewind persistent ring buffer pages which have been read in the previous boot. Those pages are highly possible to be lost before writing it to the disk if the previous kernel crashed. In this case, the trace data is kept on the persistent ring buffer, but it can not be read because its commit size has been reset after read. This skips clearing the commit size of each sub-buffer and recover it after reboot. Note: If you read the previous boot data via trace_pipe, that is not accessible in that time. But reboot without clearing (or reusing) the read data, the read data is recovered again in the next boot. Thus, when you read the previous boot data, clear it by `echo > trace`. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/174899582116.955054.773265393511190051.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 103 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00fc38d70e86..a99ed4716de9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage) *bpage = list_entry(p, struct buffer_page, list); } +static inline void rb_dec_page(struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.prev); + + *bpage = list_entry(p, struct buffer_page, list); +} + static struct buffer_page * rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) { @@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu) static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) { struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta; - struct buffer_page *head_page; + struct buffer_page *head_page, *orig_head; unsigned long entry_bytes = 0; unsigned long entries = 0; int ret; + u64 ts; int i; if (!meta || !meta->head_buffer) @@ -1885,8 +1893,98 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer) entry_bytes += local_read(&cpu_buffer->reader_page->page->commit); local_set(&cpu_buffer->reader_page->entries, ret); - head_page = cpu_buffer->head_page; + orig_head = head_page = cpu_buffer->head_page; + ts = head_page->page->time_stamp; + + /* + * Try to rewind the head so that we can read the pages which already + * read in the previous boot. + */ + if (head_page == cpu_buffer->tail_page) + goto skip_rewind; + + rb_dec_page(&head_page); + for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) { + + /* Rewind until tail (writer) page. */ + if (head_page == cpu_buffer->tail_page) + break; + + /* Ensure the page has older data than head. */ + if (ts < head_page->page->time_stamp) + break; + + ts = head_page->page->time_stamp; + /* Ensure the page has correct timestamp and some data. */ + if (!ts || rb_page_commit(head_page) == 0) + break; + /* Stop rewind if the page is invalid. */ + ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu); + if (ret < 0) + break; + + /* Recover the number of entries and update stats. */ + local_set(&head_page->entries, ret); + if (ret) + local_inc(&cpu_buffer->pages_touched); + entries += ret; + entry_bytes += rb_page_commit(head_page); + } + if (i) + pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i); + + /* The last rewound page must be skipped. */ + if (head_page != orig_head) + rb_inc_page(&head_page); + + /* + * If the ring buffer was rewound, then inject the reader page + * into the location just before the original head page. + */ + if (head_page != orig_head) { + struct buffer_page *bpage = orig_head; + + rb_dec_page(&bpage); + /* + * Insert the reader_page before the original head page. + * Since the list encode RB_PAGE flags, general list + * operations should be avoided. + */ + cpu_buffer->reader_page->list.next = &orig_head->list; + cpu_buffer->reader_page->list.prev = orig_head->list.prev; + orig_head->list.prev = &cpu_buffer->reader_page->list; + bpage->list.next = &cpu_buffer->reader_page->list; + + /* Make the head_page the reader page */ + cpu_buffer->reader_page = head_page; + bpage = head_page; + rb_inc_page(&head_page); + head_page->list.prev = bpage->list.prev; + rb_dec_page(&bpage); + bpage->list.next = &head_page->list; + rb_set_list_to_head(&bpage->list); + cpu_buffer->pages = &head_page->list; + + cpu_buffer->head_page = head_page; + meta->head_buffer = (unsigned long)head_page->page; + + /* Reset all the indexes */ + bpage = cpu_buffer->reader_page; + meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = 0; + + for (i = 1, bpage = head_page; i < meta->nr_subbufs; + i++, rb_inc_page(&bpage)) { + meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page); + bpage->id = i; + } + + /* We'll restart verifying from orig_head */ + head_page = orig_head; + } + + skip_rewind: /* If the commit_buffer is the reader page, update the commit page */ if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) { cpu_buffer->commit_page = cpu_buffer->reader_page; @@ -5342,7 +5440,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) */ local_set(&cpu_buffer->reader_page->write, 0); local_set(&cpu_buffer->reader_page->entries, 0); - local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->real_end = 0; spin: -- cgit v1.2.3 From 437889b926b30093048eba31cd1fde00ca8df316 Mon Sep 17 00:00:00 2001 From: Artem Sadovnikov Date: Tue, 17 Jun 2025 15:21:09 +0000 Subject: fgraph: Make pid_str size match the comment The comment above buffer mentions sign, 10 bytes width for number and null terminator, but buffer itself isn't large enough to hold that much data. This is a cosmetic change, since PID cannot be negative, other than -1. Found by Linux Verification Center (linuxtesting.org) with SVACE. Link: https://lore.kernel.org/20250617152110.2530-1-a.sadovnikov@ispras.ru Signed-off-by: Artem Sadovnikov Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 14d74a7491b8..66e1a527cf1a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -513,7 +513,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid) { char comm[TASK_COMM_LEN]; /* sign + log10(MAX_INT) + '\0' */ - char pid_str[11]; + char pid_str[12]; int spaces = 0; int len; int i; -- cgit v1.2.3 From db6cc3f4ac2e6cdc898fc9cbc8b32ae1bf56bdad Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 4 Jul 2025 21:56:20 +0800 Subject: Revert "sched/numa: add statistics of numa balance task" This reverts commit ad6b26b6a0a79166b53209df2ca1cf8636296382. This commit introduces per-memcg/task NUMA balance statistics, but unfortunately it introduced a NULL pointer exception due to the following race condition: After a swap task candidate was chosen, its mm_struct pointer was set to NULL due to task exit. Later, when performing the actual task swapping, the p->mm caused the problem. CPU0 CPU1 : ... task_numa_migrate task_numa_find_cpu task_numa_compare # a normal task p is chosen env->best_task = p # p exit: exit_signals(p); p->flags |= PF_EXITING exit_mm p->mm = NULL; migrate_swap_stop __migrate_swap_task((arg->src_task, arg->dst_cpu) count_memcg_event_mm(p->mm, NUMA_TASK_SWAP)# p->mm is NULL task_lock() should be held and the PF_EXITING flag needs to be checked to prevent this from happening. After discussion, the conclusion was that adding a lock is not worthwhile for some statistics calculations. Revert the change and rely on the tracepoint for this purpose. Link: https://lkml.kernel.org/r/20250704135620.685752-1-yu.c.chen@intel.com Link: https://lkml.kernel.org/r/20250708064917.BBD13C4CEED@smtp.kernel.org Fixes: ad6b26b6a0a7 ("sched/numa: add statistics of numa balance task") Signed-off-by: Chen Yu Reported-by: Jirka Hladky Closes: https://lore.kernel.org/all/CAE4VaGBLJxpd=NeRJXpSCuw=REhC5LWJpC29kDy-Zh2ZDyzQZA@mail.gmail.com/ Reported-by: Srikanth Aithal Reported-by: Suneeth D Acked-by: Michal Hocko Cc: Borislav Petkov Cc: Ingo Molnar Cc: Jiri Hladky Cc: Libo Chen Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- kernel/sched/core.c | 9 ++------- kernel/sched/debug.c | 4 ---- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec68fc686bd7..81c6df746df1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3362,10 +3362,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { - __schedstat_inc(p->stats.numa_task_swapped); - count_vm_numa_event(NUMA_TASK_SWAP); - count_memcg_event_mm(p->mm, NUMA_TASK_SWAP); - if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; @@ -7939,9 +7935,8 @@ int migrate_task_to(struct task_struct *p, int target_cpu) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; - __schedstat_inc(p->stats.numa_task_migrated); - count_vm_numa_event(NUMA_TASK_MIGRATE); - count_memcg_event_mm(p->mm, NUMA_TASK_MIGRATE); + /* TODO: This is not properly updating schedstats */ + trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf08075..557246880a7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1210,10 +1210,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(nr_failed_migrations_running); P_SCHEDSTAT(nr_failed_migrations_hot); P_SCHEDSTAT(nr_forced_migrations); -#ifdef CONFIG_NUMA_BALANCING - P_SCHEDSTAT(numa_task_migrated); - P_SCHEDSTAT(numa_task_swapped); -#endif P_SCHEDSTAT(nr_wakeups); P_SCHEDSTAT(nr_wakeups_sync); P_SCHEDSTAT(nr_wakeups_migrate); -- cgit v1.2.3 From a984f16fba2cbadfd8f3310cf506a484dc5bdeb6 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Wed, 11 Jun 2025 05:27:07 +0000 Subject: mm: use folio_expected_ref_count() helper for reference counting Replace open-coded folio reference count calculations with the folio_expected_ref_count(). No functional changes intended. Link: https://lkml.kernel.org/r/20250611052706.515408-2-shivankg@amd.com Signed-off-by: Shivank Garg Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Alistair Popple Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Marc Rutland Cc: "Masami Hiramatsu (Google)" Cc: Matthew Wilcox (Oracle) Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4c965ba77f9f..8a601df87072 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -436,8 +436,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma, * there are no unexpected folio references ... */ if (is_register || userfaultfd_missing(vma) || - (folio_ref_count(folio) != folio_mapcount(folio) + 1 + - folio_test_swapcache(folio) * folio_nr_pages(folio))) + (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) goto remap; /* -- cgit v1.2.3 From 6b4a80e424cd2f99ca7262a8c0c725ec82e57b8a Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 19 Jun 2025 18:57:54 +1000 Subject: mm: filter zone device pages returned from folio_walk_start() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously dax pages were skipped by the pagewalk code as pud_special() or vm_normal_page{_pmd}() would be false for DAX pages. Now that dax pages are refcounted normally that is no longer the case, so the pagewalk code will start returning them. Most callers already explicitly filter for DAX or zone device pages so don't need updating. However some don't, so add checks to those callers. Link: https://lkml.kernel.org/r/4ecb7b357fc5b435588024770b88bbb695c30090.1750323463.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: Balbir Singh Cc: Björn Töpel Cc: Björn Töpel Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: Dan Williams Cc: David Hildenbrand Cc: Deepak Gupta Cc: Gerald Schaefer Cc: Inki Dae Cc: Jason Gunthorpe Cc: John Groves Cc: John Hubbard Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8a601df87072..f774367c8e71 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -539,7 +539,7 @@ retry: } ret = 0; - if (unlikely(!folio_test_anon(folio))) { + if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) { VM_WARN_ON_ONCE(is_register); folio_put(folio); goto out; -- cgit v1.2.3 From 449e0b4ed5a16c72289a786c5333fc97520402bf Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 9 May 2025 08:29:27 +0200 Subject: fork: clean-up naming of vm_stack/vm_struct variables in vmap stacks code There are two data types: "struct vm_struct" and "struct vm_stack" that have the same local variable names: vm_stack, or vm, or s, which makes the code confusing to read. Change the code so the naming is consistent: struct vm_struct is always called vm_area struct vm_stack is always called vm_stack One change altering vfree(vm_stack) to vfree(vm_area->addr) may look like a semantic change but it is not: vm_area->addr points to the vm_stack. This was done to improve readability. [linus.walleij@linaro.org: rebased and added new users of the variable names, address review comments] Link: https://lore.kernel.org/20240311164638.2015063-4-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20250509-fork-fixes-v3-2-e6c69dd356f2@linaro.org Signed-off-by: Pasha Tatashin Signed-off-by: Linus Walleij Acked-by: Mike Rapoport (Microsoft) Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- kernel/fork.c | 60 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..5fd893c907a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -207,14 +207,14 @@ struct vm_stack { struct vm_struct *stack_vm_area; }; -static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; - if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area)) return true; } return false; @@ -223,11 +223,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + struct vm_struct *vm_area = vm_stack->stack_vm_area; if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; - vfree(vm_stack); + vfree(vm_area->addr); } static void thread_stack_delayed_free(struct task_struct *tsk) @@ -240,32 +241,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk) static int free_vm_stack_cache(unsigned int cpu) { - struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *vm_stack = cached_vm_stacks[i]; + struct vm_struct *vm_area = cached_vm_stack_areas[i]; - if (!vm_stack) + if (!vm_area) continue; - vfree(vm_stack->addr); - cached_vm_stacks[i] = NULL; + vfree(vm_area->addr); + cached_vm_stack_areas[i] = NULL; } return 0; } -static int memcg_charge_kernel_stack(struct vm_struct *vm) +static int memcg_charge_kernel_stack(struct vm_struct *vm_area) { int i; int ret; int nr_charged = 0; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; @@ -273,38 +274,35 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm) return 0; err: for (i = 0; i < nr_charged; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { - struct vm_struct *vm; + struct vm_struct *vm_area; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - struct vm_struct *s; - - s = this_cpu_xchg(cached_stacks[i], NULL); - - if (!s) + vm_area = this_cpu_xchg(cached_stacks[i], NULL); + if (!vm_area) continue; /* Reset stack metadata. */ - kasan_unpoison_range(s->addr, THREAD_SIZE); + kasan_unpoison_range(vm_area->addr, THREAD_SIZE); - stack = kasan_reset_tag(s->addr); + stack = kasan_reset_tag(vm_area->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(s)) { - vfree(s->addr); + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); return -ENOMEM; } - tsk->stack_vm_area = s; + tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; } @@ -320,8 +318,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!stack) return -ENOMEM; - vm = find_vm_area(stack); - if (memcg_charge_kernel_stack(vm)) { + vm_area = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm_area)) { vfree(stack); return -ENOMEM; } @@ -330,7 +328,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - tsk->stack_vm_area = vm; + tsk->stack_vm_area = vm_area; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; @@ -437,11 +435,11 @@ static struct kmem_cache *mm_cachep; static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm = task_stack_vm_area(tsk); + struct vm_struct *vm_area = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, + mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); @@ -457,12 +455,12 @@ void exit_task_stack_account(struct task_struct *tsk) account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { - struct vm_struct *vm; + struct vm_struct *vm_area; int i; - vm = task_stack_vm_area(tsk); + vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); + memcg_kmem_uncharge_page(vm_area->pages[i], 0); } } -- cgit v1.2.3 From f7b0ff2bc91d8bb2ba9fdb182da39dd9733b1c50 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 9 May 2025 09:25:09 +0200 Subject: fork: define a local GFP_VMAP_STACK The current allocation of VMAP stack memory is using (THREADINFO_GFP & ~__GFP_ACCOUNT) which is a complicated way of saying (GFP_KERNEL | __GFP_ZERO): : define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) : define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) This is an unfortunate side-effect of independent changes blurring the picture: commit 19809c2da28aee5860ad9a2eff760730a0710df0 changed (THREADINFO_GFP | __GFP_HIGHMEM) to just THREADINFO_GFP since highmem became implicit. commit 9b6f7e163cd0f468d1b9696b785659d3c27c8667 then added stack caching and rewrote the allocation to (THREADINFO_GFP & ~__GFP_ACCOUNT) as cached stacks need to be accounted separately. However that code, when it eventually accounts the memory does this: ret = memcg_kmem_charge(vm->pages[i], GFP_KERNEL, 0) so the memory is charged as a GFP_KERNEL allocation. Define a unique GFP_VMAP_STACK to use GFP_KERNEL | __GFP_ZERO and move the comment there. Link: https://lkml.kernel.org/r/20250509-gfp-stack-v1-1-82f6f7efc210@linaro.org Signed-off-by: Linus Walleij Reported-by: Mateusz Guzik Cc: Pasha Tatashin Cc: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- kernel/fork.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 5fd893c907a5..6616d173307a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -201,6 +201,12 @@ static inline void free_task_struct(struct task_struct *tsk) */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +/* + * Allocated stacks are cached and later reused by new threads, so memcg + * accounting is performed by the code assigning/releasing stacks to tasks. + * We need a zeroed memory without __GFP_ACCOUNT. + */ +#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO) struct vm_stack { struct rcu_head rcu; @@ -307,13 +313,8 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) return 0; } - /* - * Allocated stacks are cached and later reused by new threads, - * so memcg accounting is performed manually on assigning/releasing - * stacks to tasks. Drop __GFP_ACCOUNT. - */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, - THREADINFO_GFP & ~__GFP_ACCOUNT, + GFP_VMAP_STACK, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; -- cgit v1.2.3 From 0ba5a25ad1c951fa25baa8c30a526b647ab50d47 Mon Sep 17 00:00:00 2001 From: Elijah Wright Date: Tue, 10 Jun 2025 15:56:28 -0700 Subject: kernel: relay: use __GFP_ZERO in relay_alloc_buf Passing the __GFP_ZERO flag to alloc_page should result in less overhead th= an using memset() Link: https://lkml.kernel.org/r/20250610225639.314970-3-git@elijahs.space Signed-off-by: Elijah Wright Signed-off-by: Andrew Morton --- kernel/relay.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c0c93a04d4ce..3ee5b038d0d9 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -118,7 +118,7 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) return NULL; for (i = 0; i < n_pages; i++) { - buf->page_array[i] = alloc_page(GFP_KERNEL); + buf->page_array[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); if (unlikely(!buf->page_array[i])) goto depopulate; set_page_private(buf->page_array[i], (unsigned long)buf); @@ -127,7 +127,6 @@ static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) if (!mem) goto depopulate; - memset(mem, 0, *size); buf->page_count = n_pages; return mem; -- cgit v1.2.3 From 2489e958129ff7cbf26a34ee33cdc9ccbd68fe3c Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:11:57 +0800 Subject: relayfs: abolish prev_padding Patch series "relayfs: misc changes", v5. The series mostly focuses on the error counters which helps every user debug their own kernel module. This patch (of 5): prev_padding represents the unused space of certain subbuffer. If the content of a call of relay_write() exceeds the limit of the remainder of this subbuffer, it will skip storing in the rest space and record the start point as buf->prev_padding in relay_switch_subbuf(). Since the buf is a per-cpu big buffer, the point of prev_padding as a global value for the whole buffer instead of a single subbuffer (whose padding info is stored in buf->padding[]) seems meaningless from the real use cases, so we don't bother to record it any more. Link: https://lkml.kernel.org/r/20250612061201.34272-1-kerneljasonxing@gmail.com Link: https://lkml.kernel.org/r/20250612061201.34272-2-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Masami Hiramatsu (Google) Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/relay.c | 14 ++++++++------ kernel/trace/blktrace.c | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 3ee5b038d0d9..fc6ad76b789d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -249,13 +249,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full); */ static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { if (!buf->chan->cb->subbuf_start) return !relay_buf_full(buf); return buf->chan->cb->subbuf_start(buf, subbuf, - prev_subbuf, prev_padding); + prev_subbuf); } /** @@ -301,7 +301,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - relay_subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL); } /** @@ -554,9 +554,11 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) goto toobig; if (buf->offset != buf->chan->subbuf_size + 1) { - buf->prev_padding = buf->chan->subbuf_size - buf->offset; + size_t prev_padding; + + prev_padding = buf->chan->subbuf_size - buf->offset; old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; - buf->padding[old_subbuf] = buf->prev_padding; + buf->padding[old_subbuf] = prev_padding; buf->subbufs_produced++; if (buf->dentry) d_inode(buf->dentry)->i_size += @@ -581,7 +583,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3f6a7bdc6edf..d3083c88474e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -461,7 +461,7 @@ static const struct file_operations blk_msg_fops = { * the user space app in telling how many lost events there were. */ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf, size_t prev_padding) + void *prev_subbuf) { struct blk_trace *bt; -- cgit v1.2.3 From ca01a90ae7bf9bb22137e719366bdc0f387675c2 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:11:58 +0800 Subject: relayfs: support a counter tracking if per-cpu buffers is full When using relay mechanism, we often encounter the case where new data are lost or old unconsumed data are overwritten because of slow reader. Add 'full' field in per-cpu buffer structure to detect if the above case is happening. Relay has two modes: 1) non-overwrite mode, 2) overwrite mode. So buffer being full here respectively means: 1) relayfs doesn't intend to accept new data and then simply drop them, or 2) relayfs is going to start over again and overwrite old unread data with new data. Note: this counter doesn't need any explicit lock to protect from being modified by different threads for the better performance consideration. Writers calling __relay_write/relay_write should consider how to use the lock and ensure it performs under the lock protection, thus it's not necessary to add a new small lock here. Link: https://lkml.kernel.org/r/20250612061201.34272-3-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Jens Axboe Reviewed-by: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/relay.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index fc6ad76b789d..4b07efddc2cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -251,8 +251,13 @@ EXPORT_SYMBOL_GPL(relay_buf_full); static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, void *prev_subbuf) { + int full = relay_buf_full(buf); + + if (full) + buf->stats.full_count++; + if (!buf->chan->cb->subbuf_start) - return !relay_buf_full(buf); + return !full; return buf->chan->cb->subbuf_start(buf, subbuf, prev_subbuf); @@ -297,6 +302,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->finalized = 0; buf->data = buf->start; buf->offset = 0; + buf->stats.full_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; -- cgit v1.2.3 From a53202ce7fbafd24f854865b02eff891e246c550 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:11:59 +0800 Subject: relayfs: introduce getting relayfs statistics function In this version, only support getting the counter for buffer full and implement the framework of how it works. Users can pass certain flag to fetch what field/statistics they expect to know. Each time it only returns one result. So do not pass multiple flags. Link: https://lkml.kernel.org/r/20250612061201.34272-4-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Masami Hiramatsu (Google) Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/relay.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 4b07efddc2cf..2fc27c0e771e 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -700,6 +700,36 @@ void relay_flush(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_flush); +/** + * relay_stats - get channel buffer statistics + * @chan: the channel + * @flags: select particular information to get + * + * Returns the count of certain field that caller specifies. + */ +size_t relay_stats(struct rchan *chan, int flags) +{ + unsigned int i, count = 0; + struct rchan_buf *rbuf; + + if (!chan || flags > RELAY_STATS_LAST) + return 0; + + if (chan->is_global) { + rbuf = *per_cpu_ptr(chan->buf, 0); + if (flags & RELAY_STATS_BUF_FULL) + count = rbuf->stats.full_count; + } else { + for_each_online_cpu(i) { + rbuf = *per_cpu_ptr(chan->buf, i); + if (rbuf && flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + } + } + + return count; +} + /** * relay_file_open - open file op for relay files * @inode: the inode -- cgit v1.2.3 From 7f2173894f7bfe63bcb241f419b15ed5ce79f0d1 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:12:00 +0800 Subject: blktrace: use rbuf->stats.full as a drop indicator in relayfs Replace internal subbuf_start in blktrace with the default policy in relayfs. Remove dropped field from struct blktrace. Correspondingly, call the common helper in relay. By incrementing full_count to keep track of how many times we encountered a full buffer issue, user space will know how many events were lost. Link: https://lkml.kernel.org/r/20250612061201.34272-5-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Jens Axboe Reviewed-by: Masami Hiramatsu (Google) Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/trace/blktrace.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d3083c88474e..5401b9006135 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -415,9 +415,10 @@ static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, size_t count, loff_t *ppos) { struct blk_trace *bt = filp->private_data; + size_t dropped = relay_stats(bt->rchan, RELAY_STATS_BUF_FULL); char buf[16]; - snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + snprintf(buf, sizeof(buf), "%zu\n", dropped); return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); } @@ -456,23 +457,6 @@ static const struct file_operations blk_msg_fops = { .llseek = noop_llseek, }; -/* - * Keep track of how many times we encountered a full subbuffer, to aid - * the user space app in telling how many lost events there were. - */ -static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, - void *prev_subbuf) -{ - struct blk_trace *bt; - - if (!relay_buf_full(buf)) - return 1; - - bt = buf->chan->private_data; - atomic_inc(&bt->dropped); - return 0; -} - static int blk_remove_buf_file_callback(struct dentry *dentry) { debugfs_remove(dentry); @@ -491,7 +475,6 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, } static const struct rchan_callbacks blk_relay_callbacks = { - .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, }; @@ -580,7 +563,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } bt->dev = dev; - atomic_set(&bt->dropped, 0); INIT_LIST_HEAD(&bt->running_list); ret = -EIO; -- cgit v1.2.3 From 19f3cb64a25b80db667a00182785577fae465b3e Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 12 Jun 2025 14:12:01 +0800 Subject: relayfs: support a counter tracking if data is too big to write It really doesn't matter if the user/admin knows what the last too big value is. Record how many times this case is triggered would be helpful. Solve the existing issue where relay_reset() doesn't restore the value. Store the counter in the per-cpu buffer structure instead of the global buffer structure. It also solves the racy condition which is likely to happen when a few of per-cpu buffers encounter the too big data case and then access the global field last_toobig without lock protection. Remove the printk in relay_close() since kernel module can directly call relay_stats() as they want. Link: https://lkml.kernel.org/r/20250612061201.34272-6-kerneljasonxing@gmail.com Signed-off-by: Jason Xing Reviewed-by: Yushan Zhou Reviewed-by: Masami Hiramatsu (Google) Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/relay.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 2fc27c0e771e..8d915fe98198 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -303,6 +303,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) buf->data = buf->start; buf->offset = 0; buf->stats.full_count = 0; + buf->stats.big_count = 0; for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; @@ -602,7 +603,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) return length; toobig: - buf->chan->last_toobig = length; + buf->stats.big_count++; return 0; } EXPORT_SYMBOL_GPL(relay_switch_subbuf); @@ -662,11 +663,6 @@ void relay_close(struct rchan *chan) if ((buf = *per_cpu_ptr(chan->buf, i))) relay_close_buf(buf); - if (chan->last_toobig) - printk(KERN_WARNING "relay: one or more items not logged " - "[item size (%zd) > sub-buffer size (%zd)]\n", - chan->last_toobig, chan->subbuf_size); - list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); @@ -719,11 +715,17 @@ size_t relay_stats(struct rchan *chan, int flags) rbuf = *per_cpu_ptr(chan->buf, 0); if (flags & RELAY_STATS_BUF_FULL) count = rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count = rbuf->stats.big_count; } else { for_each_online_cpu(i) { rbuf = *per_cpu_ptr(chan->buf, i); - if (rbuf && flags & RELAY_STATS_BUF_FULL) - count += rbuf->stats.full_count; + if (rbuf) { + if (flags & RELAY_STATS_BUF_FULL) + count += rbuf->stats.full_count; + else if (flags & RELAY_STATS_WRT_BIG) + count += rbuf->stats.big_count; + } } } -- cgit v1.2.3 From ad2c8079e9d5637f6d66cb5ce5cf49768ae87658 Mon Sep 17 00:00:00 2001 From: Wei Nanxin Date: Sun, 15 Jun 2025 20:32:37 +0800 Subject: kcov: fix typo in comment of kcov_fault_in_area change '__santizer_cov_trace_pc()' to '__sanitizer_cov_trace_pc()' Link: https://lkml.kernel.org/r/20250615123237.110144-1-n9winx@163.com Signed-off-by: Wei Nanxin Cc: Andrey Konovalov Cc: Macro Elver Signed-off-by: Andrew Morton --- kernel/kcov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 187ba1b80bda..1d85597057e1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -552,7 +552,7 @@ static int kcov_get_mode(unsigned long arg) /* * Fault in a lazily-faulted vmalloc area before it can be used by - * __santizer_cov_trace_pc(), to avoid recursion issues if any code on the + * __sanitizer_cov_trace_pc(), to avoid recursion issues if any code on the * vmalloc fault handling path is instrumented. */ static void kcov_fault_in_area(struct kcov *kcov) -- cgit v1.2.3 From d71b90e5ba83b32b4e3980f8c07ba2012ad9378a Mon Sep 17 00:00:00 2001 From: Fushuai Wang Date: Sun, 15 Jun 2025 11:09:30 +0800 Subject: exit: fix misleading comment in forget_original_parent() The commit 482a3767e508 ("exit: reparent: call forget_original_parent() under tasklist_lock") moved the comment from exit_notify() to forget_original_parent(). However, the forget_original_parent() only handles (A), while (B) is handled in kill_orphaned_pgrp(). So remove the unrelated part. Link: https://lkml.kernel.org/r/20250615030930.58051-1-wangfushuai@baidu.com Signed-off-by: Fushuai Wang Acked-by: Oleg Nesterov Cc: Andrii Nakryiko Cc: Christian Brauner Cc: Mateusz Guzik Cc: Michal Hocko Cc: Pasha Tatashin Cc: wangfushuai Signed-off-by: Andrew Morton --- kernel/exit.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index bb184a67ac73..f03caf17b214 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -692,12 +692,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, } /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + * Make init inherit all the child processes */ static void forget_original_parent(struct task_struct *father, struct list_head *dead) -- cgit v1.2.3 From aa644c405291a419e92b112e2279c01c410e9a26 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 May 2025 12:18:09 +0200 Subject: uprobes: revert ref_ctr_offset in uprobe_unregister error path There's error path that could lead to inactive uprobe: 1) uprobe_register succeeds - updates instruction to int3 and changes ref_ctr from 0 to 1 2) uprobe_unregister fails - int3 stays in place, but ref_ctr is changed to 0 (it's not restored to 1 in the fail path) uprobe is leaked 3) another uprobe_register comes and re-uses the leaked uprobe and succeds - but int3 is already in place, so ref_ctr update is skipped and it stays 0 - uprobe CAN NOT be triggered now 4) uprobe_unregister fails because ref_ctr value is unexpected Fix this by reverting the updated ref_ctr value back to 1 in step 2), which is the case when uprobe_unregister fails (int3 stays in place), but we have already updated refctr. The new scenario will go as follows: 1) uprobe_register succeeds - updates instruction to int3 and changes ref_ctr from 0 to 1 2) uprobe_unregister fails - int3 stays in place and ref_ctr is reverted to 1.. uprobe is leaked 3) another uprobe_register comes and re-uses the leaked uprobe and succeds - but int3 is already in place, so ref_ctr update is skipped and it stays 1 - uprobe CAN be triggered now 4) uprobe_unregister succeeds Link: https://lkml.kernel.org/r/20250514101809.2010193-1-jolsa@kernel.org Fixes: 1cc33161a83d ("uprobes: Support SDT markers having reference count (semaphore)") Signed-off-by: Jiri Olsa Acked-by: David Hildenbrand Acked-by: Oleg Nesterov Suggested-by: Oleg Nesterov Cc: Andrii Nakryiko Cc: "Masami Hiramatsu (Google)" Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4c965ba77f9f..84ee7b590861 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -581,8 +581,8 @@ retry: out: /* Revert back reference counter if instruction update failed. */ - if (ret < 0 && is_register && ref_ctr_updated) - update_ref_ctr(uprobe, mm, -1); + if (ret < 0 && ref_ctr_updated) + update_ref_ctr(uprobe, mm, is_register ? -1 : 1); /* try collapse pmd for compound page */ if (ret > 0) -- cgit v1.2.3 From 64960497ea86a5d09176c296c3616aa7c8668624 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Wed, 18 Jun 2025 15:34:33 +0200 Subject: fork: clean up ifdef logic around stack allocation There is an unneeded OR in the ifdef functions that are used to allocate and free kernel stacks based on direct map or vmap. Adding dynamic stack support would complicate this logic even further. Therefore, clean up by changing the order so OR is no longer needed. Link: https://lkml.kernel.org/r/20250618-fork-fixes-v4-1-2e05a2e1f5fc@linaro.org Signed-off-by: Pasha Tatashin Link: https://lore.kernel.org/20240311164638.2015063-3-pasha.tatashin@soleen.com Signed-off-by: Linus Walleij Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- kernel/fork.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 6616d173307a..bd8c21d64746 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -188,13 +188,7 @@ static inline void free_task_struct(struct task_struct *tsk) kmem_cache_free(task_struct_cachep, tsk); } -/* - * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a - * kmemcache based allocator. - */ -# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) - -# ifdef CONFIG_VMAP_STACK +#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @@ -344,7 +338,13 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack_vm_area = NULL; } -# else /* !CONFIG_VMAP_STACK */ +#else /* !CONFIG_VMAP_STACK */ + +/* + * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a + * kmemcache based allocator. + */ +#if THREAD_SIZE >= PAGE_SIZE static void thread_stack_free_rcu(struct rcu_head *rh) { @@ -376,8 +376,7 @@ static void free_thread_stack(struct task_struct *tsk) tsk->stack = NULL; } -# endif /* CONFIG_VMAP_STACK */ -# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ +#else /* !(THREAD_SIZE >= PAGE_SIZE) */ static struct kmem_cache *thread_stack_cache; @@ -416,7 +415,8 @@ void thread_stack_cache_init(void) BUG_ON(thread_stack_cache == NULL); } -# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#endif /* THREAD_SIZE >= PAGE_SIZE */ +#endif /* CONFIG_VMAP_STACK */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; -- cgit v1.2.3 From fed307b67c5bbb17b72c54816cd1bce61c23b4d7 Mon Sep 17 00:00:00 2001 From: Jiazi Li Date: Fri, 20 Jun 2025 18:07:56 +0800 Subject: kthread: update comment for __to_kthread With commit 343f4c49f243 ("kthread: Don't allocate kthread_struct for init and umh") and commit 753550eb0ce1 ("fork: Explicitly set PF_KTHREAD"), umh task no longer have struct kthread and PF_KTHREAD flag. Update the comment to describe what the current rules are to detect is something is a kthread. Link: https://lkml.kernel.org/r/20250620100801.23185-1-jqqlijiazi@gmail.com Signed-off-by: Jiazi Li Signed-off-by: mingzhu.wang Suggested-by Eric W . Biederman Reviewed-by: "Eric W. Biederman" Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- kernel/kthread.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 85fc068f0083..0e98b228a8ef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -88,13 +88,12 @@ static inline struct kthread *to_kthread(struct task_struct *k) /* * Variant of to_kthread() that doesn't assume @p is a kthread. * - * Per construction; when: + * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will + * always remain a kthread. For kthreads p->worker_private always + * points to a struct kthread. For tasks that are not kthreads + * p->worker_private is used to point to other things. * - * (p->flags & PF_KTHREAD) && p->worker_private - * - * the task is both a kthread and struct kthread is persistent. However - * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and - * begin_new_exec()). + * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { -- cgit v1.2.3 From f747cde5e71b1701a107c3a2e223e5b4a6cb4c52 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Wed, 9 Jul 2025 12:31:16 +0000 Subject: PM: sleep: add kernel parameter to disable asynchronous suspend/resume On some platforms, device dependencies are not properly represented by device links, which can cause issues when asynchronous power management is enabled. While it is possible to disable this via sysfs, doing so at runtime can race with the first system suspend event. This patch introduces a kernel command-line parameter, "pm_async", which can be set to "off" to globally disable asynchronous suspend and resume operations from early boot. It effectively provides a way to set the initial value of the existing pm_async sysfs knob at boot time. This offers a robust method to fall back to synchronous (sequential) operation, which can stabilize platforms with problematic dependencies and also serve as a useful debugging tool. The default behavior remains unchanged (asynchronous enabled). To disable it, boot the kernel with the "pm_async=off" parameter. Signed-off-by: Tudor Ambarus Acked-by: Randy Dunlap Link: https://patch.msgid.link/20250709-pm-async-off-v3-1-cb69a6fc8d04@linaro.org Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 3d484630505a..3cf2d7e72567 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -112,6 +113,14 @@ int pm_notifier_call_chain(unsigned long val) /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; +static int __init pm_async_setup(char *str) +{ + if (!strcmp(str, "off")) + pm_async_enabled = 0; + return 1; +} +__setup("pm_async=", pm_async_setup); + static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { -- cgit v1.2.3 From a6b0465bd2833f3ca9e92d8c9a422b7be6e40fc9 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 7 Jul 2025 10:20:15 +0200 Subject: irqdomain: Export irq_domain_free_irqs_top() Export irq_domain_free_irqs_top(), making it usable for drivers compiled as modules. Reviewed-by: Michael Kelley Reviewed-by: Thomas Gleixner Signed-off-by: Nam Cao Signed-off-by: David S. Miller --- kernel/irq/irqdomain.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c8b6de09047b..46919e6c9c45 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1561,6 +1561,7 @@ void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, } irq_domain_free_irqs_common(domain, virq, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_top); static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, -- cgit v1.2.3 From 56180dd20c19e5b0fa34822997a9ac66b517e7b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Jul 2025 13:00:07 +0200 Subject: futex: Use RCU-based per-CPU reference counting instead of rcuref_t The use of rcuref_t for reference counting introduces a performance bottleneck when accessed concurrently by multiple threads during futex operations. Replace rcuref_t with special crafted per-CPU reference counters. The lifetime logic remains the same. The newly allocate private hash starts in FR_PERCPU state. In this state, each futex operation that requires the private hash uses a per-CPU counter (an unsigned int) for incrementing or decrementing the reference count. When the private hash is about to be replaced, the per-CPU counters are migrated to a atomic_t counter mm_struct::futex_atomic. The migration process: - Waiting for one RCU grace period to ensure all users observe the current private hash. This can be skipped if a grace period elapsed since the private hash was assigned. - futex_private_hash::state is set to FR_ATOMIC, forcing all users to use mm_struct::futex_atomic for reference counting. - After a RCU grace period, all users are guaranteed to be using the atomic counter. The per-CPU counters can now be summed up and added to the atomic_t counter. If the resulting count is zero, the hash can be safely replaced. Otherwise, active users still hold a valid reference. - Once the atomic reference count drops to zero, the next futex operation will switch to the new private hash. call_rcu_hurry() is used to speed up transition which otherwise might be delay with RCU_LAZY. There is nothing wrong with using call_rcu(). The side effects would be that on auto scaling the new hash is used later and the SET_SLOTS prctl() will block longer. [bigeasy: commit description + mm get/ put_async] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250710110011.384614-3-bigeasy@linutronix.de --- kernel/fork.c | 8 +- kernel/futex/core.c | 243 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 232 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..0b885dcbde9a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1046,7 +1046,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); - futex_mm_init(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif @@ -1061,6 +1060,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->def_flags = 0; } + if (futex_mm_init(mm)) + goto fail_mm_init; + if (mm_alloc_pgd(mm)) goto fail_nopgd; @@ -1090,6 +1092,8 @@ fail_nocontext: fail_noid: mm_free_pgd(mm); fail_nopgd: + futex_hash_free(mm); +fail_mm_init: free_mm(mm); return NULL; } @@ -1145,7 +1149,7 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH) static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 90d53fb0ee9e..1dcb4c8a2585 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include @@ -65,7 +64,7 @@ static struct { #define futex_queues (__futex_data.queues) struct futex_private_hash { - rcuref_t users; + int state; unsigned int hash_mask; struct rcu_head rcu; void *mm; @@ -129,6 +128,12 @@ static struct futex_hash_bucket * __futex_hash(union futex_key *key, struct futex_private_hash *fph); #ifdef CONFIG_FUTEX_PRIVATE_HASH +static bool futex_ref_get(struct futex_private_hash *fph); +static bool futex_ref_put(struct futex_private_hash *fph); +static bool futex_ref_is_dead(struct futex_private_hash *fph); + +enum { FR_PERCPU = 0, FR_ATOMIC }; + static inline bool futex_key_is_private(union futex_key *key) { /* @@ -142,15 +147,14 @@ bool futex_private_hash_get(struct futex_private_hash *fph) { if (fph->immutable) return true; - return rcuref_get(&fph->users); + return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { - /* Ignore return value, last put is verified via rcuref_is_dead() */ if (fph->immutable) return; - if (rcuref_put(&fph->users)) + if (futex_ref_put(fph)) wake_up_var(fph->mm); } @@ -243,14 +247,18 @@ static bool __futex_pivot_hash(struct mm_struct *mm, fph = rcu_dereference_protected(mm->futex_phash, lockdep_is_held(&mm->futex_hash_lock)); if (fph) { - if (!rcuref_is_dead(&fph->users)) { + if (!futex_ref_is_dead(fph)) { mm->futex_phash_new = new; return false; } futex_rehash_private(fph, new); } - rcu_assign_pointer(mm->futex_phash, new); + new->state = FR_PERCPU; + scoped_guard(rcu) { + mm->futex_batches = get_state_synchronize_rcu(); + rcu_assign_pointer(mm->futex_phash, new); + } kvfree_rcu(fph, rcu); return true; } @@ -289,9 +297,7 @@ again: if (!fph) return NULL; - if (fph->immutable) - return fph; - if (rcuref_get(&fph->users)) + if (futex_private_hash_get(fph)) return fph; } futex_pivot_hash(mm); @@ -1527,16 +1533,219 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, #define FH_IMMUTABLE 0x02 #ifdef CONFIG_FUTEX_PRIVATE_HASH + +/* + * futex-ref + * + * Heavily inspired by percpu-rwsem/percpu-refcount; not reusing any of that + * code because it just doesn't fit right. + * + * Dual counter, per-cpu / atomic approach like percpu-refcount, except it + * re-initializes the state automatically, such that the fph swizzle is also a + * transition back to per-cpu. + */ + +static void futex_ref_rcu(struct rcu_head *head); + +static void __futex_ref_atomic_begin(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * The counter we're about to switch to must have fully switched; + * otherwise it would be impossible for it to have reported success + * from futex_ref_is_dead(). + */ + WARN_ON_ONCE(atomic_long_read(&mm->futex_atomic) != 0); + + /* + * Set the atomic to the bias value such that futex_ref_{get,put}() + * will never observe 0. Will be fixed up in __futex_ref_atomic_end() + * when folding in the percpu count. + */ + atomic_long_set(&mm->futex_atomic, LONG_MAX); + smp_store_release(&fph->state, FR_ATOMIC); + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static void __futex_ref_atomic_end(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + unsigned int count = 0; + long ret; + int cpu; + + /* + * Per __futex_ref_atomic_begin() the state of the fph must be ATOMIC + * and per this RCU callback, everybody must now observe this state and + * use the atomic variable. + */ + WARN_ON_ONCE(fph->state != FR_ATOMIC); + + /* + * Therefore the per-cpu counter is now stable, sum and reset. + */ + for_each_possible_cpu(cpu) { + unsigned int *ptr = per_cpu_ptr(mm->futex_ref, cpu); + count += *ptr; + *ptr = 0; + } + + /* + * Re-init for the next cycle. + */ + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + + /* + * Add actual count, subtract bias and initial refcount. + * + * The moment this atomic operation happens, futex_ref_is_dead() can + * become true. + */ + ret = atomic_long_add_return(count - LONG_MAX - 1, &mm->futex_atomic); + if (!ret) + wake_up_var(mm); + + WARN_ON_ONCE(ret < 0); + mmput_async(mm); +} + +static void futex_ref_rcu(struct rcu_head *head) +{ + struct mm_struct *mm = container_of(head, struct mm_struct, futex_rcu); + struct futex_private_hash *fph = rcu_dereference_raw(mm->futex_phash); + + if (fph->state == FR_PERCPU) { + /* + * Per this extra grace-period, everybody must now observe + * fph as the current fph and no previously observed fph's + * are in-flight. + * + * Notably, nobody will now rely on the atomic + * futex_ref_is_dead() state anymore so we can begin the + * migration of the per-cpu counter into the atomic. + */ + __futex_ref_atomic_begin(fph); + return; + } + + __futex_ref_atomic_end(fph); +} + +/* + * Drop the initial refcount and transition to atomics. + */ +static void futex_ref_drop(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + /* + * Can only transition the current fph; + */ + WARN_ON_ONCE(rcu_dereference_raw(mm->futex_phash) != fph); + /* + * We enqueue at least one RCU callback. Ensure mm stays if the task + * exits before the transition is completed. + */ + mmget(mm); + + /* + * In order to avoid the following scenario: + * + * futex_hash() __futex_pivot_hash() + * guard(rcu); guard(mm->futex_hash_lock); + * fph = mm->futex_phash; + * rcu_assign_pointer(&mm->futex_phash, new); + * futex_hash_allocate() + * futex_ref_drop() + * fph->state = FR_ATOMIC; + * atomic_set(, BIAS); + * + * futex_private_hash_get(fph); // OOPS + * + * Where an old fph (which is FR_ATOMIC) and should fail on + * inc_not_zero, will succeed because a new transition is started and + * the atomic is bias'ed away from 0. + * + * There must be at least one full grace-period between publishing a + * new fph and trying to replace it. + */ + if (poll_state_synchronize_rcu(mm->futex_batches)) { + /* + * There was a grace-period, we can begin now. + */ + __futex_ref_atomic_begin(fph); + return; + } + + call_rcu_hurry(&mm->futex_rcu, futex_ref_rcu); +} + +static bool futex_ref_get(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) { + this_cpu_inc(*mm->futex_ref); + return true; + } + + return atomic_long_inc_not_zero(&mm->futex_atomic); +} + +static bool futex_ref_put(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) { + this_cpu_dec(*mm->futex_ref); + return false; + } + + return atomic_long_dec_and_test(&mm->futex_atomic); +} + +static bool futex_ref_is_dead(struct futex_private_hash *fph) +{ + struct mm_struct *mm = fph->mm; + + guard(rcu)(); + + if (smp_load_acquire(&fph->state) == FR_PERCPU) + return false; + + return atomic_long_read(&mm->futex_atomic) == 0; +} + +int futex_mm_init(struct mm_struct *mm) +{ + mutex_init(&mm->futex_hash_lock); + RCU_INIT_POINTER(mm->futex_phash, NULL); + mm->futex_phash_new = NULL; + /* futex-ref */ + atomic_long_set(&mm->futex_atomic, 0); + mm->futex_batches = get_state_synchronize_rcu(); + mm->futex_ref = alloc_percpu(unsigned int); + if (!mm->futex_ref) + return -ENOMEM; + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + return 0; +} + void futex_hash_free(struct mm_struct *mm) { struct futex_private_hash *fph; + free_percpu(mm->futex_ref); kvfree(mm->futex_phash_new); fph = rcu_dereference_raw(mm->futex_phash); - if (fph) { - WARN_ON_ONCE(rcuref_read(&fph->users) > 1); + if (fph) kvfree(fph); - } } static bool futex_pivot_pending(struct mm_struct *mm) @@ -1549,7 +1758,7 @@ static bool futex_pivot_pending(struct mm_struct *mm) return true; fph = rcu_dereference(mm->futex_phash); - return rcuref_is_dead(&fph->users); + return futex_ref_is_dead(fph); } static bool futex_hash_less(struct futex_private_hash *a, @@ -1598,11 +1807,11 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) } } - fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + fph = kvzalloc(struct_size(fph, queues, hash_slots), + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!fph) return -ENOMEM; - rcuref_init(&fph->users, 1); fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; fph->immutable = !!(flags & FH_IMMUTABLE); @@ -1645,7 +1854,7 @@ again: * allocated a replacement hash, drop the initial * reference on the existing hash. */ - futex_private_hash_put(cur); + futex_ref_drop(cur); } if (new) { -- cgit v1.2.3 From fb3c553da7fa9991f9b1436d91dbb78c7477c86a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 10 Jul 2025 13:00:08 +0200 Subject: futex: Make futex_private_hash_get() static futex_private_hash_get() is not used outside if its compilation unit. Make it static. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250710110011.384614-4-bigeasy@linutronix.de --- kernel/futex/core.c | 2 +- kernel/futex/futex.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 1dcb4c8a2585..1981574a459d 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -143,7 +143,7 @@ static inline bool futex_key_is_private(union futex_key *key) return !(key->both.offset & (FUT_OFF_INODE | FUT_OFF_MMSHARED)); } -bool futex_private_hash_get(struct futex_private_hash *fph) +static bool futex_private_hash_get(struct futex_private_hash *fph) { if (fph->immutable) return true; diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index fcd1617212ee..c74eac572acd 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -228,14 +228,12 @@ extern void futex_hash_get(struct futex_hash_bucket *hb); extern void futex_hash_put(struct futex_hash_bucket *hb); extern struct futex_private_hash *futex_private_hash(void); -extern bool futex_private_hash_get(struct futex_private_hash *fph); extern void futex_private_hash_put(struct futex_private_hash *fph); #else /* !CONFIG_FUTEX_PRIVATE_HASH */ static inline void futex_hash_get(struct futex_hash_bucket *hb) { } static inline void futex_hash_put(struct futex_hash_bucket *hb) { } static inline struct futex_private_hash *futex_private_hash(void) { return NULL; } -static inline bool futex_private_hash_get(void) { return false; } static inline void futex_private_hash_put(struct futex_private_hash *fph) { } #endif -- cgit v1.2.3 From 760e6f7befbab9a84c54457a8ee45313b7b91ee5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 10 Jul 2025 13:00:09 +0200 Subject: futex: Remove support for IMMUTABLE The FH_FLAG_IMMUTABLE flag was meant to avoid the reference counting on the private hash and so to avoid the performance regression on big machines. With the switch to per-CPU counter this is no longer needed. That flag was never useable on any released kernel. Remove any support for IMMUTABLE while preserve the flags argument and enforce it to be zero. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250710110011.384614-5-bigeasy@linutronix.de --- kernel/futex/core.c | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 1981574a459d..d9bb5567af0c 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -69,7 +69,6 @@ struct futex_private_hash { struct rcu_head rcu; void *mm; bool custom; - bool immutable; struct futex_hash_bucket queues[]; }; @@ -145,15 +144,11 @@ static inline bool futex_key_is_private(union futex_key *key) static bool futex_private_hash_get(struct futex_private_hash *fph) { - if (fph->immutable) - return true; return futex_ref_get(fph); } void futex_private_hash_put(struct futex_private_hash *fph) { - if (fph->immutable) - return; if (futex_ref_put(fph)) wake_up_var(fph->mm); } @@ -1530,7 +1525,6 @@ static void futex_hash_bucket_init(struct futex_hash_bucket *fhb, } #define FH_CUSTOM 0x01 -#define FH_IMMUTABLE 0x02 #ifdef CONFIG_FUTEX_PRIVATE_HASH @@ -1800,7 +1794,7 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) */ scoped_guard(rcu) { fph = rcu_dereference(mm->futex_phash); - if (fph && (!fph->hash_mask || fph->immutable)) { + if (fph && !fph->hash_mask) { if (custom) return -EBUSY; return 0; @@ -1814,7 +1808,6 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) fph->hash_mask = hash_slots ? hash_slots - 1 : 0; fph->custom = custom; - fph->immutable = !!(flags & FH_IMMUTABLE); fph->mm = mm; for (i = 0; i < hash_slots; i++) @@ -1838,7 +1831,7 @@ again: mm->futex_phash_new = NULL; if (fph) { - if (cur && (!cur->hash_mask || cur->immutable)) { + if (cur && !cur->hash_mask) { /* * If two threads simultaneously request the global * hash then the first one performs the switch, @@ -1931,19 +1924,6 @@ static int futex_hash_get_slots(void) return 0; } -static int futex_hash_get_immutable(void) -{ - struct futex_private_hash *fph; - - guard(rcu)(); - fph = rcu_dereference(current->mm->futex_phash); - if (fph && fph->immutable) - return 1; - if (fph && !fph->hash_mask) - return 1; - return 0; -} - #else static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) @@ -1956,10 +1936,6 @@ static int futex_hash_get_slots(void) return 0; } -static int futex_hash_get_immutable(void) -{ - return 0; -} #endif int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -1969,10 +1945,8 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) switch (arg2) { case PR_FUTEX_HASH_SET_SLOTS: - if (arg4 & ~FH_FLAG_IMMUTABLE) + if (arg4) return -EINVAL; - if (arg4 & FH_FLAG_IMMUTABLE) - flags |= FH_IMMUTABLE; ret = futex_hash_allocate(arg3, flags); break; @@ -1980,10 +1954,6 @@ int futex_hash_prctl(unsigned long arg2, unsigned long arg3, unsigned long arg4) ret = futex_hash_get_slots(); break; - case PR_FUTEX_HASH_GET_IMMUTABLE: - ret = futex_hash_get_immutable(); - break; - default: ret = -EINVAL; break; -- cgit v1.2.3 From 19c24f7ee39af503b9731067b91add627b70ecb6 Mon Sep 17 00:00:00 2001 From: David Kaplan Date: Mon, 7 Jul 2025 13:32:57 -0500 Subject: cpu: Define attack vectors Define 4 new attack vectors that are used for controlling CPU speculation mitigations. These may be individually disabled as part of the mitigations= command line. Attack vector controls are combined with global options like 'auto' or 'auto,nosmt' like 'mitigations=auto,no_user_kernel'. The global options come first in the mitigations= string. Cross-thread mitigations can either remain enabled fully, including potentially disabling SMT ('auto,nosmt'), remain enabled except for disabling SMT ('auto'), or entirely disabled through the new 'no_cross_thread' attack vector option. The default settings for these attack vectors are consistent with existing kernel defaults, other than the automatic disabling of VM-based attack vectors if KVM support is not present. Signed-off-by: David Kaplan Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/20250707183316.1349127-3-david.kaplan@amd.com --- kernel/cpu.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 119 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a59e009e0be4..faf0f23fc5d8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -3174,8 +3175,38 @@ void __init boot_cpu_hotplug_init(void) #ifdef CONFIG_CPU_MITIGATIONS /* - * These are used for a global "mitigations=" cmdline option for toggling - * optional CPU mitigations. + * All except the cross-thread attack vector are mitigated by default. + * Cross-thread mitigation often requires disabling SMT which is expensive + * so cross-thread mitigations are only partially enabled by default. + * + * Guest-to-Host and Guest-to-Guest vectors are only needed if KVM support is + * present. + */ +static bool attack_vectors[NR_CPU_ATTACK_VECTORS] __ro_after_init = { + [CPU_MITIGATE_USER_KERNEL] = true, + [CPU_MITIGATE_USER_USER] = true, + [CPU_MITIGATE_GUEST_HOST] = IS_ENABLED(CONFIG_KVM), + [CPU_MITIGATE_GUEST_GUEST] = IS_ENABLED(CONFIG_KVM), +}; + +bool cpu_attack_vector_mitigated(enum cpu_attack_vectors v) +{ + if (v < NR_CPU_ATTACK_VECTORS) + return attack_vectors[v]; + + WARN_ONCE(1, "Invalid attack vector %d\n", v); + return false; +} + +/* + * There are 3 global options, 'off', 'auto', 'auto,nosmt'. These may optionally + * be combined with attack-vector disables which follow them. + * + * Examples: + * mitigations=auto,no_user_kernel,no_user_user,no_cross_thread + * mitigations=auto,nosmt,no_guest_host,no_guest_guest + * + * mitigations=off is equivalent to disabling all attack vectors. */ enum cpu_mitigations { CPU_MITIGATIONS_OFF, @@ -3183,19 +3214,96 @@ enum cpu_mitigations { CPU_MITIGATIONS_AUTO_NOSMT, }; +enum { + NO_USER_KERNEL, + NO_USER_USER, + NO_GUEST_HOST, + NO_GUEST_GUEST, + NO_CROSS_THREAD, + NR_VECTOR_PARAMS, +}; + +enum smt_mitigations smt_mitigations __ro_after_init = SMT_MITIGATIONS_AUTO; static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +static const match_table_t global_mitigations = { + { CPU_MITIGATIONS_AUTO_NOSMT, "auto,nosmt"}, + { CPU_MITIGATIONS_AUTO, "auto"}, + { CPU_MITIGATIONS_OFF, "off"}, +}; + +static const match_table_t vector_mitigations = { + { NO_USER_KERNEL, "no_user_kernel"}, + { NO_USER_USER, "no_user_user"}, + { NO_GUEST_HOST, "no_guest_host"}, + { NO_GUEST_GUEST, "no_guest_guest"}, + { NO_CROSS_THREAD, "no_cross_thread"}, + { NR_VECTOR_PARAMS, NULL}, +}; + +static int __init mitigations_parse_global_opt(char *arg) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(global_mitigations); i++) { + const char *pattern = global_mitigations[i].pattern; + + if (!strncmp(arg, pattern, strlen(pattern))) { + cpu_mitigations = global_mitigations[i].token; + return strlen(pattern); + } + } + + return 0; +} + static int __init mitigations_parse_cmdline(char *arg) { - if (!strcmp(arg, "off")) - cpu_mitigations = CPU_MITIGATIONS_OFF; - else if (!strcmp(arg, "auto")) - cpu_mitigations = CPU_MITIGATIONS_AUTO; - else if (!strcmp(arg, "auto,nosmt")) - cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; - else - pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", - arg); + char *s, *p; + int len; + + len = mitigations_parse_global_opt(arg); + + if (cpu_mitigations_off()) { + memset(attack_vectors, 0, sizeof(attack_vectors)); + smt_mitigations = SMT_MITIGATIONS_OFF; + } else if (cpu_mitigations_auto_nosmt()) { + smt_mitigations = SMT_MITIGATIONS_ON; + } + + p = arg + len; + + if (!*p) + return 0; + + /* Attack vector controls may come after the ',' */ + if (*p++ != ',' || !IS_ENABLED(CONFIG_ARCH_HAS_CPU_ATTACK_VECTORS)) { + pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", arg); + return 0; + } + + while ((s = strsep(&p, ",")) != NULL) { + switch (match_token(s, vector_mitigations, NULL)) { + case NO_USER_KERNEL: + attack_vectors[CPU_MITIGATE_USER_KERNEL] = false; + break; + case NO_USER_USER: + attack_vectors[CPU_MITIGATE_USER_USER] = false; + break; + case NO_GUEST_HOST: + attack_vectors[CPU_MITIGATE_GUEST_HOST] = false; + break; + case NO_GUEST_GUEST: + attack_vectors[CPU_MITIGATE_GUEST_GUEST] = false; + break; + case NO_CROSS_THREAD: + smt_mitigations = SMT_MITIGATIONS_OFF; + break; + default: + pr_crit("Unsupported mitigations options %s\n", s); + return 0; + } + } return 0; } -- cgit v1.2.3 From 8fc3d2d8b5016adf63a3a6d21c189677fa653a4a Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Wed, 9 Jul 2025 15:13:11 -0400 Subject: bpf/arena: add bpf_arena_reserve_pages kfunc Add a new BPF arena kfunc for reserving a range of arena virtual addresses without backing them with pages. This prevents the range from being populated using bpf_arena_alloc_pages(). Acked-by: Yonghong Song Signed-off-by: Emil Tsalapatis Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250709191312.29840-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arena.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 0d56cea71602..5b37753799d2 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -550,6 +550,34 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt) } } +/* + * Reserve an arena virtual address range without populating it. This call stops + * bpf_arena_alloc_pages from adding pages to this range. + */ +static int arena_reserve_pages(struct bpf_arena *arena, long uaddr, u32 page_cnt) +{ + long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT; + long pgoff; + int ret; + + if (uaddr & ~PAGE_MASK) + return 0; + + pgoff = compute_pgoff(arena, uaddr); + if (pgoff + page_cnt > page_cnt_max) + return -EINVAL; + + guard(mutex)(&arena->lock); + + /* Cannot guard already allocated pages. */ + ret = is_range_tree_set(&arena->rt, pgoff, page_cnt); + if (ret) + return -EBUSY; + + /* "Allocate" the region to prevent it from being allocated. */ + return range_tree_clear(&arena->rt, pgoff, page_cnt); +} + __bpf_kfunc_start_defs(); __bpf_kfunc void *bpf_arena_alloc_pages(void *p__map, void *addr__ign, u32 page_cnt, @@ -573,11 +601,26 @@ __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt return; arena_free_pages(arena, (long)ptr__ign, page_cnt); } + +__bpf_kfunc int bpf_arena_reserve_pages(void *p__map, void *ptr__ign, u32 page_cnt) +{ + struct bpf_map *map = p__map; + struct bpf_arena *arena = container_of(map, struct bpf_arena, map); + + if (map->map_type != BPF_MAP_TYPE_ARENA) + return -EINVAL; + + if (!page_cnt) + return 0; + + return arena_reserve_pages(arena, (long)ptr__ign, page_cnt); +} __bpf_kfunc_end_defs(); BTF_KFUNCS_START(arena_kfuncs) BTF_ID_FLAGS(func, bpf_arena_alloc_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_RET | KF_ARENA_ARG2) BTF_ID_FLAGS(func, bpf_arena_free_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) +BTF_ID_FLAGS(func, bpf_arena_reserve_pages, KF_TRUSTED_ARGS | KF_SLEEPABLE | KF_ARENA_ARG2) BTF_KFUNCS_END(arena_kfuncs) static const struct btf_kfunc_id_set common_kfunc_set = { -- cgit v1.2.3 From 6279846b9b2532e1b04559ef8bd0dec049f29383 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 10 Jul 2025 20:20:53 +0200 Subject: bpf: Forget ranges when refining tnum after JSET Syzbot reported a kernel warning due to a range invariant violation on the following BPF program. 0: call bpf_get_netns_cookie 1: if r0 == 0 goto 2: if r0 & Oxffffffff goto The issue is on the path where we fall through both jumps. That path is unreachable at runtime: after insn 1, we know r0 != 0, but with the sign extension on the jset, we would only fallthrough insn 2 if r0 == 0. Unfortunately, is_branch_taken() isn't currently able to figure this out, so the verifier walks all branches. The verifier then refines the register bounds using the second condition and we end up with inconsistent bounds on this unreachable path: 1: if r0 == 0 goto r0: u64=[0x1, 0xffffffffffffffff] var_off=(0, 0xffffffffffffffff) 2: if r0 & 0xffffffff goto r0 before reg_bounds_sync: u64=[0x1, 0xffffffffffffffff] var_off=(0, 0) r0 after reg_bounds_sync: u64=[0x1, 0] var_off=(0, 0) Improving the range refinement for JSET to cover all cases is tricky. We also don't expect many users to rely on JSET given LLVM doesn't generate those instructions. So instead of improving the range refinement for JSETs, Eduard suggested we forget the ranges whenever we're narrowing tnums after a JSET. This patch implements that approach. Reported-by: syzbot+c711ce17dd78e5d4fdcf@syzkaller.appspotmail.com Suggested-by: Eduard Zingerman Acked-by: Yonghong Song Acked-by: Eduard Zingerman Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/9d4fd6432a095d281f815770608fdcd16028ce0b.1752171365.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53007182b46b..e2fcea860755 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -16208,6 +16208,10 @@ static void regs_refine_cond_op(struct bpf_reg_state *reg1, struct bpf_reg_state if (!is_reg_const(reg2, is_jmp32)) break; val = reg_const_value(reg2, is_jmp32); + /* Forget the ranges before narrowing tnums, to avoid invariant + * violations if we're on a dead branch. + */ + __mark_reg_unbounded(reg1); if (is_jmp32) { t = tnum_and(tnum_subreg(reg1->var_off), tnum_const(~val)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); -- cgit v1.2.3 From b725441f02c2b31c04a95d0e9ca5420fa029a767 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:32 +0800 Subject: bpf: Add attach_type field to bpf_link Attach_type will be set when a link is created by user. It is better to record attach_type in bpf_link generically and have it available universally for all link types. So add the attach_type field in bpf_link and move the sleepable field to avoid unnecessary gap padding. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250710032038.888700-2-chen.dylane@linux.dev --- kernel/bpf/bpf_iter.c | 3 ++- kernel/bpf/bpf_struct_ops.c | 5 +++-- kernel/bpf/cgroup.c | 4 ++-- kernel/bpf/net_namespace.c | 2 +- kernel/bpf/syscall.c | 35 ++++++++++++++++++++++------------- kernel/bpf/tcx.c | 3 ++- kernel/bpf/trampoline.c | 10 ++++++---- kernel/trace/bpf_trace.c | 4 ++-- 8 files changed, 40 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 303ab1f42d3a..0cbcae727079 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -552,7 +552,8 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, if (!link) return -ENOMEM; - bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog, + attr->link_create.attach_type); link->tinfo = tinfo; err = bpf_link_prime(&link->link, &link_primer); diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 96113633e391..687a3e9c76f5 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -808,7 +808,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, goto reset_unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, - &bpf_struct_ops_link_lops, prog); + &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); *plink++ = &link->link; ksym = kzalloc(sizeof(*ksym), GFP_USER); @@ -1351,7 +1351,8 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) err = -ENOMEM; goto err_out; } - bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL); + bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL, + attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cd220e861d67..bacdd0ca7419 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -867,7 +867,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, cgrp->bpf.flags[atype] = saved_flags; if (type == BPF_LSM_CGROUP) { - err = bpf_trampoline_link_cgroup_shim(new_prog, atype); + err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type); if (err) goto cleanup; } @@ -1495,7 +1495,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_cgroup; } bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, - prog); + prog, attr->link_create.attach_type); link->cgroup = cgrp; link->type = attr->link_create.attach_type; diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 868cc2c43899..63702c862757 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -501,7 +501,7 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) goto out_put_net; } bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, - &bpf_netns_link_ops, prog); + &bpf_netns_link_ops, prog, type); net_link->net = net; net_link->type = type; net_link->netns_type = netns_type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3f36bfe13266..cd7321fe0ba3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3069,7 +3069,7 @@ static int bpf_obj_get(const union bpf_attr *attr) */ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog, - bool sleepable) + enum bpf_attach_type attach_type, bool sleepable) { WARN_ON(ops->dealloc && ops->dealloc_deferred); atomic64_set(&link->refcnt, 1); @@ -3078,12 +3078,14 @@ void bpf_link_init_sleepable(struct bpf_link *link, enum bpf_link_type type, link->id = 0; link->ops = ops; link->prog = prog; + link->attach_type = attach_type; } void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, - const struct bpf_link_ops *ops, struct bpf_prog *prog) + const struct bpf_link_ops *ops, struct bpf_prog *prog, + enum bpf_attach_type attach_type) { - bpf_link_init_sleepable(link, type, ops, prog, false); + bpf_link_init_sleepable(link, type, ops, prog, attach_type, false); } static void bpf_link_free_id(int id) @@ -3443,7 +3445,8 @@ static const struct bpf_link_ops bpf_tracing_link_lops = { static int bpf_tracing_prog_attach(struct bpf_prog *prog, int tgt_prog_fd, u32 btf_id, - u64 bpf_cookie) + u64 bpf_cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_prog *tgt_prog = NULL; @@ -3511,7 +3514,8 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, - &bpf_tracing_link_lops, prog); + &bpf_tracing_link_lops, prog, attach_type); + link->attach_type = prog->expected_attach_type; link->link.cookie = bpf_cookie; @@ -4049,7 +4053,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro err = -ENOMEM; goto out_put_file; } - bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog); + bpf_link_init(&link->link, BPF_LINK_TYPE_PERF_EVENT, &bpf_perf_link_lops, prog, + attr->link_create.attach_type); link->perf_file = perf_file; err = bpf_link_prime(&link->link, &link_primer); @@ -4081,7 +4086,8 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro #endif /* CONFIG_PERF_EVENTS */ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, - const char __user *user_tp_name, u64 cookie) + const char __user *user_tp_name, u64 cookie, + enum bpf_attach_type attach_type) { struct bpf_link_primer link_primer; struct bpf_raw_tp_link *link; @@ -4104,7 +4110,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0, 0); + return bpf_tracing_prog_attach(prog, 0, 0, 0, attach_type); case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, user_tp_name, sizeof(buf) - 1) < 0) @@ -4126,7 +4132,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog, goto out_put_btp; } bpf_link_init_sleepable(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT, - &bpf_raw_tp_link_lops, prog, + &bpf_raw_tp_link_lops, prog, attach_type, tracepoint_is_faultable(btp->tp)); link->btp = btp; link->cookie = cookie; @@ -4168,7 +4174,7 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = u64_to_user_ptr(attr->raw_tracepoint.name); cookie = attr->raw_tracepoint.cookie; - fd = bpf_raw_tp_link_attach(prog, tp_name, cookie); + fd = bpf_raw_tp_link_attach(prog, tp_name, cookie, prog->expected_attach_type); if (fd < 0) bpf_prog_put(prog); return fd; @@ -5525,7 +5531,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_TRACING: @@ -5534,7 +5541,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) goto out; } if (prog->expected_attach_type == BPF_TRACE_RAW_TP) - ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie); + ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie, + attr->link_create.attach_type); else if (prog->expected_attach_type == BPF_TRACE_ITER) ret = bpf_iter_link_attach(attr, uattr, prog); else if (prog->expected_attach_type == BPF_LSM_CGROUP) @@ -5543,7 +5551,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_tracing_prog_attach(prog, attr->link_create.target_fd, attr->link_create.target_btf_id, - attr->link_create.tracing.cookie); + attr->link_create.tracing.cookie, + attr->link_create.attach_type); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_SK_LOOKUP: diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index 2e4885e7781f..e6a14f408d94 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -301,7 +301,8 @@ static int tcx_link_init(struct tcx_link *tcx, struct net_device *dev, struct bpf_prog *prog) { - bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog); + bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog, + attr->link_create.attach_type); tcx->location = attr->link_create.attach_type; tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b1e358c16eeb..0e364614c3a2 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -674,7 +674,8 @@ static const struct bpf_link_ops bpf_shim_tramp_link_lops = { static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, bpf_func_t bpf_func, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_prog *p; @@ -701,7 +702,7 @@ static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, - &bpf_shim_tramp_link_lops, p); + &bpf_shim_tramp_link_lops, p, attach_type); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; @@ -726,7 +727,8 @@ static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, } int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, - int cgroup_atype) + int cgroup_atype, + enum bpf_attach_type attach_type) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_attach_target_info tgt_info = {}; @@ -763,7 +765,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, /* Allocate and install new shim. */ - shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); + shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype, attach_type); if (!shim_link) { err = -ENOMEM; goto err; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e7f97a9a8bbd..ffdde840abb8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2986,7 +2986,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr } bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI, - &bpf_kprobe_multi_link_lops, prog); + &bpf_kprobe_multi_link_lops, prog, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) @@ -3441,7 +3441,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr link->link.flags = flags; bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI, - &bpf_uprobe_multi_link_lops, prog); + &bpf_uprobe_multi_link_lops, prog, attr->link_create.attach_type); for (i = 0; i < cnt; i++) { uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry), -- cgit v1.2.3 From 9b8d543dc2bbf5d3a1e2d60049df94ae2bc68b28 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:33 +0800 Subject: bpf: Remove attach_type in bpf_cgroup_link Use attach_type in bpf_link, and remove it in bpf_cgroup_link. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250710032038.888700-3-chen.dylane@linux.dev --- kernel/bpf/cgroup.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index bacdd0ca7419..72c8b50dca0a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -984,7 +984,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp, struct hlist_head *progs; bool found = false; - atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id); + atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id); if (atype < 0) return -EINVAL; @@ -1396,8 +1396,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link) } WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, - cg_link->type, 0)); - if (cg_link->type == BPF_LSM_CGROUP) + link->attach_type, 0)); + if (link->attach_type == BPF_LSM_CGROUP) bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); cg = cg_link->cgroup; @@ -1439,7 +1439,7 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, "cgroup_id:\t%llu\n" "attach_type:\t%d\n", cg_id, - cg_link->type); + link->attach_type); } static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, @@ -1455,7 +1455,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, cgroup_unlock(); info->cgroup.cgroup_id = cg_id; - info->cgroup.attach_type = cg_link->type; + info->cgroup.attach_type = link->attach_type; return 0; } @@ -1497,7 +1497,6 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, prog, attr->link_create.attach_type); link->cgroup = cgrp; - link->type = attr->link_create.attach_type; err = bpf_link_prime(&link->link, &link_primer); if (err) { @@ -1506,7 +1505,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) } err = cgroup_bpf_attach(cgrp, NULL, NULL, link, - link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags, + link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags, attr->link_create.cgroup.relative_fd, attr->link_create.cgroup.expected_revision); if (err) { -- cgit v1.2.3 From 6e816e1c052b453a93aeb8b57ede9acde58c458d Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:35 +0800 Subject: bpf: Remove location field in tcx_link Use attach_type in bpf_link to replace the location filed, and remove location field in tcx_link. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20250710032038.888700-5-chen.dylane@linux.dev --- kernel/bpf/tcx.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index e6a14f408d94..efd987ea6872 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -142,7 +142,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct tcx_link *tcx = tcx_link(link); - bool created, ingress = tcx->location == BPF_TCX_INGRESS; + bool created, ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev = tcx->dev; int ret; @@ -169,7 +169,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, static void tcx_link_release(struct bpf_link *link) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -204,7 +204,7 @@ static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, struct bpf_prog *oprog) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -260,8 +260,8 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) seq_printf(seq, "ifindex:\t%u\n", ifindex); seq_printf(seq, "attach_type:\t%u (%s)\n", - tcx->location, - tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); + link->attach_type, + link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress"); } static int tcx_link_fill_info(const struct bpf_link *link, @@ -276,7 +276,7 @@ static int tcx_link_fill_info(const struct bpf_link *link, rtnl_unlock(); info->tcx.ifindex = ifindex; - info->tcx.attach_type = tcx->location; + info->tcx.attach_type = link->attach_type; return 0; } @@ -303,7 +303,6 @@ static int tcx_link_init(struct tcx_link *tcx, { bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog, attr->link_create.attach_type); - tcx->location = attr->link_create.attach_type; tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); } -- cgit v1.2.3 From 2a76a80c7ffc9894b90126af7b17584195b40b7a Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:36 +0800 Subject: bpf: Remove attach_type in bpf_netns_link Use attach_type in bpf_link, and remove it in bpf_netns_link. And move netns_type field to the end to fill the byte hole. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Reviewed-by: Jakub Sitnicki Acked-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20250710032038.888700-6-chen.dylane@linux.dev --- kernel/bpf/net_namespace.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 63702c862757..8e88201c98bf 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -11,8 +11,6 @@ struct bpf_netns_link { struct bpf_link link; - enum bpf_attach_type type; - enum netns_bpf_attach_type netns_type; /* We don't hold a ref to net in order to auto-detach the link * when netns is going away. Instead we rely on pernet @@ -21,6 +19,7 @@ struct bpf_netns_link { */ struct net *net; struct list_head node; /* node in list of links attached to net */ + enum netns_bpf_attach_type netns_type; }; /* Protects updates to netns_bpf */ @@ -216,7 +215,7 @@ static int bpf_netns_link_fill_info(const struct bpf_link *link, mutex_unlock(&netns_bpf_mutex); info->netns.netns_ino = inum; - info->netns.attach_type = net_link->type; + info->netns.attach_type = link->attach_type; return 0; } @@ -230,7 +229,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link, "netns_ino:\t%u\n" "attach_type:\t%u\n", info.netns.netns_ino, - info.netns.attach_type); + link->attach_type); } static const struct bpf_link_ops bpf_netns_link_ops = { @@ -503,7 +502,6 @@ int netns_bpf_link_create(const union bpf_attr *attr, struct bpf_prog *prog) bpf_link_init(&net_link->link, BPF_LINK_TYPE_NETNS, &bpf_netns_link_ops, prog, type); net_link->net = net; - net_link->type = type; net_link->netns_type = netns_type; err = bpf_link_prime(&net_link->link, &link_primer); -- cgit v1.2.3 From 0eeeebdcc5feeec48118f7a3df2ac818e694ccc7 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:37 +0800 Subject: bpf: Remove attach_type in bpf_tracing_link Use attach_type in bpf_link, and remove it in bpf_tracing_link. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20250710032038.888700-7-chen.dylane@linux.dev --- kernel/bpf/syscall.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cd7321fe0ba3..1a26d17536be 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3414,7 +3414,7 @@ static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link, "target_obj_id:\t%u\n" "target_btf_id:\t%u\n" "cookie:\t%llu\n", - tr_link->attach_type, + link->attach_type, target_obj_id, target_btf_id, tr_link->link.cookie); @@ -3426,7 +3426,7 @@ static int bpf_tracing_link_fill_link_info(const struct bpf_link *link, struct bpf_tracing_link *tr_link = container_of(link, struct bpf_tracing_link, link.link); - info->tracing.attach_type = tr_link->attach_type; + info->tracing.attach_type = link->attach_type; info->tracing.cookie = tr_link->link.cookie; bpf_trampoline_unpack_key(tr_link->trampoline->key, &info->tracing.target_obj_id, @@ -3516,7 +3516,6 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, bpf_link_init(&link->link.link, BPF_LINK_TYPE_TRACING, &bpf_tracing_link_lops, prog, attach_type); - link->attach_type = prog->expected_attach_type; link->link.cookie = bpf_cookie; mutex_lock(&prog->aux->dst_mutex); -- cgit v1.2.3 From 9a425da913cf4ea23040334301fce87be2dab384 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 11 Jul 2025 13:20:43 +0200 Subject: panic: Fix up description of vpanic() The description above vpanic() has the wrong function name. Fix it up. Cc: John Ogness Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Gabriele Monaco Cc: Josh Poimboeuf Cc: Petr Mladek Link: https://lore.kernel.org/23a7e8add6546b155371b7e0fbb37bb1def13d6e.1752232374.git.namcao@linutronix.de Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20250711183802.2d8c124d@canb.auug.org.au/ Acked-by: Peter Zijlstra (Intel) Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6a1823c383d0..2a499facde13 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -307,7 +307,7 @@ static void panic_other_cpus_shutdown(bool crash_kexec) } /** - * panic - halt the system + * vpanic - halt the system * @fmt: The text string to print * @args: Arguments for the format string * -- cgit v1.2.3 From f84a15b90d96f3da99f67fea2e116850d99fb7c4 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Tue, 10 Jun 2025 21:01:58 +0800 Subject: locking/rwsem: Use OWNER_NONSPINNABLE directly instead of OWNER_SPINNABLE After commit 7d43f1ce9dd0 ("locking/rwsem: Enable time-based spinning on reader-owned rwsem"), OWNER_SPINNABLE contains all possible values except OWNER_NONSPINNABLE, namely OWNER_NULL | OWNER_WRITER | OWNER_READER. Therefore, it is better to use OWNER_NONSPINNABLE directly to determine whether to exit optimistic spin. And, remove useless OWNER_SPINNABLE to simplify the code. Signed-off-by: Jinliang Zheng Acked-by: Waiman Long Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250610130158.4876-1-alexjlzheng@tencent.com --- kernel/locking/rwsem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2ddb827e3bea..8572dba95af4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -727,8 +727,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) return ret; } -#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) - static inline enum owner_state rwsem_owner_state(struct task_struct *owner, unsigned long flags) { @@ -835,7 +833,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) enum owner_state owner_state; owner_state = rwsem_spin_on_owner(sem); - if (!(owner_state & OWNER_SPINNABLE)) + if (owner_state == OWNER_NONSPINNABLE) break; /* -- cgit v1.2.3 From 6b233784b198e0d6dbfd526341b6ec51ffd30020 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Jun 2025 15:03:46 +0200 Subject: mm, madvise: extract mm code from prctl_set_vma() to mm/madvise.c Setting anon_name is done via madvise_set_anon_name() and behaves a lot of like other madvise operations. However, apparently because madvise() has lacked the 4th argument and prctl() not, the userspace entry point has been implemented via prctl(PR_SET_VMA, ...) and handled first by prctl_set_vma(). Currently prctl_set_vma() lives in kernel/sys.c but setting the vma->anon_name is mm-specific code so extract it to a new set_anon_vma_name() function under mm. mm/madvise.c seems to be the most straightforward place as that's where madvise_set_anon_name() lives. Stop declaring the latter in mm.h and instead declare set_anon_vma_name(). Link: https://lkml.kernel.org/r/20250624-anon_name_cleanup-v2-2-600075462a11@suse.cz Signed-off-by: Vlastimil Babka Acked-by: David Hildenbrand Tested-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Cc: Colin Cross Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: SeongJae Park Signed-off-by: Andrew Morton --- kernel/sys.c | 50 +------------------------------------------------- 1 file changed, 1 insertion(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index adc0de0aa364..b153fb345ada 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2343,54 +2343,14 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) -#ifdef CONFIG_ANON_VMA_NAME - -#define ANON_VMA_NAME_MAX_LEN 80 -#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" - -static inline bool is_valid_name_char(char ch) -{ - /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ - return ch > 0x1f && ch < 0x7f && - !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); -} - static int prctl_set_vma(unsigned long opt, unsigned long addr, unsigned long size, unsigned long arg) { - struct mm_struct *mm = current->mm; - const char __user *uname; - struct anon_vma_name *anon_name = NULL; int error; switch (opt) { case PR_SET_VMA_ANON_NAME: - uname = (const char __user *)arg; - if (uname) { - char *name, *pch; - - name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); - if (IS_ERR(name)) - return PTR_ERR(name); - - for (pch = name; *pch != '\0'; pch++) { - if (!is_valid_name_char(*pch)) { - kfree(name); - return -EINVAL; - } - } - /* anon_vma has its own copy */ - anon_name = anon_vma_name_alloc(name); - kfree(name); - if (!anon_name) - return -ENOMEM; - - } - - mmap_write_lock(mm); - error = madvise_set_anon_name(mm, addr, size, anon_name); - mmap_write_unlock(mm); - anon_vma_name_put(anon_name); + error = set_anon_vma_name(addr, size, (const char __user *)arg); break; default: error = -EINVAL; @@ -2399,14 +2359,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr, return error; } -#else /* CONFIG_ANON_VMA_NAME */ -static int prctl_set_vma(unsigned long opt, unsigned long start, - unsigned long size, unsigned long arg) -{ - return -EINVAL; -} -#endif /* CONFIG_ANON_VMA_NAME */ - static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; -- cgit v1.2.3 From 8e1bf051c524c6e95194cebf64a839285301d735 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 16 Jun 2025 15:51:51 +0200 Subject: kernel,cpuset: use node-notifier instead of memory-notifier cpuset is only concerned when a numa node changes its memory state, as it needs to know the current numa nodes with memory to keep an updated mems_allowed mask. So stop using the memory notifier and use the new numa node notifer instead. Link: https://lkml.kernel.org/r/20250616135158.450136-9-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Jonathan Cameron Reviewed-by: Harry Yoo Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Rakie Kim Signed-off-by: Andrew Morton --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 3bc4301466f3..f74d04429a29 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4051,7 +4051,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); + hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); -- cgit v1.2.3 From 1bc3587a88d291a37dab12d6c14aa7da53304251 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 16 Jun 2025 22:11:11 -0400 Subject: mm/page_alloc: add support for initializing pageblock as isolated MIGRATE_ISOLATE is a standalone bit, so a pageblock cannot be initialized to just MIGRATE_ISOLATE. Add init_pageblock_migratetype() to enable initialize a pageblock with a migratetype and isolated. Link: https://lkml.kernel.org/r/20250617021115.2331563-4-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Brendan Jackman Cc: Johannes Weiner Cc: Kirill A. Shuemov Cc: Mel Gorman Cc: Michal Hocko Cc: Oscar Salvador Cc: Richard Chang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 5a21dbe17950..49634cc3fb43 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1100,8 +1100,8 @@ static void __init kho_release_scratch(void) ulong pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) - set_pageblock_migratetype(pfn_to_page(pfn), - MIGRATE_CMA); + init_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA, false); } } -- cgit v1.2.3 From 36569780b0d64de283f9d6c2195fd1a43e221ee8 Mon Sep 17 00:00:00 2001 From: Aruna Ramakrishna Date: Wed, 9 Jul 2025 17:33:28 +0000 Subject: sched: Change nr_uninterruptible type to unsigned long The commit e6fe3f422be1 ("sched: Make multiple runqueue task counters 32-bit") changed nr_uninterruptible to an unsigned int. But the nr_uninterruptible values for each of the CPU runqueues can grow to large numbers, sometimes exceeding INT_MAX. This is valid, if, over time, a large number of tasks are migrated off of one CPU after going into an uninterruptible state. Only the sum of all nr_interruptible values across all CPUs yields the correct result, as explained in a comment in kernel/sched/loadavg.c. Change the type of nr_uninterruptible back to unsigned long to prevent overflows, and thus the miscalculation of load average. Fixes: e6fe3f422be1 ("sched: Make multiple runqueue task counters 32-bit") Signed-off-by: Aruna Ramakrishna Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250709173328.606794-1-aruna.ramakrishna@oracle.com --- kernel/sched/loadavg.c | 2 +- kernel/sched/sched.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2..52ca8e268cfc 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -80,7 +80,7 @@ long calc_load_fold_active(struct rq *this_rq, long adjust) long nr_active, delta = 0; nr_active = this_rq->nr_running - adjust; - nr_active += (int)this_rq->nr_uninterruptible; + nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { delta = nr_active - this_rq->calc_load_active; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 475bb5998295..83e3aa917142 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1149,7 +1149,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned int nr_uninterruptible; + unsigned long nr_uninterruptible; union { struct task_struct __rcu *donor; /* Scheduler context */ -- cgit v1.2.3 From 9f239df55546ee1d28f0976130136ffd1cad0fd7 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:14 +0200 Subject: sched/deadline: Initialize dl_servers after SMP dl-servers are currently initialized too early at boot when CPUs are not fully up (only boot CPU is). This results in miscalculation of per runqueue DEADLINE variables like extra_bw (which needs a stable CPU count). Move initialization of dl-servers later on after SMP has been initialized and CPUs are all online, so that CPU count is stable and DEADLINE variables can be computed correctly. Fixes: d741f297bceaf ("sched/fair: Fair server interface") Reported-by: Marcel Ziswiler Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-2-juri.lelli@redhat.com --- kernel/sched/core.c | 2 ++ kernel/sched/deadline.c | 48 ++++++++++++++++++++++++++++++------------------ kernel/sched/sched.h | 1 + 3 files changed, 33 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f8caa9db78d..89b3ed637465 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8371,6 +8371,8 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_init_dl_servers(); + sched_smp_initialized = true; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 23668fc60bd3..0d25553f2c17 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -761,6 +761,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + update_rq_clock(rq); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); @@ -1585,23 +1587,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; - /* - * XXX: the apply do not work fine at the init phase for the - * fair server because things are not yet set. We need to improve - * this before getting generic. - */ - if (!dl_server(dl_se)) { - u64 runtime = 50 * NSEC_PER_MSEC; - u64 period = 1000 * NSEC_PER_MSEC; - - dl_server_apply_params(dl_se, runtime, period, 1); - - dl_se->dl_server = 1; - dl_se->dl_defer = 1; - setup_new_dl_entity(dl_se); - } - - if (!dl_se->dl_runtime || dl_se->dl_server_active) + if (!dl_server(dl_se) || dl_se->dl_server_active) return; dl_se->dl_server_active = 1; @@ -1612,7 +1598,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) void dl_server_stop(struct sched_dl_entity *dl_se) { - if (!dl_se->dl_runtime) + if (!dl_server(dl_se) || !dl_server_active(dl_se)) return; dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); @@ -1645,6 +1631,32 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_se->server_pick_task = pick_task; } +void sched_init_dl_servers(void) +{ + int cpu; + struct rq *rq; + struct sched_dl_entity *dl_se; + + for_each_online_cpu(cpu) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + rq = cpu_rq(cpu); + + guard(rq_lock_irq)(rq); + + dl_se = &rq->fair_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } +} + void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) { u64 new_bw = dl_se->dl_bw; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 105190b18020..3058fb6246da 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -385,6 +385,7 @@ extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); +extern void sched_init_dl_servers(void); extern void dl_server_update_idle_time(struct rq *rq, struct task_struct *p); -- cgit v1.2.3 From fcc9276c4d331cd1fe9319d793e80b02e09727f5 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:15 +0200 Subject: sched/deadline: Reset extra_bw to max_bw when clearing root domains dl_clear_root_domain() doesn't take into account the fact that per-rq extra_bw variables retain values computed before root domain changes, resulting in broken accounting. Fix it by resetting extra_bw to max_bw before restoring back dl-servers contributions. Fixes: 2ff899e351643 ("sched/deadline: Rebuild root domain accounting after every update") Reported-by: Marcel Ziswiler Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-3-juri.lelli@redhat.com --- kernel/sched/deadline.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0d25553f2c17..0abffe3860e3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2924,7 +2924,14 @@ void dl_clear_root_domain(struct root_domain *rd) int i; guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); + + /* + * Reset total_bw to zero and extra_bw to max_bw so that next + * loop will add dl-servers contributions back properly, + */ rd->dl_bw.total_bw = 0; + for_each_cpu(i, rd->span) + cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw; /* * dl_servers are not tasks. Since dl_add_task_root_domain ignores -- cgit v1.2.3 From 440989c10f4e32620e9e2717ca52c3ed7ae11048 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:16 +0200 Subject: sched/deadline: Fix accounting after global limits change A global limits change (sched_rt_handler() logic) currently leaves stale and/or incorrect values in variables related to accounting (e.g. extra_bw). Properly clean up per runqueue variables before implementing the change and rebuild scheduling domains (so that accounting is also properly restored) after such a change is complete. Reported-by: Marcel Ziswiler Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-4-juri.lelli@redhat.com --- kernel/sched/deadline.c | 4 +++- kernel/sched/rt.c | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0abffe3860e3..9c7d9528d5fa 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3183,6 +3183,9 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + for_each_possible_cpu(cpu) + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + for_each_possible_cpu(cpu) { rcu_read_lock_sched(); @@ -3198,7 +3201,6 @@ void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 15d5855c542c..be6e9bcbe82b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2886,6 +2886,12 @@ undo: sched_domains_mutex_unlock(); mutex_unlock(&mutex); + /* + * After changing maximum available bandwidth for DEADLINE, we need to + * recompute per root domain and per cpus variables accordingly. + */ + rebuild_sched_domains(); + return ret; } -- cgit v1.2.3 From e075f4360931263f5ec006ea5dadc065e5e98eb8 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:07 +0800 Subject: smpboot: introduce SDTL_INIT() helper to tidy sched topology setup Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty --- kernel/sched/topology.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8e06b1d22e91..d01f5a49f2e7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1737,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; @@ -2008,23 +2008,15 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; + tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; -- cgit v1.2.3 From 1eec89a671413ce38df9fe9e70f5130a9eb79a59 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 11 Jul 2025 11:20:30 +0530 Subject: sched/topology: Remove sched_domain_topology_level::flags Support for overlapping domains added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") also allowed forcefully setting SD_OVERLAP for !NUMA domains via FORCE_SD_OVERLAP sched_feat(). Since NUMA domains had to be presumed overlapping to ensure correct behavior, "sched_domain_topology_level::flags" was introduced. NUMA domains added the SDTL_OVERLAP flag would ensure SD_OVERLAP was always added during build_sched_domains() for these domains, even when FORCE_SD_OVERLAP was off. Condition for adding the SD_OVERLAP flag at the aforementioned commit was as follows: if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; The FORCE_SD_OVERLAP debug feature was removed in commit af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") which left the NUMA domains as the exclusive users of SDTL_OVERLAP, SD_OVERLAP, and SD_NUMA flags. Get rid of SDTL_OVERLAP and SD_OVERLAP as they have become redundant and instead rely on SD_NUMA to detect the only overlapping domain currently supported. Since SDTL_OVERLAP was the only user of "tl->flags", get rid of "sched_domain_topology_level::flags" too. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/ba4dbdf8-bc37-493d-b2e0-2efb00ea3e19@amd.com --- kernel/sched/fair.c | 6 +++--- kernel/sched/topology.c | 19 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20a845697c1d..b9b4bbbf0af6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9926,9 +9926,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) min_capacity = ULONG_MAX; max_capacity = 0; - if (child->flags & SD_OVERLAP) { + if (child->flags & SD_NUMA) { /* - * SD_OVERLAP domains cannot assume that child groups + * SD_NUMA domains cannot assume that child groups * span the current group. */ @@ -9941,7 +9941,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } } else { /* - * !SD_OVERLAP domains can assume that child groups + * !SD_NUMA domains can assume that child groups * span the current group. */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d01f5a49f2e7..977e133bb8a4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -89,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!(sd->flags & SD_OVERLAP) && + if (!(sd->flags & SD_NUMA) && cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); @@ -102,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_span(group))); - if ((sd->flags & SD_OVERLAP) && + if ((sd->flags & SD_NUMA) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); @@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" * which is shared by all the overlapping groups. */ - WARN_ON_ONCE(sd->flags & SD_OVERLAP); + WARN_ON_ONCE(sd->flags & SD_NUMA); sg = sd->groups; if (cpu != sg->asym_prefer_cpu) { @@ -2016,7 +2016,6 @@ void sched_init_numa(int offline_node) for (j = 1; j < nr_levels; i++, j++) { tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); tl[i].numa_level = j; - tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; @@ -2327,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sd) { sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) + if (sd && (sd->flags & SD_NUMA)) free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sd, j)); } @@ -2393,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map) id_seen = sched_domains_tmpmask2; for_each_sd_topology(tl) { + int tl_common_flags = 0; + + if (tl->sd_flags) + tl_common_flags = (*tl->sd_flags)(); /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) + if (tl_common_flags & SD_NUMA) continue; cpumask_clear(covered); @@ -2466,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } @@ -2480,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { + if (sd->flags & SD_NUMA) { if (build_overlap_sched_groups(sd, i)) goto error; } else { -- cgit v1.2.3 From 25c411fce735dda29de26f58d3fce52d4824380c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:42 +0000 Subject: sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable Add a CONFIG_SCHED_PROXY_EXEC option, along with a boot argument sched_proxy_exec= that can be used to disable the feature at boot time if CONFIG_SCHED_PROXY_EXEC was enabled. Also uses this option to allow the rq->donor to be different from rq->curr. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-2-jstultz@google.com --- kernel/sched/core.c | 29 +++++++++++++++++++++++++++++ kernel/sched/sched.h | 12 ++++++++++++ 2 files changed, 41 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9c8bda84d80..dd9f5c08b563 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,6 +119,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_SCHED_PROXY_EXEC +DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); +static int __init setup_proxy_exec(char *str) +{ + bool proxy_enable = true; + + if (*str && kstrtobool(str + 1, &proxy_enable)) { + pr_warn("Unable to parse sched_proxy_exec=\n"); + return 0; + } + + if (proxy_enable) { + pr_info("sched_proxy_exec enabled via boot arg\n"); + static_branch_enable(&__sched_proxy_exec); + } else { + pr_info("sched_proxy_exec disabled via boot arg\n"); + static_branch_disable(&__sched_proxy_exec); + } + return 1; +} +#else +static int __init setup_proxy_exec(char *str) +{ + pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); + return 0; +} +#endif +__setup("sched_proxy_exec", setup_proxy_exec); + /* * Debugging: various feature bits * diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ac953fad8c21..e53d0b87f780 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1142,10 +1142,15 @@ struct rq { */ unsigned long nr_uninterruptible; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else union { struct task_struct __rcu *donor; /* Scheduler context */ struct task_struct __rcu *curr; /* Execution context */ }; +#endif struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1326,10 +1331,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + rcu_assign_pointer(rq->donor, t); +} +#else static inline void rq_set_donor(struct rq *rq, struct task_struct *t) { /* Do nothing */ } +#endif #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); -- cgit v1.2.3 From 44e4e0297c3c01987399bb9973f4d22a096a62c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 12 Jul 2025 03:33:43 +0000 Subject: locking/mutex: Rework task_struct::blocked_on Track the blocked-on relation for mutexes, to allow following this relation at schedule time. task | blocked-on v mutex | owner v task This all will be used for tracking blocked-task/mutex chains with the prox-execution patch in a similar fashion to how priority inheritance is done with rt_mutexes. For serialization, blocked-on is only set by the task itself (current). And both when setting or clearing (potentially by others), is done while holding the mutex::wait_lock. [minor changes while rebasing] [jstultz: Fix blocked_on tracking in __mutex_lock_common in error paths] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-3-jstultz@google.com --- kernel/fork.c | 3 +-- kernel/locking/mutex-debug.c | 9 +++++---- kernel/locking/mutex.c | 22 ++++++++++++++++++++++ kernel/locking/ww_mutex.h | 18 ++++++++++++++++-- 4 files changed, 44 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..5f87f05aff4a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2123,9 +2123,8 @@ __latent_entropy struct task_struct *copy_process( lockdep_init_task(p); #endif -#ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ -#endif + #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 6e6f6071cfa2..758b7a6792b0 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, { lockdep_assert_held(&lock->wait_lock); - /* Mark the current thread as blocked on the lock: */ - task->blocked_on = waiter; + /* Current thread can't be already blocked (since it's executing!) */ + DEBUG_LOCKS_WARN_ON(task->blocked_on); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { + struct mutex *blocked_on = READ_ONCE(task->blocked_on); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); - task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a39ecccbd106..e2f59863a866 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -644,6 +644,8 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + WARN_ON(current->blocked_on); + current->blocked_on = lock; set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { @@ -680,6 +682,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas first = __mutex_waiter_is_first(lock, &waiter); + /* + * As we likely have been woken up by task + * that has cleared our blocked_on state, re-set + * it to the lock we are trying to aquire. + */ + current->blocked_on = lock; set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -691,8 +699,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (first) { trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + /* clear blocked_on as mutex_optimistic_spin may schedule() */ + current->blocked_on = NULL; if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) break; + current->blocked_on = lock; trace_contention_begin(lock, LCB_F_MUTEX); } @@ -700,6 +711,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: + current->blocked_on = NULL; __set_current_state(TASK_RUNNING); if (ww_ctx) { @@ -729,9 +741,11 @@ skip_wait: return 0; err: + current->blocked_on = NULL; __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + WARN_ON(current->blocked_on); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -942,6 +956,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); + /* + * Unlock wakeups can be happening in parallel + * (when optimistic spinners steal and release + * the lock), so blocked_on may already be + * cleared here. + */ + WARN_ON(next->blocked_on && next->blocked_on != lock); + next->blocked_on = NULL; wake_q_add(&wake_q, next); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 37f025a096c9..45fe05e51db1 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -283,7 +283,15 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); + /* + * When waking up the task to die, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + WARN_ON(waiter->task->blocked_on && + waiter->task->blocked_on != lock); #endif + waiter->task->blocked_on = NULL; wake_q_add(wake_q, waiter->task); } @@ -331,9 +339,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ - if (owner != current) + if (owner != current) { + /* + * When waking up the task to wound, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + owner->blocked_on = NULL; wake_q_add(wake_q, owner); - + } return true; } -- cgit v1.2.3 From a4f0b6fef4b08e9928449206390133e48ac185a7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sat, 12 Jul 2025 03:33:44 +0000 Subject: locking/mutex: Add p->blocked_on wrappers for correctness checks This lets us assert mutex::wait_lock is held whenever we access p->blocked_on, as well as warn us for unexpected state changes. [fix conflicts, call in more places] [jstultz: tweaked commit subject, reworked a good bit] Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-4-jstultz@google.com --- kernel/locking/mutex-debug.c | 4 ++-- kernel/locking/mutex.c | 32 ++++++++++++++------------------ kernel/locking/ww_mutex.h | 8 +++----- 3 files changed, 19 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 758b7a6792b0..949103fd8e9b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -54,13 +54,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, lockdep_assert_held(&lock->wait_lock); /* Current thread can't be already blocked (since it's executing!) */ - DEBUG_LOCKS_WARN_ON(task->blocked_on); + DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - struct mutex *blocked_on = READ_ONCE(task->blocked_on); + struct mutex *blocked_on = __get_task_blocked_on(task); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e2f59863a866..80d778fedd60 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -644,8 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } - WARN_ON(current->blocked_on); - current->blocked_on = lock; + __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { @@ -685,9 +684,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas /* * As we likely have been woken up by task * that has cleared our blocked_on state, re-set - * it to the lock we are trying to aquire. + * it to the lock we are trying to acquire. */ - current->blocked_on = lock; + set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -699,11 +698,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (first) { trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); - /* clear blocked_on as mutex_optimistic_spin may schedule() */ - current->blocked_on = NULL; + /* + * mutex_optimistic_spin() can call schedule(), so + * clear blocked on so we don't become unselectable + * to run. + */ + clear_task_blocked_on(current, lock); if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) break; - current->blocked_on = lock; + set_task_blocked_on(current, lock); trace_contention_begin(lock, LCB_F_MUTEX); } @@ -711,7 +714,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: - current->blocked_on = NULL; + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); if (ww_ctx) { @@ -741,11 +744,11 @@ skip_wait: return 0; err: - current->blocked_on = NULL; + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: - WARN_ON(current->blocked_on); + WARN_ON(__get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -956,14 +959,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - /* - * Unlock wakeups can be happening in parallel - * (when optimistic spinners steal and release - * the lock), so blocked_on may already be - * cleared here. - */ - WARN_ON(next->blocked_on && next->blocked_on != lock); - next->blocked_on = NULL; + __clear_task_blocked_on(next, lock); wake_q_add(&wake_q, next); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 45fe05e51db1..086fd5487ca7 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -283,15 +283,13 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); +#endif /* * When waking up the task to die, be sure to clear the * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. */ - WARN_ON(waiter->task->blocked_on && - waiter->task->blocked_on != lock); -#endif - waiter->task->blocked_on = NULL; + __clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -345,7 +343,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. */ - owner->blocked_on = NULL; + __clear_task_blocked_on(owner, lock); wake_q_add(wake_q, owner); } return true; -- cgit v1.2.3 From 865d8cfb1672089e4b628d6899ac5c6e49787150 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:45 +0000 Subject: sched: Move update_curr_task logic into update_curr_se Absorb update_curr_task() into update_curr_se(), and in the process simplify update_curr_common(). This will make the next step a bit easier. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-5-jstultz@google.com --- kernel/sched/fair.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b9b4bbbf0af6..8334580ed3a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1164,6 +1164,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) curr->exec_start = now; curr->sum_exec_runtime += delta_exec; + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); + } + if (schedstat_enabled()) { struct sched_statistics *stats; @@ -1175,26 +1183,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) return delta_exec; } -static inline void update_curr_task(struct task_struct *p, s64 delta_exec) -{ - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); -} - /* * Used by other classes to account runtime. */ s64 update_curr_common(struct rq *rq) { struct task_struct *donor = rq->donor; - s64 delta_exec; - delta_exec = update_curr_se(rq, &donor->se); - if (likely(delta_exec > 0)) - update_curr_task(donor, delta_exec); - - return delta_exec; + return update_curr_se(rq, &donor->se); } /* @@ -1219,10 +1215,6 @@ static void update_curr(struct cfs_rq *cfs_rq) update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); - - update_curr_task(p, delta_exec); - /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on -- cgit v1.2.3 From aa4f74dfd42ba4399f785fb9c460a11bd1756f0a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:46 +0000 Subject: sched: Fix runtime accounting w/ split exec & sched contexts Without proxy-exec, we normally charge the "current" task for both its vruntime as well as its sum_exec_runtime. With proxy, however, we have two "current" contexts: the scheduler context and the execution context. We want to charge the execution context rq->curr (ie: proxy/lock holder) execution time to its sum_exec_runtime (so it's clear to userland the rq->curr task *is* running), as well as its thread group. However the rest of the time accounting (such a vruntime and cgroup accounting), we charge against the scheduler context (rq->donor) task, because it is from that task that the time is being "donated". If the donor and curr tasks are the same, then it's the same as without proxy. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-6-jstultz@google.com --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8334580ed3a3..97176458f2be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1152,30 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); s64 delta_exec; - delta_exec = now - curr->exec_start; + delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec; - curr->exec_start = now; - curr->sum_exec_runtime += delta_exec; + se->exec_start = now; + if (entity_is_task(se)) { + struct task_struct *donor = task_of(se); + struct task_struct *running = rq->curr; + /* + * If se is a task, we account the time against the running + * task, as w/ proxy-exec they may not be the same. + */ + running->se.exec_start = now; + running->se.sum_exec_runtime += delta_exec; - if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); + } else { + /* If not task, account the time against donor se */ + se->sum_exec_runtime += delta_exec; } if (schedstat_enabled()) { struct sched_statistics *stats; - stats = __schedstats_from_se(curr); + stats = __schedstats_from_se(se); __schedstat_set(stats->exec_max, max(delta_exec, stats->exec_max)); } @@ -1188,9 +1198,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) */ s64 update_curr_common(struct rq *rq) { - struct task_struct *donor = rq->donor; - - return update_curr_se(rq, &donor->se); + return update_se(rq, &rq->donor->se); } /* @@ -1198,6 +1206,12 @@ s64 update_curr_common(struct rq *rq) */ static void update_curr(struct cfs_rq *cfs_rq) { + /* + * Note: cfs_rq->curr corresponds to the task picked to + * run (ie: rq->donor.se) which due to proxy-exec may + * not necessarily be the actual task running + * (rq->curr.se). This is easy to confuse! + */ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq); s64 delta_exec; @@ -1206,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq, curr); + delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return; -- cgit v1.2.3 From be41bde4c3a86de4be5cd3d1ca613e24664e68dc Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:47 +0000 Subject: sched: Add an initial sketch of the find_proxy_task() function Add a find_proxy_task() function which doesn't do much. When we select a blocked task to run, we will just deactivate it and pick again. The exception being if it has become unblocked after find_proxy_task() was called. This allows us to validate keeping blocked tasks on the runqueue and later deactivating them is working ok, stressing the failure cases for when a proxy isn't found. Greatly simplified from patch by: Peter Zijlstra (Intel) Juri Lelli Valentin Schneider Connor O'Brien [jstultz: Split out from larger proxy patch and simplified for review and testing.] Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-7-jstultz@google.com --- kernel/sched/core.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 10 ++++- 2 files changed, 121 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd9f5c08b563..cb55d4247e65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6522,11 +6522,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Helper function for __schedule() * - * If a task does not have signals pending, deactivate it - * Otherwise marks the task's __state as RUNNING + * Tries to deactivate the task, unless the should_block arg + * is false or if a signal is pending. In the case a signal + * is pending, marks the task's __state as RUNNING (and clear + * blocked_on). */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long *task_state_p) + unsigned long *task_state_p, bool should_block) { unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; @@ -6537,6 +6539,16 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return false; } + /* + * We check should_block after signal_pending because we + * will want to wake the task in that case. But if + * should_block is false, its likely due to the task being + * blocked on a mutex, and we want to keep it on the runqueue + * to be selectable for proxy-execution. + */ + if (!should_block) + return false; + p->sched_contributes_to_load = (task_state & TASK_UNINTERRUPTIBLE) && !(task_state & TASK_NOLOAD) && @@ -6560,6 +6572,88 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return true; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline void proxy_resched_idle(struct rq *rq) +{ + put_prev_set_next_task(rq, rq->donor, rq->idle); + rq_set_donor(rq, rq->idle); + set_tsk_need_resched(rq->idle); +} + +static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + unsigned long state = READ_ONCE(donor->__state); + + /* Don't deactivate if the state has been changed to TASK_RUNNING */ + if (state == TASK_RUNNING) + return false; + /* + * Because we got donor from pick_next_task(), it is *crucial* + * that we call proxy_resched_idle() before we deactivate it. + * As once we deactivate donor, donor->on_rq is set to zero, + * which allows ttwu() to immediately try to wake the task on + * another rq. So we cannot use *any* references to donor + * after that point. So things like cfs_rq->curr or rq->donor + * need to be changed from next *before* we deactivate. + */ + proxy_resched_idle(rq); + return try_to_block_task(rq, donor, &state, true); +} + +static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + if (!__proxy_deactivate(rq, donor)) { + /* + * XXX: For now, if deactivation failed, set donor + * as unblocked, as we aren't doing proxy-migrations + * yet (more logic will be needed then). + */ + donor->blocked_on = NULL; + } + return NULL; +} + +/* + * Initial simple sketch that just deactivates the blocked task + * chosen by pick_next_task() so we can then pick something that + * isn't blocked. + */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + struct mutex *mutex; + + mutex = donor->blocked_on; + /* Something changed in the chain, so pick again */ + if (!mutex) + return NULL; + /* + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() + * and ensure @owner sticks around. + */ + guard(raw_spinlock)(&mutex->wait_lock); + + /* Check again that donor is blocked with blocked_lock held */ + if (!task_is_blocked(donor) || mutex != __get_task_blocked_on(donor)) { + /* + * Something changed in the blocked_on chain and + * we don't know if only at this level. So, let's + * just bail out completely and let __schedule() + * figure things out (pick_again loop). + */ + return NULL; /* do pick_next_task() again */ + } + return proxy_deactivate(rq, donor); +} +#else /* SCHED_PROXY_EXEC */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n"); + return donor; +} +#endif /* SCHED_PROXY_EXEC */ + /* * __schedule() is the main scheduler function. * @@ -6672,12 +6766,25 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, &prev_state); + /* + * We pass task_is_blocked() as the should_block arg + * in order to keep mutex-blocked tasks on the runqueue + * for slection with proxy-exec (without proxy-exec + * task_is_blocked() will always be false). + */ + try_to_block_task(rq, prev, &prev_state, + !task_is_blocked(prev)); switch_count = &prev->nvcsw; } - next = pick_next_task(rq, prev, &rf); +pick_again: + next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + if (unlikely(task_is_blocked(next))) { + next = find_proxy_task(rq, next, &rf); + if (!next) + goto pick_again; + } picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e53d0b87f780..d3f33d10c58c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2262,6 +2262,14 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) return rq->donor == p; } +static inline bool task_is_blocked(struct task_struct *p) +{ + if (!sched_proxy_exec()) + return false; + + return !!p->blocked_on; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { return p->on_cpu; @@ -2459,7 +2467,7 @@ static inline void put_prev_set_next_task(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); __put_prev_set_next_dl_server(rq, prev, next); -- cgit v1.2.3 From be39617e38e0b1939a6014d77ee6f14707d59b1b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sat, 12 Jul 2025 03:33:48 +0000 Subject: sched: Fix proxy/current (push,pull)ability Proxy execution forms atomic pairs of tasks: The waiting donor task (scheduling context) and a proxy (execution context). The donor task, along with the rest of the blocked chain, follows the proxy wrt CPU placement. They can be the same task, in which case push/pull doesn't need any modification. When they are different, however, FIFO1 & FIFO42: ,-> RT42 | | blocked-on | v blocked_donor | mutex | | owner | v `-- RT1 RT1 RT42 CPU0 CPU1 ^ ^ | | overloaded !overloaded rq prio = 42 rq prio = 0 RT1 is eligible to be pushed to CPU1, but should that happen it will "carry" RT42 along. Clearly here neither RT1 nor RT42 must be seen as push/pullable. Unfortunately, only the donor task is usually dequeued from the rq, and the proxy'ed execution context (rq->curr) remains on the rq. This can cause RT1 to be selected for migration from logic like the rt pushable_list. Thus, adda a dequeue/enqueue cycle on the proxy task before __schedule returns, which allows the sched class logic to avoid adding the now current task to the pushable_list. Furthermore, tasks becoming blocked on a mutex don't need an explicit dequeue/enqueue cycle to be made (push/pull)able: they have to be running to block on a mutex, thus they will eventually hit put_prev_task(). Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-8-jstultz@google.com --- kernel/sched/core.c | 25 +++++++++++++++++++++++++ kernel/sched/deadline.c | 7 +++++++ kernel/sched/rt.c | 5 +++++ 3 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cb55d4247e65..a0b11201a7b4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6654,6 +6654,23 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) } #endif /* SCHED_PROXY_EXEC */ +static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) +{ + if (!sched_proxy_exec()) + return; + /* + * pick_next_task() calls set_next_task() on the chosen task + * at some point, which ensures it is not push/pullable. + * However, the chosen/donor task *and* the mutex owner form an + * atomic pair wrt push/pull. + * + * Make sure owner we run is not pushable. Unfortunately we can + * only deal with that by means of a dequeue/enqueue cycle. :-/ + */ + dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); + enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); +} + /* * __schedule() is the main scheduler function. * @@ -6798,6 +6815,10 @@ picked: * changes to task_struct made by pick_next_task(). */ RCU_INIT_POINTER(rq->curr, next); + + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6832,6 +6853,10 @@ picked: /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + /* In case next was already curr but just got blocked_donor */ + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1af06e48227d..e2d51f4306b3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2121,6 +2121,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (dl_server(&p->dl)) return; + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2415,6 +2418,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); + + if (task_is_blocked(p)) + return; + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index be6e9bcbe82b..7936d4333731 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1440,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1716,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + if (task_is_blocked(p)) + return; /* * The previous task needs to be made eligible for pushing * if it is still active -- cgit v1.2.3 From 7de9d4f946383f48ec393b6e9ad0c20e49e174e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 12 Jul 2025 03:33:49 +0000 Subject: sched: Start blocked_on chain processing in find_proxy_task() Start to flesh out the real find_proxy_task() implementation, but avoid the migration cases for now, in those cases just deactivate the donor task and pick again. To ensure the donor task or other blocked tasks in the chain aren't migrated away while we're running the proxy, also tweak the fair class logic to avoid migrating donor or mutex blocked tasks. [jstultz: This change was split out from the larger proxy patch] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-9-jstultz@google.com --- kernel/locking/mutex.h | 3 +- kernel/sched/core.c | 146 +++++++++++++++++++++++++++++++++++++++++-------- kernel/sched/fair.c | 12 +++- 3 files changed, 135 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index cbff35b9b7ae..2e8080a9bee3 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -6,7 +6,7 @@ * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar */ - +#ifndef CONFIG_PREEMPT_RT /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: @@ -70,3 +70,4 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_unlock(lock) do { } while (0) # define debug_mutex_init(lock, name, key) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a0b11201a7b4..f7f576ad9b22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -96,6 +96,7 @@ #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#include "../locking/mutex.h" EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -2933,8 +2934,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + /* + * Can the task run on the task's current CPU? If so, we're done + * + * We are also done if the task is the current donor, boosting a lock- + * holding proxy, (and potentially has been migrated outside its + * current or previous affinity mask) + */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) || + (task_current_donor(rq, p) && !task_current(rq, p))) { struct task_struct *push_task = NULL; if ((flags & SCA_MIGRATE_ENABLE) && @@ -6573,11 +6581,12 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_PROXY_EXEC -static inline void proxy_resched_idle(struct rq *rq) +static inline struct task_struct *proxy_resched_idle(struct rq *rq) { put_prev_set_next_task(rq, rq->donor, rq->idle); rq_set_donor(rq, rq->idle); set_tsk_need_resched(rq->idle); + return rq->idle; } static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) @@ -6614,36 +6623,124 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d } /* - * Initial simple sketch that just deactivates the blocked task - * chosen by pick_next_task() so we can then pick something that - * isn't blocked. + * Find runnable lock owner to proxy for mutex blocked donor + * + * Follow the blocked-on relation: + * task->blocked_on -> mutex->owner -> task... + * + * Lock order: + * + * p->pi_lock + * rq->lock + * mutex->wait_lock + * + * Returns the task that is going to be used as execution context (the one + * that is actually going to be run on cpu_of(rq)). */ static struct task_struct * find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) { + struct task_struct *owner = NULL; + int this_cpu = cpu_of(rq); + struct task_struct *p; struct mutex *mutex; - mutex = donor->blocked_on; - /* Something changed in the chain, so pick again */ - if (!mutex) - return NULL; - /* - * By taking mutex->wait_lock we hold off concurrent mutex_unlock() - * and ensure @owner sticks around. - */ - guard(raw_spinlock)(&mutex->wait_lock); + /* Follow blocked_on chain. */ + for (p = donor; task_is_blocked(p); p = owner) { + mutex = p->blocked_on; + /* Something changed in the chain, so pick again */ + if (!mutex) + return NULL; + /* + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() + * and ensure @owner sticks around. + */ + guard(raw_spinlock)(&mutex->wait_lock); - /* Check again that donor is blocked with blocked_lock held */ - if (!task_is_blocked(donor) || mutex != __get_task_blocked_on(donor)) { + /* Check again that p is blocked with wait_lock held */ + if (mutex != __get_task_blocked_on(p)) { + /* + * Something changed in the blocked_on chain and + * we don't know if only at this level. So, let's + * just bail out completely and let __schedule() + * figure things out (pick_again loop). + */ + return NULL; + } + + owner = __mutex_owner(mutex); + if (!owner) { + __clear_task_blocked_on(p, mutex); + return p; + } + + if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { + /* XXX Don't handle blocked owners/delayed dequeue yet */ + return proxy_deactivate(rq, donor); + } + + if (task_cpu(owner) != this_cpu) { + /* XXX Don't handle migrations yet */ + return proxy_deactivate(rq, donor); + } + + if (task_on_rq_migrating(owner)) { + /* + * One of the chain of mutex owners is currently migrating to this + * CPU, but has not yet been enqueued because we are holding the + * rq lock. As a simple solution, just schedule rq->idle to give + * the migration a chance to complete. Much like the migrate_task + * case we should end up back in find_proxy_task(), this time + * hopefully with all relevant tasks already enqueued. + */ + return proxy_resched_idle(rq); + } + + /* + * Its possible to race where after we check owner->on_rq + * but before we check (owner_cpu != this_cpu) that the + * task on another cpu was migrated back to this cpu. In + * that case it could slip by our checks. So double check + * we are still on this cpu and not migrating. If we get + * inconsistent results, try again. + */ + if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu) + return NULL; + + if (owner == p) { + /* + * It's possible we interleave with mutex_unlock like: + * + * lock(&rq->lock); + * find_proxy_task() + * mutex_unlock() + * lock(&wait_lock); + * donor(owner) = current->blocked_donor; + * unlock(&wait_lock); + * + * wake_up_q(); + * ... + * ttwu_runnable() + * __task_rq_lock() + * lock(&wait_lock); + * owner == p + * + * Which leaves us to finish the ttwu_runnable() and make it go. + * + * So schedule rq->idle so that ttwu_runnable() can get the rq + * lock and mark owner as running. + */ + return proxy_resched_idle(rq); + } /* - * Something changed in the blocked_on chain and - * we don't know if only at this level. So, let's - * just bail out completely and let __schedule() - * figure things out (pick_again loop). + * OK, now we're absolutely sure @owner is on this + * rq, therefore holding @rq->lock is sufficient to + * guarantee its existence, as per ttwu_remote(). */ - return NULL; /* do pick_next_task() again */ } - return proxy_deactivate(rq, donor); + + WARN_ON_ONCE(owner && !owner->on_rq); + return owner; } #else /* SCHED_PROXY_EXEC */ static struct task_struct * @@ -6801,10 +6898,13 @@ pick_again: next = find_proxy_task(rq, next, &rf); if (!next) goto pick_again; + if (next == rq->idle) + goto keep_resched; } picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); +keep_resched: rq->last_seen_need_resched_ns = 0; is_switch = prev != next; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97176458f2be..b173a059315c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9291,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) throttled_lb_pair, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or - * 5) are cache-hot on their current CPU. + * 5) are cache-hot on their current CPU, or + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; @@ -9313,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; + if (task_is_blocked(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; @@ -9348,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_on_cpu(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p) || + task_current_donor(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -9392,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p->stats.nr_forced_migrations); } + WARN_ON(task_current(env->src_rq, p)); + WARN_ON(task_current_donor(env->src_rq, p)); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } -- cgit v1.2.3 From 9503ca2ccafec51ee9e533d6f3aef14a589fc106 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 12 Jul 2025 16:22:53 -0700 Subject: lib/crypto: sha1: Rename sha1_init() to sha1_init_raw() Rename the existing sha1_init() to sha1_init_raw(), since it conflicts with the upcoming library function. This will later be removed, but this keeps the kernel building for the introduction of the library. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250712232329.818226-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c20babbf998f..dae281a1286d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -304,7 +304,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) if (!raw) return -ENOMEM; - sha1_init(digest); + sha1_init_raw(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation -- cgit v1.2.3 From d7c36d6350b5a4b27256eaeeea3b72621a819c9a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 10 Jun 2025 11:29:21 +0200 Subject: locking/lockdep: Avoid struct return in lock_stats() Returning a large structure from the lock_stats() function causes clang to have multiple copies of it on the stack and copy between them, which can end up exceeding the frame size warning limit: kernel/locking/lockdep.c:300:25: error: stack frame size (1464) exceeds limit (1280) in 'lock_stats' [-Werror,-Wframe-larger-than] 300 | struct lock_class_stats lock_stats(struct lock_class *class) Change the calling conventions to directly operate on the caller's copy, which apparently is what gcc does already. Signed-off-by: Arnd Bergmann Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250610092941.2642847-1-arnd@kernel.org --- kernel/locking/lockdep.c | 27 ++++++++++++--------------- kernel/locking/lockdep_proc.c | 2 +- 2 files changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dd2bbf73718b..0c941418a215 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -297,33 +297,30 @@ static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) dst->nr += src->nr; } -struct lock_class_stats lock_stats(struct lock_class *class) +void lock_stats(struct lock_class *class, struct lock_class_stats *stats) { - struct lock_class_stats stats; int cpu, i; - memset(&stats, 0, sizeof(struct lock_class_stats)); + memset(stats, 0, sizeof(struct lock_class_stats)); for_each_possible_cpu(cpu) { struct lock_class_stats *pcs = &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; - for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) - stats.contention_point[i] += pcs->contention_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contention_point); i++) + stats->contention_point[i] += pcs->contention_point[i]; - for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) - stats.contending_point[i] += pcs->contending_point[i]; + for (i = 0; i < ARRAY_SIZE(stats->contending_point); i++) + stats->contending_point[i] += pcs->contending_point[i]; - lock_time_add(&pcs->read_waittime, &stats.read_waittime); - lock_time_add(&pcs->write_waittime, &stats.write_waittime); + lock_time_add(&pcs->read_waittime, &stats->read_waittime); + lock_time_add(&pcs->write_waittime, &stats->write_waittime); - lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); - lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + lock_time_add(&pcs->read_holdtime, &stats->read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats->write_holdtime); - for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) - stats.bounces[i] += pcs->bounces[i]; + for (i = 0; i < ARRAY_SIZE(stats->bounces); i++) + stats->bounces[i] += pcs->bounces[i]; } - - return stats; } void clear_lock_stats(struct lock_class *class) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index b52c07c4707c..1916db9aa46b 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -657,7 +657,7 @@ static int lock_stat_open(struct inode *inode, struct file *file) if (!test_bit(idx, lock_classes_in_use)) continue; iter->class = class; - iter->stats = lock_stats(class); + lock_stats(class, &iter->stats); iter++; } -- cgit v1.2.3 From bd27cfb58c2803923702cd80289b35b7b8108859 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 9 Apr 2025 14:22:58 +0200 Subject: locking/lockdep: Change 'static const' variables to enum values gcc warns about 'static const' variables even in headers when building with -Wunused-const-variables enabled: In file included from kernel/locking/lockdep_proc.c:25: kernel/locking/lockdep_internals.h:69:28: error: 'LOCKF_USED_IN_IRQ_READ' defined but not used [-Werror=unused-const-variable=] 69 | static const unsigned long LOCKF_USED_IN_IRQ_READ = | ^~~~~~~~~~~~~~~~~~~~~~ kernel/locking/lockdep_internals.h:63:28: error: 'LOCKF_ENABLED_IRQ_READ' defined but not used [-Werror=unused-const-variable=] 63 | static const unsigned long LOCKF_ENABLED_IRQ_READ = | ^~~~~~~~~~~~~~~~~~~~~~ kernel/locking/lockdep_internals.h:57:28: error: 'LOCKF_USED_IN_IRQ' defined but not used [-Werror=unused-const-variable=] 57 | static const unsigned long LOCKF_USED_IN_IRQ = | ^~~~~~~~~~~~~~~~~ kernel/locking/lockdep_internals.h:51:28: error: 'LOCKF_ENABLED_IRQ' defined but not used [-Werror=unused-const-variable=] 51 | static const unsigned long LOCKF_ENABLED_IRQ = | ^~~~~~~~~~~~~~~~~ This one is easy to avoid by changing the generated constant definition into an equivalent enum. Tested-by: Andy Shevchenko Signed-off-by: Arnd Bergmann Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250409122314.2848028-6-arnd@kernel.org --- kernel/locking/lockdep_internals.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 82156caf77d1..0e5e6ffe91a3 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -47,29 +47,31 @@ enum { __LOCKF(USED_READ) }; +enum { #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE | -static const unsigned long LOCKF_ENABLED_IRQ = + LOCKF_ENABLED_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE | -static const unsigned long LOCKF_USED_IN_IRQ = + LOCKF_USED_IN_IRQ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ | -static const unsigned long LOCKF_ENABLED_IRQ_READ = + LOCKF_ENABLED_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE #define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ | -static const unsigned long LOCKF_USED_IN_IRQ_READ = + LOCKF_USED_IN_IRQ_READ = #include "lockdep_states.h" - 0; + 0, #undef LOCKDEP_STATE +}; #define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) #define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) -- cgit v1.2.3 From 1dfe5ea6dbb3e03073f5426d65394694683b8692 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 4 Jul 2025 01:52:18 +0000 Subject: locking/mutex: Remove redundant #ifdefs hung_task_{set,clear}_blocker() is already guarded by CONFIG_DETECT_HUNG_TASK_BLOCKER in hung_task.h, So remove the redudant check of #ifdef. Signed-off-by: Ran Xiaokai Acked-by: Waiman Long Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250704015218.359754-1-ranxiaokai627@163.com --- kernel/locking/mutex.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a39ecccbd106..d4210dc97b6a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -191,9 +191,7 @@ static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { -#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER hung_task_set_blocker(lock, BLOCKER_TYPE_MUTEX); -#endif debug_mutex_add_waiter(lock, waiter, current); list_add_tail(&waiter->list, list); @@ -209,9 +207,7 @@ __mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_remove_waiter(lock, waiter, current); -#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER hung_task_clear_blocker(); -#endif } /* -- cgit v1.2.3 From 7a3cedafccf8e7d038ad4cfec5b38052647ceac5 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 21 Mar 2025 02:30:49 -0700 Subject: lockdep: Speed up lockdep_unregister_key() with expedited RCU synchronization lockdep_unregister_key() is called from critical code paths, including sections where rtnl_lock() is held. For example, when replacing a qdisc in a network device, network egress traffic is disabled while __qdisc_destroy() is called for every network queue. If lockdep is enabled, __qdisc_destroy() calls lockdep_unregister_key(), which gets blocked waiting for synchronize_rcu() to complete. For example, a simple tc command to replace a qdisc could take 13 seconds: # time /usr/sbin/tc qdisc replace dev eth0 root handle 0x1: mq real 0m13.195s user 0m0.001s sys 0m2.746s During this time, network egress is completely frozen while waiting for RCU synchronization. Use synchronize_rcu_expedited() instead to minimize the impact on critical operations like network connectivity changes. This improves 10x the function call to tc, when replacing the qdisc for a network card. # time /usr/sbin/tc qdisc replace dev eth0 root handle 0x1: mq real 0m1.789s user 0m0.000s sys 0m1.613s [boqun: Fixed the comment and add more information for the temporary workaround, and add TODO information for hazptr] Reported-by: Erik Lundgren Signed-off-by: Breno Leitao Reviewed-by: Paul E. McKenney Reviewed-by: Eric Dumazet Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250321-lockdep-v1-1-78b732d195fb@debian.org --- kernel/locking/lockdep.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 0c941418a215..2d4c5bab5af8 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6616,8 +6616,16 @@ void lockdep_unregister_key(struct lock_class_key *key) if (need_callback) call_rcu(&delayed_free.rcu_head, free_zapped_rcu); - /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ - synchronize_rcu(); + /* + * Wait until is_dynamic_key() has finished accessing k->hash_entry. + * + * Some operations like __qdisc_destroy() will call this in a debug + * kernel, and the network traffic is disabled while waiting, hence + * the delay of the wait matters in debugging cases. Currently use a + * synchronize_rcu_expedited() to speed up the wait at the cost of + * system IPIs. TODO: Replace RCU with hazptr for this. + */ + synchronize_rcu_expedited(); } EXPORT_SYMBOL_GPL(lockdep_unregister_key); -- cgit v1.2.3 From e1876fb015c3edcc7f665e5955afad7009644ab6 Mon Sep 17 00:00:00 2001 From: Peng Jiang Date: Fri, 4 Jul 2025 14:38:17 +0800 Subject: kprobes: Add missing kerneldoc for __get_insn_slot Add kerneldoc for '__get_insn_slot' function to fix W=1 warnings: kernel/kprobes.c:141 function parameter 'c' not described in '__get_insn_slot' Link: https://lore.kernel.org/all/20250704143817707TOCcfTRWsO5OAbQ2eYoU9@zte.com.cn/ Signed-off-by: Peng Jiang Signed-off-by: Masami Hiramatsu (Google) --- kernel/kprobes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ffe0c3d52306..ab8f9fc1f0d1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -135,8 +135,12 @@ struct kprobe_insn_cache kprobe_insn_slots = { static int collect_garbage_slots(struct kprobe_insn_cache *c); /** - * __get_insn_slot() - Find a slot on an executable page for an instruction. - * We allocate an executable page if there's no room on existing ones. + * __get_insn_slot - Find a slot on an executable page for an instruction. + * @c: Pointer to kprobe instruction cache + * + * Description: Locates available slot on existing executable pages, + * allocates an executable page if there's no room on existing ones. + * Return: Pointer to instruction slot on success, NULL on failure. */ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) { -- cgit v1.2.3 From 75b63ce2c98b41c45ee3ff14c76f27c3238bc6d2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Jul 2025 10:43:26 +0200 Subject: PM: suspend: Drop a misplaced pm_restore_gfp_mask() call The pm_restore_gfp_mask() call added by commit 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") to suspend_devices_and_enter() is done too early because it takes place before calling dpm_resume() in dpm_resume_end() and some swap-backing devices may not be ready at that point. Moreover, dpm_resume_end() called subsequently in the same code path invokes pm_restore_gfp_mask() again and calling it twice in a row is pointless. Drop the misplaced pm_restore_gfp_mask() call from suspend_devices_and_enter() to address this issue. Fixes: 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/2810409.mvXUDI8C0e@rjwysocki.net [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index bb608b68fb30..632f83b60d16 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -540,7 +540,6 @@ int suspend_devices_and_enter(suspend_state_t state) return error; Recover_platform: - pm_restore_gfp_mask(); platform_recover(state); goto Resume_devices; } -- cgit v1.2.3 From 228b9deded0011482149cdb474e3ad15aeeb4a97 Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Sat, 12 Jul 2025 11:08:24 +0800 Subject: PM: suspend: clean up redundant filesystems_freeze/thaw() handling The recently introduced support for freezing filesystems during system suspend included calls to filesystems_freeze() in both suspend_prepare() and enter_state(), as well as calls to filesystems_thaw() in both suspend_finish() and the Unlock path in enter_state(). These are redundant. Moreover, calling filesystems_freeze() twice, from both suspend_prepare() and enter_state(), leads to a black screen and makes the system unable to resume in some cases. Address this as follows: - filesystems_freeze() is already called in suspend_prepare(), which is the proper and consistent place to handle pre-suspend operations. The second call in enter_state() is unnecessary and so remove it. - filesystems_thaw() is invoked in suspend_finish(), which covers successful suspend/resume paths. In the failure case, add a call to filesystems_thaw() only when needed, avoiding the duplicate call in the general Unlock path. This change simplifies the suspend code and avoids repeated freeze/thaw calls, while preserving correct ordering and behavior. Fixes: eacfbf74196f ("power: freeze filesystems during suspend/resume") Signed-off-by: Zihuan Zhang Link: https://patch.msgid.link/20250712030824.81474-1-zhangzihuan@kylinos.cn [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 632f83b60d16..b4ca17c2fecf 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -384,6 +384,7 @@ static int suspend_prepare(suspend_state_t state) return 0; dpm_save_failed_step(SUSPEND_FREEZE); + filesystems_thaw(); pm_notifier_call_chain(PM_POST_SUSPEND); Restore: pm_restore_console(); @@ -592,8 +593,6 @@ static int enter_state(suspend_state_t state) ksys_sync_helper(); trace_suspend_resume(TPS("sync_filesystems"), 0, false); } - if (filesystem_freeze_enabled) - filesystems_freeze(); pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]); pm_suspend_clear_flags(); @@ -613,7 +612,6 @@ static int enter_state(suspend_state_t state) pm_pr_dbg("Finishing wakeup.\n"); suspend_finish(); Unlock: - filesystems_thaw(); mutex_unlock(&system_transition_mutex); return error; } -- cgit v1.2.3 From 996afb6efd1a345736f9a888e4d6c7d4f3752aa5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Jul 2025 15:10:41 +0200 Subject: kexec_core: Fix error code path in the KEXEC_JUMP flow If dpm_suspend_start() fails, dpm_resume_end() must be called to recover devices whose suspend callbacks have been called, but this does not happen in the KEXEC_JUMP flow's error path due to a confused goto target label. Address this by using the correct target label in the goto statement in question and drop the Resume_console label that is not used any more. Fixes: 2965faa5e03d ("kexec: split kexec_load syscall from kexec core code") Signed-off-by: Rafael J. Wysocki Acked-by: Baoquan He Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/2396879.ElGaqSPkdT@rjwysocki.net [ rjw: Drop unused label and amend the changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/kexec_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3a9a9f240dbc..554369595298 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1080,7 +1080,7 @@ int kernel_kexec(void) console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) - goto Resume_console; + goto Resume_devices; /* * dpm_suspend_end() must be called after dpm_suspend_start() * to complete the transition, like in the hibernation flows @@ -1135,7 +1135,6 @@ int kernel_kexec(void) dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); - Resume_console: pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); -- cgit v1.2.3 From 2096d42d82dc983d9db861bd6585723bd24a0819 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Jul 2025 15:12:20 +0200 Subject: kexec_core: Drop redundant pm_restore_gfp_mask() call Drop the direct pm_restore_gfp_mask() call from the KEXEC_JUMP flow in kernel_kexec() because it is redundant. Namely, dpm_resume_end() called beforehand in the same code path invokes that function and it is sufficient to invoke it once. Signed-off-by: Rafael J. Wysocki Acked-by: Baoquan He Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/1949230.tdWV9SEqCh@rjwysocki.net [ rjw: Rebase after fixing up previous changes ] Signed-off-by: Rafael J. Wysocki --- kernel/kexec_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 554369595298..351cd7d76dfa 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1135,7 +1135,6 @@ int kernel_kexec(void) dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); - pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); Restore_console: -- cgit v1.2.3 From bd116214d53c66dc7f863822af171b20c06b4784 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 15 Jul 2025 13:53:20 +0200 Subject: blktrace: add zoned block commands to blk_fill_rwbs Add zoned block commands to blk_fill_rwbs: - ZONE APPEND will be decoded as 'ZA' - ZONE RESET will be decoded as 'ZR' - ZONE RESET ALL will be decoded as 'ZRA' - ZONE FINISH will be decoded as 'ZF' - ZONE OPEN will be decoded as 'ZO' - ZONE CLOSE will be decoded as 'ZC' Reviewed-by: Chaitanya Kulkarni Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250715115324.53308-2-johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3f6a7bdc6edf..47168d2afbf1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1875,6 +1875,29 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) case REQ_OP_READ: rwbs[i++] = 'R'; break; + case REQ_OP_ZONE_APPEND: + rwbs[i++] = 'Z'; + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_RESET_ALL: + rwbs[i++] = 'Z'; + rwbs[i++] = 'R'; + if ((opf & REQ_OP_MASK) == REQ_OP_ZONE_RESET_ALL) + rwbs[i++] = 'A'; + break; + case REQ_OP_ZONE_FINISH: + rwbs[i++] = 'Z'; + rwbs[i++] = 'F'; + break; + case REQ_OP_ZONE_OPEN: + rwbs[i++] = 'Z'; + rwbs[i++] = 'O'; + break; + case REQ_OP_ZONE_CLOSE: + rwbs[i++] = 'Z'; + rwbs[i++] = 'C'; + break; default: rwbs[i++] = 'N'; } @@ -1890,6 +1913,8 @@ void blk_fill_rwbs(char *rwbs, blk_opf_t opf) if (opf & REQ_ATOMIC) rwbs[i++] = 'U'; + WARN_ON_ONCE(i >= RWBS_LEN); + rwbs[i] = '\0'; } EXPORT_SYMBOL_GPL(blk_fill_rwbs); -- cgit v1.2.3 From 90c09d57caeca94e6f3f87c49e96a91edd40cbfd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Apr 2025 16:49:53 -0700 Subject: rcu: Protect ->defer_qs_iw_pending from data race On kernels built with CONFIG_IRQ_WORK=y, when rcu_read_unlock() is invoked within an interrupts-disabled region of code [1], it will invoke rcu_read_unlock_special(), which uses an irq-work handler to force the system to notice when the RCU read-side critical section actually ends. That end won't happen until interrupts are enabled at the soonest. In some kernels, such as those booted with rcutree.use_softirq=y, the irq-work handler is used unconditionally. The per-CPU rcu_data structure's ->defer_qs_iw_pending field is updated by the irq-work handler and is both read and updated by rcu_read_unlock_special(). This resulted in the following KCSAN splat: ------------------------------------------------------------------------ BUG: KCSAN: data-race in rcu_preempt_deferred_qs_handler / rcu_read_unlock_special read to 0xffff96b95f42d8d8 of 1 bytes by task 90 on cpu 8: rcu_read_unlock_special+0x175/0x260 __rcu_read_unlock+0x92/0xa0 rt_spin_unlock+0x9b/0xc0 __local_bh_enable+0x10d/0x170 __local_bh_enable_ip+0xfb/0x150 rcu_do_batch+0x595/0xc40 rcu_cpu_kthread+0x4e9/0x830 smpboot_thread_fn+0x24d/0x3b0 kthread+0x3bd/0x410 ret_from_fork+0x35/0x40 ret_from_fork_asm+0x1a/0x30 write to 0xffff96b95f42d8d8 of 1 bytes by task 88 on cpu 8: rcu_preempt_deferred_qs_handler+0x1e/0x30 irq_work_single+0xaf/0x160 run_irq_workd+0x91/0xc0 smpboot_thread_fn+0x24d/0x3b0 kthread+0x3bd/0x410 ret_from_fork+0x35/0x40 ret_from_fork_asm+0x1a/0x30 no locks held by irq_work/8/88. irq event stamp: 200272 hardirqs last enabled at (200272): [] finish_task_switch+0x131/0x320 hardirqs last disabled at (200271): [] __schedule+0x129/0xd70 softirqs last enabled at (0): [] copy_process+0x4df/0x1cc0 softirqs last disabled at (0): [<0000000000000000>] 0x0 ------------------------------------------------------------------------ The problem is that irq-work handlers run with interrupts enabled, which means that rcu_preempt_deferred_qs_handler() could be interrupted, and that interrupt handler might contain an RCU read-side critical section, which might invoke rcu_read_unlock_special(). In the strict KCSAN mode of operation used by RCU, this constitutes a data race on the ->defer_qs_iw_pending field. This commit therefore disables interrupts across the portion of the rcu_preempt_deferred_qs_handler() that updates the ->defer_qs_iw_pending field. This suffices because this handler is not a fast path. Signed-off-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_plugin.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0b0f56f6abc8..a91b2322a0cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -624,10 +624,13 @@ notrace void rcu_preempt_deferred_qs(struct task_struct *t) */ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) { + unsigned long flags; struct rcu_data *rdp; rdp = container_of(iwp, struct rcu_data, defer_qs_iw); + local_irq_save(flags); rdp->defer_qs_iw_pending = false; + local_irq_restore(flags); } /* -- cgit v1.2.3 From 78370df5c3574cdc5c6de7844481b4dc0ef4f172 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 2 Jul 2025 16:59:36 +0200 Subject: rcu: Enable rcu_normal_wake_from_gp on small systems Automatically enable the rcu_normal_wake_from_gp parameter on systems with a small number of CPUs. The activation threshold is set to 16 CPUs. This helps to reduce a latency of normal synchronize_rcu() API by waking up GP-waiters earlier and decoupling synchronize_rcu() callers from regular callback handling. A benchmark running 64 parallel jobs(system with 64 CPUs) invoking synchronize_rcu() demonstrates a notable latency reduction with the setting enabled. Latency distribution (microseconds): 0 - 9999 : 1 10000 - 19999 : 4 20000 - 29999 : 399 30000 - 39999 : 3197 40000 - 49999 : 10428 50000 - 59999 : 17363 60000 - 69999 : 15529 70000 - 79999 : 9287 80000 - 89999 : 4249 90000 - 99999 : 1915 100000 - 109999 : 922 110000 - 119999 : 390 120000 - 129999 : 187 ... 0 - 9999 : 1 10000 - 19999 : 234 20000 - 29999 : 6678 30000 - 39999 : 33463 40000 - 49999 : 20669 50000 - 59999 : 2766 60000 - 69999 : 183 ... Reviewed-by: Joel Fernandes Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f83bbb408895..8c22db759978 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1632,8 +1632,10 @@ static void rcu_sr_put_wait_head(struct llist_node *node) atomic_set_release(&sr_wn->inuse, 0); } -/* Disabled by default. */ -static int rcu_normal_wake_from_gp; +/* Enable rcu_normal_wake_from_gp automatically on small systems. */ +#define WAKE_FROM_GP_CPU_THRESHOLD 16 + +static int rcu_normal_wake_from_gp = -1; module_param(rcu_normal_wake_from_gp, int, 0644); static struct workqueue_struct *sync_wq; @@ -3250,7 +3252,7 @@ static void synchronize_rcu_normal(void) trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); - if (!READ_ONCE(rcu_normal_wake_from_gp)) { + if (READ_ONCE(rcu_normal_wake_from_gp) < 1) { wait_rcu_gp(call_rcu_hurry); goto trace_complete_out; } @@ -4854,6 +4856,12 @@ void __init rcu_init(void) sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); WARN_ON(!sync_wq); + /* Respect if explicitly disabled via a boot parameter. */ + if (rcu_normal_wake_from_gp < 0) { + if (num_possible_cpus() <= WAKE_FROM_GP_CPU_THRESHOLD) + rcu_normal_wake_from_gp = 1; + } + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) -- cgit v1.2.3 From b41642c87716bbd09797b1e4ea7d904f06c39b7b Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 8 Jul 2025 10:22:19 -0400 Subject: rcu: Fix rcu_read_unlock() deadloop due to IRQ work During rcu_read_unlock_special(), if this happens during irq_exit(), we can lockup if an IPI is issued. This is because the IPI itself triggers the irq_exit() path causing a recursive lock up. This is precisely what Xiongfeng found when invoking a BPF program on the trace_tick_stop() tracepoint As shown in the trace below. Fix by managing the irq_work state correctly. irq_exit() __irq_exit_rcu() /* in_hardirq() returns false after this */ preempt_count_sub(HARDIRQ_OFFSET) tick_irq_exit() tick_nohz_irq_exit() tick_nohz_stop_sched_tick() trace_tick_stop() /* a bpf prog is hooked on this trace point */ __bpf_trace_tick_stop() bpf_trace_run2() rcu_read_unlock_special() /* will send a IPI to itself */ irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); A simple reproducer can also be obtained by doing the following in tick_irq_exit(). It will hang on boot without the patch: static inline void tick_irq_exit(void) { + rcu_read_lock(); + WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true); + rcu_read_unlock(); + Reported-by: Xiongfeng Wang Closes: https://lore.kernel.org/all/9acd5f9f-6732-7701-6880-4b51190aa070@huawei.com/ Tested-by: Qi Xi Signed-off-by: Joel Fernandes Reviewed-by: "Paul E. McKenney" Reported-by: Linux Kernel Functional Testing [neeraj: Apply Frederic's suggested fix for PREEMPT_RT] Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.h | 13 ++++++++++++- kernel/rcu/tree_plugin.h | 37 ++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 3830c19cf2f6..de6ca13a7b5f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -174,6 +174,17 @@ struct rcu_snap_record { unsigned long jiffies; /* Track jiffies value */ }; +/* + * An IRQ work (deferred_qs_iw) is used by RCU to get the scheduler's attention. + * to report quiescent states at the soonest possible time. + * The request can be in one of the following states: + * - DEFER_QS_IDLE: An IRQ work is yet to be scheduled. + * - DEFER_QS_PENDING: An IRQ work was scheduled but either not yet run, or it + * ran and we still haven't reported a quiescent state. + */ +#define DEFER_QS_IDLE 0 +#define DEFER_QS_PENDING 1 + /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ @@ -192,7 +203,7 @@ struct rcu_data { /* during and after the last grace */ /* period it is aware of. */ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ - bool defer_qs_iw_pending; /* Scheduler attention pending? */ + int defer_qs_iw_pending; /* Scheduler attention pending? */ struct work_struct strict_work; /* Schedule readers for strict GPs. */ /* 2) batch handling */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a91b2322a0cd..b6f44871f774 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) struct rcu_node *rnp; union rcu_special special; + rdp = this_cpu_ptr(&rcu_data); + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING) + rdp->defer_qs_iw_pending = DEFER_QS_IDLE; + /* * If RCU core is waiting for this CPU to exit its critical section, * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; - rdp = this_cpu_ptr(&rcu_data); if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; @@ -629,7 +632,23 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) rdp = container_of(iwp, struct rcu_data, defer_qs_iw); local_irq_save(flags); - rdp->defer_qs_iw_pending = false; + + /* + * If the IRQ work handler happens to run in the middle of RCU read-side + * critical section, it could be ineffective in getting the scheduler's + * attention to report a deferred quiescent state (the whole point of the + * IRQ work). For this reason, requeue the IRQ work. + * + * Basically, we want to avoid following situation: + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING) + * 2. CPU enters new rcu_read_lock() + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0 + * 4. rcu_read_unlock() does not re-queue work (state still PENDING) + * 5. Deferred QS reporting does not happen. + */ + if (rcu_preempt_depth() > 0) + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE); + local_irq_restore(flags); } @@ -676,17 +695,13 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) { + expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - IS_ENABLED(CONFIG_PREEMPT_RT)) - rdp->defer_qs_iw = IRQ_WORK_INIT_HARD( - rcu_preempt_deferred_qs_handler); - else - init_irq_work(&rdp->defer_qs_iw, - rcu_preempt_deferred_qs_handler); - rdp->defer_qs_iw_pending = true; + rdp->defer_qs_iw = + IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); + rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } } -- cgit v1.2.3 From cbd5d35e6ddc47b4cde5c96a0d5f00da0f8e881f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2025 17:23:26 -0700 Subject: torture: Remove support for SRCU-lite Because SRCU-lite is being replaced by SRCU-fast, this commit removes support for SRCU-lite from refscale.c. Both SRCU-lite and SRCU-fast provide faster readers by dropping the smp_mb() call from their lock and unlock primitives, but incur a pair of added RCU grace periods during the SRCU grace period. There is a trivial mapping from the SRCU-lite API to that of SRCU-fast, so there should be no transition issues. [ paulmck: Apply Christoph Hellwig feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/refscale.c | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index f11a7c2af778..f4b2cea1cce5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -246,36 +246,6 @@ static const struct ref_scale_ops srcu_fast_ops = { .name = "srcu-fast" }; -static void srcu_lite_ref_scale_read_section(const int nloops) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); - srcu_read_unlock_lite(srcu_ctlp, idx); - } -} - -static void srcu_lite_ref_scale_delay_section(const int nloops, const int udl, const int ndl) -{ - int i; - int idx; - - for (i = nloops; i >= 0; i--) { - idx = srcu_read_lock_lite(srcu_ctlp); - un_delay(udl, ndl); - srcu_read_unlock_lite(srcu_ctlp, idx); - } -} - -static const struct ref_scale_ops srcu_lite_ops = { - .init = rcu_sync_scale_init, - .readsection = srcu_lite_ref_scale_read_section, - .delaysection = srcu_lite_ref_scale_delay_section, - .name = "srcu-lite" -}; - #ifdef CONFIG_TASKS_RCU // Definitions for RCU Tasks ref scale testing: Empty read markers. @@ -1193,7 +1163,7 @@ ref_scale_init(void) long i; int firsterr = 0; static const struct ref_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS + &rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, -- cgit v1.2.3 From 941ab0b369c983f7867de54c8579fd7f1676ee3c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Jul 2025 17:23:28 -0700 Subject: rcutorture: Remove support for SRCU-lite Because SRCU-lite is being replaced by SRCU-fast, this commit removes support for SRCU-lite from rcutorture.c Both SRCU-lite and SRCU-fast provide faster readers by dropping the smp_mb() call from their lock and unlock primitives, but incur a pair of added RCU grace periods during the SRCU grace period. There is a trivial mapping from the SRCU-lite API to that of SRCU-fast, so there should be no transition issues. [ paulmck: Apply Christoph Hellwig feedback. ] Signed-off-by: "Paul E. McKenney" Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/rcutorture.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 213f23f20a64..7a893d51d02b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -718,11 +718,6 @@ static int srcu_torture_read_lock(void) WARN_ON_ONCE(idx & ~0x1); ret += idx << 1; } - if (reader_flavor & SRCU_READ_FLAVOR_LITE) { - idx = srcu_read_lock_lite(srcu_ctlp); - WARN_ON_ONCE(idx & ~0x1); - ret += idx << 2; - } if (reader_flavor & SRCU_READ_FLAVOR_FAST) { scp = srcu_read_lock_fast(srcu_ctlp); idx = __srcu_ptr_to_ctr(srcu_ctlp, scp); @@ -756,8 +751,6 @@ static void srcu_torture_read_unlock(int idx) WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1))); if (reader_flavor & SRCU_READ_FLAVOR_FAST) srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3)); - if (reader_flavor & SRCU_READ_FLAVOR_LITE) - srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2); if (reader_flavor & SRCU_READ_FLAVOR_NMI) srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1); if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) -- cgit v1.2.3 From 3aea745a2a82e8397d2f30326f0dcf5f375dd7c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Jun 2025 13:49:53 -0700 Subject: srcu: Expedite SRCU-fast grace periods Currently, SRCU-fast grace periods use synchronize_rcu() to provide the needed ordering with readers, even given an expedited SRCU-fast grace period, which isn't all that expedited. This commit therefore instead uses synchronize_rcu_expedited() if there is an expedited SRCU-fast grace period in flight. Of course, given an non-expedited SRCU-fast grace period blocked in synchronize_rcu(), a later request for an expedited SRCU-fast grace period will wait for that synchronize_rcu() to return before switching to use of synchronize_rcu_expedited(). If this turns out to be a real problem for a production workload, we can increase the complexity (but likely also degrade the energy efficiency) to speed things up further. Signed-off-by: Paul E. McKenney Cc: Andrii Nakryiko Cc: Alexei Starovoitov Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/srcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 48047260697e..c5e8ebc493d5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -502,6 +502,8 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) */ if (!did_gp) smp_mb(); /* A */ + else if (srcu_gp_is_expedited(ssp)) + synchronize_rcu_expedited(); /* X */ else synchronize_rcu(); /* X */ -- cgit v1.2.3 From 908a97eba8c8b510996bf5d77d1e3070d59caa6d Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:52 -0400 Subject: rcu: Refactor expedited handling check in rcu_read_unlock_special() Extract the complex expedited handling condition in rcu_read_unlock_special() into a separate function rcu_unlock_needs_exp_handling() with detailed comments explaining each condition. This improves code readability. No functional change intended. Reviewed-by: "Paul E. McKenney" Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_plugin.h | 83 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b6f44871f774..c71d9d10780d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -652,6 +652,75 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) local_irq_restore(flags); } +/* + * Check if expedited grace period processing during unlock is needed. + * + * This function determines whether expedited handling is required based on: + * 1. Task blocking an expedited grace period (based on a heuristic, could be + * false-positive, see below.) + * 2. CPU participating in an expedited grace period + * 3. Strict grace period mode requiring expedited handling + * 4. RCU priority deboosting needs when interrupts were disabled + * + * @t: The task being checked + * @rdp: The per-CPU RCU data + * @rnp: The RCU node for this CPU + * @irqs_were_disabled: Whether interrupts were disabled before rcu_read_unlock() + * + * Returns true if expedited processing of the rcu_read_unlock() is needed. + */ +static bool rcu_unlock_needs_exp_handling(struct task_struct *t, + struct rcu_data *rdp, + struct rcu_node *rnp, + bool irqs_were_disabled) +{ + /* + * Check if this task is blocking an expedited grace period. If the + * task was preempted within an RCU read-side critical section and is + * on the expedited grace period blockers list (exp_tasks), we need + * expedited handling to unblock the expedited GP. This is not an exact + * check because 't' might not be on the exp_tasks list at all - its + * just a fast heuristic that can be false-positive sometimes. + */ + if (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) + return true; + + /* + * Check if this CPU is participating in an expedited grace period. + * The expmask bitmap tracks which CPUs need to check in for the + * current expedited GP. If our CPU's bit is set, we need expedited + * handling to help complete the expedited GP. + */ + if (rdp->grpmask & READ_ONCE(rnp->expmask)) + return true; + + /* + * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, all grace periods + * are treated as short for testing purposes even if that means + * disturbing the system more. Check if either: + * - This CPU has not yet reported a quiescent state, or + * - This task was preempted within an RCU critical section + * In either case, require expedited handling for strict GP mode. + */ + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && + ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) + return true; + + /* + * RCU priority boosting case: If a task is subject to RCU priority + * boosting and exits an RCU read-side critical section with interrupts + * disabled, we need expedited handling to ensure timely deboosting. + * Without this, a low-priority task could incorrectly run at high + * real-time priority for an extended period degrading real-time + * responsiveness. This applies to all CONFIG_RCU_BOOST=y kernels, + * not just to PREEMPT_RT. + */ + if (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node) + return true; + + return false; +} + /* * Handle special cases during rcu_read_unlock(), such as needing to * notify RCU core processing or task having blocked during the RCU @@ -671,18 +740,14 @@ static void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); irqs_were_disabled = irqs_disabled_flags(flags); if (preempt_bh_were_disabled || irqs_were_disabled) { - bool expboost; // Expedited GP in flight or possible boosting. + bool needs_exp; // Expedited handling needed. struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) || - (rdp->grpmask & READ_ONCE(rnp->expmask)) || - (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && - ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) || - (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && - t->rcu_blocked_node); + needs_exp = rcu_unlock_needs_exp_handling(t, rdp, rnp, irqs_were_disabled); + // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (needs_exp && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. @@ -695,7 +760,7 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && + needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. -- cgit v1.2.3 From 1ed171a3afe81531b3ace96bd151a372dda3ee25 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 15 Jul 2025 20:19:44 -0700 Subject: tracing/probes: Avoid using params uninitialized in parse_btf_arg() After a recent change in clang to strengthen uninitialized warnings [1], it points out that in one of the error paths in parse_btf_arg(), params is used uninitialized: kernel/trace/trace_probe.c:660:19: warning: variable 'params' is uninitialized when used here [-Wuninitialized] 660 | return PTR_ERR(params); | ^~~~~~ Match many other NO_BTF_ENTRY error cases and return -ENOENT, clearing up the warning. Link: https://lore.kernel.org/all/20250715-trace_probe-fix-const-uninit-warning-v1-1-98960f91dd04@kernel.org/ Cc: stable@vger.kernel.org Closes: https://github.com/ClangBuiltLinux/linux/issues/2110 Fixes: d157d7694460 ("tracing/probes: Support BTF field access from $retval") Link: https://github.com/llvm/llvm-project/commit/2464313eef01c5b1edf0eccf57a32cdee01472c7 [1] Signed-off-by: Nathan Chancellor Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 424751cdf31f..40830a3ecd96 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -657,7 +657,7 @@ static int parse_btf_arg(char *varname, ret = query_btf_context(ctx); if (ret < 0 || ctx->nr_params == 0) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); - return PTR_ERR(params); + return -ENOENT; } } params = ctx->params; -- cgit v1.2.3 From e14fd98c6d66cb76694b12c05768e4f9e8c95664 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Wed, 16 Jul 2025 10:38:48 -0700 Subject: sched/ext: Prevent update_locked_rq() calls with NULL rq Avoid invoking update_locked_rq() when the runqueue (rq) pointer is NULL in the SCX_CALL_OP and SCX_CALL_OP_RET macros. Previously, calling update_locked_rq(NULL) with preemption enabled could trigger the following warning: BUG: using __this_cpu_write() in preemptible [00000000] This happens because __this_cpu_write() is unsafe to use in preemptible context. rq is NULL when an ops invoked from an unlocked context. In such cases, we don't need to store any rq, since the value should already be NULL (unlocked). Ensure that update_locked_rq() is only called when rq is non-NULL, preventing calling __this_cpu_write() on preemptible context. Suggested-by: Peter Zijlstra Fixes: 18853ba782bef ("sched_ext: Track currently locked rq") Signed-off-by: Breno Leitao Acked-by: Andrea Righi Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v6.15 --- kernel/sched/ext.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index b498d867ba21..7dd5cbcb7a06 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1272,7 +1272,8 @@ static inline struct rq *scx_locked_rq(void) #define SCX_CALL_OP(sch, mask, op, rq, args...) \ do { \ - update_locked_rq(rq); \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ (sch)->ops.op(args); \ @@ -1280,14 +1281,16 @@ do { \ } else { \ (sch)->ops.op(args); \ } \ - update_locked_rq(NULL); \ + if (rq) \ + update_locked_rq(NULL); \ } while (0) #define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ ({ \ __typeof__((sch)->ops.op(args)) __ret; \ \ - update_locked_rq(rq); \ + if (rq) \ + update_locked_rq(rq); \ if (mask) { \ scx_kf_allow(mask); \ __ret = (sch)->ops.op(args); \ @@ -1295,7 +1298,8 @@ do { \ } else { \ __ret = (sch)->ops.op(args); \ } \ - update_locked_rq(NULL); \ + if (rq) \ + update_locked_rq(NULL); \ __ret; \ }) -- cgit v1.2.3 From 1f489662fba823c5063f99cd875516829be0c276 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 10 Jul 2025 12:08:50 +0200 Subject: bpf: Update iterators.lskel-big-endian.h The last iterators update (commit 515ee52b2224 ("bpf: make preloaded map iterators to display map elements count")) missed the big-endian skeleton. Update it by running "make big" with Debian clang version 21.0.0 (++20250706105601+01c97b4953e8-1~exp1~20250706225612.1558). Signed-off-by: Ilya Leoshkevich Link: https://lore.kernel.org/r/20250710100907.45880-1-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- .../preload/iterators/iterators.lskel-big-endian.h | 492 +++++++++++---------- 1 file changed, 255 insertions(+), 237 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h index ebdc6c0cdb70..49b1d515a847 100644 --- a/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h +++ b/kernel/bpf/preload/iterators/iterators.lskel-big-endian.h @@ -89,10 +89,7 @@ iterators_bpf__load(struct iterators_bpf *skel) { struct bpf_load_and_run_opts opts = {}; int err; - - opts.ctx = (struct bpf_loader_ctx *)skel; - opts.data_sz = 6008; - opts.data = (void *)"\ + static const char opts_data[] __attribute__((__aligned__(8))) = "\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ @@ -126,190 +123,196 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x9f\x01\0\ -\0\0\0\x18\0\0\0\0\0\0\x04\x1c\0\0\x04\x1c\0\0\x05\x18\0\0\0\0\x02\0\0\0\0\0\0\ +\0\0\0\x18\0\0\0\0\0\0\x04\x80\0\0\x04\x80\0\0\x05\x44\0\0\0\0\x02\0\0\0\0\0\0\ \x02\0\0\0\x01\x04\0\0\x02\0\0\0\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\ \0\x04\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x08\0\0\0\0\x02\0\0\0\0\0\0\x0d\0\0\0\ \0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\x01\0\0\0\0\0\0\x04\x01\ -\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc2\x04\0\0\x03\0\0\0\x18\0\0\0\ -\xd0\0\0\0\x09\0\0\0\0\0\0\0\xd4\0\0\0\x0b\0\0\0\x40\0\0\0\xdf\0\0\0\x0b\0\0\0\ -\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe7\x07\0\0\0\0\0\0\0\0\0\0\xf0\x08\0\0\ -\0\0\0\0\x0c\0\0\0\xf6\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xb3\x04\0\0\x03\0\ -\0\0\x18\0\0\x01\xbb\0\0\0\x0e\0\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x01\ -\xc3\0\0\0\x0e\0\0\0\xa0\0\0\x01\xcf\x08\0\0\0\0\0\0\x0f\0\0\x01\xd5\x01\0\0\0\ -\0\0\0\x04\0\0\0\x20\0\0\x01\xe2\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\ -\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xe7\x01\0\0\0\0\0\0\x04\0\0\ -\0\x20\0\0\0\0\x02\0\0\0\0\0\0\x14\0\0\x02\x4b\x04\0\0\x02\0\0\0\x10\0\0\0\x13\ -\0\0\0\x03\0\0\0\0\0\0\x02\x5e\0\0\0\x15\0\0\0\x40\0\0\0\0\x02\0\0\0\0\0\0\x18\ -\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x13\0\0\x02\x63\x0c\0\0\x01\0\0\ -\0\x16\0\0\x02\xaf\x04\0\0\x01\0\0\0\x08\0\0\x02\xb8\0\0\0\x19\0\0\0\0\0\0\0\0\ -\x02\0\0\0\0\0\0\x1a\0\0\x03\x09\x04\0\0\x06\0\0\0\x38\0\0\x01\xbb\0\0\0\x0e\0\ -\0\0\0\0\0\x01\xbe\0\0\0\x11\0\0\0\x20\0\0\x03\x16\0\0\0\x1b\0\0\0\xc0\0\0\x03\ -\x27\0\0\0\x15\0\0\x01\0\0\0\x03\x30\0\0\0\x1d\0\0\x01\x40\0\0\x03\x3a\0\0\0\ -\x1e\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x1c\0\0\0\0\x0a\0\0\0\0\0\0\x10\0\0\0\ -\0\x02\0\0\0\0\0\0\x1f\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\x03\x84\x04\0\0\x02\0\0\ -\0\x08\0\0\x03\x92\0\0\0\x0e\0\0\0\0\0\0\x03\x9b\0\0\0\x0e\0\0\0\x20\0\0\x03\ -\x3a\x04\0\0\x03\0\0\0\x18\0\0\x03\xa5\0\0\0\x1b\0\0\0\0\0\0\x03\xad\0\0\0\x21\ -\0\0\0\x40\0\0\x03\xb3\0\0\0\x23\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x22\0\0\0\0\ -\x02\0\0\0\0\0\0\x24\0\0\x03\xb7\x04\0\0\x01\0\0\0\x04\0\0\x03\xc2\0\0\0\x0e\0\ -\0\0\0\0\0\x04\x2b\x04\0\0\x01\0\0\0\x04\0\0\x04\x34\0\0\0\x0e\0\0\0\0\0\0\0\0\ -\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\x04\xaa\x0e\0\0\0\0\0\0\ -\x25\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\x04\ -\xbe\x0e\0\0\0\0\0\0\x27\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\x12\ -\0\0\0\x20\0\0\x04\xd4\x0e\0\0\0\0\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\ -\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\x04\xe9\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\ -\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\0\x0e\0\0\0\0\0\0\x2d\ -\0\0\0\x01\0\0\x05\x08\x0f\0\0\x04\0\0\0\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\ -\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\ -\0\x11\0\0\x05\x10\x0f\0\0\x01\0\0\0\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\x62\x70\ -\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\ -\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\x3a\ -\x30\0\x2f\x68\x6f\x6d\x65\x2f\x69\x69\x69\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\ -\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\ -\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\ -\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\ -\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\ -\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\ -\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\ -\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\ -\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\ -\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\ -\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\ -\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\ -\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\ -\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\ -\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\ -\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\ -\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\ -\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\ -\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\ -\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\x6e\x65\x64\x20\x69\ -\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\x5f\x53\x49\x5a\x45\ -\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\ -\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\ -\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\x2d\x3e\x69\x64\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\x70\x2d\x3e\x6d\x61\x78\x5f\x65\ -\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ -\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\ -\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\ -\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\ -\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\ -\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ -\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\ -\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\ -\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\ -\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\ -\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\ -\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\ -\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\ -\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\ -\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\ -\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\ -\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\ -\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\ -\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\ -\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\ -\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\ -\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\ -\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\ -\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\ -\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\ -\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\ -\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\ -\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ -\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\ -\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\ -\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\ -\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x72\x6f\x64\ -\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x09\x4c\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x62\0\0\0\ -\x01\0\0\0\x80\0\0\0\0\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\ -\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\0\0\0\0\0\x20\ -\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ -\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\ -\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\ +\0\0\x20\0\0\0\x24\x0c\0\0\x01\0\0\0\x05\0\0\0\xc3\x04\0\0\x03\0\0\0\x18\0\0\0\ +\xd1\0\0\0\x09\0\0\0\0\0\0\0\xd5\0\0\0\x0b\0\0\0\x40\0\0\0\xe0\0\0\0\x0b\0\0\0\ +\x80\0\0\0\0\x02\0\0\0\0\0\0\x0a\0\0\0\xe8\x07\0\0\0\0\0\0\0\0\0\0\xf1\x08\0\0\ +\0\0\0\0\x0c\0\0\0\xf7\x01\0\0\0\0\0\0\x08\0\0\0\x40\0\0\x01\xc1\x04\0\0\x03\0\ +\0\0\x18\0\0\x01\xc9\0\0\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x01\ +\xd1\0\0\0\x0e\0\0\0\xa0\0\0\x01\xdd\x08\0\0\0\0\0\0\x0f\0\0\x01\xe3\x01\0\0\0\ +\0\0\0\x04\0\0\0\x20\0\0\x01\xf0\x01\0\0\0\0\0\0\x01\x01\0\0\x08\0\0\0\0\x03\0\ +\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\x01\xf5\x01\0\0\0\0\0\0\x04\0\0\ +\0\x20\0\0\0\0\x0d\0\0\x01\0\0\0\x14\0\0\x05\x39\0\0\0\x04\0\0\x02\x3e\x08\0\0\ +\0\0\0\0\x15\0\0\x02\x44\x01\0\0\0\0\0\0\x08\x01\0\0\x40\0\0\x02\x4e\x0c\0\0\ +\x01\0\0\0\x13\0\0\0\0\x02\0\0\0\0\0\0\x18\0\0\x02\x65\x04\0\0\x02\0\0\0\x10\0\ +\0\0\x13\0\0\0\x03\0\0\0\0\0\0\x02\x78\0\0\0\x19\0\0\0\x40\0\0\0\0\x02\0\0\0\0\ +\0\0\x1c\0\0\0\0\x0d\0\0\x01\0\0\0\x06\0\0\0\x1c\0\0\0\x17\0\0\x02\x7d\x0c\0\0\ +\x01\0\0\0\x1a\0\0\x02\xc9\x04\0\0\x01\0\0\0\x08\0\0\x02\xd2\0\0\0\x1d\0\0\0\0\ +\0\0\0\0\x02\0\0\0\0\0\0\x1e\0\0\x03\x23\x04\0\0\x06\0\0\0\x38\0\0\x01\xc9\0\0\ +\0\x0e\0\0\0\0\0\0\x01\xcc\0\0\0\x11\0\0\0\x20\0\0\x03\x30\0\0\0\x1f\0\0\0\xc0\ +\0\0\x03\x41\0\0\0\x19\0\0\x01\0\0\0\x03\x4a\0\0\0\x21\0\0\x01\x40\0\0\x03\x54\ +\0\0\0\x22\0\0\x01\x80\0\0\0\0\x02\0\0\0\0\0\0\x20\0\0\0\0\x0a\0\0\0\0\0\0\x10\ +\0\0\0\0\x02\0\0\0\0\0\0\x23\0\0\0\0\x02\0\0\0\0\0\0\x24\0\0\x03\x9e\x04\0\0\ +\x02\0\0\0\x08\0\0\x03\xac\0\0\0\x0e\0\0\0\0\0\0\x03\xb5\0\0\0\x0e\0\0\0\x20\0\ +\0\x03\x54\x04\0\0\x03\0\0\0\x18\0\0\x03\xbf\0\0\0\x1f\0\0\0\0\0\0\x03\xc7\0\0\ +\0\x25\0\0\0\x40\0\0\x03\xcd\0\0\0\x27\0\0\0\x80\0\0\0\0\x02\0\0\0\0\0\0\x26\0\ +\0\0\0\x02\0\0\0\0\0\0\x28\0\0\x03\xd1\x04\0\0\x01\0\0\0\x04\0\0\x03\xdc\0\0\0\ +\x0e\0\0\0\0\0\0\x04\x45\x04\0\0\x01\0\0\0\x04\0\0\x04\x4e\0\0\0\x0e\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x30\0\0\x04\xc4\x0e\0\0\0\0\ +\0\0\x29\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\x12\0\0\0\x1a\0\0\ +\x04\xd8\x0e\0\0\0\0\0\0\x2b\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\x20\0\0\0\ +\x12\0\0\0\x20\0\0\x04\xee\x0e\0\0\0\0\0\0\x2d\0\0\0\0\0\0\0\0\x03\0\0\0\0\0\0\ +\0\0\0\0\x20\0\0\0\x12\0\0\0\x11\0\0\x05\x03\x0e\0\0\0\0\0\0\x2f\0\0\0\0\0\0\0\ +\0\x03\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\x05\x1a\x0e\0\0\0\0\0\0\ +\x31\0\0\0\x01\0\0\x05\x22\x0f\0\0\x01\0\0\0\x04\0\0\0\x36\0\0\0\0\0\0\0\x04\0\ +\0\x05\x29\x0f\0\0\x04\0\0\0\x7b\0\0\0\x2a\0\0\0\0\0\0\0\x30\0\0\0\x2c\0\0\0\ +\x30\0\0\0\x1a\0\0\0\x2e\0\0\0\x4a\0\0\0\x20\0\0\0\x30\0\0\0\x6a\0\0\0\x11\0\0\ +\x05\x31\x0f\0\0\x01\0\0\0\x04\0\0\0\x32\0\0\0\0\0\0\0\x04\0\0\x05\x39\x0e\0\0\ +\0\0\0\0\x06\0\0\0\x01\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\ +\x5f\x6d\x61\x70\0\x6d\x65\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x32\x2f\x69\x69\x69\ +\x2f\x6c\x69\x6e\x75\x78\x2d\x6b\x65\x72\x6e\x65\x6c\x2d\x74\x6f\x6f\x6c\x63\ +\x68\x61\x69\x6e\x2f\x73\x72\x63\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\ +\x65\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\ +\x2e\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\ +\x2a\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\ +\x65\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\ +\x71\0\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\ +\x73\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\ +\x6e\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\ +\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\ +\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\ +\x70\x29\0\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\ +\x6d\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\ +\x6e\x75\x6d\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\ +\x20\x30\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\ +\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\ +\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\x6e\x22\x29\ +\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\x6d\x61\x78\ +\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\x73\x69\x67\ +\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\x52\x41\x59\ +\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\x5f\x53\x45\ +\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\x20\x25\x31\x30\x6c\x6c\ +\x64\x5c\x6e\x22\x2c\0\x7d\0\x5f\x5f\x73\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\ +\x6e\x67\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\x6d\x5f\x65\x6c\x65\x6d\x5f\ +\x63\x6f\x75\x6e\x74\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\ +\x70\x72\x6f\x67\0\x70\x72\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ +\x72\x6f\x67\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\ +\x74\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\ +\x67\x20\x3d\x20\x63\x74\x78\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\ +\x21\x70\x72\x6f\x67\x29\0\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\ +\x61\x75\x78\x20\x3d\x20\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\ +\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\ +\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\ +\x6e\x63\x5f\x6e\x61\x6d\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\ +\x63\x5f\x69\x6e\x66\x6f\0\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\ +\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\ +\x31\x36\x73\x20\x25\x73\x20\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\ +\x69\x64\x2c\0\x30\x3a\x34\0\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\ +\x29\0\x62\x70\x66\x5f\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\ +\x5f\x6f\x66\x66\0\x74\x79\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\ +\x73\0\x74\x79\x70\x65\x73\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\ +\x72\0\x73\x74\x72\x5f\x6c\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\ +\x74\x66\x2d\x3e\x74\x79\x70\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\ +\x65\x5f\x72\x65\x61\x64\x5f\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\ +\x69\x7a\x65\x6f\x66\x28\x74\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\ +\x74\x66\x5f\x69\x64\x29\x3b\0\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\ +\x73\x74\x72\x69\x6e\x67\x73\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\ +\x6d\x65\x5f\x6f\x66\x66\0\x09\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\ +\x50\x46\x5f\x43\x4f\x52\x45\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\ +\x65\x5f\x6f\x66\x66\x29\x3b\0\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\ +\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\ +\x2e\x73\x74\x72\x5f\x6c\x65\x6e\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\ +\x72\x20\x2b\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\ +\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\ +\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\ +\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\ +\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\ +\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\x4e\x53\x45\0\x2e\x6b\x73\x79\x6d\x73\0\x2e\ +\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x64\x75\x6d\x6d\x79\ +\x5f\x6b\x73\x79\x6d\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x09\xdc\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\0\x04\0\0\0\x7b\0\0\0\x01\0\0\0\x80\0\0\0\0\ +\0\0\0\0\x69\x74\x65\x72\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x34\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\ +\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\ +\x65\x6e\x74\x72\x69\x65\x73\x20\x20\x63\x75\x72\x5f\x65\x6e\x74\x72\x69\x65\ +\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x20\x25\x31\x30\x64\x20\x20\ +\x20\x25\x31\x30\x6c\x6c\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\ \x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\ \x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\0\0\0\ \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\ -\0\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1a\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\ -\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe8\xbf\x16\0\0\ -\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb7\x30\0\0\0\0\0\x23\xb7\x50\0\0\ -\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe8\0\0\0\0\xb7\ -\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xf0\0\0\ -\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\ -\0\0\xff\xff\xff\xe8\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x23\ -\xb7\x30\0\0\0\0\0\x0e\xb7\x50\0\0\0\0\0\x18\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\ -\0\0\x95\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x3c\ -\x1e\0\0\0\x01\0\0\0\x42\0\0\0\x9a\0\x01\x3c\x24\0\0\0\x02\0\0\0\x42\0\0\x01\ -\x0d\0\x01\x44\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2e\0\x01\x4c\x06\0\0\0\x04\0\0\ -\0\x42\0\0\x01\x3d\0\x01\x40\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x62\0\x01\x58\x06\ -\0\0\0\x07\0\0\0\x42\0\0\x01\x75\0\x01\x5c\x03\0\0\0\x0e\0\0\0\x42\0\0\x01\xfb\ -\0\x01\x64\x02\0\0\0\x1e\0\0\0\x42\0\0\x02\x49\0\x01\x6c\x01\0\0\0\0\0\0\0\x02\ -\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\ -\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x70\0\0\0\x0d\ -\0\0\0\x3e\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x09\0\0\0\0\0\0\0\xa0\0\0\0\x0d\ -\0\0\x01\x39\0\0\0\0\0\0\0\x1a\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\ -\x6d\x61\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\ -\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\ -\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\ -\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\0\x79\ -\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\ -\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\x31\xb7\x30\0\0\0\0\0\x20\xb7\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\x7b\ -\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\0\0\ -\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\x87\ -\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\ -\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\0\ -\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\ -\xb7\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\ +\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x21\0\0\0\0\0\0\x79\x62\0\0\0\ +\0\0\0\x79\x71\0\x08\0\0\0\0\x15\x70\0\x1d\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\ +\x10\0\x08\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xe0\xbf\x16\0\0\ +\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xb4\x30\0\0\0\0\0\x30\xb4\x50\0\0\ +\0\0\0\0\x85\0\0\0\0\0\0\x7e\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xe0\0\0\0\0\xb7\ +\x10\0\0\0\0\0\x04\xbf\x27\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x7b\xa2\xff\xe8\0\0\ +\0\0\x61\x17\0\x14\0\0\0\0\x7b\xa1\xff\xf0\0\0\0\0\xbf\x17\0\0\0\0\0\0\x85\x02\ +\0\0\0\0\0\0\x7b\xa0\xff\xf8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\ +\xff\xe0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x30\xb4\x30\0\0\ +\0\0\0\x1a\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x44\x1e\0\0\0\ +\x01\0\0\0\x42\0\0\0\x9b\0\x01\x44\x24\0\0\0\x02\0\0\0\x42\0\0\x01\x0e\0\x01\ +\x4c\x1d\0\0\0\x03\0\0\0\x42\0\0\x01\x2f\0\x01\x54\x06\0\0\0\x04\0\0\0\x42\0\0\ +\x01\x3e\0\x01\x48\x1d\0\0\0\x05\0\0\0\x42\0\0\x01\x63\0\x01\x60\x0e\0\0\0\x08\ +\0\0\0\x42\0\0\x01\x76\0\x01\x64\x03\0\0\0\x0e\0\0\0\x42\0\0\x02\x09\0\x01\x6c\ +\x02\0\0\0\x21\0\0\0\x42\0\0\x02\x3c\0\x01\x80\x01\0\0\0\0\0\0\0\x02\0\0\0\x3e\ +\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\x01\x0a\ +\0\0\0\0\0\0\0\x20\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\ +\0\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\x01\x0a\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\x01\ +\x3a\0\0\0\0\0\0\0\x1a\0\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\ +\x70\0\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\ +\0\x10\0\0\0\0\0\0\0\0\0\0\0\x09\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\0\0\0\0\0\0\0\x62\x70\x66\x5f\x6d\x61\x70\x5f\x73\x75\ +\x6d\x5f\x65\x6c\x65\x6d\x5f\x63\x6f\x75\x6e\x74\0\0\x47\x50\x4c\0\0\0\0\0\x79\ +\x21\0\0\0\0\0\0\x79\x62\0\0\0\0\0\0\x79\x11\0\x08\0\0\0\0\x15\x10\0\x3b\0\0\0\ +\0\x79\x71\0\0\0\0\0\0\x79\x12\0\x10\0\0\0\0\x55\x10\0\x08\0\0\0\0\xbf\x4a\0\0\ +\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\xbf\x16\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x4a\xb4\x30\0\0\0\0\0\x20\xb4\x50\0\0\0\0\0\0\x85\0\0\0\0\0\0\x7e\ +\x7b\xa6\xff\xc8\0\0\0\0\x61\x17\0\0\0\0\0\0\x7b\xa1\xff\xd0\0\0\0\0\xb7\x30\0\ +\0\0\0\0\x04\xbf\x97\0\0\0\0\0\0\x0f\x93\0\0\0\0\0\0\x79\x17\0\x28\0\0\0\0\x79\ +\x87\0\x30\0\0\0\0\x15\x80\0\x18\0\0\0\0\xb7\x20\0\0\0\0\0\0\x0f\x12\0\0\0\0\0\ +\0\x61\x11\0\x04\0\0\0\0\x79\x38\0\x08\0\0\0\0\x67\x10\0\0\0\0\0\x03\x0f\x31\0\ +\0\0\0\0\0\x79\x68\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf8\ +\xb4\x20\0\0\0\0\0\x08\x85\0\0\0\0\0\0\x71\xb7\x10\0\0\0\0\0\0\x79\x3a\xff\xf8\ \0\0\0\0\x0f\x31\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\xf4\ -\xb7\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\ -\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3d\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\ +\xb4\x20\0\0\0\0\0\x04\x85\0\0\0\0\0\0\x71\xb7\x30\0\0\0\0\0\x04\x61\x1a\xff\ +\xf4\0\0\0\0\x61\x28\0\x10\0\0\0\0\x3e\x12\0\x02\0\0\0\0\x0f\x61\0\0\0\0\0\0\ \xbf\x96\0\0\0\0\0\0\x7b\xa9\xff\xd8\0\0\0\0\x79\x17\0\x18\0\0\0\0\x7b\xa1\xff\ \xe0\0\0\0\0\x79\x17\0\x20\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x13\0\0\0\0\0\0\x7b\ \xa1\xff\xe8\0\0\0\0\xbf\x4a\0\0\0\0\0\0\x07\x40\0\0\xff\xff\xff\xd0\x79\x1a\ -\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x51\xb7\x30\0\0\0\0\0\x11\ -\xb7\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x9a\0\x01\x80\x1e\0\0\0\x01\0\0\0\ -\x42\0\0\0\x9a\0\x01\x80\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x7f\0\x01\x88\x1f\0\0\ -\0\x03\0\0\0\x42\0\0\x02\xa3\0\x01\x94\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xbc\0\ -\x01\xa0\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3d\0\x01\x84\x1d\0\0\0\x06\0\0\0\x42\ -\0\0\x01\x62\0\x01\xa4\x06\0\0\0\x08\0\0\0\x42\0\0\x02\xce\0\x01\xa8\x03\0\0\0\ -\x10\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x79\0\x01\ -\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x1b\0\0\0\x42\0\0\ -\x03\xca\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x2d\0\0\0\x1e\ -\0\0\0\x42\0\0\x04\x16\0\x01\x0c\x0d\0\0\0\x20\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\ -\x02\0\0\0\x21\0\0\0\x42\0\0\x03\xdf\0\x01\x14\x02\0\0\0\x24\0\0\0\x42\0\0\x04\ -\x3d\0\x01\x18\x0d\0\0\0\x27\0\0\0\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x28\0\0\ -\0\x42\0\0\x04\x3d\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\x3d\0\x01\x18\x0d\ -\0\0\0\x2c\0\0\0\x42\0\0\x04\x6b\0\x01\x1c\x1b\0\0\0\x2d\0\0\0\x42\0\0\x04\x6b\ -\0\x01\x1c\x06\0\0\0\x2e\0\0\0\x42\0\0\x04\x8e\0\x01\x24\x0d\0\0\0\x30\0\0\0\ -\x42\0\0\x03\x3e\0\x01\xb0\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x49\0\x01\xc0\x01\0\ -\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\ -\x10\0\0\0\x14\0\0\x01\x09\0\0\0\0\0\0\0\x20\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\ -\x28\0\0\0\x08\0\0\x01\x39\0\0\0\0\0\0\0\x80\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\ -\x90\0\0\0\x1a\0\0\x01\x09\0\0\0\0\0\0\0\xa8\0\0\0\x1a\0\0\x03\x71\0\0\0\0\0\0\ -\0\xb0\0\0\0\x1a\0\0\x03\x75\0\0\0\0\0\0\0\xc0\0\0\0\x1f\0\0\x03\xa3\0\0\0\0\0\ -\0\0\xd8\0\0\0\x20\0\0\x01\x09\0\0\0\0\0\0\0\xf0\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\ -\0\x01\x18\0\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\x1a\0\0\x01\x09\0\0\0\ -\0\0\0\x01\x60\0\0\0\x20\0\0\x04\x65\0\0\0\0\0\0\x01\x88\0\0\0\x1a\0\0\x01\x39\ -\0\0\0\0\0\0\x01\x98\0\0\0\x1a\0\0\x04\xa6\0\0\0\0\0\0\x01\xa0\0\0\0\x18\0\0\0\ -\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\ -\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\ -\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x19\0\0\0\x01\0\0\0\0\0\0\0\x12\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; - opts.insns_sz = 2216; - opts.insns = (void *)"\ +\xff\xc8\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\0\x6a\xb4\x30\0\0\0\0\0\x11\ +\xb4\x50\0\0\0\0\0\x20\x85\0\0\0\0\0\0\x7e\xb4\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x1b\0\0\0\0\0\0\0\x42\0\0\0\x9b\0\x01\x94\x1e\0\0\0\x01\0\0\0\ +\x42\0\0\0\x9b\0\x01\x94\x24\0\0\0\x02\0\0\0\x42\0\0\x02\x99\0\x01\x9c\x1f\0\0\ +\0\x03\0\0\0\x42\0\0\x02\xbd\0\x01\xa8\x06\0\0\0\x04\0\0\0\x42\0\0\x02\xd6\0\ +\x01\xb4\x0e\0\0\0\x05\0\0\0\x42\0\0\x01\x3e\0\x01\x98\x1d\0\0\0\x06\0\0\0\x42\ +\0\0\x01\x63\0\x01\xb8\x0e\0\0\0\x09\0\0\0\x42\0\0\x02\xe8\0\x01\xbc\x03\0\0\0\ +\x10\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x17\0\0\0\x42\0\0\x03\x93\0\x01\ +\x04\x06\0\0\0\x1a\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x1b\0\0\0\x42\0\0\ +\x03\xe4\0\x01\x10\x0f\0\0\0\x1c\0\0\0\x42\0\0\x03\xf9\0\x01\x14\x2d\0\0\0\x1e\ +\0\0\0\x42\0\0\x04\x30\0\x01\x0c\x0d\0\0\0\x21\0\0\0\x42\0\0\x03\xf9\0\x01\x14\ +\x02\0\0\0\x24\0\0\0\x42\0\0\x04\x57\0\x01\x18\x0d\0\0\0\x2b\0\0\0\x42\0\0\x04\ +\x57\0\x01\x18\x0d\0\0\0\x2c\0\0\0\x42\0\0\x04\x85\0\x01\x1c\x1b\0\0\0\x2d\0\0\ +\0\x42\0\0\x04\x85\0\x01\x1c\x0f\0\0\0\x2e\0\0\0\x42\0\0\x04\xa8\0\x01\x24\x0d\ +\0\0\0\x30\0\0\0\x42\0\0\x03\x58\0\x01\xc4\x02\0\0\0\x3f\0\0\0\x42\0\0\x02\x3c\ +\0\x01\xd4\x01\0\0\0\0\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\ +\x3e\0\0\0\0\0\0\0\x10\0\0\0\x18\0\0\x01\x0a\0\0\0\0\0\0\0\x20\0\0\0\x1c\0\0\0\ +\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\x01\x3a\0\0\0\0\0\0\0\x80\0\0\0\x1e\0\0\0\ +\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1e\0\0\x01\x0a\0\0\0\0\0\0\0\xa8\0\0\0\x1e\0\0\ +\x03\x8b\0\0\0\0\0\0\0\xb0\0\0\0\x1e\0\0\x03\x8f\0\0\0\0\0\0\0\xc0\0\0\0\x23\0\ +\0\x03\xbd\0\0\0\0\0\0\0\xd8\0\0\0\x24\0\0\x01\x0a\0\0\0\0\0\0\0\xf0\0\0\0\x24\ +\0\0\0\x3e\0\0\0\0\0\0\x01\x18\0\0\0\x28\0\0\0\x3e\0\0\0\0\0\0\x01\x50\0\0\0\ +\x1e\0\0\x01\x0a\0\0\0\0\0\0\x01\x60\0\0\0\x24\0\0\x04\x7f\0\0\0\0\0\0\x01\x88\ +\0\0\0\x1e\0\0\x01\x3a\0\0\0\0\0\0\x01\x98\0\0\0\x1e\0\0\x04\xc0\0\0\0\0\0\0\ +\x01\xa0\0\0\0\x1c\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x41\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\ +\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x16\0\0\0\x01\0\0\0\0\0\ +\0\0\x12\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\x62\x70\x66\x5f\x69\ +\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0"; + static const char opts_insn[] __attribute__((__aligned__(8))) = "\ \xbf\x61\0\0\0\0\0\0\xbf\x1a\0\0\0\0\0\0\x07\x10\0\0\xff\xff\xff\x78\xb7\x20\0\ \0\0\0\0\x88\xb7\x30\0\0\0\0\0\0\x85\0\0\0\0\0\0\x71\x05\0\0\x14\0\0\0\0\x61\ \x1a\xff\x78\0\0\0\0\xd5\x10\0\x01\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x1a\xff\x7c\ @@ -318,72 +321,87 @@ iterators_bpf__load(struct iterators_bpf *skel) \0\0\0\x85\0\0\0\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\ \0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xbf\x07\0\0\ \0\0\0\0\x95\0\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ -\0\x0e\x68\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\ -\0\0\0\x0e\x64\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\ -\0\0\0\0\0\x0e\x58\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ -\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\x50\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\ +\0\x0e\xf8\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\ +\0\0\0\x0e\xf4\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0e\xe8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x05\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x12\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xe0\xb7\x30\0\0\0\0\0\x1c\x85\0\0\0\0\ \0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\xd4\0\0\0\0\x63\xa7\xff\x78\0\0\0\0\ -\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xa0\x63\x10\0\0\0\ +\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x30\x63\x10\0\0\0\ \0\0\0\x61\x06\0\x1c\0\0\0\0\x15\0\0\x03\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\ -\0\x0e\x7c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ -\0\0\x0e\x70\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\ +\0\x0f\x0c\x63\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\0\ +\0\0\x0f\0\xb7\x30\0\0\0\0\0\x48\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\ \x70\xff\xc3\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x17\0\0\0\0\0\0\ \x79\x36\0\x20\0\0\0\0\x15\x30\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ -\x0e\xb8\xb7\x20\0\0\0\0\0\x62\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\ +\x0f\x48\xb7\x20\0\0\0\0\0\x7b\x61\x06\0\x04\0\0\0\0\x45\0\0\x02\0\0\0\x01\x85\ \0\0\0\0\0\0\x94\x05\0\0\x01\0\0\0\0\x85\0\0\0\0\0\0\x71\x18\x26\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\x63\ -\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x20\x18\x16\0\0\0\0\0\0\0\ -\0\0\0\0\0\x0f\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0e\xb8\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x38\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ -\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x28\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\x63\ +\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xc8\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x0f\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xe0\x7b\x10\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\ +\x02\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xd0\xb7\x30\0\0\0\0\0\x20\x85\0\0\0\0\ \0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x9f\0\0\0\0\x18\x26\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\x63\x10\ -\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x48\xb7\ +\0\0\0\0\0\x61\x02\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\x63\x10\ +\0\0\0\0\0\0\xb7\x10\0\0\0\0\0\x16\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf0\xb7\ \x30\0\0\0\0\0\x04\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\x92\0\0\ -\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x50\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ -\x11\x70\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\x58\x18\x16\0\ -\0\0\0\0\0\0\0\0\0\0\0\x11\x68\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ -\0\0\x10\x58\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x7b\x10\0\0\0\0\0\0\x18\ -\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\x60\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xc0\ -\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\xf0\x18\x16\0\0\0\0\0\ -\0\0\0\0\0\0\0\x11\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd8\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\ -\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x78\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\ -\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x7c\x63\x10\0\0\0\0\0\0\x79\x06\0\ -\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\x80\x7b\x10\0\0\0\0\0\0\x61\ -\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xa8\x63\x10\0\0\0\0\0\ -\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x11\xf0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\ -\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\ -\xff\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\x60\x63\x07\0\x6c\0\0\0\0\ -\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\ -\0\0\0\0\0\0\0\0\0\0\x11\x60\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\ -\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xd0\x61\x10\0\0\0\0\0\0\xd5\ -\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x4a\0\0\ -\0\0\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x08\x18\x16\0\ -\0\0\0\0\0\0\0\0\0\0\0\x16\xe0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\ -\0\0\x12\x10\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd8\x7b\x10\0\0\0\0\0\0\x18\ -\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x20\ -\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\x20\x18\x16\0\0\0\0\0\ -\0\0\0\0\0\0\0\x17\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x15\ -\xb0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x50\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x48\x7b\x10\0\0\0\0\ -\0\0\x61\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xe8\x63\x10\0\0\ -\0\0\0\0\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xec\x63\x10\ -\0\0\0\0\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x16\xf0\x7b\ -\x10\0\0\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\ -\x18\x63\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x60\xb7\x20\0\0\0\ -\0\0\x12\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\ -\0\0\0\0\0\0\xc5\x70\xff\x13\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\ -\x63\x07\0\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\ -\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x16\xd0\xb7\x30\0\0\0\0\0\x8c\x85\0\ -\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x40\x61\ -\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\ -\xc5\x70\xff\x01\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\ -\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\ -\0\0\x63\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\ -\x16\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\ -\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0"; +\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x0f\xf8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\ +\x12\x30\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\x18\x16\0\0\ +\0\0\0\0\0\0\0\0\0\0\x12\x28\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\ +\0\x11\x18\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x70\x7b\x10\0\0\0\0\0\0\x18\x06\ +\0\0\0\0\0\0\0\0\0\0\0\0\x11\x20\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x80\x7b\ +\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x11\xb0\x18\x16\0\0\0\0\0\0\0\ +\0\0\0\0\0\x12\xa0\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x18\ +\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x98\x7b\x10\0\0\0\0\0\0\x61\x06\0\x08\0\0\0\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x38\x63\x10\0\0\0\0\0\0\x61\x06\0\x0c\0\0\ +\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x3c\x63\x10\0\0\0\0\0\0\x79\x06\0\x10\ +\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x40\x7b\x10\0\0\0\0\0\0\x61\x0a\ +\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\x68\x63\x10\0\0\0\0\0\0\ +\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xb0\xb7\x20\0\0\0\0\0\x11\xb7\x30\0\0\0\0\ +\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\xff\ +\x5c\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\x63\x07\0\x6c\0\0\0\0\x77\ +\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\x18\x86\0\0\0\0\0\0\0\0\0\0\0\0\x10\ +\xb8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x12\xc8\xb7\x20\0\0\0\0\0\x17\xb7\x30\0\0\ +\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\0\0\xc5\x70\ +\xff\x4d\0\0\0\0\x75\x70\0\x03\0\0\0\0\x62\x80\0\x04\0\0\0\0\x6a\x80\0\x02\0\0\ +\0\0\x05\0\0\x0a\0\0\0\0\x63\x87\0\x04\0\0\0\0\xbf\x97\0\0\0\0\0\0\x77\x90\0\0\ +\0\0\0\x20\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\x63\x09\0\0\0\0\0\0\x55\x90\0\ +\x02\0\0\0\0\x6a\x80\0\x02\0\0\0\0\x05\0\0\x01\0\0\0\0\x6a\x80\0\x02\0\0\0\x40\ +\xb7\x10\0\0\0\0\0\x05\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x12\x20\xb7\x30\0\0\0\0\ +\0\x8c\x85\0\0\0\0\0\0\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\ +\x01\0\x61\x10\0\0\0\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\ +\0\0\0\xa8\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\x90\x61\x10\0\0\0\0\0\0\xd5\x10\ +\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\xff\x2c\0\0\0\0\ +\x63\xa7\xff\x80\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x12\xe0\x18\x16\0\0\0\ +\0\0\0\0\0\0\0\0\0\x17\x88\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\ +\x12\xe8\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x80\x7b\x10\0\0\0\0\0\0\x18\x06\0\ +\0\0\0\0\0\0\0\0\0\0\0\x14\xf0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc8\x7b\x10\ +\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x14\xf8\x18\x16\0\0\0\0\0\0\0\0\0\ +\0\0\0\x17\xd8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x16\x58\x18\ +\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf8\x7b\x10\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xf0\x7b\x10\0\0\0\0\0\0\x61\ +\x06\0\x08\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x90\x63\x10\0\0\0\0\0\0\ +\x61\x06\0\x0c\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x94\x63\x10\0\0\0\0\ +\0\0\x79\x06\0\x10\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\x98\x7b\x10\0\0\ +\0\0\0\0\x61\x0a\xff\x78\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x17\xc0\x63\ +\x10\0\0\0\0\0\0\x18\x16\0\0\0\0\0\0\0\0\0\0\0\0\x18\x08\xb7\x20\0\0\0\0\0\x12\ +\xb7\x30\0\0\0\0\0\x0c\xb7\x40\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa7\xbf\x70\0\0\0\0\ +\0\0\xc5\x70\xfe\xf5\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\x63\x07\0\ +\x6c\0\0\0\0\x77\x70\0\0\0\0\0\x20\x63\x07\0\x70\0\0\0\0\xb7\x10\0\0\0\0\0\x05\ +\x18\x26\0\0\0\0\0\0\0\0\0\0\0\0\x17\x78\xb7\x30\0\0\0\0\0\x8c\x85\0\0\0\0\0\0\ +\xa6\xbf\x70\0\0\0\0\0\0\x18\x06\0\0\0\0\0\0\0\0\0\0\0\0\x17\xe8\x61\x10\0\0\0\ +\0\0\0\xd5\x10\0\x02\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\xc5\x70\ +\xfe\xe3\0\0\0\0\x63\xa7\xff\x84\0\0\0\0\x61\x1a\xff\x78\0\0\0\0\xd5\x10\0\x02\ +\0\0\0\0\xbf\x91\0\0\0\0\0\0\x85\0\0\0\0\0\0\xa8\x61\x0a\xff\x80\0\0\0\0\x63\ +\x60\0\x28\0\0\0\0\x61\x0a\xff\x84\0\0\0\0\x63\x60\0\x2c\0\0\0\0\x18\x16\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\0\0\x63\x60\0\x18\0\0\0\0\xb7\0\0\0\0\0\ +\0\0\x95\0\0\0\0\0\0\0"; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = sizeof(opts_data) - 1; + opts.data = (void *)opts_data; + opts.insns_sz = sizeof(opts_insn) - 1; + opts.insns = (void *)opts_insn; + err = bpf_load_and_run(&opts); if (err < 0) return err; -- cgit v1.2.3 From 62ef449b8d8e312ee06279da797702cdb19a9920 Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Thu, 10 Jul 2025 13:54:19 +0800 Subject: bpf: Clean up individual BTF_ID code Use BTF_ID_LIST_SINGLE(a, b, c) instead of BTF_ID_LIST(a) BTF_ID(b, c) Signed-off-by: Feng Yang Link: https://lore.kernel.org/r/20250710055419.70544-1-yangfeng59949@163.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 3 +-- kernel/bpf/link_iter.c | 3 +-- kernel/bpf/prog_iter.c | 3 +-- kernel/kallsyms.c | 3 +-- kernel/trace/bpf_trace.c | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2dd13eea7b0e..0aff814cb53a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6200,8 +6200,7 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty return kctx_type_id; } -BTF_ID_LIST(bpf_ctx_convert_btf_id) -BTF_ID(struct, bpf_ctx_convert) +BTF_ID_LIST_SINGLE(bpf_ctx_convert_btf_id, struct, bpf_ctx_convert) static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name, void *data, unsigned int data_size) diff --git a/kernel/bpf/link_iter.c b/kernel/bpf/link_iter.c index fec8005a121c..8158e9c1af7b 100644 --- a/kernel/bpf/link_iter.c +++ b/kernel/bpf/link_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_link_seq_ops = { .show = bpf_link_seq_show, }; -BTF_ID_LIST(btf_bpf_link_id) -BTF_ID(struct, bpf_link) +BTF_ID_LIST_SINGLE(btf_bpf_link_id, struct, bpf_link) static const struct bpf_iter_seq_info bpf_link_seq_info = { .seq_ops = &bpf_link_seq_ops, diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c index 53a73c841c13..85d8fcb56fb7 100644 --- a/kernel/bpf/prog_iter.c +++ b/kernel/bpf/prog_iter.c @@ -78,8 +78,7 @@ static const struct seq_operations bpf_prog_seq_ops = { .show = bpf_prog_seq_show, }; -BTF_ID_LIST(btf_bpf_prog_id) -BTF_ID(struct, bpf_prog) +BTF_ID_LIST_SINGLE(btf_bpf_prog_id, struct, bpf_prog) static const struct bpf_iter_seq_info bpf_prog_seq_info = { .seq_ops = &bpf_prog_seq_ops, diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 4198f30aac3c..1e7635864124 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -829,8 +829,7 @@ static struct bpf_iter_reg ksym_iter_reg_info = { .seq_info = &ksym_iter_seq_info, }; -BTF_ID_LIST(btf_ksym_iter_id) -BTF_ID(struct, kallsym_iter) +BTF_ID_LIST_SINGLE(btf_ksym_iter_id, struct, kallsym_iter) static int __init bpf_ksym_iter_register(void) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ffdde840abb8..3ae52978cae6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -781,8 +781,7 @@ BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) return (unsigned long) task_pt_regs(task); } -BTF_ID_LIST(bpf_task_pt_regs_ids) -BTF_ID(struct, pt_regs) +BTF_ID_LIST_SINGLE(bpf_task_pt_regs_ids, struct, pt_regs) const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, -- cgit v1.2.3 From 19d18fdfc79217c86802271c9ce5b4ed174628cc Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Wed, 16 Jul 2025 21:46:53 +0800 Subject: bpf: Add struct bpf_token_info The 'commit 35f96de04127 ("bpf: Introduce BPF token object")' added BPF token as a new kind of BPF kernel object. And BPF_OBJ_GET_INFO_BY_FD already used to get BPF object info, so we can also get token info with this cmd. One usage scenario, when program runs failed with token, because of the permission failure, we can report what BPF token is allowing with this API for debugging. Acked-by: Andrii Nakryiko Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/20250716134654.1162635-1-chen.dylane@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 18 ++++++++++++++++++ kernel/bpf/token.c | 25 ++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1a26d17536be..e63039817af3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -5239,6 +5239,21 @@ static int bpf_link_get_info_by_fd(struct file *file, } +static int token_get_info_by_fd(struct file *file, + struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + u32 info_len = attr->info.info_len; + int err; + + err = bpf_check_uarg_tail_zero(USER_BPFPTR(uinfo), sizeof(*uinfo), info_len); + if (err) + return err; + return bpf_token_get_info_by_fd(token, attr, uattr); +} + #define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, @@ -5262,6 +5277,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, else if (fd_file(f)->f_op == &bpf_link_fops || fd_file(f)->f_op == &bpf_link_fops_poll) return bpf_link_get_info_by_fd(fd_file(f), fd_file(f)->private_data, attr, uattr); + else if (fd_file(f)->f_op == &bpf_token_fops) + return token_get_info_by_fd(fd_file(f), fd_file(f)->private_data, + attr, uattr); return -EINVAL; } diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 26057aa13503..0bbe412f854e 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -103,7 +103,7 @@ static void bpf_token_show_fdinfo(struct seq_file *m, struct file *filp) static const struct inode_operations bpf_token_iops = { }; -static const struct file_operations bpf_token_fops = { +const struct file_operations bpf_token_fops = { .release = bpf_token_release, .show_fdinfo = bpf_token_show_fdinfo, }; @@ -210,6 +210,29 @@ out_file: return err; } +int bpf_token_get_info_by_fd(struct bpf_token *token, + const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_token_info __user *uinfo = u64_to_user_ptr(attr->info.info); + struct bpf_token_info info; + u32 info_len = attr->info.info_len; + + info_len = min_t(u32, info_len, sizeof(info)); + memset(&info, 0, sizeof(info)); + + info.allowed_cmds = token->allowed_cmds; + info.allowed_maps = token->allowed_maps; + info.allowed_progs = token->allowed_progs; + info.allowed_attachs = token->allowed_attachs; + + if (copy_to_user(uinfo, &info, info_len) || + put_user(info_len, &uattr->info.info_len)) + return -EFAULT; + + return 0; +} + struct bpf_token *bpf_token_get_from_fd(u32 ufd) { CLASS(fd, f)(ufd); -- cgit v1.2.3 From dfe25fbaedfc2a07281ed5ff0442270217d25b31 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 4 Jul 2025 11:08:04 -0700 Subject: cgroup: llist: avoid memory tears for llist_node Before the commit 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe"), the struct llist_node is expected to be private to the one inserting the node to the lockless list or the one removing the node from the lockless list. After the mentioned commit, the llist_node in the rstat code is per-cpu shared between the stacked contexts i.e. process, softirq, hardirq & nmi. It is possible the compiler may tear the loads or stores of llist_node. Let's avoid that. KCSAN reported the following race: Reported by Kernel Concurrency Sanitizer on: CPU: 60 UID: 0 PID: 5425 ... 6.16.0-rc3-next-20250626 #1 NONE Tainted: [E]=UNSIGNED_MODULE Hardware name: ... ================================================================== ================================================================== BUG: KCSAN: data-race in css_rstat_flush / css_rstat_updated write to 0xffffe8fffe1c85f0 of 8 bytes by task 1061 on cpu 1: css_rstat_flush+0x1b8/0xeb0 __mem_cgroup_flush_stats+0x184/0x190 flush_memcg_stats_dwork+0x22/0x50 process_one_work+0x335/0x630 worker_thread+0x5f1/0x8a0 kthread+0x197/0x340 ret_from_fork+0xd3/0x110 ret_from_fork_asm+0x11/0x20 read to 0xffffe8fffe1c85f0 of 8 bytes by task 3551 on cpu 15: css_rstat_updated+0x81/0x180 mod_memcg_lruvec_state+0x113/0x2d0 __mod_lruvec_state+0x3d/0x50 lru_add+0x21e/0x3f0 folio_batch_move_lru+0x80/0x1b0 __folio_batch_add_and_move+0xd7/0x160 folio_add_lru_vma+0x42/0x50 do_anonymous_page+0x892/0xe90 __handle_mm_fault+0xfaa/0x1520 handle_mm_fault+0xdc/0x350 do_user_addr_fault+0x1dc/0x650 exc_page_fault+0x5c/0x110 asm_exc_page_fault+0x22/0x30 value changed: 0xffffe8fffe18e0d0 -> 0xffffe8fffe1c85f0 $ ./scripts/faddr2line vmlinux css_rstat_flush+0x1b8/0xeb0 css_rstat_flush+0x1b8/0xeb0: init_llist_node at include/linux/llist.h:86 (inlined by) llist_del_first_init at include/linux/llist.h:308 (inlined by) css_process_update_tree at kernel/cgroup/rstat.c:148 (inlined by) css_rstat_updated_list at kernel/cgroup/rstat.c:258 (inlined by) css_rstat_flush at kernel/cgroup/rstat.c:389 $ ./scripts/faddr2line vmlinux css_rstat_updated+0x81/0x180 css_rstat_updated+0x81/0x180: css_rstat_updated at kernel/cgroup/rstat.c:90 (discriminator 1) These are expected race and a simple READ_ONCE/WRITE_ONCE resolves these reports. However let's add comments to explain the race and the need for memory barriers if stronger guarantees are needed. More specifically the rstat updater and the flusher can race and cause a scenario where the stats updater skips adding the css to the lockless list but the flusher might not see those updates done by the skipped updater. This is benign race and the subsequent flusher will flush those stats and at the moment there aren't any rstat users which are not fine with this kind of race. However some future user might want more stricter guarantee, so let's add appropriate comments to ease the job of future users. Signed-off-by: Shakeel Butt Reviewed-by: Paul E. McKenney Fixes: 36df6e3dbd7e ("cgroup: make css_rstat_updated nmi safe") Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index c8a48cf83878..981e2f77ad4e 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -60,6 +60,12 @@ static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) * Atomically inserts the css in the ss's llist for the given cpu. This is * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist * will be processed at the flush time to create the update tree. + * + * NOTE: if the user needs the guarantee that the updater either add itself in + * the lockless list or the concurrent flusher flushes its updated stats, a + * memory barrier is needed before the call to css_rstat_updated() i.e. a + * barrier after updating the per-cpu stats and before calling + * css_rstat_updated(). */ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { @@ -86,7 +92,12 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) return; rstatc = css_rstat_cpu(css, cpu); - /* If already on list return. */ + /* + * If already on list return. This check is racy and smp_mb() is needed + * to pair it with the smp_mb() in css_process_update_tree() if the + * guarantee that the updated stats are visible to concurrent flusher is + * needed. + */ if (llist_on_list(&rstatc->lnode)) return; @@ -148,6 +159,21 @@ static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) while ((lnode = llist_del_first_init(lhead))) { struct css_rstat_cpu *rstatc; + /* + * smp_mb() is needed here (more specifically in between + * init_llist_node() and per-cpu stats flushing) if the + * guarantee is required by a rstat user where etiher the + * updater should add itself on the lockless list or the + * flusher flush the stats updated by the updater who have + * observed that they are already on the list. The + * corresponding barrier pair for this one should be before + * css_rstat_updated() by the user. + * + * For now, there aren't any such user, so not adding the + * barrier here but if such a use-case arise, please add + * smp_mb() here. + */ + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); __css_process_update_tree(rstatc->owner, cpu); } -- cgit v1.2.3 From 9beb8c5e77dc10e3889ff5f967eeffba78617a88 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 17 Jul 2025 08:55:49 +0000 Subject: sched,freezer: Remove unnecessary warning in __thaw_task Commit cff5f49d433f ("cgroup_freezer: cgroup_freezing: Check if not frozen") modified the cgroup_freezing() logic to verify that the FROZEN flag is not set, affecting the return value of the freezing() function, in order to address a warning in __thaw_task. A race condition exists that may allow tasks to escape being frozen. The following scenario demonstrates this issue: CPU 0 (get_signal path) CPU 1 (freezer.state reader) try_to_freeze read freezer.state __refrigerator freezer_read update_if_frozen WRITE_ONCE(current->__state, TASK_FROZEN); ... /* Task is now marked frozen */ /* frozen(task) == true */ /* Assuming other tasks are frozen */ freezer->state |= CGROUP_FROZEN; /* freezing(current) returns false */ /* because cgroup is frozen (not freezing) */ break out __set_current_state(TASK_RUNNING); /* Bug: Task resumes running when it should remain frozen */ The existing !frozen(p) check in __thaw_task makes the WARN_ON_ONCE(freezing(p)) warning redundant. Removing this warning enables reverting commit cff5f49d433f ("cgroup_freezer: cgroup_freezing: Check if not frozen") to resolve the issue. This patch removes the warning from __thaw_task. A subsequent patch will revert commit cff5f49d433f ("cgroup_freezer: cgroup_freezing: Check if not frozen") to complete the fix. Reported-by: Zhong Jiawei Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/freezer.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index 8d530d0949ff..6a96149aede9 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -201,18 +201,9 @@ static int __restore_freezer_state(struct task_struct *p, void *arg) void __thaw_task(struct task_struct *p) { - unsigned long flags; - - spin_lock_irqsave(&freezer_lock, flags); - if (WARN_ON_ONCE(freezing(p))) - goto unlock; - - if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) - goto unlock; - - wake_up_state(p, TASK_FROZEN); -unlock: - spin_unlock_irqrestore(&freezer_lock, flags); + guard(spinlock_irqsave)(&freezer_lock); + if (frozen(p) && !task_call_func(p, __restore_freezer_state, NULL)) + wake_up_state(p, TASK_FROZEN); } /** -- cgit v1.2.3 From 14a67b42cb6f3ab66f41603c062c5056d32ea7dd Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Thu, 17 Jul 2025 08:55:50 +0000 Subject: Revert "cgroup_freezer: cgroup_freezing: Check if not frozen" This reverts commit cff5f49d433fcd0063c8be7dd08fa5bf190c6c37. Commit cff5f49d433f ("cgroup_freezer: cgroup_freezing: Check if not frozen") modified the cgroup_freezing() logic to verify that the FROZEN flag is not set, affecting the return value of the freezing() function, in order to address a warning in __thaw_task. A race condition exists that may allow tasks to escape being frozen. The following scenario demonstrates this issue: CPU 0 (get_signal path) CPU 1 (freezer.state reader) try_to_freeze read freezer.state __refrigerator freezer_read update_if_frozen WRITE_ONCE(current->__state, TASK_FROZEN); ... /* Task is now marked frozen */ /* frozen(task) == true */ /* Assuming other tasks are frozen */ freezer->state |= CGROUP_FROZEN; /* freezing(current) returns false */ /* because cgroup is frozen (not freezing) */ break out __set_current_state(TASK_RUNNING); /* Bug: Task resumes running when it should remain frozen */ The existing !frozen(p) check in __thaw_task makes the WARN_ON_ONCE(freezing(p)) warning redundant. Removing this warning enables reverting the commit cff5f49d433f ("cgroup_freezer: cgroup_freezing: Check if not frozen") to resolve the issue. The warning has been removed in the previous patch. This patch revert the commit cff5f49d433f ("cgroup_freezer: cgroup_freezing: Check if not frozen") to complete the fix. Fixes: cff5f49d433f ("cgroup_freezer: cgroup_freezing: Check if not frozen") Reported-by: Zhong Jiawei Signed-off-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/legacy_freezer.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 507b8f19a262..dd9417425d92 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -66,15 +66,9 @@ static struct freezer *parent_freezer(struct freezer *freezer) bool cgroup_freezing(struct task_struct *task) { bool ret; - unsigned int state; rcu_read_lock(); - /* Check if the cgroup is still FREEZING, but not FROZEN. The extra - * !FROZEN check is required, because the FREEZING bit is not cleared - * when the state FROZEN is reached. - */ - state = task_freezer(task)->state; - ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; -- cgit v1.2.3 From df316ab3d4440177e322a2847969d356c29b0eef Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 9 Jul 2025 15:19:03 +0200 Subject: workqueue: Use atomic_try_cmpxchg_relaxed() in tryinc_node_nr_active() Use try_cmpxchg() family of locking primitives instead of cmpxchg(*ptr, old, new) == old. The x86 CMPXCHG instruction returns success in the ZF flag, so this change saves a compare after CMPXCHG (and related move instruction in front of CMPXCHG). Also, try_cmpxchg() implicitly assigns old *ptr value to "old" when CMPXCHG fails. There is no need to re-read the value in the loop. The generated assembly improves from: 3f7: 44 8b 0a mov (%rdx),%r9d 3fa: eb 12 jmp 40e <...> 3fc: 8d 79 01 lea 0x1(%rcx),%edi 3ff: 89 c8 mov %ecx,%eax 401: f0 0f b1 7a 04 lock cmpxchg %edi,0x4(%rdx) 406: 39 c1 cmp %eax,%ecx 408: 0f 84 83 00 00 00 je 491 <...> 40e: 8b 4a 04 mov 0x4(%rdx),%ecx 411: 41 39 c9 cmp %ecx,%r9d 414: 7f e6 jg 3fc <...> to: 256b: 45 8b 08 mov (%r8),%r9d 256e: 41 8b 40 04 mov 0x4(%r8),%eax 2572: 41 39 c1 cmp %eax,%r9d 2575: 7e 10 jle 2587 <...> 2577: 8d 78 01 lea 0x1(%rax),%edi 257a: f0 41 0f b1 78 04 lock cmpxchg %edi,0x4(%r8) 2580: 75 f0 jne 2572 <...> No functional change intended. Signed-off-by: Uros Bizjak Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 12c3080332bd..ad83662a0533 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1690,17 +1690,14 @@ static void __pwq_activate_work(struct pool_workqueue *pwq, static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { int max = READ_ONCE(nna->max); + int old = atomic_read(&nna->nr); - while (true) { - int old, tmp; - - old = atomic_read(&nna->nr); + do { if (old >= max) return false; - tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); - if (tmp == old) - return true; - } + } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1)); + + return true; } /** -- cgit v1.2.3 From ae96bba1ca0000ebb3f3ced64c9367e2a223d69e Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 8 Jul 2025 17:12:51 +0100 Subject: sched_ext: Fix scx_bpf_reenqueue_local() reference The comment mentions bpf_scx_reenqueue_local(), but the function is provided for the BPF program implementing scx, as such the naming convention is scx_bpf_reenqueue_local(), fix the comment. Signed-off-by: Christian Loehle Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 512474eabea6..b083ca426e89 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -911,7 +911,7 @@ enum scx_enq_flags { /* * The task being enqueued was previously enqueued on the current CPU's * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was * invoked in a ->cpu_release() callback, and the task is again * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the * task will not be scheduled on the CPU until at least the next invocation -- cgit v1.2.3 From 06efc9fe0b8deeb83b47fd7c5451fe1a60c8a761 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 5 Jul 2025 07:43:51 +0200 Subject: sched_ext: idle: Handle migration-disabled tasks in idle selection When SCX_OPS_ENQ_MIGRATION_DISABLED is enabled, migration-disabled tasks are also routed to ops.enqueue(). A scheduler may attempt to dispatch such tasks directly to an idle CPU using the default idle selection policy via scx_bpf_select_cpu_and() or scx_bpf_select_cpu_dfl(). This scenario must be properly handled by the built-in idle policy to avoid returning an idle CPU where the target task isn't allowed to run. Otherwise, it can lead to errors such as: EXIT: runtime error (SCX_DSQ_LOCAL[_ON] cannot move migration disabled Chrome_ChildIOT[291646] from CPU 3 to 14) Prevent this by explicitly handling migration-disabled tasks in the built-in idle selection logic, maintaining their CPU affinity. Fixes: a730e3f7a48bc ("sched_ext: idle: Consolidate default idle CPU selection kfuncs") Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 6d29d3cbc670..001fb88a8481 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -903,7 +903,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * selection optimizations and simply check whether the previously * used CPU is idle and within the allowed cpumask. */ - if (p->nr_cpus_allowed == 1) { + if (p->nr_cpus_allowed == 1 || is_migration_disabled(p)) { if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) && scx_idle_test_and_clear_cpu(prev_cpu)) cpu = prev_cpu; -- cgit v1.2.3 From f633c1a236df95ab927f1919b3be2619c8cc6733 Mon Sep 17 00:00:00 2001 From: Darshan Rathod Date: Wed, 16 Jul 2025 12:42:16 +0000 Subject: PM: hibernate: Fix up white space that does not follow coding style Fix up white space usage that does not follow the kernel coding style rules in several places in snapshot.c. Signed-off-by: Darshan Rathod Link: https://patch.msgid.link/20250716124216.64329-1-darshanrathod475@gmail.com [ rjw: New subject and changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2af36cfe35cd..501df0676a61 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1536,7 +1536,7 @@ static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); copy_pfn = memory_bm_next_pfn(copy_bm); - for(;;) { + for (;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; @@ -2161,13 +2161,13 @@ static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) + if (strcmp(info->uts.sysname, init_utsname()->sysname)) return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) + if (strcmp(info->uts.release, init_utsname()->release)) return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) + if (strcmp(info->uts.version, init_utsname()->version)) return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) + if (strcmp(info->uts.machine, init_utsname()->machine)) return "machine"; return NULL; } @@ -2361,7 +2361,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, struct memory_bitmap *zero_bm) { unsigned long decoded_pfn; - bool zero; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { -- cgit v1.2.3 From 2e2713ae1a05eea7dda2f3b6988827196e33b25a Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 17 Jul 2025 17:49:49 +0100 Subject: btf: Fix virt_to_phys() on arm64 when mmapping BTF Breno Leitao reports that arm64 emits the following warning with CONFIG_DEBUG_VIRTUAL: [ 58.896157] virt_to_phys used for non-linear address: 000000009fea9737 (__start_BTF+0x0/0x685530) [ 23.988669] WARNING: CPU: 25 PID: 1442 at arch/arm64/mm/physaddr.c:15 __virt_to_phys (arch/arm64/mm/physaddr.c:?) ... [ 24.075371] Tainted: [E]=UNSIGNED_MODULE, [N]=TEST [ 24.080276] Hardware name: Quanta S7GM 20S7GCU0010/S7G MB (CG1), BIOS 3D22 07/03/2024 [ 24.088295] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) [ 24.098440] pc : __virt_to_phys (arch/arm64/mm/physaddr.c:?) [ 24.105398] lr : __virt_to_phys (arch/arm64/mm/physaddr.c:?) ... [ 24.197257] Call trace: [ 24.199761] __virt_to_phys (arch/arm64/mm/physaddr.c:?) (P) [ 24.206883] btf_sysfs_vmlinux_mmap (kernel/bpf/sysfs_btf.c:27) [ 24.214264] sysfs_kf_bin_mmap (fs/sysfs/file.c:179) [ 24.218536] kernfs_fop_mmap (fs/kernfs/file.c:462) [ 24.222461] mmap_region (./include/linux/fs.h:? mm/internal.h:167 mm/vma.c:2405 mm/vma.c:2467 mm/vma.c:2622 mm/vma.c:2692) It seems that the memory layout on arm64 maps the kernel image in vmalloc space which is different than x86. This makes virt_to_phys emit the warning. Fix this by translating the address using __pa_symbol as suggested by Breno instead. Reported-by: Breno Leitao Closes: https://lore.kernel.org/bpf/g2gqhkunbu43awrofzqb4cs4sxkxg2i4eud6p4qziwrdh67q4g@mtw3d3aqfgmb/ Signed-off-by: Lorenz Bauer Tested-by: Breno Leitao Fixes: a539e2a6d51d ("btf: Allow mmap of vmlinux btf") Link: https://lore.kernel.org/r/20250717-vmlinux-mmap-pa-symbol-v1-1-970be6681158@isovalent.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/sysfs_btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 941d0d2427e3..8e61dc555415 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -21,7 +21,7 @@ static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj, { unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT; size_t vm_size = vma->vm_end - vma->vm_start; - phys_addr_t addr = virt_to_phys(__start_BTF); + phys_addr_t addr = __pa_symbol(__start_BTF); unsigned long pfn = addr >> PAGE_SHIFT; if (attr->private != __start_BTF || !PAGE_ALIGNED(addr)) -- cgit v1.2.3 From cf4fc66746e344181f41604066659073dbb8aaf0 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:51 -0400 Subject: smp: Document preemption and stop_machine() mutual exclusion Recently while revising RCU's cpu online checks, there was some discussion around how IPIs synchronize with hotplug. Add comments explaining how preemption disable creates mutual exclusion with CPU hotplug's stop_machine mechanism. The key insight is that stop_machine() atomically updates CPU masks and flushes IPIs with interrupts disabled, and cannot proceed while any CPU (including the IPI sender) has preemption disabled. [ Apply peterz feedback. ] Cc: Andrea Righi Cc: Paul E. McKenney Cc: Frederic Weisbecker Cc: Peter Zijlstra (Intel) Cc: rcu@vger.kernel.org Acked-by: Paul E. McKenney Co-developed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/smp.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 974f3a3962e8..23d51a8e582d 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -86,13 +86,15 @@ int smpcfd_dead_cpu(unsigned int cpu) int smpcfd_dying_cpu(unsigned int cpu) { /* - * The IPIs for the smp-call-function callbacks queued by other - * CPUs might arrive late, either due to hardware latencies or - * because this CPU disabled interrupts (inside stop-machine) - * before the IPIs were sent. So flush out any pending callbacks - * explicitly (without waiting for the IPIs to arrive), to - * ensure that the outgoing CPU doesn't go offline with work - * still pending. + * The IPIs for the smp-call-function callbacks queued by other CPUs + * might arrive late, either due to hardware latencies or because this + * CPU disabled interrupts (inside stop-machine) before the IPIs were + * sent. So flush out any pending callbacks explicitly (without waiting + * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go + * offline with work still pending. + * + * This runs with interrupts disabled inside the stopper task invoked by + * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush. */ __flush_smp_call_function_queue(false); irq_work_run(); @@ -418,6 +420,10 @@ void __smp_call_single_queue(int cpu, struct llist_node *node) */ static int generic_exec_single(int cpu, call_single_data_t *csd) { + /* + * Preemption already disabled here so stopper cannot run on this CPU, + * ensuring mutually exclusive CPU offlining and last IPI flush. + */ if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; @@ -638,8 +644,10 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int err; /* - * prevent preemption and reschedule on another processor, - * as well as CPU removal + * Prevent preemption and reschedule on another CPU, as well as CPU + * removal. This prevents stopper from running on this CPU, thus + * providing mutual exclusion of the below cpu_online() check and + * IPI sending ensuring IPI are not missed by CPU going offline. */ this_cpu = get_cpu(); -- cgit v1.2.3 From 7f71195c15dcf5f34c4c7f056603659374e3a525 Mon Sep 17 00:00:00 2001 From: Dishank Jogi Date: Wed, 16 Jul 2025 15:05:25 +0530 Subject: fork: reorder function qualifiers for copy_clone_args_from_user Change the order of function qualifiers from 'noinline static' to 'static noinline' in copy_clone_args_from_user for consistency with kernel coding style. No functional change intended. The goal is to improve readability and maintain consistent ordering of qualifiers across the codebase. Signed-off-by: Dishank Jogi Reviewed-by: Liam R. Howlett Reviewed-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Link: https://lore.kernel.org/r/20250716093525.449994-1-dishank.jogi@siqol.com Signed-off-by: Kees Cook --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..574ff0d983db 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2743,7 +2743,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif -noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, +static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { -- cgit v1.2.3 From 463d46044f04013306a4893242f65788b8a16b2e Mon Sep 17 00:00:00 2001 From: Tze-nan Wu Date: Thu, 17 Jul 2025 13:53:38 +0800 Subject: rcu: Fix delayed execution of hurry callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We observed a regression in our customer’s environment after enabling CONFIG_LAZY_RCU. In the Android Update Engine scenario, where ioctl() is used heavily, we found that callbacks queued via call_rcu_hurry (such as percpu_ref_switch_to_atomic_rcu) can sometimes be delayed by up to 5 seconds before execution. This occurs because the new grace period does not start immediately after the previous one completes. The root cause is that the wake_nocb_gp_defer() function now checks "rdp->nocb_defer_wakeup" instead of "rdp_gp->nocb_defer_wakeup". On CPUs that are not rcuog, "rdp->nocb_defer_wakeup" may always be RCU_NOCB_WAKE_NOT. This can cause "rdp_gp->nocb_defer_wakeup" to be downgraded and the "rdp_gp->nocb_timer" to be postponed by up to 10 seconds, delaying the execution of hurry RCU callbacks. The trace log of one scenario we encountered is as follow: // previous GP ends at this point rcu_preempt [000] d..1. 137.240210: rcu_grace_period: rcu_preempt 8369 end rcu_preempt [000] ..... 137.240212: rcu_grace_period: rcu_preempt 8372 reqwait // call_rcu_hurry enqueues "percpu_ref_switch_to_atomic_rcu", the callback waited on by UpdateEngine update_engine [002] d..1. 137.301593: __call_rcu_common: wyy: unlikely p_ref = 00000000********. lazy = 0 // FirstQ on cpu 2 rdp_gp->nocb_timer is set to fire after 1 jiffy (4ms) // and the rdp_gp->nocb_defer_wakeup is set to RCU_NOCB_WAKE update_engine [002] d..2. 137.301595: rcu_nocb_wake: rcu_preempt 2 FirstQ on cpu2 with rdp_gp (cpu0). // FirstBQ event on cpu2 during the 1 jiffy, make the timer postpond 10 seconds later. // also, the rdp_gp->nocb_defer_wakeup is overwrite to RCU_NOCB_WAKE_LAZY update_engine [002] d..1. 137.301601: rcu_nocb_wake: rcu_preempt 2 WakeEmptyIsDeferred ... ... ... // before the 10 seconds timeout, cpu0 received another call_rcu_hurry // reset the timer to jiffies+1 and set the waketype = RCU_NOCB_WAKE. kworker/u32:0 [000] d..2. 142.557564: rcu_nocb_wake: rcu_preempt 0 FirstQ kworker/u32:0 [000] d..1. 142.557576: rcu_nocb_wake: rcu_preempt 0 WakeEmptyIsDeferred kworker/u32:0 [000] d..1. 142.558296: rcu_nocb_wake: rcu_preempt 0 WakeNot kworker/u32:0 [000] d..1. 142.558562: rcu_nocb_wake: rcu_preempt 0 WakeNot // idle(do_nocb_deferred_wakeup) wake rcuog due to waketype == RCU_NOCB_WAKE [000] d..1. 142.558786: rcu_nocb_wake: rcu_preempt 0 DoWake [000] dN.1. 142.558839: rcu_nocb_wake: rcu_preempt 0 DeferredWake rcuog/0 [000] ..... 142.558871: rcu_nocb_wake: rcu_preempt 0 EndSleep rcuog/0 [000] ..... 142.558877: rcu_nocb_wake: rcu_preempt 0 Check // finally rcuog request a new GP at this point (5 seconds after the FirstQ event) rcuog/0 [000] d..2. 142.558886: rcu_grace_period: rcu_preempt 8372 newreq rcu_preempt [001] d..1. 142.559458: rcu_grace_period: rcu_preempt 8373 start ... rcu_preempt [000] d..1. 142.564258: rcu_grace_period: rcu_preempt 8373 end rcuop/2 [000] D..1. 142.566337: rcu_batch_start: rcu_preempt CBs=219 bl=10 // the hurry CB is invoked at this point rcuop/2 [000] b.... 142.566352: blk_queue_usage_counter_release: wyy: wakeup. p_ref = 00000000********. This patch changes the condition to check "rdp_gp->nocb_defer_wakeup" in the lazy path. This prevents an already scheduled "rdp_gp->nocb_timer" from being postponed and avoids overwriting "rdp_gp->nocb_defer_wakeup" when it is not RCU_NOCB_WAKE_NOT. Fixes: 3cb278e73be5 ("rcu: Make call_rcu() lazy to save power") Co-developed-by: Cheng-jui Wang Signed-off-by: Cheng-jui Wang Co-developed-by: Lorry.Luo@mediatek.com Signed-off-by: Lorry.Luo@mediatek.com Tested-by: weiyangyang@vivo.com Signed-off-by: weiyangyang@vivo.com Signed-off-by: Tze-nan Wu Reviewed-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 08eb9b0e2fab..e6cd56603cad 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -276,7 +276,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, * callback storms, no need to wake up too early. */ if (waketype == RCU_NOCB_WAKE_LAZY && - rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { + rdp_gp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { -- cgit v1.2.3 From 9b7fc3f14576c268f62fe0b882fac5e61239b659 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:04 +0200 Subject: vdso: Introduce aux_clock_resolution_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the constant resolution to a shared header, so the vDSO can use it and return it without going through a syscall. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-10-df7d9f87b9b8@linutronix.de --- kernel/time/timekeeping.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c6fe89bded02..cbcf090bb4be 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -26,6 +26,8 @@ #include #include +#include + #include "tick-internal.h" #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -2876,8 +2878,8 @@ static int aux_get_res(clockid_t id, struct timespec64 *tp) if (!clockid_aux_valid(id)) return -ENODEV; - tp->tv_sec = 0; - tp->tv_nsec = 1; + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; return 0; } -- cgit v1.2.3 From 380b84e168e57c54d0a9e053a5558fddc43f0c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 1 Jul 2025 10:58:05 +0200 Subject: vdso/vsyscall: Update auxiliary clock data in the datapage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the auxiliary clock data so it can be read from the vDSO. Architectures not using the generic vDSO time framework, namely SPARC64, are not supported. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250701-vdso-auxclock-v1-11-df7d9f87b9b8@linutronix.de --- kernel/time/namespace.c | 5 +++++ kernel/time/timekeeping.c | 12 ++++++++++++ kernel/time/vsyscall.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index e3642278df43..667452768ed3 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -242,6 +242,11 @@ static void timens_set_vvar_page(struct task_struct *task, for (i = 0; i < CS_BASES; i++) timens_setup_vdso_clock_data(&vc[i], ns); + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } + out: mutex_unlock(&offset_lock); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cbcf090bb4be..243fe25e680a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -66,11 +66,21 @@ static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); } + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} #else static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) { return false; } + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} #endif /* flag for if timekeeping is suspended */ @@ -719,6 +729,8 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); } if (action & TK_CLOCK_WAS_SET) diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index df6bada2d58e..8ba8b0d8a387 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -136,6 +136,46 @@ void update_vsyscall_tz(void) __arch_sync_vdso_time_data(vdata); } +#ifdef CONFIG_POSIX_AUX_CLOCKS +void vdso_time_update_aux(struct timekeeper *tk) +{ + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_timestamp *vdso_ts; + struct vdso_clock *vc; + s32 clock_mode; + u64 nsec; + + vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; + vdso_ts = &vc->basetime[VDSO_BASE_AUX]; + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + if (!tk->clock_valid) + clock_mode = VDSO_CLOCKMODE_NONE; + + /* copy vsyscall data */ + vdso_write_begin_clock(vc); + + vc->clock_mode = clock_mode; + + if (clock_mode != VDSO_CLOCKMODE_NONE) { + fill_clock_configuration(vc, &tk->tkr_mono); + + vdso_ts->sec = tk->xtime_sec; + + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->offs_aux; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + nsec = nsec << tk->tkr_mono.shift; + vdso_ts->nsec = nsec; + } + + __arch_update_vdso_clock(vc); + + vdso_write_end_clock(vc); + + __arch_sync_vdso_time_data(vdata); +} +#endif + /** * vdso_update_begin - Start of a VDSO update section * -- cgit v1.2.3 From 85a3bce695b361d85fc528e6fbb33e4c8089c806 Mon Sep 17 00:00:00 2001 From: Tomas Glozar Date: Wed, 16 Jul 2025 16:36:01 +0200 Subject: tracing/osnoise: Fix crash in timerlat_dump_stack() We have observed kernel panics when using timerlat with stack saving, with the following dmesg output: memcpy: detected buffer overflow: 88 byte write of buffer size 0 WARNING: CPU: 2 PID: 8153 at lib/string_helpers.c:1032 __fortify_report+0x55/0xa0 CPU: 2 UID: 0 PID: 8153 Comm: timerlatu/2 Kdump: loaded Not tainted 6.15.3-200.fc42.x86_64 #1 PREEMPT(lazy) Call Trace: ? trace_buffer_lock_reserve+0x2a/0x60 __fortify_panic+0xd/0xf __timerlat_dump_stack.cold+0xd/0xd timerlat_dump_stack.part.0+0x47/0x80 timerlat_fd_read+0x36d/0x390 vfs_read+0xe2/0x390 ? syscall_exit_to_user_mode+0x1d5/0x210 ksys_read+0x73/0xe0 do_syscall_64+0x7b/0x160 ? exc_page_fault+0x7e/0x1a0 entry_SYSCALL_64_after_hwframe+0x76/0x7e __timerlat_dump_stack() constructs the ftrace stack entry like this: struct stack_entry *entry; ... memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; Since commit e7186af7fb26 ("tracing: Add back FORTIFY_SOURCE logic to kernel_stack event structure"), struct stack_entry marks its caller field with __counted_by(size). At the time of the memcpy, entry->size contains garbage from the ringbuffer, which under some circumstances is zero, triggering a kernel panic by buffer overflow. Populate the size field before the memcpy so that the out-of-bounds check knows the correct size. This is analogous to __ftrace_trace_stack(). Cc: stable@vger.kernel.org Cc: John Kacur Cc: Luis Goncalves Cc: Attila Fazekas Link: https://lore.kernel.org/20250716143601.7313-1-tglozar@redhat.com Fixes: e7186af7fb26 ("tracing: Add back FORTIFY_SOURCE logic to kernel_stack event structure") Signed-off-by: Tomas Glozar Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 6819b93309ce..fd259da0aa64 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -637,8 +637,8 @@ __timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, u entry = ring_buffer_event_data(event); - memcpy(&entry->caller, fstack->calls, size); entry->size = fstack->nr_entries; + memcpy(&entry->caller, fstack->calls, size); trace_buffer_unlock_commit_nostack(buffer, event); } -- cgit v1.2.3 From 646faf36d7271c597497ca547a59912fcab49be9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 18 Jul 2025 11:18:54 +0200 Subject: cgroup: Add compatibility option for content of /proc/cgroups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit /proc/cgroups lists only v1 controllers by default, however, this is only enforced since the commit af000ce85293b ("cgroup: Do not report unavailable v1 controllers in /proc/cgroups") and there is software in the wild that uses content of /proc/cgroups to decide on availability of v2 (sic) controllers. Add a boottime param that can bring back the previous behavior for setups where the check in the software cannot be changed and it causes e.g. unintended OOMs. Also, this patch takes out cgrp_v1_visible from cgroup1_subsys_absent() guard since it's only important to check which hierarchy (v1 vs v2) the subsys is attached to. This has no effect on the printed message but the code is cleaner since cgrp_v1_visible is really about mounted hierarchies, not the content of /proc/cgroups. Link: https://lore.kernel.org/r/b26b60b7d0d2a5ecfd2f3c45f95f32922ed24686.camel@decadent.org.uk Fixes: af000ce85293b ("cgroup: Do not report unavailable v1 controllers in /proc/cgroups") Fixes: a0ab1453226d8 ("cgroup: Print message when /proc/cgroups is read on v2-only system") Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fa24c032ed6f..2a4a387f867a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; +/* Show unavailable controllers in /proc/cgroups */ +static bool proc_show_all; + /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. @@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) */ for_each_subsys(ss, i) { - if (cgroup1_subsys_absent(ss)) - continue; cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + if (!proc_show_all && cgroup1_subsys_absent(ss)) + continue; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), @@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str) return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); + +static int __init cgroup_v1_proc(char *str) +{ + return (kstrtobool(str, &proc_show_all) == 0); +} +__setup("cgroup_v1_proc=", cgroup_v1_proc); -- cgit v1.2.3 From b5e8acc14dcb314a9b61ff19dcd9fdd0d88f70df Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Jul 2025 22:31:58 -0400 Subject: tracing: Add down_write(trace_event_sem) when adding trace event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a module is loaded, it adds trace events defined by the module. It may also need to modify the modules trace printk formats to replace enum names with their values. If two modules are loaded at the same time, the adding of the event to the ftrace_events list can corrupt the walking of the list in the code that is modifying the printk format strings and crash the kernel. The addition of the event should take the trace_event_sem for write while it adds the new event. Also add a lockdep_assert_held() on that semaphore in __trace_add_event_dirs() as it iterates the list. Cc: stable@vger.kernel.org Cc: Mathieu Desnoyers Acked-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/20250718223158.799bfc0c@batman.local.home Reported-by: Fusheng Huang(黄富生) Closes: https://lore.kernel.org/all/20250717105007.46ccd18f@batman.local.home/ Fixes: 110bf2b764eb6 ("tracing: add protection around module events unload") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 120531268abf..d01e5c910ce1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3136,7 +3136,10 @@ __register_event(struct trace_event_call *call, struct module *mod) if (ret < 0) return ret; + down_write(&trace_event_sem); list_add(&call->list, &ftrace_events); + up_write(&trace_event_sem); + if (call->flags & TRACE_EVENT_FL_DYNAMIC) atomic_set(&call->refcnt, 0); else @@ -3750,6 +3753,8 @@ __trace_add_event_dirs(struct trace_array *tr) struct trace_event_call *call; int ret; + lockdep_assert_held(&trace_event_sem); + list_for_each_entry(call, &ftrace_events, list) { ret = __trace_add_new_event(call, tr); if (ret < 0) -- cgit v1.2.3 From 35c18f2933c596b4fd6a98baee36f3137d133a5f Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 12 Jun 2025 12:13:21 +0200 Subject: Add a new optional ",cma" suffix to the crashkernel= command line option Patch series "kdump: crashkernel reservation from CMA", v5. This series implements a way to reserve additional crash kernel memory using CMA. Currently, all the memory for the crash kernel is not usable by the 1st (production) kernel. It is also unmapped so that it can't be corrupted by the fault that will eventually trigger the crash. This makes sense for the memory actually used by the kexec-loaded crash kernel image and initrd and the data prepared during the load (vmcoreinfo, ...). However, the reserved space needs to be much larger than that to provide enough run-time memory for the crash kernel and the kdump userspace. Estimating the amount of memory to reserve is difficult. Being too careful makes kdump likely to end in OOM, being too generous takes even more memory from the production system. Also, the reservation only allows reserving a single contiguous block (or two with the "low" suffix). I've seen systems where this fails because the physical memory is fragmented. By reserving additional crashkernel memory from CMA, the main crashkernel reservation can be just large enough to fit the kernel and initrd image, minimizing the memory taken away from the production system. Most of the run-time memory for the crash kernel will be memory previously available to userspace in the production system. As this memory is no longer wasted, the reservation can be done with a generous margin, making kdump more reliable. Kernel memory that we need to preserve for dumping is normally not allocated from CMA, unless it is explicitly allocated as movable. Currently this is only the case for memory ballooning and zswap. Such movable memory will be missing from the vmcore. User data is typically not dumped by makedumpfile. When dumping of user data is intended this new CMA reservation cannot be used. There are five patches in this series: The first adds a new ",cma" suffix to the recenly introduced generic crashkernel parsing code. parse_crashkernel() takes one more argument to store the cma reservation size. The second patch implements reserve_crashkernel_cma() which performs the reservation. If the requested size is not available in a single range, multiple smaller ranges will be reserved. The third patch updates Documentation/, explicitly mentioning the potential DMA corruption of the CMA-reserved memory. The fourth patch adds a short delay before booting the kdump kernel, allowing pending DMA transfers to finish. The fifth patch enables the functionality for x86 as a proof of concept. There are just three things every arch needs to do: - call reserve_crashkernel_cma() - include the CMA-reserved ranges in the physical memory map - exclude the CMA-reserved ranges from the memory available through /proc/vmcore by excluding them from the vmcoreinfo PT_LOAD ranges. Adding other architectures is easy and I can do that as soon as this series is merged. With this series applied, specifying crashkernel=100M craskhernel=1G,cma on the command line will make a standard crashkernel reservation of 100M, where kexec will load the kernel and initrd. An additional 1G will be reserved from CMA, still usable by the production system. The crash kernel will have 1.1G memory available. The 100M can be reliably predicted based on the size of the kernel and initrd. The new cma suffix is completely optional. When no crashkernel=size,cma is specified, everything works as before. This patch (of 5): Add a new cma_size parameter to parse_crashkernel(). When not NULL, call __parse_crashkernel to parse the CMA reservation size from "crashkernel=size,cma" and store it in cma_size. Set cma_size to NULL in all calls to parse_crashkernel(). Link: https://lkml.kernel.org/r/aEqnxxfLZMllMC8I@dwarf.suse.cz Link: https://lkml.kernel.org/r/aEqoQckgoTQNULnh@dwarf.suse.cz Signed-off-by: Jiri Bohac Cc: Baoquan He Cc: Dave Young Cc: Donald Dutile Cc: Michal Hocko Cc: Philipp Rudo Cc: Pingfan Liu Cc: Tao Liu Cc: Vivek Goyal Cc: David Hildenbrand Signed-off-by: Andrew Morton --- kernel/crash_reserve.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index acb6bf42e30d..86ae1365d04e 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -172,17 +172,19 @@ static int __init parse_crashkernel_simple(char *cmdline, #define SUFFIX_HIGH 0 #define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 +#define SUFFIX_CMA 2 +#define SUFFIX_NULL 3 static __initdata char *suffix_tbl[] = { [SUFFIX_HIGH] = ",high", [SUFFIX_LOW] = ",low", + [SUFFIX_CMA] = ",cma", [SUFFIX_NULL] = NULL, }; /* * That function parses "suffix" crashkernel command lines like * - * crashkernel=size,[high|low] + * crashkernel=size,[high|low|cma] * * It returns 0 on success and -EINVAL on failure. */ @@ -298,9 +300,11 @@ int __init parse_crashkernel(char *cmdline, unsigned long long *crash_size, unsigned long long *crash_base, unsigned long long *low_size, + unsigned long long *cma_size, bool *high) { int ret; + unsigned long long __always_unused cma_base; /* crashkernel=X[@offset] */ ret = __parse_crashkernel(cmdline, system_ram, crash_size, @@ -331,6 +335,14 @@ int __init parse_crashkernel(char *cmdline, *high = true; } + + /* + * optional CMA reservation + * cma_base is ignored + */ + if (cma_size) + __parse_crashkernel(cmdline, 0, cma_size, + &cma_base, suffix_tbl[SUFFIX_CMA]); #endif if (!*crash_size) ret = -EINVAL; -- cgit v1.2.3 From ab475510e0422bb5672d465f9d0f523d72fdb7f1 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 12 Jun 2025 12:16:39 +0200 Subject: kdump: implement reserve_crashkernel_cma reserve_crashkernel_cma() reserves CMA ranges for the crash kernel. If allocating the requested size fails, try to reserve in smaller blocks. Store the reserved ranges in the crashk_cma_ranges array and the number of ranges in crashk_cma_cnt. Link: https://lkml.kernel.org/r/aEqpBwOy_ekm0gw9@dwarf.suse.cz Signed-off-by: Jiri Bohac Cc: Baoquan He Cc: Dave Young Cc: David Hildenbrand Cc: Donald Dutile Cc: Michal Hocko Cc: Philipp Rudo Cc: Pingfan Liu Cc: Tao Liu Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_reserve.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_reserve.c b/kernel/crash_reserve.c index 86ae1365d04e..87bf4d41eabb 100644 --- a/kernel/crash_reserve.c +++ b/kernel/crash_reserve.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -469,6 +471,56 @@ retry: #endif } +struct range crashk_cma_ranges[CRASHKERNEL_CMA_RANGES_MAX]; +#ifdef CRASHKERNEL_CMA +int crashk_cma_cnt; +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + unsigned long long request_size = roundup(cma_size, PAGE_SIZE); + unsigned long long reserved_size = 0; + + if (!cma_size) + return; + + while (cma_size > reserved_size && + crashk_cma_cnt < CRASHKERNEL_CMA_RANGES_MAX) { + + struct cma *res; + + if (cma_declare_contiguous(0, request_size, 0, 0, 0, false, + "crashkernel", &res)) { + /* reservation failed, try half-sized blocks */ + if (request_size <= PAGE_SIZE) + break; + + request_size = roundup(request_size / 2, PAGE_SIZE); + continue; + } + + crashk_cma_ranges[crashk_cma_cnt].start = cma_get_base(res); + crashk_cma_ranges[crashk_cma_cnt].end = + crashk_cma_ranges[crashk_cma_cnt].start + + cma_get_size(res) - 1; + ++crashk_cma_cnt; + reserved_size += request_size; + } + + if (cma_size > reserved_size) + pr_warn("crashkernel CMA reservation failed: %lld MB requested, %lld MB reserved in %d ranges\n", + cma_size >> 20, reserved_size >> 20, crashk_cma_cnt); + else + pr_info("crashkernel CMA reserved: %lld MB in %d ranges\n", + reserved_size >> 20, crashk_cma_cnt); +} + +#else /* CRASHKERNEL_CMA */ +void __init reserve_crashkernel_cma(unsigned long long cma_size) +{ + if (cma_size) + pr_warn("crashkernel CMA reservation not supported\n"); +} +#endif + #ifndef HAVE_ARCH_ADD_CRASH_RES_TO_IOMEM_EARLY static __init int insert_crashkernel_resources(void) { -- cgit v1.2.3 From e1280f3071f11abc1bacd84937ecf077dce449f3 Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Thu, 12 Jun 2025 12:18:40 +0200 Subject: kdump: wait for DMA to finish when using CMA When re-using the CMA area for kdump there is a risk of pending DMA into pinned user pages in the CMA area. Pages residing in CMA areas can usually not get long-term pinned and are instead migrated away from the CMA area, so long-term pinning is typically not a concern. (BUGs in the kernel might still lead to long-term pinning of such pages if everything goes wrong.) Pages pinned without FOLL_LONGTERM remain in the CMA and may possibly be the source or destination of a pending DMA transfer. Although there is no clear specification how long a page may be pinned without FOLL_LONGTERM, pinning without the flag shows an intent of the caller to only use the memory for short-lived DMA transfers, not a transfer initiated by a device asynchronously at a random time in the future. Add a delay of CMA_DMA_TIMEOUT_SEC seconds before starting the kdump kernel, giving such short-lived DMA transfers time to finish before the CMA memory is re-used by the kdump kernel. Set CMA_DMA_TIMEOUT_SEC to 10 seconds - chosen arbitrarily as both a huge margin for a DMA transfer, yet not increasing the kdump time too significantly. Link: https://lkml.kernel.org/r/aEqpgDIBndZ5LXSo@dwarf.suse.cz Signed-off-by: Jiri Bohac Acked-by: David Hildenbrand Cc: Baoquan He Cc: Dave Young Cc: Donald Dutile Cc: Michal Hocko Cc: Philipp Rudo Cc: Pingfan Liu Cc: Tao Liu Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/crash_core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 335b8425dd4b..a4ef79591eb2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,11 @@ /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; +/* time to wait for possible DMA to finish before starting the kdump kernel + * when a CMA reservation is used + */ +#define CMA_DMA_TIMEOUT_SEC 10 + #ifdef CONFIG_CRASH_DUMP int kimage_crash_copy_vmcoreinfo(struct kimage *image) @@ -97,6 +103,14 @@ int kexec_crash_loaded(void) } EXPORT_SYMBOL_GPL(kexec_crash_loaded); +static void crash_cma_clear_pending_dma(void) +{ + if (!crashk_cma_cnt) + return; + + mdelay(CMA_DMA_TIMEOUT_SEC * 1000); +} + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU @@ -119,6 +133,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); + crash_cma_clear_pending_dma(); machine_kexec(kexec_crash_image); } kexec_unlock(); -- cgit v1.2.3 From 261743b0135d1d578cab407ba0cf226df30b43d8 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jul 2025 10:10:00 +0800 Subject: panic: clean up code for console replay Patch series "generalize panic_print's dump function to be used by other kernel parts", v3. When working on kernel stability issues, panic, task-hung and software/hardware lockup are frequently met. And to debug them, user may need lots of system information at that time, like task call stacks, lock info, memory info etc. panic case already has panic_print_sys_info() for this purpose, and has a 'panic_print' bitmask to control what kinds of information is needed, which is also helpful to debug other task-hung and lockup cases. So this patchset extracts the function out to a new file 'lib/sys_info.c', and makes it available for other cases which also need to dump system info for debugging. Also as suggested by Petr Mladek, add 'panic_sys_info=' interface to take human readable string like "tasks,mem,locks,timers,ftrace,....", and eventually obsolete the current 'panic_print' bitmap interface. In RFC and V1 version, hung_task and SW/HW watchdog modules are enabled with the new sys_info dump interface. In v2, they are kept out for better review of current change, and will be posted later. Locally these have been used in our bug chasing for stability issues and was proven helpful. Many thanks to Petr Mladek for great suggestions on both the code and architectures! This patch (of 5): Currently the panic_print_sys_info() was called twice with different parameters to handle console replay case, which is kind of confusing. Add panic_console_replay() explicitly and rename 'PANIC_PRINT_ALL_PRINTK_MSG' to 'PANIC_CONSOLE_REPLAY', to make the code straightforward. The related kernel document is also updated. Link: https://lkml.kernel.org/r/20250703021004.42328-1-feng.tang@linux.alibaba.com Link: https://lkml.kernel.org/r/20250703021004.42328-2-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- kernel/panic.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b0b9a8bf4560..9b6c5dc28a65 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -74,7 +74,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); #define PANIC_PRINT_TIMER_INFO 0x00000004 #define PANIC_PRINT_LOCK_INFO 0x00000008 #define PANIC_PRINT_FTRACE_INFO 0x00000010 -#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 +#define PANIC_CONSOLE_REPLAY 0x00000020 #define PANIC_PRINT_ALL_CPU_BT 0x00000040 #define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; @@ -238,14 +238,14 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_print_sys_info(bool console_flush) +static void panic_console_replay(void) { - if (console_flush) { - if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) - console_flush_on_panic(CONSOLE_REPLAY_ALL); - return; - } + if (panic_print & PANIC_CONSOLE_REPLAY) + console_flush_on_panic(CONSOLE_REPLAY_ALL); +} +static void panic_print_sys_info(void) +{ if (panic_print & PANIC_PRINT_TASK_INFO) show_state(); @@ -410,7 +410,7 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - panic_print_sys_info(false); + panic_print_sys_info(); kmsg_dump_desc(KMSG_DUMP_PANIC, buf); @@ -439,7 +439,7 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_print_sys_info(true); + panic_console_replay(); if (!panic_blink) panic_blink = no_blink; -- cgit v1.2.3 From b76e89e50fc3693b7b8a443ed906320d8ccb93fd Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jul 2025 10:10:01 +0800 Subject: panic: generalize panic_print's function to show sys info 'panic_print' was introduced to help debugging kernel panic by dumping different kinds of system information like tasks' call stack, memory, ftrace buffer, etc. Actually this function could also be used to help debugging other cases like task-hung, soft/hard lockup, etc. where user may need the snapshot of system info at that time. Extract system info dump function related code from panic.c to separate file sys_info.[ch], for wider usage by other kernel parts for debugging. Also modify the macro names about singulars/plurals. Link: https://lkml.kernel.org/r/20250703021004.42328-3-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- kernel/panic.c | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 9b6c5dc28a65..cbb0681177b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -69,14 +70,6 @@ bool panic_triggering_all_cpu_backtrace; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); -#define PANIC_PRINT_TASK_INFO 0x00000001 -#define PANIC_PRINT_MEM_INFO 0x00000002 -#define PANIC_PRINT_TIMER_INFO 0x00000004 -#define PANIC_PRINT_LOCK_INFO 0x00000008 -#define PANIC_PRINT_FTRACE_INFO 0x00000010 -#define PANIC_CONSOLE_REPLAY 0x00000020 -#define PANIC_PRINT_ALL_CPU_BT 0x00000040 -#define PANIC_PRINT_BLOCKED_TASKS 0x00000080 unsigned long panic_print; ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -240,31 +233,10 @@ EXPORT_SYMBOL(nmi_panic); static void panic_console_replay(void) { - if (panic_print & PANIC_CONSOLE_REPLAY) + if (panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) console_flush_on_panic(CONSOLE_REPLAY_ALL); } -static void panic_print_sys_info(void) -{ - if (panic_print & PANIC_PRINT_TASK_INFO) - show_state(); - - if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(); - - if (panic_print & PANIC_PRINT_TIMER_INFO) - sysrq_timer_list_show(); - - if (panic_print & PANIC_PRINT_LOCK_INFO) - debug_show_all_locks(); - - if (panic_print & PANIC_PRINT_FTRACE_INFO) - ftrace_dump(DUMP_ALL); - - if (panic_print & PANIC_PRINT_BLOCKED_TASKS) - show_state_filter(TASK_UNINTERRUPTIBLE); -} - void check_panic_on_warn(const char *origin) { unsigned int limit; @@ -285,7 +257,7 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & PANIC_PRINT_ALL_CPU_BT) { + if (panic_print & SYS_INFO_ALL_CPU_BT) { /* Temporary allow non-panic CPUs to write their backtraces. */ panic_triggering_all_cpu_backtrace = true; trigger_all_cpu_backtrace(); @@ -410,7 +382,7 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - panic_print_sys_info(); + sys_info(panic_print); kmsg_dump_desc(KMSG_DUMP_PANIC, buf); -- cgit v1.2.3 From d747755917bf8ae08f490c3fe7d8e321afab8127 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jul 2025 10:10:02 +0800 Subject: panic: add 'panic_sys_info' sysctl to take human readable string parameter Bitmap definition for 'panic_print' is hard to remember and decode. Add 'panic_sys_info='sysctl to take human readable string like "tasks,mem,timers,locks,ftrace,..." and translate it into bitmap. The detailed mapping is: SYS_INFO_TASKS "tasks" SYS_INFO_MEM "mem" SYS_INFO_TIMERS "timers" SYS_INFO_LOCKS "locks" SYS_INFO_FTRACE "ftrace" SYS_INFO_ALL_CPU_BT "all_bt" SYS_INFO_BLOCKED_TASKS "blocked_tasks" [nathan@kernel.org: add __maybe_unused to sys_info_avail] Link: https://lkml.kernel.org/r/20250708-fix-clang-sys_info_avail-warning-v1-1-60d239eacd64@kernel.org Link: https://lkml.kernel.org/r/20250703021004.42328-4-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Nathan Chancellor Cc: Andy Shevchenko Signed-off-by: Andrew Morton --- kernel/panic.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index cbb0681177b3..d7aa427dc23c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -126,6 +126,13 @@ static const struct ctl_table kern_panic_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, + { + .procname = "panic_sys_info", + .data = &panic_print, + .maxlen = sizeof(panic_print), + .mode = 0644, + .proc_handler = sysctl_sys_info_handler, + }, }; static __init int kernel_panic_sysctls_init(void) -- cgit v1.2.3 From 9743d12d0c63968320ece31e2e48723f3235be6d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jul 2025 10:10:03 +0800 Subject: panic: add 'panic_sys_info=' setup option for kernel cmdline 'panic_sys_info=' sysctl interface is already added for runtime setting. Add counterpart kernel cmdline option for boottime setting. Link: https://lkml.kernel.org/r/20250703021004.42328-5-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- kernel/panic.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index d7aa427dc23c..d9d4fcd5e318 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -143,6 +143,15 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +/* The format is "panic_sys_info=tasks,mem,locks,ftrace,..." */ +static int __init setup_panic_sys_info(char *buf) +{ + /* There is no risk of race in kernel boot phase */ + panic_print = sys_info_parse_param(buf); + return 1; +} +__setup("panic_sys_info=", setup_panic_sys_info); + static atomic_t warn_count = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS -- cgit v1.2.3 From ee13240cd78b68430eb50af4721b3f18dd08af29 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 3 Jul 2025 10:10:04 +0800 Subject: panic: add note that panic_print sysctl interface is deprecated Add a dedicated core parameter 'panic_console_replay' for controlling console replay, and add note that 'panic_print' sysctl interface will be obsoleted by 'panic_sys_info' and 'panic_console_replay'. When it happens, the SYS_INFO_PANIC_CONSOLE_REPLAY can be removed as well. Link: https://lkml.kernel.org/r/20250703021004.42328-6-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- kernel/panic.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index d9d4fcd5e318..bb16f254cd02 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -64,6 +64,7 @@ int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; static unsigned int warn_limit __read_mostly; +static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; @@ -77,6 +78,13 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); #ifdef CONFIG_SYSCTL +static int sysctl_panic_print_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} + static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { @@ -108,7 +116,7 @@ static const struct ctl_table kern_panic_table[] = { .data = &panic_print, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = sysctl_panic_print_handler, }, { .procname = "panic_on_warn", @@ -247,12 +255,6 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); -static void panic_console_replay(void) -{ - if (panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) - console_flush_on_panic(CONSOLE_REPLAY_ALL); -} - void check_panic_on_warn(const char *origin) { unsigned int limit; @@ -427,7 +429,9 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(CONSOLE_FLUSH_PENDING); - panic_console_replay(); + if ((panic_print & SYS_INFO_PANIC_CONSOLE_REPLAY) || + panic_console_replay) + console_flush_on_panic(CONSOLE_REPLAY_ALL); if (!panic_blink) panic_blink = no_blink; @@ -869,6 +873,7 @@ core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); +core_param(panic_console_replay, panic_console_replay, bool, 0644); static int __init oops_setup(char *s) { -- cgit v1.2.3 From ae2da51def76020fa16f53cd3446c00cafe41008 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 27 Jun 2025 15:29:22 +0800 Subject: locking/rwsem: make owner helpers globally available Patch series "extend hung task blocker tracking to rwsems". Inspired by mutex blocker tracking[1], and having already extended it to semaphores, let's now add support for reader-writer semaphores (rwsems). The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while waiting for an rwsem, we just call hung_task_set_blocker(). The hung task detector can then query the rwsem's owner to identify the lock holder. Tracking works reliably for writers, as there can only be a single writer holding the lock, and its task struct is stored in the owner field. The main challenge lies with readers. The owner field points to only one of many concurrent readers, so we might lose track of the blocker if that specific reader unlocks, even while others remain. This is not a significant issue, however. In practice, long-lasting lock contention is almost always caused by a writer. Therefore, reliably tracking the writer is the primary goal of this patch series ;) With this change, the hung task detector can now show blocker task's info like below: [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds. [Fri Jun 27 15:21:34 2025] Tainted: G S 6.16.0-rc3 #8 [Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Fri Jun 27 15:21:34 2025] task:cat state:D stack:0 pid:28631 tgid:28631 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? policy_nodemask+0x215/0x340 [Fri Jun 27 15:21:34 2025] ? _raw_spin_lock_irq+0x8a/0xe0 [Fri Jun 27 15:21:34 2025] ? __pfx__raw_spin_lock_irq+0x10/0x10 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_preempt_disabled+0x15/0x30 [Fri Jun 27 15:21:34 2025] rwsem_down_read_slowpath+0x55e/0xe10 [Fri Jun 27 15:21:34 2025] ? __pfx_rwsem_down_read_slowpath+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx___might_resched+0x10/0x10 [Fri Jun 27 15:21:34 2025] down_read+0xc9/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_down_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __debugfs_file_get+0x14d/0x700 [Fri Jun 27 15:21:34 2025] ? __pfx___debugfs_file_get+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? handle_pte_fault+0x52a/0x710 [Fri Jun 27 15:21:34 2025] ? selinux_file_permission+0x3a9/0x590 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_read+0x4a/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked on an rw-semaphore likely owned by task cat:28630 [Fri Jun 27 15:21:34 2025] task:cat state:S stack:0 pid:28630 tgid:28630 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __mod_timer+0x304/0xa80 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_timeout+0xfb/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_schedule_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx_process_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? down_write+0xc4/0x140 [Fri Jun 27 15:21:34 2025] msleep_interruptible+0xbe/0x150 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_write+0x54/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] This patch (of 3): In preparation for extending blocker tracking to support rwsems, make the rwsem_owner() and is_rwsem_reader_owned() helpers globally available for determining if the blocker is a writer or one of the readers. Additionally, a stale owner pointer in a reader-owned rwsem can lead to false positives in blocker tracking when CONFIG_DETECT_HUNG_TASK_BLOCKER is enabled. To mitigate this, clear the owner field on the reader unlock path, similar to what CONFIG_DEBUG_RWSEMS does. A NULL owner is better than a stale one for diagnostics. Link: https://lkml.kernel.org/r/20250627072924.36567-1-lance.yang@linux.dev Link: https://lkml.kernel.org/r/20250627072924.36567-2-lance.yang@linux.dev Link: https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/ [1] Signed-off-by: Lance Yang Reviewed-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Mingzhe Yang Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- kernel/locking/rwsem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2ddb827e3bea..a310eb9896de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -181,11 +181,11 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) __rwsem_set_reader_owned(sem, current); } -#ifdef CONFIG_DEBUG_RWSEMS +#if defined(CONFIG_DEBUG_RWSEMS) || defined(CONFIG_DETECT_HUNG_TASK_BLOCKER) /* * Return just the real task structure pointer of the owner */ -static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) +struct task_struct *rwsem_owner(struct rw_semaphore *sem) { return (struct task_struct *) (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); @@ -194,7 +194,7 @@ static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) /* * Return true if the rwsem is owned by a reader. */ -static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) +bool is_rwsem_reader_owned(struct rw_semaphore *sem) { /* * Check the count to see if it is write-locked. @@ -207,10 +207,10 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) } /* - * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there - * is a task pointer in owner of a reader-owned rwsem, it will be the - * real owner or one of the real owners. The only exception is when the - * unlock is done by up_read_non_owner(). + * With CONFIG_DEBUG_RWSEMS or CONFIG_DETECT_HUNG_TASK_BLOCKER configured, + * it will make sure that the owner field of a reader-owned rwsem either + * points to a real reader-owner(s) or gets cleared. The only exception is + * when the unlock is done by up_read_non_owner(). */ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) { -- cgit v1.2.3 From 77da18de55ac6417e48905bec8b3c66f023b15a9 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 27 Jun 2025 15:29:23 +0800 Subject: hung_task: extend hung task blocker tracking to rwsems Inspired by mutex blocker tracking[1], and having already extended it to semaphores, let's now add support for reader-writer semaphores (rwsems). The approach is simple: when a task enters TASK_UNINTERRUPTIBLE while waiting for an rwsem, we just call hung_task_set_blocker(). The hung task detector can then query the rwsem's owner to identify the lock holder. Tracking works reliably for writers, as there can only be a single writer holding the lock, and its task struct is stored in the owner field. The main challenge lies with readers. The owner field points to only one of many concurrent readers, so we might lose track of the blocker if that specific reader unlocks, even while others remain. This is not a significant issue, however. In practice, long-lasting lock contention is almost always caused by a writer. Therefore, reliably tracking the writer is the primary goal of this patch series ;) With this change, the hung task detector can now show blocker task's info like below: [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked for more than 122 seconds. [Fri Jun 27 15:21:34 2025] Tainted: G S 6.16.0-rc3 #8 [Fri Jun 27 15:21:34 2025] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [Fri Jun 27 15:21:34 2025] task:cat state:D stack:0 pid:28631 tgid:28631 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? policy_nodemask+0x215/0x340 [Fri Jun 27 15:21:34 2025] ? _raw_spin_lock_irq+0x8a/0xe0 [Fri Jun 27 15:21:34 2025] ? __pfx__raw_spin_lock_irq+0x10/0x10 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_preempt_disabled+0x15/0x30 [Fri Jun 27 15:21:34 2025] rwsem_down_read_slowpath+0x55e/0xe10 [Fri Jun 27 15:21:34 2025] ? __pfx_rwsem_down_read_slowpath+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx___might_resched+0x10/0x10 [Fri Jun 27 15:21:34 2025] down_read+0xc9/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_down_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __debugfs_file_get+0x14d/0x700 [Fri Jun 27 15:21:34 2025] ? __pfx___debugfs_file_get+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? handle_pte_fault+0x52a/0x710 [Fri Jun 27 15:21:34 2025] ? selinux_file_permission+0x3a9/0x590 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_read+0x4a/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f3f8faefb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffdeda5ab98 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f3f8faefb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 00000000010fa000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 00000000010fa000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffdeda59fe0 R11: 0000000000000246 R12: 00000000010fa000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] INFO: task cat:28631 blocked on an rw-semaphore likely owned by task cat:28630 [Fri Jun 27 15:21:34 2025] task:cat state:S stack:0 pid:28630 tgid:28630 ppid:28501 task_flags:0x400000 flags:0x00004000 [Fri Jun 27 15:21:34 2025] Call Trace: [Fri Jun 27 15:21:34 2025] [Fri Jun 27 15:21:34 2025] __schedule+0x7c7/0x1930 [Fri Jun 27 15:21:34 2025] ? __pfx___schedule+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __mod_timer+0x304/0xa80 [Fri Jun 27 15:21:34 2025] schedule+0x6a/0x180 [Fri Jun 27 15:21:34 2025] schedule_timeout+0xfb/0x230 [Fri Jun 27 15:21:34 2025] ? __pfx_schedule_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? __pfx_process_timeout+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? down_write+0xc4/0x140 [Fri Jun 27 15:21:34 2025] msleep_interruptible+0xbe/0x150 [Fri Jun 27 15:21:34 2025] read_dummy_rwsem_write+0x54/0x90 [Fri Jun 27 15:21:34 2025] full_proxy_read+0xff/0x1c0 [Fri Jun 27 15:21:34 2025] ? rw_verify_area+0x6d/0x410 [Fri Jun 27 15:21:34 2025] vfs_read+0x177/0xa50 [Fri Jun 27 15:21:34 2025] ? __pfx_vfs_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] ? fdget_pos+0x1cf/0x4c0 [Fri Jun 27 15:21:34 2025] ksys_read+0xfc/0x1d0 [Fri Jun 27 15:21:34 2025] ? __pfx_ksys_read+0x10/0x10 [Fri Jun 27 15:21:34 2025] do_syscall_64+0x66/0x2d0 [Fri Jun 27 15:21:34 2025] entry_SYSCALL_64_after_hwframe+0x76/0x7e [Fri Jun 27 15:21:34 2025] RIP: 0033:0x7f8f288efb40 [Fri Jun 27 15:21:34 2025] RSP: 002b:00007ffffb631038 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [Fri Jun 27 15:21:34 2025] RAX: ffffffffffffffda RBX: 0000000000010000 RCX: 00007f8f288efb40 [Fri Jun 27 15:21:34 2025] RDX: 0000000000010000 RSI: 000000002a4b5000 RDI: 0000000000000003 [Fri Jun 27 15:21:34 2025] RBP: 000000002a4b5000 R08: 0000000000000000 R09: 0000000000010fff [Fri Jun 27 15:21:34 2025] R10: 00007ffffb630460 R11: 0000000000000246 R12: 000000002a4b5000 [Fri Jun 27 15:21:34 2025] R13: 0000000000000003 R14: 0000000000000000 R15: 0000000000000fff [Fri Jun 27 15:21:34 2025] [1] https://lore.kernel.org/all/174046694331.2194069.15472952050240807469.stgit@mhiramat.tok.corp.google.com/ Link: https://lkml.kernel.org/r/20250627072924.36567-3-lance.yang@linux.dev Signed-off-by: Lance Yang Suggested-by: Masami Hiramatsu (Google) Reviewed-by: Masami Hiramatsu (Google) Cc: Anna Schumaker Cc: Boqun Feng Cc: Ingo Molnar Cc: Joel Granados Cc: John Stultz Cc: Kent Overstreet Cc: Mingzhe Yang Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Steven Rostedt Cc: Tomasz Figa Cc: Waiman Long Cc: Will Deacon Cc: Yongliang Gao Cc: Zi Li Signed-off-by: Andrew Morton --- kernel/hung_task.c | 29 +++++++++++++++++++++++++---- kernel/locking/rwsem.c | 17 ++++++++++++++++- 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index d2432df2b905..8708a1205f82 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -100,6 +101,7 @@ static void debug_show_blocker(struct task_struct *task) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; + const char *rwsem_blocked_by, *rwsem_blocked_as; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held"); @@ -111,12 +113,20 @@ static void debug_show_blocker(struct task_struct *task) switch (blocker_type) { case BLOCKER_TYPE_MUTEX: - owner = mutex_get_owner( - (struct mutex *)hung_task_blocker_to_lock(blocker)); + owner = mutex_get_owner(hung_task_blocker_to_lock(blocker)); break; case BLOCKER_TYPE_SEM: - owner = sem_last_holder( - (struct semaphore *)hung_task_blocker_to_lock(blocker)); + owner = sem_last_holder(hung_task_blocker_to_lock(blocker)); + break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + owner = (unsigned long)rwsem_owner( + hung_task_blocker_to_lock(blocker)); + rwsem_blocked_as = (blocker_type == BLOCKER_TYPE_RWSEM_READER) ? + "reader" : "writer"; + rwsem_blocked_by = is_rwsem_reader_owned( + hung_task_blocker_to_lock(blocker)) ? + "reader" : "writer"; break; default: WARN_ON_ONCE(1); @@ -134,6 +144,11 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n", task->comm, task->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d is blocked on an rw-semaphore, but the owner is not found.\n", + task->comm, task->pid); + break; } return; } @@ -152,6 +167,12 @@ static void debug_show_blocker(struct task_struct *task) pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n", task->comm, task->pid, t->comm, t->pid); break; + case BLOCKER_TYPE_RWSEM_READER: + case BLOCKER_TYPE_RWSEM_WRITER: + pr_err("INFO: task %s:%d <%s> blocked on an rw-semaphore likely owned by task %s:%d <%s>\n", + task->comm, task->pid, rwsem_blocked_as, t->comm, + t->pid, rwsem_blocked_by); + break; } sched_show_task(t); return; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index a310eb9896de..92c6332da401 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #ifndef CONFIG_PREEMPT_RT @@ -1065,10 +1066,13 @@ queue: wake_up_q(&wake_q); trace_contention_begin(sem, LCB_F_READ); + set_current_state(state); + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_READER); /* wait to be given the lock */ for (;;) { - set_current_state(state); if (!smp_load_acquire(&waiter.task)) { /* Matches rwsem_mark_wake()'s smp_store_release(). */ break; @@ -1083,8 +1087,12 @@ queue: } schedule_preempt_disabled(); lockevent_inc(rwsem_sleep_reader); + set_current_state(state); } + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock); trace_contention_end(sem, 0); @@ -1146,6 +1154,9 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) set_current_state(state); trace_contention_begin(sem, LCB_F_WRITE); + if (state == TASK_UNINTERRUPTIBLE) + hung_task_set_blocker(sem, BLOCKER_TYPE_RWSEM_WRITER); + for (;;) { if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ @@ -1179,6 +1190,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) trylock_again: raw_spin_lock_irq(&sem->wait_lock); } + + if (state == TASK_UNINTERRUPTIBLE) + hung_task_clear_blocker(); + __set_current_state(TASK_RUNNING); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); -- cgit v1.2.3 From ee4a2e08c10188f02fe3fb36b6beddc3c2fdb287 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 8 Jul 2025 19:27:41 +1000 Subject: entry: Add arch_in_rcu_eqs() All architectures have an interruptible RCU extended quiescent state (EQS) as part of their idle sequences, where interrupts can occur without RCU watching. Entry code must account for this and wake RCU as necessary; the common entry code deals with this in irqentry_enter() by treating any interrupt from an idle thread as potentially having occurred within an EQS and waking RCU for the duration of the interrupt via rcu_irq_enter() .. rcu_irq_exit(). Some architectures may have other interruptible EQSs which require similar treatment. For example, on s390 it is necessary to enable interrupts around guest entry in the middle of a period where core KVM code has entered an EQS. So that architectures can wake RCU in these cases, this patch adds a new arch_in_rcu_eqs() hook to the common entry code which is checked in addition to the existing is_idle_thread() check, with RCU woken if either returns true. A default implementation is provided which always returns false, which suffices for most architectures. As no architectures currently implement arch_in_rcu_eqs(), there should be no functional change as a result of this patch alone. A subsequent patch will add an s390 implementation to fix a latent bug with missing RCU wakeups. [ajd@linux.ibm.com: rebase, fix commit message] Signed-off-by: Mark Rutland Cc: Andy Lutomirski Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Thomas Gleixner Cc: Claudio Imbrenda Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Janosch Frank Reviewed-by: Christian Borntraeger Signed-off-by: Andrew Donnellan Acked-by: Peter Zijlstra (Intel) Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20250708092742.104309-2-ajd@linux.ibm.com Signed-off-by: Janosch Frank Message-ID: <20250708092742.104309-2-ajd@linux.ibm.com> --- kernel/entry/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index a8dd1f27417c..eb52d38e8099 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -220,7 +220,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * TINY_RCU does not support EQS, so let the compiler eliminate * this part when enabled. */ - if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + if (!IS_ENABLED(CONFIG_TINY_RCU) && + (is_idle_task(current) || arch_in_rcu_eqs())) { /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required -- cgit v1.2.3 From 647fe16b46999258ce1aec41f4bdeabb4f0cc8e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Jun 2025 10:53:11 -0400 Subject: PM: cpufreq: powernv/tracing: Move powernv_throttle trace event As the trace event powernv_throttle is only used by the powernv code, move it to a separate include file and have that code directly enable it. Trace events can take up around 5K of memory when they are defined regardless if they are used or not. It wastes memory to have them defined in configurations where the tracepoint is not used. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Madhavan Srinivasan Cc: Michael Ellerman Link: https://lore.kernel.org/20250612145407.906308844@goodmis.org Fixes: 0306e481d479a ("cpufreq: powernv/tracing: Add powernv_throttle tracepoint") Acked-by: Viresh Kumar Acked-by: Rafael J. Wysocki Signed-off-by: Steven Rostedt (Google) --- kernel/trace/power-traces.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index 21bb161c2316..f2fe33573e54 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -17,5 +17,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency); -EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); -- cgit v1.2.3 From 95993dc3039e29dabb9a50d074145d4cb757b08b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 20 Jul 2025 09:47:54 -0700 Subject: bpf: Use ERR_CAST instead of ERR_PTR(PTR_ERR(...)) Intel linux test robot reported a warning that ERR_CAST can be used for error pointer casting instead of more-complicated/rarely-used ERR_PTR(PTR_ERR(...)) style. There is no functionality change, but still let us replace two such instances as it improves consistency and readability. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507201048.bceHy8zX-lkp@intel.com/ Signed-off-by: Yonghong Song Signed-off-by: Martin KaFai Lau Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250720164754.3999140-1-yonghong.song@linux.dev --- kernel/bpf/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 72c8b50dca0a..2e1c0eab20c0 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -707,11 +707,11 @@ static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_ if (is_link) { anchor_link = bpf_get_anchor_link(flags, id_or_fd); if (IS_ERR(anchor_link)) - return ERR_PTR(PTR_ERR(anchor_link)); + return ERR_CAST(anchor_link); } else if (is_id || id_or_fd) { anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); if (IS_ERR(anchor_prog)) - return ERR_PTR(PTR_ERR(anchor_prog)); + return ERR_CAST(anchor_prog); } if (!anchor_prog && !anchor_link) { -- cgit v1.2.3 From 57fbad15c2eee77276a541c616589b32976d2b8e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jul 2025 16:25:06 -0700 Subject: stackleak: Rename STACKLEAK to KSTACK_ERASE In preparation for adding Clang sanitizer coverage stack depth tracking that can support stack depth callbacks: - Add the new top-level CONFIG_KSTACK_ERASE option which will be implemented either with the stackleak GCC plugin, or with the Clang stack depth callback support. - Rename CONFIG_GCC_PLUGIN_STACKLEAK as needed to CONFIG_KSTACK_ERASE, but keep it for anything specific to the GCC plugin itself. - Rename all exposed "STACKLEAK" names and files to "KSTACK_ERASE" (named for what it does rather than what it protects against), but leave as many of the internals alone as possible to avoid even more churn. While here, also split "prev_lowest_stack" into CONFIG_KSTACK_ERASE_METRICS, since that's the only place it is referenced from. Suggested-by: Ingo Molnar Link: https://lore.kernel.org/r/20250717232519.2984886-1-kees@kernel.org Signed-off-by: Kees Cook --- kernel/Makefile | 10 +-- kernel/fork.c | 2 +- kernel/kstack_erase.c | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/stackleak.c | 177 -------------------------------------------------- 4 files changed, 183 insertions(+), 183 deletions(-) create mode 100644 kernel/kstack_erase.c delete mode 100644 kernel/stackleak.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 32e80dd626af..e4f01f1d4d0c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -139,11 +139,11 @@ obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o -CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) -obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o -KASAN_SANITIZE_stackleak.o := n -KCSAN_SANITIZE_stackleak.o := n -KCOV_INSTRUMENT_stackleak.o := n +CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE) +obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o +KASAN_SANITIZE_kstack_erase.o := n +KCSAN_SANITIZE_kstack_erase.o := n +KCOV_INSTRUMENT_kstack_erase.o := n obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..1ec66911f6f6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,7 +93,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c new file mode 100644 index 000000000000..201b846f8345 --- /dev/null +++ b/kernel/kstack_erase.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This code fills the used part of the kernel stack with a poison value + * before returning to userspace. It's part of the STACKLEAK feature + * ported from grsecurity/PaX. + * + * Author: Alexander Popov + * + * KSTACK_ERASE reduces the information which kernel stack leak bugs can + * reveal and blocks some uninitialized stack variable attacks. + */ + +#include +#include + +#ifdef CONFIG_KSTACK_ERASE_RUNTIME_DISABLE +#include +#include +#include +#include + +static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); + +#ifdef CONFIG_SYSCTL +static int stack_erasing_sysctl(const struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + int state = !static_branch_unlikely(&stack_erasing_bypass); + int prev_state = state; + struct ctl_table table_copy = *table; + + table_copy.data = &state; + ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); + state = !!state; + if (ret || !write || state == prev_state) + return ret; + + if (state) + static_branch_disable(&stack_erasing_bypass); + else + static_branch_enable(&stack_erasing_bypass); + + pr_warn("stackleak: kernel stack erasing is %s\n", + str_enabled_disabled(state)); + return ret; +} +static const struct ctl_table stackleak_sysctls[] = { + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init stackleak_sysctls_init(void) +{ + register_sysctl_init("kernel", stackleak_sysctls); + return 0; +} +late_initcall(stackleak_sysctls_init); +#endif /* CONFIG_SYSCTL */ + +#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) +#else +#define skip_erasing() false +#endif /* CONFIG_KSTACK_ERASE_RUNTIME_DISABLE */ + +#ifndef __stackleak_poison +static __always_inline void __stackleak_poison(unsigned long erase_low, + unsigned long erase_high, + unsigned long poison) +{ + while (erase_low < erase_high) { + *(unsigned long *)erase_low = poison; + erase_low += sizeof(unsigned long); + } +} +#endif + +static __always_inline void __stackleak_erase(bool on_task_stack) +{ + const unsigned long task_stack_low = stackleak_task_low_bound(current); + const unsigned long task_stack_high = stackleak_task_high_bound(current); + unsigned long erase_low, erase_high; + + erase_low = stackleak_find_top_of_poison(task_stack_low, + current->lowest_stack); + +#ifdef CONFIG_KSTACK_ERASE_METRICS + current->prev_lowest_stack = erase_low; +#endif + + /* + * Write poison to the task's stack between 'erase_low' and + * 'erase_high'. + * + * If we're running on a different stack (e.g. an entry trampoline + * stack) we can erase everything below the pt_regs at the top of the + * task stack. + * + * If we're running on the task stack itself, we must not clobber any + * stack used by this function and its caller. We assume that this + * function has a fixed-size stack frame, and the current stack pointer + * doesn't change while we write poison. + */ + if (on_task_stack) + erase_high = current_stack_pointer; + else + erase_high = task_stack_high; + + __stackleak_poison(erase_low, erase_high, KSTACK_ERASE_POISON); + + /* Reset the 'lowest_stack' value for the next syscall */ + current->lowest_stack = task_stack_high; +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can be called from the task stack or an entry stack when the task stack is + * no longer in use. + */ +asmlinkage void noinstr stackleak_erase(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(on_thread_stack()); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from the task stack. + */ +asmlinkage void noinstr stackleak_erase_on_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(true); +} + +/* + * Erase and poison the portion of the task stack used since the last erase. + * Can only be called from a stack other than the task stack. + */ +asmlinkage void noinstr stackleak_erase_off_task_stack(void) +{ + if (skip_erasing()) + return; + + __stackleak_erase(false); +} + +void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) +{ + unsigned long sp = current_stack_pointer; + + /* + * Having CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE larger than + * KSTACK_ERASE_SEARCH_DEPTH makes the poison search in + * stackleak_erase() unreliable. Let's prevent that. + */ + BUILD_BUG_ON(CONFIG_KSTACK_ERASE_TRACK_MIN_SIZE > KSTACK_ERASE_SEARCH_DEPTH); + + /* 'lowest_stack' should be aligned on the register width boundary */ + sp = ALIGN(sp, sizeof(unsigned long)); + if (sp < current->lowest_stack && + sp >= stackleak_task_low_bound(current)) { + current->lowest_stack = sp; + } +} +EXPORT_SYMBOL(stackleak_track_stack); diff --git a/kernel/stackleak.c b/kernel/stackleak.c deleted file mode 100644 index bb65321761b4..000000000000 --- a/kernel/stackleak.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This code fills the used part of the kernel stack with a poison value - * before returning to userspace. It's part of the STACKLEAK feature - * ported from grsecurity/PaX. - * - * Author: Alexander Popov - * - * STACKLEAK reduces the information which kernel stack leak bugs can - * reveal and blocks some uninitialized stack variable attacks. - */ - -#include -#include - -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -#include -#include -#include -#include - -static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); - -#ifdef CONFIG_SYSCTL -static int stack_erasing_sysctl(const struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret = 0; - int state = !static_branch_unlikely(&stack_erasing_bypass); - int prev_state = state; - struct ctl_table table_copy = *table; - - table_copy.data = &state; - ret = proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos); - state = !!state; - if (ret || !write || state == prev_state) - return ret; - - if (state) - static_branch_disable(&stack_erasing_bypass); - else - static_branch_enable(&stack_erasing_bypass); - - pr_warn("stackleak: kernel stack erasing is %s\n", - str_enabled_disabled(state)); - return ret; -} -static const struct ctl_table stackleak_sysctls[] = { - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -}; - -static int __init stackleak_sysctls_init(void) -{ - register_sysctl_init("kernel", stackleak_sysctls); - return 0; -} -late_initcall(stackleak_sysctls_init); -#endif /* CONFIG_SYSCTL */ - -#define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) -#else -#define skip_erasing() false -#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ - -#ifndef __stackleak_poison -static __always_inline void __stackleak_poison(unsigned long erase_low, - unsigned long erase_high, - unsigned long poison) -{ - while (erase_low < erase_high) { - *(unsigned long *)erase_low = poison; - erase_low += sizeof(unsigned long); - } -} -#endif - -static __always_inline void __stackleak_erase(bool on_task_stack) -{ - const unsigned long task_stack_low = stackleak_task_low_bound(current); - const unsigned long task_stack_high = stackleak_task_high_bound(current); - unsigned long erase_low, erase_high; - - erase_low = stackleak_find_top_of_poison(task_stack_low, - current->lowest_stack); - -#ifdef CONFIG_STACKLEAK_METRICS - current->prev_lowest_stack = erase_low; -#endif - - /* - * Write poison to the task's stack between 'erase_low' and - * 'erase_high'. - * - * If we're running on a different stack (e.g. an entry trampoline - * stack) we can erase everything below the pt_regs at the top of the - * task stack. - * - * If we're running on the task stack itself, we must not clobber any - * stack used by this function and its caller. We assume that this - * function has a fixed-size stack frame, and the current stack pointer - * doesn't change while we write poison. - */ - if (on_task_stack) - erase_high = current_stack_pointer; - else - erase_high = task_stack_high; - - __stackleak_poison(erase_low, erase_high, STACKLEAK_POISON); - - /* Reset the 'lowest_stack' value for the next syscall */ - current->lowest_stack = task_stack_high; -} - -/* - * Erase and poison the portion of the task stack used since the last erase. - * Can be called from the task stack or an entry stack when the task stack is - * no longer in use. - */ -asmlinkage void noinstr stackleak_erase(void) -{ - if (skip_erasing()) - return; - - __stackleak_erase(on_thread_stack()); -} - -/* - * Erase and poison the portion of the task stack used since the last erase. - * Can only be called from the task stack. - */ -asmlinkage void noinstr stackleak_erase_on_task_stack(void) -{ - if (skip_erasing()) - return; - - __stackleak_erase(true); -} - -/* - * Erase and poison the portion of the task stack used since the last erase. - * Can only be called from a stack other than the task stack. - */ -asmlinkage void noinstr stackleak_erase_off_task_stack(void) -{ - if (skip_erasing()) - return; - - __stackleak_erase(false); -} - -void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) -{ - unsigned long sp = current_stack_pointer; - - /* - * Having CONFIG_STACKLEAK_TRACK_MIN_SIZE larger than - * STACKLEAK_SEARCH_DEPTH makes the poison search in - * stackleak_erase() unreliable. Let's prevent that. - */ - BUILD_BUG_ON(CONFIG_STACKLEAK_TRACK_MIN_SIZE > STACKLEAK_SEARCH_DEPTH); - - /* 'lowest_stack' should be aligned on the register width boundary */ - sp = ALIGN(sp, sizeof(unsigned long)); - if (sp < current->lowest_stack && - sp >= stackleak_task_low_bound(current)) { - current->lowest_stack = sp; - } -} -EXPORT_SYMBOL(stackleak_track_stack); -- cgit v1.2.3 From 9ea1e8d28add49ab3c1ecfa43f08d92ee23f3e33 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jul 2025 16:25:07 -0700 Subject: stackleak: Rename stackleak_track_stack to __sanitizer_cov_stack_depth The Clang stack depth tracking implementation has a fixed name for the stack depth tracking callback, "__sanitizer_cov_stack_depth", so rename the GCC plugin function to match since the plugin has no external dependencies on naming. Link: https://lore.kernel.org/r/20250717232519.2984886-2-kees@kernel.org Signed-off-by: Kees Cook --- kernel/kstack_erase.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kstack_erase.c b/kernel/kstack_erase.c index 201b846f8345..e49bb88b4f0a 100644 --- a/kernel/kstack_erase.c +++ b/kernel/kstack_erase.c @@ -156,7 +156,7 @@ asmlinkage void noinstr stackleak_erase_off_task_stack(void) __stackleak_erase(false); } -void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) +void __used __no_caller_saved_registers noinstr __sanitizer_cov_stack_depth(void) { unsigned long sp = current_stack_pointer; @@ -174,4 +174,4 @@ void __used __no_caller_saved_registers noinstr stackleak_track_stack(void) current->lowest_stack = sp; } } -EXPORT_SYMBOL(stackleak_track_stack); +EXPORT_SYMBOL(__sanitizer_cov_stack_depth); -- cgit v1.2.3 From 4c56d9f7e75eb2a137584f708fa262d7e8c8a2d8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jul 2025 16:25:17 -0700 Subject: configs/hardening: Enable CONFIG_KSTACK_ERASE Since we can wipe the stack with both Clang and GCC plugins, enable this for the "hardening.config" for wider testing. Link: https://lore.kernel.org/r/20250717232519.2984886-12-kees@kernel.org Signed-off-by: Kees Cook --- kernel/configs/hardening.config | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index dd7c32fb5ac1..d24c2772d04d 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -63,6 +63,9 @@ CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y # Initialize all stack variables to zero on function entry. CONFIG_INIT_STACK_ALL_ZERO=y +# Wipe kernel stack after syscall completion to reduce stale data lifetime. +CONFIG_KSTACK_ERASE=y + # Wipe RAM at reboot via EFI. For more details, see: # https://trustedcomputinggroup.org/resource/pc-client-work-group-platform-reset-attack-mitigation-specification/ # https://bugzilla.redhat.com/show_bug.cgi?id=1532058 -- cgit v1.2.3 From 437641a72d0a675242ae3e649a30b4c51b3ad450 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 17 Jul 2025 16:25:18 -0700 Subject: configs/hardening: Enable CONFIG_INIT_ON_FREE_DEFAULT_ON To reduce stale data lifetimes, enable CONFIG_INIT_ON_FREE_DEFAULT_ON as well. This matches the addition of CONFIG_STACKLEAK=y, which is doing similar for stack memory. Link: https://lore.kernel.org/r/20250717232519.2984886-13-kees@kernel.org Signed-off-by: Kees Cook --- kernel/configs/hardening.config | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config index d24c2772d04d..64caaf997fc0 100644 --- a/kernel/configs/hardening.config +++ b/kernel/configs/hardening.config @@ -60,6 +60,9 @@ CONFIG_LIST_HARDENED=y # Initialize all heap variables to zero on allocation. CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +# Initialize all heap variables to zero on free to reduce stale data lifetime. +CONFIG_INIT_ON_FREE_DEFAULT_ON=y + # Initialize all stack variables to zero on function entry. CONFIG_INIT_STACK_ALL_ZERO=y -- cgit v1.2.3 From 30a7806adab5f6b971cf07439ed6a3fac3fd80cf Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:53 -0400 Subject: rcu: Document GP init vs hotplug-scan ordering requirements Add detailed comments explaining the critical ordering constraints during RCU grace period initialization, based on discussions with Frederic. Reviewed-by: "Paul E. McKenney" Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c22db759978..384af6b5cf9c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1838,6 +1838,14 @@ static noinline_for_stack bool rcu_gp_init(void) start_new_poll = rcu_sr_normal_gp_init(); /* Record GP times before starting GP, hence rcu_seq_start(). */ old_gp_seq = rcu_state.gp_seq; + /* + * Critical ordering: rcu_seq_start() must happen BEFORE the CPU hotplug + * scan below. Otherwise we risk a race where a newly onlining CPU could + * be missed by the current grace period, potentially leading to + * use-after-free errors. For a detailed explanation of this race, see + * Documentation/RCU/Design/Requirements/Requirements.rst in the + * "Hotplug CPU" section. + */ rcu_seq_start(&rcu_state.gp_seq); /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && -- cgit v1.2.3 From 186779c036468038b0d077ec5333a51512f867e5 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:54 -0400 Subject: rcu: Document separation of rcu_state and rnp's gp_seq The details of this are subtle and was discussed recently. Add a quick-quiz about this and refer to it from the code, for more clarity. Reviewed-by: "Paul E. McKenney" Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 384af6b5cf9c..b206db119546 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1845,6 +1845,10 @@ static noinline_for_stack bool rcu_gp_init(void) * use-after-free errors. For a detailed explanation of this race, see * Documentation/RCU/Design/Requirements/Requirements.rst in the * "Hotplug CPU" section. + * + * Also note that the root rnp's gp_seq is kept separate from, and lags, + * the rcu_state's gp_seq, for a reason. See the Quick-Quiz on + * Single-node systems for more details (in Data-Structures.rst). */ rcu_seq_start(&rcu_state.gp_seq); /* Ensure that rcu_seq_done_exact() guardband doesn't give false positives. */ -- cgit v1.2.3 From 5d71c2b53f1790c2ca09d03848839c610653d278 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Tue, 15 Jul 2025 16:01:55 -0400 Subject: rcu: Document concurrent quiescent state reporting for offline CPUs The synchronization of CPU offlining with GP initialization is confusing to put it mildly (rightfully so as the issue it deals with is complex). Recent discussions brought up a question -- what prevents the rcu_implicit_dyntick_qs() from warning about QS reports for offline CPUs (missing QS reports for offline CPUs causing indefinite hangs). QS reporting for now-offline CPUs should only happen from: - gp_init() - rcutree_cpu_report_dead() Add some documentation on this and refer to it from comments in the code explaining how QS reporting is not missed when these functions are concurrently running. I referred heavily to this post [1] about the need for the ofl_lock. [1] https://lore.kernel.org/all/20180924164443.GF4222@linux.ibm.com/ [ Applied paulmck feedback on moving documentation to Requirements.rst ] Link: https://lore.kernel.org/all/01b4d228-9416-43f8-a62e-124b92e8741a@paulmck-laptop/ Co-developed-by: "Paul E. McKenney" Signed-off-by: "Paul E. McKenney" Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b206db119546..dd282a984fec 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1886,6 +1886,10 @@ static noinline_for_stack bool rcu_gp_init(void) /* Exclude CPU hotplug operations. */ rcu_for_each_leaf_node(rnp) { local_irq_disable(); + /* + * Serialize with CPU offline. See Requirements.rst > Hotplug CPU > + * Concurrent Quiescent State Reporting for Offline CPUs. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -1960,7 +1964,12 @@ static noinline_for_stack bool rcu_gp_init(void) trace_rcu_grace_period_init(rcu_state.name, rnp->gp_seq, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - /* Quiescent states for tasks on any now-offline CPUs. */ + /* + * Quiescent states for tasks on any now-offline CPUs. Since we + * released the ofl and rnp lock before this loop, CPUs might + * have gone offline and we have to report QS on their behalf. + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting. + */ mask = rnp->qsmask & ~rnp->qsmaskinitnext; rnp->rcu_gp_init_mask = mask; if ((mask || rnp->wait_blkd_tasks) && rcu_is_leaf_node(rnp)) @@ -4386,6 +4395,13 @@ void rcutree_report_cpu_dead(void) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + + /* + * Hold the ofl_lock and rnp lock to avoid races between CPU going + * offline and doing a QS report (as below), versus rcu_gp_init(). + * See Requirements.rst > Hotplug CPU > Concurrent QS Reporting section + * for more details. + */ arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4396,6 +4412,7 @@ void rcutree_report_cpu_dead(void) rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); raw_spin_lock_irqsave_rcu_node(rnp, flags); } + /* Clear from ->qsmaskinitnext to mark offline. */ WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); -- cgit v1.2.3 From 67c632b4a7fbd6b76a08b86f4950f0f84de93439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20Bl=C3=B6chl?= Date: Sun, 20 Jul 2025 15:54:51 +0200 Subject: timekeeping: Zero initialize system_counterval when querying time from phc drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most drivers only populate the fields cycles and cs_id of system_counterval in their get_time_fn() callback for get_device_system_crosststamp(), unless they explicitly provide nanosecond values. When the use_nsecs field was added to struct system_counterval, most drivers did not care. Clock sources other than CSID_GENERIC could then get converted in convert_base_to_cs() based on an uninitialized use_nsecs field, which usually results in -EINVAL during the following range check. Pass in a fully zero initialized system_counterval_t to cure that. Fixes: 6b2e29977518 ("timekeeping: Provide infrastructure for converting to/from a base clock") Signed-off-by: Markus Blöchl Signed-off-by: Thomas Gleixner Acked-by: John Stultz Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250720-timekeeping_uninit_crossts-v2-1-f513c885b7c2@blochl.de --- kernel/time/timekeeping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a009c91f7b05..83c65f3afcca 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1256,7 +1256,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval; + struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; -- cgit v1.2.3 From 46958a7bac2d32fda43fd7cd1858aa414640fbd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:06 +0200 Subject: genirq: Remove pointless local variable The variable is only used at one place, which can simply take the constant as function argument. Signed-off-by: Thomas Gleixner Tested-by: Liangyan Link: https://lore.kernel.org/all/20250718185311.884314473@linutronix.de --- kernel/irq/chip.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2b274007e8ba..5bb26fc5368b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -466,13 +466,11 @@ static bool irq_check_poll(struct irq_desc *desc) static bool irq_can_handle_pm(struct irq_desc *desc) { - unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED; - /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ - if (!irqd_has_set(&desc->irq_data, mask)) + if (!irqd_has_set(&desc->irq_data, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) return true; /* -- cgit v1.2.3 From 4e879dedd571128ed5aa4d5989ec0a1938804d20 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:08 +0200 Subject: genirq: Move irq_wait_for_poll() to call site Move it to the call site so that the waiting for the INPROGRESS flag can be reused by an upcoming mitigation for a potential live lock in the edge type handler. No functional change. Signed-off-by: Thomas Gleixner Tested-by: Liangyan Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/all/20250718185311.948555026@linutronix.de --- kernel/irq/chip.c | 33 ++++++++++++++++++++++++--------- kernel/irq/internals.h | 2 +- kernel/irq/spurious.c | 37 +------------------------------------ 3 files changed, 26 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5bb26fc5368b..290244ca28dd 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -457,11 +457,21 @@ void unmask_threaded_irq(struct irq_desc *desc) unmask_irq(desc); } -static bool irq_check_poll(struct irq_desc *desc) -{ - if (!(desc->istate & IRQS_POLL_INPROGRESS)) - return false; - return irq_wait_for_poll(desc); +/* Busy wait until INPROGRESS is cleared */ +static bool irq_wait_on_inprogress(struct irq_desc *desc) +{ + if (IS_ENABLED(CONFIG_SMP)) { + do { + raw_spin_unlock(&desc->lock); + while (irqd_irq_inprogress(&desc->irq_data)) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (irqd_irq_inprogress(&desc->irq_data)); + + /* Might have been disabled in meantime */ + return !irqd_irq_disabled(&desc->irq_data) && desc->action; + } + return false; } static bool irq_can_handle_pm(struct irq_desc *desc) @@ -481,10 +491,15 @@ static bool irq_can_handle_pm(struct irq_desc *desc) if (irq_pm_check_wakeup(desc)) return false; - /* - * Handle a potential concurrent poll on a different core. - */ - return irq_check_poll(desc); + /* Check whether the interrupt is polled on another CPU */ + if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) { + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + return irq_wait_on_inprogress(desc); + } + return false; } static inline bool irq_can_handle_actions(struct irq_desc *desc) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index aebfe225c9a6..82b0d67022c4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,6 +20,7 @@ #define istate core_internal_state__do_not_mess_with_it extern bool noirqdebug; +extern int irq_poll_cpu; extern struct irqaction chained_action; @@ -112,7 +113,6 @@ irqreturn_t handle_irq_event(struct irq_desc *desc); int check_irq_resend(struct irq_desc *desc, bool inject); void clear_irq_resend(struct irq_desc *desc); void irq_resend_init(struct irq_desc *desc); -bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); void wake_threads_waitq(struct irq_desc *desc); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 8f26982e7300..73280ccb74b0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -19,44 +19,9 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(struct timer_list *unused); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs); -static int irq_poll_cpu; +int irq_poll_cpu; static atomic_t irq_poll_active; -/* - * We wait here for a poller to finish. - * - * If the poll runs on this CPU, then we yell loudly and return - * false. That will leave the interrupt line disabled in the worst - * case, but it should never happen. - * - * We wait until the poller is done and then recheck disabled and - * action (about to be disabled). Only if it's still active, we return - * true and let the handler run. - */ -bool irq_wait_for_poll(struct irq_desc *desc) -{ - lockdep_assert_held(&desc->lock); - - if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), - "irq poll in progress on cpu %d for irq %d\n", - smp_processor_id(), desc->irq_data.irq)) - return false; - -#ifdef CONFIG_SMP - do { - raw_spin_unlock(&desc->lock); - while (irqd_irq_inprogress(&desc->irq_data)) - cpu_relax(); - raw_spin_lock(&desc->lock); - } while (irqd_irq_inprogress(&desc->irq_data)); - /* Might have been disabled in meantime */ - return !irqd_irq_disabled(&desc->irq_data) && desc->action; -#else - return false; -#endif -} - - /* * Recovery handler for misrouted interrupts. */ -- cgit v1.2.3 From c609045abc778689ce42e8f5827a84179ace52c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:10 +0200 Subject: genirq: Split up irq_pm_check_wakeup() Let the calling code check for the IRQD_WAKEUP_ARMED flag to prepare for a live lock mitigation in the edge type handler. No functional change. Signed-off-by: Thomas Gleixner Tested-by: Liangyan Link: https://lore.kernel.org/all/20250718185312.012392426@linutronix.de --- kernel/irq/chip.c | 4 +++- kernel/irq/internals.h | 4 ++-- kernel/irq/pm.c | 16 ++++++---------- 3 files changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 290244ca28dd..11ecf6c78c01 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -488,8 +488,10 @@ static bool irq_can_handle_pm(struct irq_desc *desc) * and suspended, disable it and notify the pm core about the * event. */ - if (irq_pm_check_wakeup(desc)) + if (unlikely(irqd_has_set(irqd, IRQD_WAKEUP_ARMED))) { + irq_pm_handle_wakeup(desc); return false; + } /* Check whether the interrupt is polled on another CPU */ if (unlikely(desc->istate & IRQS_POLL_INPROGRESS)) { diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 82b0d67022c4..0164ca48da59 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -277,11 +277,11 @@ static inline bool irq_is_nmi(struct irq_desc *desc) } #ifdef CONFIG_PM_SLEEP -bool irq_pm_check_wakeup(struct irq_desc *desc); +void irq_pm_handle_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action); #else -static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; } +static inline void irq_pm_handle_wakeup(struct irq_desc *desc) { } static inline void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { } static inline void diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 445912d51033..f7394729cedc 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -13,17 +13,13 @@ #include "internals.h" -bool irq_pm_check_wakeup(struct irq_desc *desc) +void irq_pm_handle_wakeup(struct irq_desc *desc) { - if (irqd_is_wakeup_armed(&desc->irq_data)) { - irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); - desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; - desc->depth++; - irq_disable(desc); - pm_system_irq_wakeup(irq_desc_get_irq(desc)); - return true; - } - return false; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); + desc->istate |= IRQS_SUSPENDED | IRQS_PENDING; + desc->depth++; + irq_disable(desc); + pm_system_irq_wakeup(irq_desc_get_irq(desc)); } /* -- cgit v1.2.3 From 8d39d6ec4db5da9899993092227584a97c203fd3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2025 20:54:12 +0200 Subject: genirq: Prevent migration live lock in handle_edge_irq() Yicon reported and Liangyan debugged a live lock in handle_edge_irq() related to interrupt migration. If the interrupt affinity is moved to a new target CPU and the interrupt is currently handled on the previous target CPU for edge type interrupts the handler might get stuck on the previous target: CPU 0 (previous target) CPU 1 (new target) handle_edge_irq() repeat: handle_event() handle_edge_irq() if (INPROGESS) { set(PENDING); mask(); return; } if (PENDING) { clear(PENDING); unmask(); goto repeat; } The migration in software never completes and CPU0 continues to handle the pending events forever. This happens when the device raises interrupts with a high rate and always before handle_event() completes and before the CPU0 handler can clear INPROGRESS so that CPU1 sets the PENDING flag over and over. This has been observed in virtual machines. Prevent this by checking whether the CPU which observes the INPROGRESS flag is the new affinity target. If that's the case, do not set the PENDING flag and wait for the INPROGRESS flag to be cleared instead, so that the new interrupt is handled on the new target CPU and the previous CPU is released from the action. This is restricted to the edge type handler and only utilized on systems, which use single CPU targets for interrupt affinity. Reported-by: Yicong Shen Reported-by: Liangyan Signed-off-by: Thomas Gleixner Tested-by: Liangyan Reviewed-by: Jiri Slaby Link: https://lore.kernel.org/all/20250701163558.2588435-1-liangyan.peng@bytedance.com Link: https://lore.kernel.org/all/20250718185312.076515034@linutronix.de --- kernel/irq/chip.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 11ecf6c78c01..624106e886ad 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -476,11 +476,14 @@ static bool irq_wait_on_inprogress(struct irq_desc *desc) static bool irq_can_handle_pm(struct irq_desc *desc) { + struct irq_data *irqd = &desc->irq_data; + const struct cpumask *aff; + /* * If the interrupt is not in progress and is not an armed * wakeup interrupt, proceed. */ - if (!irqd_has_set(&desc->irq_data, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) + if (!irqd_has_set(irqd, IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED)) return true; /* @@ -501,7 +504,41 @@ static bool irq_can_handle_pm(struct irq_desc *desc) return false; return irq_wait_on_inprogress(desc); } - return false; + + /* The below works only for single target interrupts */ + if (!IS_ENABLED(CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK) || + !irqd_is_single_target(irqd) || desc->handle_irq != handle_edge_irq) + return false; + + /* + * If the interrupt affinity was moved to this CPU and the + * interrupt is currently handled on the previous target CPU, then + * busy wait for INPROGRESS to be cleared. Otherwise for edge type + * interrupts the handler might get stuck on the previous target: + * + * CPU 0 CPU 1 (new target) + * handle_edge_irq() + * repeat: + * handle_event() handle_edge_irq() + * if (INPROGESS) { + * set(PENDING); + * mask(); + * return; + * } + * if (PENDING) { + * clear(PENDING); + * unmask(); + * goto repeat; + * } + * + * This happens when the device raises interrupts with a high rate + * and always before handle_event() completes and the CPU0 handler + * can clear INPROGRESS. This has been observed in virtual machines. + */ + aff = irq_data_get_effective_affinity_mask(irqd); + if (cpumask_first(aff) != smp_processor_id()) + return false; + return irq_wait_on_inprogress(desc); } static inline bool irq_can_handle_actions(struct irq_desc *desc) -- cgit v1.2.3 From 2c9e7f857400ffecf16c49bc6d98ac43d4129fef Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 8 Jul 2025 18:33:52 +0100 Subject: genirq: Teach handle_simple_irq() to resend an in-progress interrupt It appears that the defect outlined in 9c15eeb5362c4 ("genirq: Allow fasteoi handler to resend interrupts on concurrent handling") also affects some other less stellar MSI controllers, this time using the handle_simple_irq() flow. Teach this flow about irqd_needs_resend_when_in_progress(). Given the invasive nature of this workaround, only this flow is updated. Signed-off-by: Marc Zyngier Signed-off-by: Lorenzo Pieralisi Signed-off-by: Bjorn Helgaas Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20250708173404.1278635-2-maz@kernel.org --- kernel/irq/chip.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b0e0a7332993..e3948e31e654 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -551,7 +551,13 @@ void handle_simple_irq(struct irq_desc *desc) { guard(raw_spinlock)(&desc->lock); - if (!irq_can_handle(desc)) + if (!irq_can_handle_pm(desc)) { + if (irqd_needs_resend_when_in_progress(&desc->irq_data)) + desc->istate |= IRQS_PENDING; + return; + } + + if (!irq_can_handle_actions(desc)) return; kstat_incr_irqs_this_cpu(desc); -- cgit v1.2.3 From 119a5d573622ae90ba730d18acfae9bb75d77b9a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Jun 2025 18:04:40 -0400 Subject: ring-buffer: Remove ring_buffer_read_prepare_sync() When the ring buffer was first introduced, reading the non-consuming "trace" file required disabling the writing of the ring buffer. To make sure the writing was fully disabled before iterating the buffer with a non-consuming read, it would set the disable flag of the buffer and then call an RCU synchronization to make sure all the buffers were synchronized. The function ring_buffer_read_start() originally would initialize the iterator and call an RCU synchronization, but this was for each individual per CPU buffer where this would get called many times on a machine with many CPUs before the trace file could be read. The commit 72c9ddfd4c5bf ("ring-buffer: Make non-consuming read less expensive with lots of cpus.") separated ring_buffer_read_start into ring_buffer_read_prepare(), ring_buffer_read_sync() and then ring_buffer_read_start() to allow each of the per CPU buffers to be prepared, call the read_buffer_read_sync() once, and then the ring_buffer_read_start() for each of the CPUs which made things much faster. The commit 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") removed the requirement of disabling the recording of the ring buffer in order to iterate it, but it did not remove the synchronization that was happening that was required to wait for all the buffers to have no more writers. It's now OK for the buffers to have writers and no synchronization is needed. Remove the synchronization and put back the interface for the ring buffer iterator back before commit 72c9ddfd4c5bf was applied. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250630180440.3eabb514@batman.local.home Reported-by: David Howells Fixes: 1039221cc278 ("ring-buffer: Do not disable recording when there is an iterator") Tested-by: David Howells Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 63 ++++++++-------------------------------------- kernel/trace/trace.c | 14 +++-------- kernel/trace/trace_kdb.c | 8 +++--- 3 files changed, 17 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a99ed4716de9..903d9db75e12 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5943,24 +5943,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts, EXPORT_SYMBOL_GPL(ring_buffer_consume); /** - * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * ring_buffer_read_start - start a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over * @flags: gfp flags to use for memory allocation * - * This performs the initial preparations necessary to iterate - * through the buffer. Memory is allocated, buffer resizing - * is disabled, and the iterator pointer is returned to the caller. - * - * After a sequence of ring_buffer_read_prepare calls, the user is - * expected to make at least one call to ring_buffer_read_prepare_sync. - * Afterwards, ring_buffer_read_start is invoked to get things going - * for real. + * This creates an iterator to allow non-consuming iteration through + * the buffer. If the buffer is disabled for writing, it will produce + * the same information each time, but if the buffer is still writing + * then the first hit of a write will cause the iteration to stop. * - * This overall must be paired with ring_buffer_read_finish. + * Must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) +ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -5986,51 +5982,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) atomic_inc(&cpu_buffer->resize_disabled); - return iter; -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); - -/** - * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls - * - * All previously invoked ring_buffer_read_prepare calls to prepare - * iterators will be synchronized. Afterwards, read_buffer_read_start - * calls on those iterators are allowed. - */ -void -ring_buffer_read_prepare_sync(void) -{ - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); - -/** - * ring_buffer_read_start - start a non consuming read of the buffer - * @iter: The iterator returned by ring_buffer_read_prepare - * - * This finalizes the startup of an iteration through the buffer. - * The iterator comes from a call to ring_buffer_read_prepare and - * an intervening ring_buffer_read_prepare_sync must have been - * performed. - * - * Must be paired with ring_buffer_read_finish. - */ -void -ring_buffer_read_start(struct ring_buffer_iter *iter) -{ - struct ring_buffer_per_cpu *cpu_buffer; - unsigned long flags; - - if (!iter) - return; - - cpu_buffer = iter->cpu_buffer; - - raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return iter; } EXPORT_SYMBOL_GPL(ring_buffer_read_start); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 95ae7c4e5835..7996f26c3f46 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4735,21 +4735,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - } - ring_buffer_read_prepare_sync(); - for_each_tracing_cpu(cpu) { - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->array_buffer->buffer, - cpu, GFP_KERNEL); - ring_buffer_read_prepare_sync(); - ring_buffer_read_start(iter->buffer_iter[cpu]); + ring_buffer_read_start(iter->array_buffer->buffer, + cpu, GFP_KERNEL); tracing_iter_reset(iter, cpu); } diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d7b135de958a..896ff78b8349 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.array_buffer->buffer, - cpu, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu]); + ring_buffer_read_start(iter.array_buffer->buffer, + cpu, GFP_ATOMIC); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.array_buffer->buffer, + ring_buffer_read_start(iter.array_buffer->buffer, cpu_file, GFP_ATOMIC); - ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } -- cgit v1.2.3 From 218d372ce8d56c08f08d028e4c7845bc121e719f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 1 Jul 2025 19:44:51 -0400 Subject: fgraph: Keep track of when fgraph_ops are registered or not Add a warning if unregister_ftrace_graph() is called without ever registering it, or if register_ftrace_graph() is called twice. This can detect errors when they happen and not later when there's a side effect: Link: https://lore.kernel.org/all/20250617120830.24fbdd62@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/20250701194451.22e34724@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index c5b207992fb4..f4d200f0c610 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1325,6 +1325,10 @@ int register_ftrace_graph(struct fgraph_ops *gops) int ret = 0; int i = -1; + if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH, + "function graph ops registered again")) + return -EBUSY; + guard(mutex)(&ftrace_lock); if (!fgraph_stack_cachep) { @@ -1401,17 +1405,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) { int command = 0; + if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH), + "function graph ops unregistered without registering")) + return; + guard(mutex)(&ftrace_lock); if (unlikely(!ftrace_graph_active)) - return; + goto out; if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE || fgraph_array[gops->idx] != gops)) - return; + goto out; if (fgraph_lru_release_index(gops->idx) < 0) - return; + goto out; fgraph_array[gops->idx] = &fgraph_stub; @@ -1434,4 +1442,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); } gops->saved_func = NULL; + out: + gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH; } -- cgit v1.2.3 From 9b4d5d330fcd40bbc38a1e6ed3d617e674d651fa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Jul 2025 11:52:22 -0400 Subject: ftrace: Make DYNAMIC_FTRACE always enabled for architectures that support it ftrace has two flavors: 1) static: Where every function always calls the ftrace trampoline 2) dynamic: Where each function has nops that can be changed on demand to jump to the ftrace trampoline when needed. The static flavor has very high performance overhead and was only created to make it easier for architectures to implement the dynamic flavor. An architecture developer can first implement the static ftrace to make sure the trampolines work before working on the more complicated dynamic aspect of ftrace. Once the architecture can support dynamic ftrace, there's no reason to continue to support the static flavor. In fact, the static flavor tends to bitrot and bugs start to appear in them. Remove the prompt to pick DYNAMIC_FTRACE and simply enable it if the architecture supports it. Link: https://lore.kernel.org/all/f7e12c6d-892e-4ca3-9ef0-fbb524d04a48@ghiti.fr/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Linus Torvalds Cc: Alexandre Ghiti Cc: ChenMiao Link: https://lore.kernel.org/20250703115222.2d7c8cd5@batman.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a3f35c7d83b6..28afc6941e7a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -275,7 +275,7 @@ config FUNCTION_TRACE_ARGS funcgraph-args (for the function graph tracer) config DYNAMIC_FTRACE - bool "enable/disable function tracing dynamically" + bool depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y -- cgit v1.2.3 From c897c1e5b19dd4fc32e84fa1ab2065c2507be3a7 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 26 Jun 2025 17:19:40 +0200 Subject: tracing: Remove pointless memory barriers Memory barriers are useful to ensure memory accesses from one CPU appear in the original order as seen by other CPUs. Some smp_rmb() and smp_wmb() are used, but they are not ordering multiple memory accesses. Remove them. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Gabriele Monaco Link: https://lore.kernel.org/20250626151940.1756398-1-namcao@linutronix.de Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.c | 6 ------ kernel/trace/trace.c | 7 ------- 2 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index e4077500a91d..c04a49da4328 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -675,8 +675,6 @@ static bool __read_mostly monitoring_on; */ bool rv_monitoring_on(void) { - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_rmb(); return READ_ONCE(monitoring_on); } @@ -696,8 +694,6 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf, static void turn_monitoring_off(void) { WRITE_ONCE(monitoring_on, false); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void reset_all_monitors(void) @@ -713,8 +709,6 @@ static void reset_all_monitors(void) static void turn_monitoring_on(void) { WRITE_ONCE(monitoring_on, true); - /* Ensures that concurrent monitors read consistent monitoring_on */ - smp_wmb(); } static void turn_monitoring_on_with_reset(void) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 95ae7c4e5835..0dff4298fc0e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -936,7 +936,6 @@ int tracing_is_enabled(void) * return the mirror variable of the state of the ring buffer. * It's a little racy, but we don't really care. */ - smp_rmb(); return !global_trace.buffer_disabled; } @@ -1107,8 +1106,6 @@ void tracer_tracing_on(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 0; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -1640,8 +1637,6 @@ void tracer_tracing_off(struct trace_array *tr) * important to be fast than accurate. */ tr->buffer_disabled = 1; - /* Make the flag seen by readers */ - smp_wmb(); } /** @@ -2710,8 +2705,6 @@ void trace_buffered_event_enable(void) static void enable_trace_buffered_event(void *data) { - /* Probably not needed, but do it anyway */ - smp_rmb(); this_cpu_dec(trace_buffered_event_cnt); } -- cgit v1.2.3 From 07c3f391bcb217b6949b49785ccb5fee02be21fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Jul 2025 14:36:57 -0400 Subject: tracing: Remove EVENT_FILE_FL_SOFT_MODE flag When soft disabling of trace events was first created, it needed to have a way to know if a file had a user that was using it with soft disabled (for triggers that need to enable or disable events from a context that can not really enable or disable the event, it would set SOFT_DISABLED to state it is disabled). The flag SOFT_MODE was used to denote that an event had a user that would enable or disable it via the SOFT_DISABLED flag. Commit 1cf4c0732db3c ("tracing: Modify soft-mode only if there's no other referrer") fixed a bug where if two users were using the SOFT_DISABLED flag the accounting would get messed up as the SOFT_MODE flag could only handle one user. That commit added the sm_ref counter which kept track of how many users were using the event in "soft mode". This made the SOFT_MODE flag redundant as it should only be set if the sm_ref counter is non zero. Remove the SOFT_MODE flag and just use the sm_ref counter to know the event is in soft mode or not. This makes the code a bit simpler. Link: https://lore.kernel.org/all/20250702111908.03759998@batman.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Gabriele Paoloni Link: https://lore.kernel.org/20250702143657.18dd1882@batman.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 120531268abf..0980f4def360 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; + bool soft_mode = atomic_read(&file->sm_ref) != 0; int ret = 0; int disable; @@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * is set we do not want the event to be enabled before we * clear the bit. * - * When soft_disable is not set but the SOFT_MODE flag is, + * When soft_disable is not set but the soft_mode is, * we do nothing. Do not disable the tracepoint, otherwise * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. */ @@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (atomic_dec_return(&file->sm_ref) > 0) break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; - clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = false; /* Disable use of trace_buffered_event */ trace_buffered_event_disable(); } else - disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); + disable = !soft_mode; if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) { clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags); @@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, WARN_ON_ONCE(ret); } - /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */ - if (file->flags & EVENT_FILE_FL_SOFT_MODE) + /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */ + if (soft_mode) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); else clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, * When soft_disable is set and enable is set, we want to * register the tracepoint for the event, but leave the event * as is. That means, if the event was already enabled, we do - * nothing (but set SOFT_MODE). If the event is disabled, we + * nothing (but set soft_mode). If the event is disabled, we * set SOFT_DISABLED before enabling the event tracepoint, so * it still seems to be disabled. */ @@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, else { if (atomic_inc_return(&file->sm_ref) > 1) break; - set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + soft_mode = true; /* Enable use of trace_buffered_event */ trace_buffered_event_enable(); } @@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (!(file->flags & EVENT_FILE_FL_ENABLED)) { bool cmd = false, tgid = false; - /* Keep the event disabled, when going to SOFT_MODE. */ + /* Keep the event disabled, when going to soft mode. */ if (soft_disable) set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags); @@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, !(flags & EVENT_FILE_FL_SOFT_DISABLED)) strcpy(buf, "1"); - if (flags & EVENT_FILE_FL_SOFT_DISABLED || - flags & EVENT_FILE_FL_SOFT_MODE) + if (atomic_read(&file->sm_ref) != 0) strcat(buf, "*"); strcat(buf, "\n"); @@ -3584,7 +3584,7 @@ static int probe_remove_event_call(struct trace_event_call *call) continue; /* * We can't rely on ftrace_event_enable_disable(enable => 0) - * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress + * we are going to do, soft mode can suppress * TRACE_REG_UNREGISTER. */ if (file->flags & EVENT_FILE_FL_ENABLED) @@ -3997,7 +3997,7 @@ static int free_probe_data(void *data) edata->ref--; if (!edata->ref) { - /* Remove the SOFT_MODE flag */ + /* Remove soft mode */ __ftrace_event_enable_disable(edata->file, 0, 1); trace_event_put_ref(edata->file->event_call); kfree(edata); -- cgit v1.2.3 From 4d6d0a6263babf7c43faa55de4fa3c6637dec624 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 4 Jul 2025 10:48:38 -0400 Subject: tracing: Remove redundant config HAVE_FTRACE_MCOUNT_RECORD Ftrace is tightly coupled with architecture specific code because it requires the use of trampolines written in assembly. This means that when a new feature or optimization is made, it must be done for all architectures. To simplify the approach, CONFIG_HAVE_FTRACE_* configs are added to denote which architecture has the new enhancement so that other architectures can still function until they too have been updated. The CONFIG_HAVE_FTRACE_MCOUNT was added to help simplify the DYNAMIC_FTRACE work, but now every architecture that implements DYNAMIC_FTRACE also has HAVE_FTRACE_MCOUNT set too, making it redundant with the HAVE_DYNAMIC_FTRACE. Remove the HAVE_FTRACE_MCOUNT config and use DYNAMIC_FTRACE directly where applicable. Link: https://lore.kernel.org/all/20250703154916.48e3ada7@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Linus Torvalds Link: https://lore.kernel.org/20250704104838.27a18690@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/module/main.c | 2 +- kernel/trace/Kconfig | 18 ++++-------------- kernel/trace/ftrace.c | 4 ---- 3 files changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..58d36f8cef0d 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2639,7 +2639,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->trace_bprintk_fmt_start), &mod->num_trace_bprintk_fmt); #endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD +#ifdef CONFIG_DYNAMIC_FTRACE /* sechdrs[0].sh_size is always zero */ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION, sizeof(*mod->ftrace_callsites), diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28afc6941e7a..9f2b1661a8ac 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -74,11 +74,6 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE If the architecture generates __patchable_function_entries sections but does not want them included in the ftrace locations. -config HAVE_FTRACE_MCOUNT_RECORD - bool - help - See Documentation/trace/ftrace-design.rst - config HAVE_SYSCALL_TRACEPOINTS bool help @@ -803,27 +798,22 @@ config BPF_KPROBE_OVERRIDE Allows BPF to override the execution of a probed function and set a different return value. This is used for error injection. -config FTRACE_MCOUNT_RECORD - def_bool y - depends on DYNAMIC_FTRACE - depends on HAVE_FTRACE_MCOUNT_RECORD - config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY bool - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_CC def_bool y depends on $(cc-option,-mrecord-mcount) depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config FTRACE_MCOUNT_USE_OBJTOOL def_bool y depends on HAVE_OBJTOOL_MCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT @@ -831,7 +821,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on !FTRACE_MCOUNT_USE_OBJTOOL - depends on FTRACE_MCOUNT_RECORD + depends on DYNAMIC_FTRACE config TRACING_MAP bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4203fad56b6c..00b76d450a89 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1042,10 +1042,6 @@ static struct ftrace_ops *removed_ops; */ static bool update_all_ops; -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; -- cgit v1.2.3 From 502ffa43994de8f038101e0920e8e87d9756c4d8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jul 2025 09:56:28 -0400 Subject: tracing: Fix comment in trace_module_remove_events() Fix typo "allocade" -> "allocated". Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250710095628.42ed6b06@batman.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0980f4def360..6c0783fc4c2c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3695,7 +3695,7 @@ static void trace_module_remove_events(struct module *mod) if (call->module == mod) __trace_remove_event_call(call); } - /* Check for any strings allocade for this module */ + /* Check for any strings allocated for this module */ list_for_each_entry_safe(modstr, m, &module_strings, next) { if (modstr->module != mod) continue; -- cgit v1.2.3 From 9872916ad1a1a5e7d089e05166c85dbd65e5b0e8 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 22 Jul 2025 20:19:17 +0200 Subject: kcsan: test: Initialize dummy variable Newer compiler versions rightfully point out: kernel/kcsan/kcsan_test.c:591:41: error: variable 'dummy' is uninitialized when passed as a const pointer argument here [-Werror,-Wuninitialized-const-pointer] 591 | KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false); | ^~~~~ 1 error generated. Although this particular test does not care about the value stored in the dummy atomic variable, let's silence the warning. Link: https://lkml.kernel.org/r/CA+G9fYu8JY=k-r0hnBRSkQQrFJ1Bz+ShdXNwC1TNeMt0eXaxeA@mail.gmail.com Fixes: 8bc32b348178 ("kcsan: test: Add test cases for memory barrier instrumentation") Reported-by: Linux Kernel Functional Testing Reviewed-by: Alexander Potapenko Signed-off-by: Marco Elver --- kernel/kcsan/kcsan_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index c2871180edcc..49ab81faaed9 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -533,7 +533,7 @@ static void test_barrier_nothreads(struct kunit *test) struct kcsan_scoped_access *reorder_access = NULL; #endif arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; - atomic_t dummy; + atomic_t dummy = ATOMIC_INIT(0); KCSAN_TEST_REQUIRES(test, reorder_access != NULL); KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP)); -- cgit v1.2.3 From d0d05f602c1504fb868ed4a560d1465d88a3c5e5 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 29 Apr 2025 14:30:00 +0200 Subject: module: Move modprobe_path and modules_disabled ctl_tables into the module subsys Move module sysctl (modprobe_path and modules_disabled) out of sysctl.c and into the modules subsystem. Make modules_disabled static as it no longer needs to be exported. Remove module.h from the includes in sysctl as it no longer uses any module exported variables. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Luis Chamberlain Reviewed-by: Petr Pavlu Signed-off-by: Joel Granados --- kernel/module/internal.h | 3 +++ kernel/module/main.c | 30 +++++++++++++++++++++++++++++- kernel/sysctl.c | 20 -------------------- 3 files changed, 32 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 8d74b0a21c82..51ddd8866ef3 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -58,6 +58,9 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const u32 __start___kcrctab[]; extern const u32 __start___kcrctab_gpl[]; +#define KMOD_PATH_LEN 256 +extern char modprobe_path[]; + struct load_info { const char *name; /* pointer to module in temporary copy, freed at end of load_module() */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..c11d9a125001 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -126,9 +126,37 @@ static void mod_update_bounds(struct module *mod) } /* Block module loading/unloading? */ -int modules_disabled; +static int modules_disabled; core_param(nomodule, modules_disabled, bint, 0); +static const struct ctl_table module_sysctl_table[] = { + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE, + }, +}; + +static int __init init_module_sysctl(void) +{ + register_sysctl_init("kernel", module_sysctl_table); + return 0; +} + +subsys_initcall(init_module_sysctl); + /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9b4f0cff76ea..473133d9651e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -19,7 +19,6 @@ * Removed it and replaced it with older style, 03/23/00, Bill Wendling */ -#include #include #include #include @@ -1616,25 +1615,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_MODULES - { - .procname = "modprobe", - .data = &modprobe_path, - .maxlen = KMOD_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, - { - .procname = "modules_disabled", - .data = &modules_disabled, - .maxlen = sizeof(int), - .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_UEVENT_HELPER { .procname = "hotplug", -- cgit v1.2.3 From f1b4f23a52c272f6c1e205e8ec243f563323c5aa Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 29 Apr 2025 15:12:17 +0200 Subject: locking/rtmutex: Move max_lock_depth into rtmutex.c Move the max_lock_depth sysctl table element into rtmutex_api.c. Removed the rtmutex.h include from sysctl.c. Chose to move into rtmutex_api.c to avoid multiple registrations every time rtmutex.c is included in other files. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Signed-off-by: Joel Granados --- kernel/locking/rtmutex_api.c | 18 ++++++++++++++++++ kernel/sysctl.c | 12 ------------ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 2d933528a0fa..bafd5af98eae 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -13,6 +13,24 @@ */ int max_lock_depth = 1024; +static const struct ctl_table rtmutex_sysctl_table[] = { + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +}; + +static int __init init_rtmutex_sysctl(void) +{ + register_sysctl_init("kernel", rtmutex_sysctl_table); + return 0; +} + +subsys_initcall(init_rtmutex_sysctl); + /* * Debug aware fast / slowpath lock,trylock,unlock * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 473133d9651e..a22f35013da0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -59,9 +59,6 @@ #include #include #endif -#ifdef CONFIG_RT_MUTEXES -#include -#endif /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; @@ -1709,15 +1706,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_RT_MUTEXES - { - .procname = "max_lock_depth", - .data = &max_lock_depth, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", -- cgit v1.2.3 From fff6703fc843569d7a2f78ca08e7a69a9be22b0f Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 30 Apr 2025 14:07:33 +0200 Subject: rcu: Move rcu_stall related sysctls into rcu/tree_stall.h Move sysctl_panic_on_rcu_stall and sysctl_max_rcu_stall_to_panic into the kernel/rcu subdirectory. Make these static in tree_stall.h and removed them as extern from panic.h as their scope is now confined into one file. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Luis Chamberlain Reviewed-by: Joel Fernandes Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/rcu/tree_stall.h | 33 +++++++++++++++++++++++++++++++-- kernel/sysctl.c | 20 -------------------- 2 files changed, 31 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 486c00536207..69482c2f0771 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -17,8 +17,37 @@ // Controlling CPU stall warnings, including delay calculation. /* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; -int sysctl_max_rcu_stall_to_panic __read_mostly; +static int sysctl_panic_on_rcu_stall __read_mostly; +static int sysctl_max_rcu_stall_to_panic __read_mostly; + +static const struct ctl_table rcu_stall_sysctl_table[] = { + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "max_rcu_stall_to_panic", + .data = &sysctl_max_rcu_stall_to_panic, + .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, +}; + +static int __init init_rcu_stall_sysctl(void) +{ + register_sysctl_init("kernel", rcu_stall_sysctl_table); + return 0; +} + +subsys_initcall(init_rcu_stall_sysctl); #ifdef CONFIG_SYSFS diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a22f35013da0..fd76f0e1d490 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1706,26 +1706,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_TREE_RCU - { - .procname = "panic_on_rcu_stall", - .data = &sysctl_panic_on_rcu_stall, - .maxlen = sizeof(sysctl_panic_on_rcu_stall), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "max_rcu_stall_to_panic", - .data = &sysctl_max_rcu_stall_to_panic, - .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_INT_MAX, - }, -#endif }; int __init sysctl_init_bases(void) -- cgit v1.2.3 From 851911aa7210ca27f007bd79553172e2e3ba8723 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 30 Apr 2025 14:42:13 +0200 Subject: mm: move randomize_va_space into memory.c Move the randomize_va_space variable together with all its sysctl table elements into memory.c. Register it to the "kernel" directory by adding it to the subsys initialization calls This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/sysctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fd76f0e1d490..adc2d3ea1278 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1688,15 +1688,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#if defined(CONFIG_MMU) - { - .procname = "randomize_va_space", - .data = &randomize_va_space, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", -- cgit v1.2.3 From 9e2f403dd8c2b07aff012e72c1fe5455538d72d2 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Fri, 2 May 2025 15:32:17 +0200 Subject: parisc/power: Move soft-power into power.c Move the soft-power ctl table into parisc/power.c. As a consequence the pwrsw_enabled var is made static. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/sysctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index adc2d3ea1278..718140251972 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1594,15 +1594,6 @@ static const struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_PARISC - { - .procname = "soft-power", - .data = &pwrsw_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { .procname = "unaligned-trap", -- cgit v1.2.3 From 8e5f04b0d58c734c69a0b6e26317561919299638 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Fri, 2 May 2025 22:27:38 +0200 Subject: fork: mv threads-max into kernel/fork.c make sysctl_max_threads static as it no longer needs to be exported into sysctl.c. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/fork.c | 20 +++++++++++++++++++- kernel/sysctl.c | 7 ------- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..dea8e7740ad2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -3216,7 +3216,7 @@ int unshare_files(void) return 0; } -int sysctl_max_threads(const struct ctl_table *table, int write, +static int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -3238,3 +3238,21 @@ int sysctl_max_threads(const struct ctl_table *table, int write, return 0; } + +static const struct ctl_table fork_sysctl_table[] = { + { + .procname = "threads-max", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_max_threads, + }, +}; + +static int __init init_fork_sysctl(void) +{ + register_sysctl_init("kernel", fork_sysctl_table); + return 0; +} + +subsys_initcall(init_fork_sysctl); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 718140251972..febf328054aa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1630,13 +1630,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_do_cad_pid, }, #endif - { - .procname = "threads-max", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_max_threads, - }, { .procname = "overflowuid", .data = &overflowuid, -- cgit v1.2.3 From 79ac8df97408b97175c01b6bff5ce0a97f35b439 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Fri, 2 May 2025 22:47:06 +0200 Subject: Input: sysrq: mv sysrq into drivers/tty/sysrq.c Move both sysrq ctl_table and supported sysrq_sysctl_handler helper function into drivers/tty/sysrq.c. Replaced the __do_proc_dointvec in helper function with do_proc_dointvec_minmax as the former is local to kernel/sysctl.c. Here we use the minmax version of do_proc_dointvec because do_proc_dointvec is static and calling do_proc_dointvec_minmax with a NULL min and max is the same as calling do_proc_dointvec. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Kees Cook Acked-by: Greg Kroah-Hartman Signed-off-by: Joel Granados --- kernel/sysctl.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index febf328054aa..ebcc7d75acd9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -964,26 +963,6 @@ int proc_dou8vec_minmax(const struct ctl_table *table, int write, } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -#ifdef CONFIG_MAGIC_SYSRQ -static int sysrq_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int tmp, ret; - - tmp = sysrq_mask(); - - ret = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (ret || !write) - return ret; - - if (write) - sysrq_toggle_support(tmp); - - return 0; -} -#endif - static int __do_proc_doulongvec_minmax(void *data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, @@ -1612,15 +1591,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_MAGIC_SYSRQ - { - .procname = "sysrq", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = sysrq_sysctl_handler, - }, -#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "cad_pid", -- cgit v1.2.3 From 942b296a6c35da6593eeeb126dce71d4e506f314 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 5 May 2025 21:20:07 +0200 Subject: sysctl: Move tainted ctl_table into kernel/panic.c Move the ctl_table with the "tainted" proc_name into kernel/panic.c. With it moves the proc_tainted helper function. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/panic.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 49 ------------------------------------------------- 2 files changed, 50 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b0b9a8bf4560..39e5b1ddf1a8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -84,6 +84,50 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); #ifdef CONFIG_SYSCTL + +/* + * Taint values can only be increased + * This means we can safely use a temporary. + */ +static int proc_taint(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) + add_taint(i, LOCKDEP_STILL_OK); + } + + return err; +} + static const struct ctl_table kern_panic_table[] = { #ifdef CONFIG_SMP { @@ -96,6 +140,12 @@ static const struct ctl_table kern_panic_table[] = { .extra2 = SYSCTL_ONE, }, #endif + { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, { .procname = "panic", .data = &panic_timeout, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ebcc7d75acd9..9d8db9cef111 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -731,49 +731,6 @@ int proc_douintvec(const struct ctl_table *table, int write, void *buffer, do_proc_douintvec_conv, NULL); } -/* - * Taint values can only be increased - * This means we can safely use a temporary. - */ -static int proc_taint(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table t; - unsigned long tmptaint = get_taint(); - int err; - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - t = *table; - t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); - if (err < 0) - return err; - - if (write) { - int i; - - /* - * If we are relying on panic_on_taint not producing - * false positives due to userspace input, bail out - * before setting the requested taint flags. - */ - if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) - return -EINVAL; - - /* - * Poor man's atomic or. Not worth adding a primitive - * to everyone's atomic.h for this - */ - for (i = 0; i < TAINT_FLAGS_COUNT; i++) - if ((1UL << i) & tmptaint) - add_taint(i, LOCKDEP_STILL_OK); - } - - return err; -} - /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -1557,12 +1514,6 @@ int proc_do_static_key(const struct ctl_table *table, int write, static const struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", - .maxlen = sizeof(long), - .mode = 0644, - .proc_handler = proc_taint, - }, { .procname = "sysctl_writes_strict", .data = &sysctl_writes_strict, -- cgit v1.2.3 From e054bcbe7e7af2baad3752f1a4916a7fffc0457e Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 5 May 2025 21:47:47 +0200 Subject: sysctl: move cad_pid into kernel/pid.c Move cad_pid as well as supporting function proc_do_cad_pid into kernel/pic.c. Replaced call to __do_proc_dointvec with proc_dointvec inside proc_do_cad_pid which requires the copy of the ctl_table to handle the temp value. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/pid.c | 31 +++++++++++++++++++++++++++++++ kernel/sysctl.c | 31 ------------------------------- 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 8317bcbc7cf7..9ea0db2cdc29 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -713,6 +713,29 @@ static struct ctl_table_root pid_table_root = { .set_ownership = pid_table_root_set_ownership, }; +static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp_pid; + int r; + struct ctl_table tmp_table = *table; + + tmp_pid = pid_vnr(cad_pid); + tmp_table.data = &tmp_pid; + + r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp_pid); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + static const struct ctl_table pid_table[] = { { .procname = "pid_max", @@ -723,6 +746,14 @@ static const struct ctl_table pid_table[] = { .extra1 = &pid_max_min, .extra2 = &pid_max_max, }, +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif }; #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d8db9cef111..d5bebdd02cd4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1224,28 +1224,6 @@ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buf do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) -{ - struct pid *new_pid; - pid_t tmp; - int r; - - tmp = pid_vnr(cad_pid); - - r = __do_proc_dointvec(&tmp, table, write, buffer, - lenp, ppos, NULL, NULL); - if (r || !write) - return r; - - new_pid = find_get_pid(tmp); - if (!new_pid) - return -ESRCH; - - put_pid(xchg(&cad_pid, new_pid)); - return 0; -} - /** * proc_do_large_bitmap - read/write from/to a large bitmap * @table: the sysctl table @@ -1541,15 +1519,6 @@ static const struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, -#endif -#ifdef CONFIG_PROC_SYSCTL - { - .procname = "cad_pid", - .data = NULL, - .maxlen = sizeof (int), - .mode = 0600, - .proc_handler = proc_do_cad_pid, - }, #endif { .procname = "overflowuid", -- cgit v1.2.3 From 5a477e934152d0b32201000444d7a5e8358c9480 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 8 May 2025 21:35:27 +0200 Subject: sysctl: Move sysctl_panic_on_stackoverflow to kernel/panic.c This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/panic.c | 10 ++++++++++ kernel/sysctl.c | 10 ---------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 39e5b1ddf1a8..64e58835086d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -183,6 +183,16 @@ static const struct ctl_table kern_panic_table[] = { .mode = 0644, .proc_handler = proc_douintvec, }, +#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ + defined(CONFIG_DEBUG_STACKOVERFLOW) + { + .procname = "panic_on_stackoverflow", + .data = &sysctl_panic_on_stackoverflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif }; static __init int kernel_panic_sysctls_init(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d5bebdd02cd4..446d77ec44f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1552,16 +1552,6 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \ - defined(CONFIG_DEBUG_STACKOVERFLOW) - { - .procname = "panic_on_stackoverflow", - .data = &sysctl_panic_on_stackoverflow, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", -- cgit v1.2.3 From ad0800b1d49ade38bd25409c9d66da0446977c87 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 5 May 2025 13:42:27 +0200 Subject: sysctl: Remove (very) old file changelog These comments are older than 2003 and therefore do not bare any relevance on the current state of the sysctl.c file. Remove them as they confuse more than clarify. Reviewed-by: Luis Chamberlain Reviewed-by: Kees Cook Signed-off-by: Joel Granados --- kernel/sysctl.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 446d77ec44f5..dee9a818a9bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1,22 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface - * - * Begun 24 March 1995, Stephen Tweedie - * Added /proc support, Dec 1995 - * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. - * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. - * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. - * Dynamic registration fixes, Stephen Tweedie. - * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. - * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris - * Horn. - * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. - * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. - * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill - * Wendling. - * The list_for_each() macro wasn't appropriate for the sysctl loop. - * Removed it and replaced it with older style, 03/23/00, Bill Wendling */ #include -- cgit v1.2.3 From 6519dba9af439722b3fd938dec939792cc0ecf8e Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Mon, 5 May 2025 10:53:28 +0200 Subject: sysctl: Remove superfluous includes from kernel/sysctl.c Remove the following headers from the include list in sysctl.c. * These are removed as the related variables are no longer there. =================== ==================== Include Related Var =================== ==================== linux/kmod.h usermodehelper asm/nmi.h nmi_watchdoc_enabled asm/io.h io_delay_type linux/pid.h pid_max_{,min,max} linux/sched/sysctl.h sysctl_{sched_*,numa_*,timer_*} linux/mount.h sysctl_mount_max linux/reboot.h poweroff_cmd linux/ratelimit.h {,printk_}ratelimit_state linux/printk.h kptr_restrict linux/security.h CONFIG_SECURITY_CAPABILITIES linux/net.h net_table linux/key.h key_sysctls linux/nvs_fs.h acpi_video_flags linux/acpi.h acpi_video_flags linux/fs.h proc_nr_files * These are no longer needed as intermediate includes ============== Include ============== linux/filter.h linux/binfmts.h Reviewed-by: Kees Cook Reviewed-by: Luis Chamberlain Signed-off-by: Joel Granados --- kernel/sysctl.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dee9a818a9bb..0716c7df7243 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -5,44 +5,24 @@ #include #include -#include #include -#include #include -#include -#include #include #include #include -#include #include #include -#include #include -#include #include #include #include -#include -#include -#include -#include #include -#include -#include -#include -#include #include "../lib/kstrtox.h" #include #include -#ifdef CONFIG_X86 -#include -#include -#endif - /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; EXPORT_SYMBOL(sysctl_vals); -- cgit v1.2.3 From 88eddb0502d45680efef870ea470a9e8955c5c8b Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Fri, 27 Jun 2025 09:29:56 +0200 Subject: uevent: mv uevent_helper into kobject_uevent.c Move both uevent_helper table into lib/kobject_uevent.c. Place the registration early in the initcall order with postcore_initcall. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Signed-off-by: Joel Granados --- kernel/sysctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0716c7df7243..2df63b69edf6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1474,15 +1474,6 @@ static const struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#endif -#ifdef CONFIG_UEVENT_HELPER - { - .procname = "hotplug", - .data = &uevent_helper, - .maxlen = UEVENT_HELPER_PATH_LEN, - .mode = 0644, - .proc_handler = proc_dostring, - }, #endif { .procname = "overflowuid", -- cgit v1.2.3 From 25ebbce1f188aa2d3e83fcfcf24da8610362564b Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Fri, 27 Jun 2025 10:00:51 +0200 Subject: kernel/sys.c: Move overflow{uid,gid} sysctl into kernel/sys.c Moved ctl_tables elements for overflowuid and overflowgid into in kernel/sys.c. Create a register function that keeps them under "kernel" and run it after core with postcore_initcall. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Signed-off-by: Joel Granados --- kernel/sys.c | 30 ++++++++++++++++++++++++++++++ kernel/sysctl.c | 18 ------------------ 2 files changed, 30 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index adc0de0aa364..bbeee62f9abc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -181,6 +181,36 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); +static const struct ctl_table overflow_sysctl_table[] = { + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, +}; + +static int __init init_overflow_sysctl(void) +{ + register_sysctl_init("kernel", overflow_sysctl_table); + return 0; +} + +postcore_initcall(init_overflow_sysctl); + + /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2df63b69edf6..21b70443aea7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1475,24 +1475,6 @@ static const struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif - { - .procname = "overflowuid", - .data = &overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, - { - .procname = "overflowgid", - .data = &overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, { .procname = "ngroups_max", .data = (void *)&ngroups_max, -- cgit v1.2.3 From 73184c8e4ff447b866dac13fc4f1a4079c78a69d Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Fri, 27 Jun 2025 10:12:40 +0200 Subject: sysctl: rename kern_table -> sysctl_subsys_table Renamed sysctl table from kern_table to sysctl_subsys_table and grouped the two arch specific ctls to the end of the array. This is part of a greater effort to move ctl tables into their respective subsystems which will reduce the merge conflicts in kernel/sysctl.c. Signed-off-by: Joel Granados --- kernel/sys.c | 1 - kernel/sysctl.c | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index bbeee62f9abc..18a037cc6f61 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -210,7 +210,6 @@ static int __init init_overflow_sysctl(void) postcore_initcall(init_overflow_sysctl); - /* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 21b70443aea7..cb6196e3fa99 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1454,7 +1454,7 @@ int proc_do_static_key(const struct ctl_table *table, int write, return ret; } -static const struct ctl_table kern_table[] = { +static const struct ctl_table sysctl_subsys_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "sysctl_writes_strict", @@ -1465,15 +1465,6 @@ static const struct ctl_table kern_table[] = { .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, -#endif -#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW - { - .procname = "unaligned-trap", - .data = &unaligned_enabled, - .maxlen = sizeof (int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif { .procname = "ngroups_max", @@ -1489,6 +1480,15 @@ static const struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW + { + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", @@ -1502,7 +1502,7 @@ static const struct ctl_table kern_table[] = { int __init sysctl_init_bases(void) { - register_sysctl_init("kernel", kern_table); + register_sysctl_init("kernel", sysctl_subsys_table); return 0; } -- cgit v1.2.3 From 9ba817fb7c6afd3c86a6d4c3b822924b87ef0348 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jul 2025 17:08:06 -0400 Subject: tracing: Deprecate auto-mounting tracefs in debugfs In January 2015, tracefs was created to allow access to the tracing infrastructure without needing to compile in debugfs. When tracefs is configured, the directory /sys/kernel/tracing will exist and tooling is expected to use that path to access the tracing infrastructure. To allow backward compatibility, when debugfs is mounted, it would automount tracefs in its "tracing" directory so that tooling that had hard coded /sys/kernel/debug/tracing would still work. It has been over 10 years since the new interface was introduced, and all tooling should now be using it. Start the process of deprecating the old path so that it doesn't need to be maintained anymore. A new config is added to allow distributions to disable automounting of tracefs on debugfs. If /sys/kernel/debug/tracing is accessed, a pr_warn() will trigger stating: "NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030" Expect to remove this feature in 5 years (2030). Cc: Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: Namhyung Kim Cc: Linus Torvalds Cc: Andrew Morton Cc: Greg Kroah-Hartman Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Ian Rogers Link: https://lore.kernel.org/20250722170806.40c068c6@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 13 +++++++++++++ kernel/trace/trace.c | 14 ++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a3f35c7d83b6..93e8e7fc11c0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -199,6 +199,19 @@ menuconfig FTRACE if FTRACE +config TRACEFS_AUTOMOUNT_DEPRECATED + bool "Automount tracefs on debugfs [DEPRECATED]" + depends on TRACING + default y + help + The tracing interface was moved from /sys/kernel/debug/tracing + to /sys/kernel/tracing in 2015, but the tracing file system + was still automounted in /sys/kernel/debug for backward + compatibility with tooling. + + The new interface has been around for more than 10 years and + the old debug mount will soon be removed. + config BOOTTIME_TRACING bool "Boot-time Tracing support" depends on TRACING diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0dff4298fc0e..06ab5b7a8711 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6296,7 +6296,7 @@ static bool tracer_options_updated; static void add_tracer_options(struct trace_array *tr, struct tracer *t) { /* Only enable if the directory has been created already. */ - if (!tr->dir) + if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL)) return; /* Only create trace option files after update_tracer_options finish */ @@ -8977,13 +8977,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; } static struct dentry *tracing_get_dentry(struct trace_array *tr) { - if (WARN_ON(!tr->dir)) - return ERR_PTR(-ENODEV); - /* Top directory uses NULL as the parent */ if (tr->flags & TRACE_ARRAY_FL_GLOBAL) return NULL; + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + /* All sub buffers have a descriptor */ return tr->dir; } @@ -10249,6 +10249,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) ftrace_init_tracefs(tr, d_tracer); } +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) { struct vfsmount *mnt; @@ -10270,6 +10271,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) if (IS_ERR(fc)) return ERR_CAST(fc); + pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n"); + ret = vfs_parse_fs_string(fc, "source", "tracefs", strlen("tracefs")); if (!ret) @@ -10280,6 +10283,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore) put_fs_context(fc); return mnt; } +#endif /** * tracing_init_dentry - initialize top level trace array @@ -10304,6 +10308,7 @@ int tracing_init_dentry(void) if (WARN_ON(!tracefs_initialized())) return -ENODEV; +#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED /* * As there may still be users that expect the tracing * files to exist in debugfs/tracing, we must automount @@ -10312,6 +10317,7 @@ int tracing_init_dentry(void) */ tr->dir = debugfs_create_automount("tracing", NULL, trace_automount, NULL); +#endif return 0; } -- cgit v1.2.3 From 2f02a61d84c629235b8f9684dd0a67e33a2a3d81 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 23 Jul 2025 10:30:47 +0900 Subject: tracing: probes: Sort #include alphabetically Sort the #include directives in trace_probe* files alphabetically for easier maintenance and avoid double includes. This also groups headers as linux-generic, asm-generic, and local headers. Link: https://lore.kernel.org/all/175323424678.57270.11975372127870059007.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 4 ++-- kernel/trace/trace_fprobe.c | 3 ++- kernel/trace/trace_kprobe.c | 8 ++++---- kernel/trace/trace_probe.c | 2 +- kernel/trace/trace_probe.h | 17 +++++++++-------- kernel/trace/trace_uprobe.c | 12 ++++++------ 6 files changed, 24 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 916555f0de81..23e06712bead 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -9,14 +9,14 @@ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> * */ +#include #include #include -#include #include "trace_dynevent.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define EPROBE_EVENT_SYSTEM "eprobes" diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index dbf9d413125a..add08ffb04d7 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -4,7 +4,6 @@ * Copyright (C) 2022 Google LLC. */ #define pr_fmt(fmt) "trace_fprobe: " fmt -#include #include #include @@ -15,6 +14,8 @@ #include #include +#include + #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_kernel.h" diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3e5c47b6d7b2..cac128a5f7e0 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -9,19 +9,19 @@ #include #include -#include +#include #include -#include #include -#include +#include +#include #include /* for COMMAND_LINE_SIZE */ #include "trace_dynevent.h" #include "trace_kprobe_selftest.h" #include "trace_probe.h" -#include "trace_probe_tmpl.h" #include "trace_probe_kernel.h" +#include "trace_probe_tmpl.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index abfab8957a6c..9d26d901c9e5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -13,8 +13,8 @@ #include #include -#include "trace_btf.h" +#include "trace_btf.h" #include "trace_probe.h" #undef C diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 854e5668f5ee..719604855279 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -10,20 +10,21 @@ * Author: Srikar Dronamraju */ +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include #include -#include -#include -#include #include -#include +#include +#include #include -#include -#include + #include #include "trace.h" diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f95a2c3d5b1b..3cc3404b09f0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -8,16 +8,16 @@ #define pr_fmt(fmt) "trace_uprobe: " fmt #include -#include #include +#include #include -#include -#include #include -#include -#include -#include #include +#include +#include +#include +#include +#include #include "trace_dynevent.h" #include "trace_probe.h" -- cgit v1.2.3 From 43beb5e89bc8c0b753553964dd0654e2d1aa23f9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 23 Jul 2025 10:30:56 +0900 Subject: tracing: probe: Allocate traceprobe_parse_context from heap Instead of allocating traceprobe_parse_context on stack, allocate it dynamically from heap (slab). This change is likely intended to prevent potential stack overflow issues, which can be a concern in the kernel environment where stack space is limited. Link: https://lore.kernel.org/all/175323425650.57270.280750740753792504.stgit@devnote2/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506240416.nZIhDXoO-lkp@intel.com/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 14 ++++++++------ kernel/trace/trace_fprobe.c | 13 ++++++++----- kernel/trace/trace_kprobe.c | 10 +++++++--- kernel/trace/trace_probe.h | 9 +++++++++ kernel/trace/trace_uprobe.c | 13 ++++++++----- 5 files changed, 40 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 23e06712bead..7ba3a18be4c5 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -797,18 +797,20 @@ find_and_get_event(const char *system, const char *event_name) static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i) { - struct traceprobe_parse_context ctx = { - .event = ep->event, - .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; - ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->event = ep->event; + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT; + + ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx); /* Handle symbols "@" */ if (!ret) ret = traceprobe_update_arg(&ep->tp.args[i]); - traceprobe_finish_parse(&ctx); return ret; } diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index add08ffb04d7..610f8d53be8a 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1384,14 +1384,17 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], static int trace_fprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { - .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + trace_probe_log_init("trace_fprobe", argc, argv); - ret = trace_fprobe_create_internal(argc, argv, &ctx); - traceprobe_finish_parse(&ctx); + ret = trace_fprobe_create_internal(argc, argv, ctx); trace_probe_log_clear(); return ret; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index cac128a5f7e0..d14b33e205f7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1065,14 +1065,18 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], static int trace_kprobe_create_cb(int argc, const char *argv[]) { - struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL; int ret; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + ctx->flags = TPARG_FL_KERNEL; + trace_probe_log_init("trace_kprobe", argc, argv); - ret = trace_kprobe_create_internal(argc, argv, &ctx); + ret = trace_kprobe_create_internal(argc, argv, ctx); - traceprobe_finish_parse(&ctx); trace_probe_log_clear(); return ret; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 719604855279..842383fbc03b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -439,6 +440,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); * this MUST be called for clean up the context and return a resource. */ void traceprobe_finish_parse(struct traceprobe_parse_context *ctx); +static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx) +{ + traceprobe_finish_parse(ctx); + kfree(ctx); +} + +DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *, + if (_T) traceprobe_free_parse_ctx(_T)) extern int traceprobe_split_symbol_offset(char *symbol, long *offset); int traceprobe_parse_event_name(const char **pevent, const char **pgroup, diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3cc3404b09f0..872dce092e46 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -695,13 +695,16 @@ static int __trace_uprobe_create(int argc, const char **argv) /* parse arguments */ for (i = 0; i < argc; i++) { - struct traceprobe_parse_context ctx = { - .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER, - }; + struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) + = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + ret = -ENOMEM; + goto error; + } + ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER; trace_probe_log_set_index(i + 2); - ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx); - traceprobe_finish_parse(&ctx); + ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx); if (ret) goto error; } -- cgit v1.2.3 From d643eaa7082dc3d2438c9ba3ab856eb1c8ad9ffc Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 23 Jul 2025 10:31:06 +0900 Subject: tracing: fprobe-event: Allocate string buffers from heap Allocate temporary string buffers for fprobe-event from heap instead of stack. This fixes the stack frame exceed limit error. Link: https://lore.kernel.org/all/175323426643.57270.6657152008331160704.stgit@devnote2/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506240416.nZIhDXoO-lkp@intel.com/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_fprobe.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 610f8d53be8a..9d14a910fbf7 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1235,18 +1235,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL; - struct module *mod __free(module_put) = NULL; - int i, new_argc = 0, ret = 0; - bool is_return = false; - char *symbol __free(kfree) = NULL; const char *event = NULL, *group = FPROBE_EVENT_SYSTEM; + struct module *mod __free(module_put) = NULL; const char **new_argv __free(kfree) = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char sbuf[KSYM_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; + char *symbol __free(kfree) = NULL; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *sbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; char *dbuf __free(kfree) = NULL; + int i, new_argc = 0, ret = 0; bool is_tracepoint = false; + bool is_return = false; if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2) return -ECANCELED; @@ -1274,6 +1274,9 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -1281,15 +1284,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], } if (!event) { + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; /* Make a new event name */ if (is_tracepoint) - snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s", isdigit(*symbol) ? "_" : "", symbol); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol, is_return ? "exit" : "entry"); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } if (is_return) @@ -1305,13 +1311,20 @@ static int trace_fprobe_create_internal(int argc, const char *argv[], ctx->flags |= TPARG_FL_TPOINT; mod = NULL; tpoint = find_tracepoint(symbol, &mod); - if (tpoint) + if (tpoint) { + sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL); + if (!sbuf) + return -ENOMEM; ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub, NULL, NULL, NULL, sbuf); + } } if (!ctx->funcname) ctx->funcname = symbol; + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, abuf, MAX_BTF_ARGS_LEN, ctx); -- cgit v1.2.3 From 33b4e38baa03b1556bec29dd80e58504fe3de1f2 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 23 Jul 2025 10:31:16 +0900 Subject: tracing: kprobe-event: Allocate string buffers from heap Allocate temporary string buffers for parsing kprobe-events from heap instead of stack. Link: https://lore.kernel.org/all/175323427627.57270.5105357260879695051.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_kprobe.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d14b33e205f7..ccae62d4fb91 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -861,20 +861,20 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL; + const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + const char **new_argv __free(kfree) = NULL; int i, len, new_argc = 0, ret = 0; - bool is_return = false; char *symbol __free(kfree) = NULL; - char *tmp = NULL; - const char **new_argv __free(kfree) = NULL; - const char *event = NULL, *group = KPROBE_EVENT_SYSTEM; + char *ebuf __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; + char *abuf __free(kfree) = NULL; + char *dbuf __free(kfree) = NULL; enum probe_print_type ptype; + bool is_return = false; int maxactive = 0; - long offset = 0; void *addr = NULL; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - char abuf[MAX_BTF_ARGS_LEN]; - char *dbuf __free(kfree) = NULL; + char *tmp = NULL; + long offset = 0; switch (argv[0][0]) { case 'r': @@ -893,6 +893,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], event++; if (isdigit(argv[0][1])) { + char *buf __free(kfree) = NULL; + if (!is_return) { trace_probe_log_err(1, BAD_MAXACT_TYPE); return -EINVAL; @@ -905,7 +907,7 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_err(1, BAD_MAXACT); return -EINVAL; } - memcpy(buf, &argv[0][1], len); + buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL); buf[len] = '\0'; ret = kstrtouint(buf, 0, &maxactive); if (ret || !maxactive) { @@ -973,6 +975,9 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + return -ENOMEM; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -981,16 +986,22 @@ static int trace_kprobe_create_internal(int argc, const char *argv[], if (!event) { /* Make a new event name */ + ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!ebuf) + return -ENOMEM; if (symbol) - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", is_return ? 'r' : 'p', symbol, offset); else - snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p", is_return ? 'r' : 'p', addr); - sanitize_event_name(buf); - event = buf; + sanitize_event_name(ebuf); + event = ebuf; } + abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL); + if (!abuf) + return -ENOMEM; argc -= 2; argv += 2; ctx->funcname = symbol; new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc, -- cgit v1.2.3 From 4c6edb43ead62776d4ae58cc1a121700b546db0f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 23 Jul 2025 10:31:26 +0900 Subject: tracing: eprobe-event: Allocate string buffers from heap Allocate temporary string buffers for parsing eprobe-events from heap instead of stack. Link: https://lore.kernel.org/all/175323428599.57270.988038042425748956.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 7ba3a18be4c5..f16a0ab0b001 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -9,6 +9,7 @@ * Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com> * */ +#include #include #include #include @@ -871,10 +872,10 @@ static int __trace_eprobe_create(int argc, const char *argv[]) const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; const char *sys_event = NULL, *sys_name = NULL; struct trace_event_call *event_call; + char *buf1 __free(kfree) = NULL; + char *buf2 __free(kfree) = NULL; + char *gbuf __free(kfree) = NULL; struct trace_eprobe *ep = NULL; - char buf1[MAX_EVENT_NAME_LEN]; - char buf2[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0, filter_idx = 0; int i, filter_cnt; @@ -885,6 +886,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + goto mem_error; event++; ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); @@ -894,6 +898,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(1); sys_event = argv[1]; + + buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf2) + goto mem_error; + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); if (ret || !sys_event || !sys_name) { trace_probe_log_err(0, NO_EVENT_INFO); @@ -901,7 +910,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (!event) { - strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); + buf1 = kstrdup(sys_event, GFP_KERNEL); + if (!buf1) + goto mem_error; event = buf1; } @@ -974,6 +985,9 @@ static int __trace_eprobe_create(int argc, const char *argv[]) trace_probe_log_clear(); return ret; +mem_error: + ret = -ENOMEM; + goto error; parse_error: ret = -EINVAL; error: -- cgit v1.2.3 From 97e8230f89a3570785a66b1f5eec86a2e4324bf9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 23 Jul 2025 10:31:36 +0900 Subject: tracing: uprobe-event: Allocate string buffers from heap Allocate temporary string buffers for parsing uprobe-events from heap instead of stack. Link: https://lore.kernel.org/all/175323429593.57270.12369235525923902341.stgit@devnote2/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_uprobe.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 872dce092e46..8b0bcc0d8f41 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "trace_uprobe: " fmt #include +#include #include #include #include @@ -19,6 +20,7 @@ #include #include +#include "trace.h" #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" @@ -537,15 +539,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu) */ static int __trace_uprobe_create(int argc, const char **argv) { - struct trace_uprobe *tu; const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; - char buf[MAX_EVENT_NAME_LEN]; - char gbuf[MAX_EVENT_NAME_LEN]; - enum probe_print_type ptype; - struct path path; unsigned long offset, ref_ctr_offset; + char *gbuf __free(kfree) = NULL; + char *buf __free(kfree) = NULL; + enum probe_print_type ptype; + struct trace_uprobe *tu; bool is_return = false; + struct path path; int i, ret; ref_ctr_offset = 0; @@ -653,6 +655,10 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { + gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!gbuf) + goto fail_mem; + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) @@ -664,15 +670,16 @@ static int __trace_uprobe_create(int argc, const char **argv) char *ptr; tail = kstrdup(kbasename(filename), GFP_KERNEL); - if (!tail) { - ret = -ENOMEM; - goto fail_address_parse; - } + if (!tail) + goto fail_mem; ptr = strpbrk(tail, ".-_"); if (ptr) *ptr = '\0'; + buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL); + if (!buf) + goto fail_mem; snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset); event = buf; kfree(tail); @@ -724,6 +731,9 @@ out: trace_probe_log_clear(); return ret; +fail_mem: + ret = -ENOMEM; + fail_address_parse: trace_probe_log_clear(); path_put(&path); -- cgit v1.2.3 From 558d5f3cd250b5c20386200d573b0518a8f5fe64 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 23 Jul 2025 10:31:45 +0900 Subject: tracing: probes: Add a kerneldoc for traceprobe_parse_event_name() Since traceprobe_parse_event_name() is a bit complicated, add a kerneldoc for explaining the behavior. Link: https://lore.kernel.org/all/175323430565.57270.2602609519355112748.stgit@devnote2/ Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 9d26d901c9e5..4fb3d98e38ea 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -247,7 +247,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) return 0; } -/* @buf must has MAX_EVENT_NAME_LEN size */ +/** + * traceprobe_parse_event_name() - Parse a string into group and event names + * @pevent: A pointer to the string to be parsed. + * @pgroup: A pointer to the group name. + * @buf: A buffer to store the parsed group name. + * @offset: The offset of the string in the original user command, for logging. + * + * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]` + * (either GROUP or EVENT or both must be specified). + * Since the parsed group name is stored in @buf, the caller must ensure @buf + * is at least MAX_EVENT_NAME_LEN bytes. + * + * Return: 0 on success, or -EINVAL on failure. + * + * If success, *@pevent is updated to point to the event name part of the + * original string, or NULL if there is no event name. + * Also, *@pgroup is updated to point to the parsed group which is stored + * in @buf, or NULL if there is no group name. + */ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, char *buf, int offset) { -- cgit v1.2.3 From 9f0cb91767f582df6b17c1e2f22f684c36962295 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 22 Jul 2025 10:37:14 -0400 Subject: tracing: arm: arm64: Hide trace events ipi_raise, ipi_entry and ipi_exit The ipi tracepoints are mostly generic, but the tracepoints ipi_raise, ipi_entry and ipi_exit are only used by arm and arm64. This means these trace events are wasting memory in all the other architectures that do not use them. Add CONFIG_HAVE_EXTRA_IPI_TRACEPOINTS and have arm and arm64 select it to enable these trace events. The config makes it easy if other architectures decide to trace these as well. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Will Deacon Cc: Russell King Cc: Mark Rutland Cc: Thomas Gleixner Cc: Valentin Schneider Cc: Nicolas Pitre Cc: Peter Zijlstra Link: https://lore.kernel.org/20250722103714.64eba013@gandalf.local.home Signed-off-by: Steven Rostedt (Google) Acked-by: Catalin Marinas --- kernel/trace/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a3f35c7d83b6..35448f7233fe 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -53,6 +53,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS bool +config HAVE_EXTRA_IPI_TRACEPOINTS + bool + help + For architectures that use ipi_raise, ipi_entry and ipi_exit + tracepoints. + config HAVE_DYNAMIC_FTRACE_WITH_ARGS bool help -- cgit v1.2.3 From e09299225d5ba3916c91ef70565f7d2187e4cca0 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Tue, 22 Jul 2025 16:32:32 +0200 Subject: bpf: Reject narrower access to pointer ctx fields The following BPF program, simplified from a syzkaller repro, causes a kernel warning: r0 = *(u8 *)(r1 + 169); exit; With pointer field sk being at offset 168 in __sk_buff. This access is detected as a narrower read in bpf_skb_is_valid_access because it doesn't match offsetof(struct __sk_buff, sk). It is therefore allowed and later proceeds to bpf_convert_ctx_access. Note that for the "is_narrower_load" case in the convert_ctx_accesses(), the insn->off is aligned, so the cnt may not be 0 because it matches the offsetof(struct __sk_buff, sk) in the bpf_convert_ctx_access. However, the target_size stays 0 and the verifier errors with a kernel warning: verifier bug: error during ctx access conversion(1) This patch fixes that to return a proper "invalid bpf_context access off=X size=Y" error on the load instruction. The same issue affects multiple other fields in context structures that allow narrow access. Some other non-affected fields (for sk_msg, sk_lookup, and sockopt) were also changed to use bpf_ctx_range_ptr for consistency. Note this syzkaller crash was reported in the "Closes" link below, which used to be about a different bug, fixed in commit fce7bd8e385a ("bpf/verifier: Handle BPF_LOAD_ACQ instructions in insn_def_regno()"). Because syzbot somehow confused the two bugs, the new crash and repro didn't get reported to the mailing list. Fixes: f96da09473b52 ("bpf: simplify narrower ctx access") Fixes: 0df1a55afa832 ("bpf: Warn on internal verifier errors") Reported-by: syzbot+0ef84a7bdf5301d4cbec@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0ef84a7bdf5301d4cbec Signed-off-by: Paul Chaignon Signed-off-by: Martin KaFai Lau Acked-by: Eduard Zingerman Link: https://patch.msgid.link/3b8dcee67ff4296903351a974ddd9c4dca768b64.1753194596.git.paul.chaignon@gmail.com --- kernel/bpf/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index f4885514f007..deb88fade249 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2440,22 +2440,22 @@ static bool cg_sockopt_is_valid_access(int off, int size, } switch (off) { - case offsetof(struct bpf_sockopt, sk): + case bpf_ctx_range_ptr(struct bpf_sockopt, sk): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_SOCKET; break; - case offsetof(struct bpf_sockopt, optval): + case bpf_ctx_range_ptr(struct bpf_sockopt, optval): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct bpf_sockopt, optval_end): + case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end): if (size != sizeof(__u64)) return false; info->reg_type = PTR_TO_PACKET_END; break; - case offsetof(struct bpf_sockopt, retval): + case bpf_ctx_range(struct bpf_sockopt, retval): if (size != size_default) return false; return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; -- cgit v1.2.3 From dabd3e7dcc5881d5d3d6dc60c6f14780fee9a9bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 23 Jul 2025 12:42:02 -0400 Subject: tracing: Have eprobes handle arrays eprobes are dynamic events that can read other events using their fields to create new events. Currently it doesn't work with arrays. When the new event field is attached to the old event field, it looks at the size of the field to determine what type of field the new field should be. For 1 byte fields it's a char, for 2 bytes, it's a short and for 4 bytes it's an integer. For all other sizes it just defaults to "long". This also reads the contents of the field for such cases. For arrays that are bigger than the size of long, return the value of the address of the content itself. This will allow eprobes to read other values in the array of the old event. This is useful when raw_syscalls is enabled but the syscall events are not. The syscall events are created from the raw_syscalls as they have an array of "args" that holds the 6 long words passed to the syscall entry point. To read the value of "filename" from sys_openat, the eprobe could attach to the raw_syscall and read the second value. It can then even be passed to a synthetic event and converted back to another eprobe to get the value of "filename" after it has been read by the kernel during the system call: [ Create an eprobe called "sys" and attach it to sys_enter. Read the id of the system call and the second argument ] # echo 'e:sys raw_syscalls.sys_enter nr=$id:u32 arg2=+8($args):u64' >> /sys/kernel/tracing/dynamic_events [ Create a synthetic event "path" that will hold the address of the sys_openat filename. This is on a 64bit machine, so make it 64 bits ] # echo 's:path u64 file;' >> /sys/kernel/tracing/dynamic_events [ Add a histogram to the eprobe/sys which tiggers if the "nr" field is 257 (sys_openat), and save the filename in the "file" variable. ] # echo 'hist:keys=common_pid:file=arg2 if nr == 257' > /sys/kernel/tracing/events/eprobes/sys/trigger [ Attach a histogram to sys_exit event that triggers the "path" synthetic event and records the "filename" that was passed from the sys eprobe. ] # echo 'hist:keys=common_pid:f=$file:onmatch(eprobes.sys).trace(path,$f)' >> /sys/kernel/tracing/events/raw_syscalls/sys_exit/trigger [ Create another eprobe that dereferences the "file" field as a user space string and displays it. ] # echo 'e:open synthetic.path file=+0($file):ustring' >> /sys/kernel/tracing/dynamic_events # echo 1 > /sys/kernel/tracing/events/eprobes/open/enable # cat trace_pipe cat-1142 [003] ...5. 799.521912: open: (synthetic.path) file="/etc/ld.so.cache" cat-1142 [003] ...5. 799.521934: open: (synthetic.path) file="/etc/ld.so.cache" cat-1142 [003] ...5. 799.522065: open: (synthetic.path) file="/etc/ld.so.cache" cat-1142 [003] ...5. 799.522080: open: (synthetic.path) file="/etc/ld.so.cache" cat-1142 [003] ...5. 799.522296: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libc.so.6" cat-1142 [003] ...5. 799.522319: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libc.so.6" less-1143 [005] ...5. 799.522327: open: (synthetic.path) file="/etc/ld.so.cache" cat-1142 [003] ...5. 799.522333: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libc.so.6" cat-1142 [003] ...5. 799.522348: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libc.so.6" less-1143 [005] ...5. 799.522349: open: (synthetic.path) file="/etc/ld.so.cache" cat-1142 [003] ...5. 799.522363: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libc.so.6" less-1143 [005] ...5. 799.522477: open: (synthetic.path) file="/etc/ld.so.cache" cat-1142 [003] ...5. 799.522489: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libc.so.6" less-1143 [005] ...5. 799.522492: open: (synthetic.path) file="/etc/ld.so.cache" less-1143 [005] ...5. 799.522720: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libtinfo.so.6" less-1143 [005] ...5. 799.522744: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libtinfo.so.6" less-1143 [005] ...5. 799.522759: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libtinfo.so.6" cat-1142 [003] ...5. 799.522850: open: (synthetic.path) file="/lib/x86_64-linux-gnu/libc.so.6" Link: https://lore.kernel.org/all/20250723124202.4f7475be@batman.local.home/ Signed-off-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_eprobe.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index f16a0ab0b001..a1d402124836 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -344,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec) val = *(unsigned int *)addr; break; default: - if (field->is_signed) - val = *(long *)addr; - else - val = *(unsigned long *)addr; + if (field->size == sizeof(long)) { + if (field->is_signed) + val = *(long *)addr; + else + val = *(unsigned long *)addr; + break; + } + /* This is an array, point to the addr itself */ + val = (unsigned long)addr; break; } return val; -- cgit v1.2.3 From 9efcf590827cd88cbb68b0f93b20c5f5add905f4 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 23 Jul 2025 18:12:38 +0200 Subject: tools/dot2c: Fix generated files going over 100 column limit The dot2c.py script generates all states in a single line. This breaks the 100 column limit when the state machines are non-trivial. Change dot2c.py to generate the states in separate lines in case the generated line is going to be too long. Also adapt existing monitors with line length over the limit. Cc: Masami Hiramatsu Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250723161240.194860-4-gmonaco@redhat.com Suggested-by: Nam Cao Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/snep/snep.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h index 6d16b9ad931e..4cd9abb77b7b 100644 --- a/kernel/trace/rv/monitors/snep/snep.h +++ b/kernel/trace/rv/monitors/snep/snep.h @@ -41,8 +41,18 @@ static const struct automaton_snep automaton_snep = { "schedule_exit" }, .function = { - { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE }, - { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep }, + { + non_scheduling_context_snep, + non_scheduling_context_snep, + scheduling_contex_snep, + INVALID_STATE + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + non_scheduling_context_snep + }, }, .initial_state = non_scheduling_context_snep, .final_states = { 1, 0 }, -- cgit v1.2.3 From 560473f2e2d77e153cb12ce1ef53c2abb6a5f0ca Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 23 Jul 2025 18:12:39 +0200 Subject: verification/rvgen: Organise Kconfig entries for nested monitors The current behaviour of rvgen when running with the -a option is to append the necessary lines at the end of the configuration for Kconfig, Makefile and tracepoints. This is not always the desired behaviour in case of nested monitors: while tracepoints are not affected by nesting and the Makefile's only requirement is that the parent monitor is built before its children, in the Kconfig it is better to have children defined right after their parent, otherwise the result has wrong indentation: [*] foo_parent monitor [*] foo_child1 monitor [*] foo_child2 monitor [*] bar_parent monitor [*] bar_child1 monitor [*] bar_child2 monitor [*] foo_child3 monitor [*] foo_child4 monitor Adapt rvgen to look for a different marker for nested monitors in the Kconfig file and append the line right after the last sibling, instead of the last monitor. Also add the marker when creating a new parent monitor. Cc: Masami Hiramatsu Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250723161240.194860-5-gmonaco@redhat.com Reviewed-by: Nam Cao Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index c11bf7e61ebf..26017378f79b 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -43,6 +43,7 @@ config RV_PER_TASK_MONITORS source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" + source "kernel/trace/rv/monitors/sched/Kconfig" source "kernel/trace/rv/monitors/tss/Kconfig" source "kernel/trace/rv/monitors/sco/Kconfig" @@ -50,9 +51,13 @@ source "kernel/trace/rv/monitors/snroc/Kconfig" source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" source "kernel/trace/rv/monitors/sncid/Kconfig" +# Add new sched monitors here + source "kernel/trace/rv/monitors/rtapp/Kconfig" source "kernel/trace/rv/monitors/pagefault/Kconfig" source "kernel/trace/rv/monitors/sleep/Kconfig" +# Add new rtapp monitors here + # Add new monitors here config RV_REACTORS -- cgit v1.2.3 From 58d5f0d437a8e036a65eeddfb7df2b5d6107f1ef Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Wed, 23 Jul 2025 18:12:40 +0200 Subject: rv: Return init error when registering monitors Monitors generated with dot2k have their registration function (the one called during monitor initialisation) return always 0, even if the registration failed on RV side. This can hide potential errors. Return the value returned by the RV register function. Cc: Masami Hiramatsu Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250723161240.194860-6-gmonaco@redhat.com Reviewed-by: Nam Cao Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/sched/sched.c | 3 +-- kernel/trace/rv/monitors/sco/sco.c | 3 +-- kernel/trace/rv/monitors/scpd/scpd.c | 3 +-- kernel/trace/rv/monitors/sncid/sncid.c | 3 +-- kernel/trace/rv/monitors/snep/snep.c | 3 +-- kernel/trace/rv/monitors/snroc/snroc.c | 3 +-- kernel/trace/rv/monitors/tss/tss.c | 3 +-- kernel/trace/rv/monitors/wip/wip.c | 3 +-- kernel/trace/rv/monitors/wwnr/wwnr.c | 3 +-- 9 files changed, 9 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c index 905e03c3c934..d04db4b543f9 100644 --- a/kernel/trace/rv/monitors/sched/sched.c +++ b/kernel/trace/rv/monitors/sched/sched.c @@ -21,8 +21,7 @@ struct rv_monitor rv_sched = { static int __init register_sched(void) { - rv_register_monitor(&rv_sched, NULL); - return 0; + return rv_register_monitor(&rv_sched, NULL); } static void __exit unregister_sched(void) diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c index 4cff59220bfc..66f4639d46ac 100644 --- a/kernel/trace/rv/monitors/sco/sco.c +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -71,8 +71,7 @@ static struct rv_monitor rv_sco = { static int __init register_sco(void) { - rv_register_monitor(&rv_sco, &rv_sched); - return 0; + return rv_register_monitor(&rv_sco, &rv_sched); } static void __exit unregister_sco(void) diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c index cbdd6a5f8d7f..299703cd72b0 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.c +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -79,8 +79,7 @@ static struct rv_monitor rv_scpd = { static int __init register_scpd(void) { - rv_register_monitor(&rv_scpd, &rv_sched); - return 0; + return rv_register_monitor(&rv_scpd, &rv_sched); } static void __exit unregister_scpd(void) diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c index f5037cd6214c..3e1ee715a0fb 100644 --- a/kernel/trace/rv/monitors/sncid/sncid.c +++ b/kernel/trace/rv/monitors/sncid/sncid.c @@ -79,8 +79,7 @@ static struct rv_monitor rv_sncid = { static int __init register_sncid(void) { - rv_register_monitor(&rv_sncid, &rv_sched); - return 0; + return rv_register_monitor(&rv_sncid, &rv_sched); } static void __exit unregister_sncid(void) diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c index 0076ba6d7ea4..2adc3108d60c 100644 --- a/kernel/trace/rv/monitors/snep/snep.c +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -79,8 +79,7 @@ static struct rv_monitor rv_snep = { static int __init register_snep(void) { - rv_register_monitor(&rv_snep, &rv_sched); - return 0; + return rv_register_monitor(&rv_snep, &rv_sched); } static void __exit unregister_snep(void) diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c index bb1f60d55296..540e686e699f 100644 --- a/kernel/trace/rv/monitors/snroc/snroc.c +++ b/kernel/trace/rv/monitors/snroc/snroc.c @@ -68,8 +68,7 @@ static struct rv_monitor rv_snroc = { static int __init register_snroc(void) { - rv_register_monitor(&rv_snroc, &rv_sched); - return 0; + return rv_register_monitor(&rv_snroc, &rv_sched); } static void __exit unregister_snroc(void) diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c index 542787e6524f..0452fcd9edcf 100644 --- a/kernel/trace/rv/monitors/tss/tss.c +++ b/kernel/trace/rv/monitors/tss/tss.c @@ -74,8 +74,7 @@ static struct rv_monitor rv_tss = { static int __init register_tss(void) { - rv_register_monitor(&rv_tss, &rv_sched); - return 0; + return rv_register_monitor(&rv_tss, &rv_sched); } static void __exit unregister_tss(void) diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index ed758fec8608..4b4e99615a11 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -71,8 +71,7 @@ static struct rv_monitor rv_wip = { static int __init register_wip(void) { - rv_register_monitor(&rv_wip, NULL); - return 0; + return rv_register_monitor(&rv_wip, NULL); } static void __exit unregister_wip(void) diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 172f31c4b0f3..4145bea2729e 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -70,8 +70,7 @@ static struct rv_monitor rv_wwnr = { static int __init register_wwnr(void) { - rv_register_monitor(&rv_wwnr, NULL); - return 0; + return rv_register_monitor(&rv_wwnr, NULL); } static void __exit unregister_wwnr(void) -- cgit v1.2.3 From 8245d47cfaba8a38337a447230b4d01f9946f5e1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 23 Jul 2025 22:50:26 -0700 Subject: x86: Handle KCOV __init vs inline mismatches GCC appears to have kind of fragile inlining heuristics, in the sense that it can change whether or not it inlines something based on optimizations. It looks like the kcov instrumentation being added (or in this case, removed) from a function changes the optimization results, and some functions marked "inline" are _not_ inlined. In that case, we end up with __init code calling a function not marked __init, and we get the build warnings I'm trying to eliminate in the coming patch that adds __no_sanitize_coverage to __init functions: WARNING: modpost: vmlinux: section mismatch in reference: xbc_exit+0x8 (section: .text.unlikely) -> _xbc_exit (section: .init.text) WARNING: modpost: vmlinux: section mismatch in reference: real_mode_size_needed+0x15 (section: .text.unlikely) -> real_mode_blob_end (section: .init.data) WARNING: modpost: vmlinux: section mismatch in reference: __set_percpu_decrypted+0x16 (section: .text.unlikely) -> early_set_memory_decrypted (section: .init.text) WARNING: modpost: vmlinux: section mismatch in reference: memblock_alloc_from+0x26 (section: .text.unlikely) -> memblock_alloc_try_nid (section: .init.text) WARNING: modpost: vmlinux: section mismatch in reference: acpi_arch_set_root_pointer+0xc (section: .text.unlikely) -> x86_init (section: .init.data) WARNING: modpost: vmlinux: section mismatch in reference: acpi_arch_get_root_pointer+0x8 (section: .text.unlikely) -> x86_init (section: .init.data) WARNING: modpost: vmlinux: section mismatch in reference: efi_config_table_is_usable+0x16 (section: .text.unlikely) -> xen_efi_config_table_is_usable (section: .init.text) This problem is somewhat fragile (though using either __always_inline or __init will deterministically solve it), but we've tripped over this before with GCC and the solution has usually been to just use __always_inline and move on. For x86 this means forcing several functions to be inline with __always_inline. Link: https://lore.kernel.org/r/20250724055029.3623499-2-kees@kernel.org Signed-off-by: Kees Cook --- kernel/kexec_handover.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 69b953551677..f3f6bfe43d47 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -305,8 +305,8 @@ err_free: return -ENOMEM; } -static void deserialize_bitmap(unsigned int order, - struct khoser_mem_bitmap_ptr *elm) +static void __init deserialize_bitmap(unsigned int order, + struct khoser_mem_bitmap_ptr *elm) { struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap); unsigned long bit; -- cgit v1.2.3 From 91a229bb7ba86b2592c3f18c54b7b2c5e6fe0f95 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sat, 19 Jul 2025 20:26:04 +0900 Subject: resource: fix false warning in __request_region() A warning is raised when __request_region() detects a conflict with a resource whose resource.desc is IORES_DESC_DEVICE_PRIVATE_MEMORY. But this warning is only valid for iomem_resources. The hmem device resource uses resource.desc as the numa node id, which can cause spurious warnings. This warning appeared on a machine with multiple cxl memory expanders. One of the NUMA node id is 6, which is the same as the value of IORES_DESC_DEVICE_PRIVATE_MEMORY. In this environment it was just a spurious warning, but when I saw the warning I suspected a real problem so it's better to fix it. This change fixes this by restricting the warning to only iomem_resource. This also adds a missing new line to the warning message. Link: https://lkml.kernel.org/r/20250719112604.25500-1-akinobu.mita@gmail.com Fixes: 7dab174e2e27 ("dax/hmem: Move hmem device registration to dax_hmem.ko") Signed-off-by: Akinobu Mita Reviewed-by: Dan Williams Cc: Signed-off-by: Andrew Morton --- kernel/resource.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 8d3e6ed0bdc1..f9bb5481501a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1279,8 +1279,9 @@ static int __request_region_locked(struct resource *res, struct resource *parent * become unavailable to other users. Conflicts are * not expected. Warn to aid debugging if encountered. */ - if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { - pr_warn("Unaddressable device %s %pR conflicts with %pR", + if (parent == &iomem_resource && + conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) { + pr_warn("Unaddressable device %s %pR conflicts with %pR\n", conflict->name, conflict, res); } if (conflict != parent) { -- cgit v1.2.3 From 8c4e53a1a09374c232fe96232426226b2824f473 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 23 Jul 2025 15:41:45 -0400 Subject: tracing: Call trace_ftrace_test_filter() for the event The trace event filter bootup self test tests a bunch of filter logic against the ftrace_test_filter event, but does not actually call the event. Work is being done to cause a warning if an event is defined but not used. To quiet the warning call the trace event under an if statement where it is disabled so it doesn't get optimized out. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Linus Torvalds Link: https://lore.kernel.org/20250723194212.274458858@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3885aadc434d..e4581e10782b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2900,6 +2900,10 @@ static __init int ftrace_test_event_filter(void) if (i == DATA_CNT) printk(KERN_CONT "OK\n"); + /* Need to call ftrace_test_filter to prevent a warning */ + if (!trace_ftrace_test_filter_enabled()) + trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8); + return 0; } -- cgit v1.2.3 From 71c52411c51bf4f0869c572294ce8123b26528d5 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 23 Jul 2025 01:30:29 +0000 Subject: net: Create separate gro_flush_normal function Move multiple copies of same code snippet doing `gro_flush` and `gro_normal_list` into separate helper function. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250723013031.2911384-2-skhawaja@google.com Signed-off-by: Jakub Kicinski --- kernel/bpf/cpumap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 67e8a2fc1a99..b2b7b8ec2c2a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -282,8 +282,7 @@ static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) * This is equivalent to how NAPI decides whether to perform a full * flush. */ - gro_flush(&rcpu->gro, !empty && HZ >= 1000); - gro_normal_list(&rcpu->gro); + gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000); } static int cpu_map_kthread_run(void *data) -- cgit v1.2.3 From b0c08dd5348dfd8098bdb815bb04e7c59dfaea79 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 24 Jul 2025 19:33:26 +0200 Subject: rv: Remove unused field in struct rv_monitor_def rv_monitor_def::task_monitor is not used. Delete it. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/502d94f2696435690a2b1fdbe80a9e56c96fcabf.1753378331.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 98fca0a1adbc..873364094402 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -41,7 +41,6 @@ struct rv_monitor_def { struct rv_reactor_def *rdef; bool reacting; #endif - bool task_monitor; }; struct dentry *get_monitors_root(void); -- cgit v1.2.3 From 24cbfe18d55a6866bc2e27fda74306f4a1b5cb01 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 24 Jul 2025 19:33:27 +0200 Subject: rv: Merge struct rv_monitor_def into struct rv_monitor Each struct rv_monitor has a unique struct rv_monitor_def associated with it. struct rv_monitor is statically allocated, while struct rv_monitor_def is dynamically allocated. This makes the code more complicated than it should be: - Lookup is required to get the associated rv_monitor_def from rv_monitor - Dynamic memory allocation is required for rv_monitor_def. This is harder to get right compared to static memory. For instance, there is an existing mistake: rv_unregister_monitor() does not free the memory allocated by rv_register_monitor(). This is fortunately not a real memory leak problem, as rv_unregister_monitor() is never called. Simplify and merge rv_monitor_def into rv_monitor. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/194449c00f87945c207aab4c96920c75796a4f53.1753378331.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.c | 211 +++++++++++++++++++----------------------- kernel/trace/rv/rv.h | 27 ++---- kernel/trace/rv/rv_reactors.c | 62 ++++++------- 3 files changed, 132 insertions(+), 168 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 108429d16ec1..6c0be2fdc52d 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -210,9 +210,9 @@ void rv_put_task_monitor_slot(int slot) * Monitors with a parent are nested, * Monitors without a parent could be standalone or containers. */ -bool rv_is_nested_monitor(struct rv_monitor_def *mdef) +bool rv_is_nested_monitor(struct rv_monitor *mon) { - return mdef->parent != NULL; + return mon->parent != NULL; } /* @@ -223,16 +223,16 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef) * for enable()/disable(). Use this condition to find empty containers. * Keep both conditions in case we have some non-compliant containers. */ -bool rv_is_container_monitor(struct rv_monitor_def *mdef) +bool rv_is_container_monitor(struct rv_monitor *mon) { - struct rv_monitor_def *next; + struct rv_monitor *next; - if (list_is_last(&mdef->list, &rv_monitors_list)) + if (list_is_last(&mon->list, &rv_monitors_list)) return false; - next = list_next_entry(mdef, list); + next = list_next_entry(mon, list); - return next->parent == mdef->monitor || !mdef->monitor->enable; + return next->parent == mon || !mon->enable; } /* @@ -241,10 +241,10 @@ bool rv_is_container_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; const char *buff; - buff = mdef->monitor->enabled ? "1\n" : "0\n"; + buff = mon->enabled ? "1\n" : "0\n"; return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1); } @@ -252,14 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf /* * __rv_disable_monitor - disabled an enabled monitor */ -static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) +static int __rv_disable_monitor(struct rv_monitor *mon, bool sync) { lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) { - mdef->monitor->enabled = 0; - if (mdef->monitor->disable) - mdef->monitor->disable(); + if (mon->enabled) { + mon->enabled = 0; + if (mon->disable) + mon->disable(); /* * Wait for the execution of all events to finish. @@ -273,90 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync) return 0; } -static void rv_disable_single(struct rv_monitor_def *mdef) +static void rv_disable_single(struct rv_monitor *mon) { - __rv_disable_monitor(mdef, true); + __rv_disable_monitor(mon, true); } -static int rv_enable_single(struct rv_monitor_def *mdef) +static int rv_enable_single(struct rv_monitor *mon) { int retval; lockdep_assert_held(&rv_interface_lock); - if (mdef->monitor->enabled) + if (mon->enabled) return 0; - retval = mdef->monitor->enable(); + retval = mon->enable(); if (!retval) - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } -static void rv_disable_container(struct rv_monitor_def *mdef) +static void rv_disable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int enabled = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; enabled += __rv_disable_monitor(p, false); } if (enabled) tracepoint_synchronize_unregister(); - mdef->monitor->enabled = 0; + mon->enabled = 0; } -static int rv_enable_container(struct rv_monitor_def *mdef) +static int rv_enable_container(struct rv_monitor *mon) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; int retval = 0; list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (retval || p->parent != mdef->monitor) + if (retval || p->parent != mon) break; retval = rv_enable_single(p); } if (retval) - rv_disable_container(mdef); + rv_disable_container(mon); else - mdef->monitor->enabled = 1; + mon->enabled = 1; return retval; } /** * rv_disable_monitor - disable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success. */ -int rv_disable_monitor(struct rv_monitor_def *mdef) +int rv_disable_monitor(struct rv_monitor *mon) { - if (rv_is_container_monitor(mdef)) - rv_disable_container(mdef); + if (rv_is_container_monitor(mon)) + rv_disable_container(mon); else - rv_disable_single(mdef); + rv_disable_single(mon); return 0; } /** * rv_enable_monitor - enable a given runtime monitor - * @mdef: Pointer to the monitor definition structure. + * @mon: Pointer to the monitor definition structure. * * Returns 0 on success, error otherwise. */ -int rv_enable_monitor(struct rv_monitor_def *mdef) +int rv_enable_monitor(struct rv_monitor *mon) { int retval; - if (rv_is_container_monitor(mdef)) - retval = rv_enable_container(mdef); + if (rv_is_container_monitor(mon)) + retval = rv_enable_container(mon); else - retval = rv_enable_single(mdef); + retval = rv_enable_single(mon); return retval; } @@ -367,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef) static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; int retval; bool val; @@ -378,9 +378,9 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u mutex_lock(&rv_interface_lock); if (val) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); + retval = rv_disable_monitor(mon); mutex_unlock(&rv_interface_lock); @@ -399,12 +399,12 @@ static const struct file_operations interface_enable_fops = { static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count, loff_t *ppos) { - struct rv_monitor_def *mdef = filp->private_data; + struct rv_monitor *mon = filp->private_data; char buff[256]; memset(buff, 0, sizeof(buff)); - snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description); + snprintf(buff, sizeof(buff), "%s\n", mon->description); return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1); } @@ -419,37 +419,37 @@ static const struct file_operations interface_desc_fops = { * the monitor dir, where the specific options of the monitor * are exposed. */ -static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent) +static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent) { struct dentry *root = parent ? parent->root_d : get_monitors_root(); - const char *name = mdef->monitor->name; + const char *name = mon->name; struct dentry *tmp; int retval; - mdef->root_d = rv_create_dir(name, root); - if (!mdef->root_d) + mon->root_d = rv_create_dir(name, root); + if (!mon->root_d) return -ENOMEM; - tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops); + tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops); if (!tmp) { retval = -ENOMEM; goto out_remove_root; } - tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops); + tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops); if (!tmp) { retval = -ENOMEM; goto out_remove_root; } - retval = reactor_populate_monitor(mdef); + retval = reactor_populate_monitor(mon); if (retval) goto out_remove_root; return 0; out_remove_root: - rv_remove(mdef->root_d); + rv_remove(mon->root_d); return retval; } @@ -458,13 +458,12 @@ out_remove_root: */ static int monitors_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mon_def = p; + struct rv_monitor *mon = p; - if (mon_def->parent) - seq_printf(m, "%s:%s\n", mon_def->parent->name, - mon_def->monitor->name); + if (mon->parent) + seq_printf(m, "%s:%s\n", mon->parent->name, mon->name); else - seq_printf(m, "%s\n", mon_def->monitor->name); + seq_printf(m, "%s\n", mon->name); return 0; } @@ -496,13 +495,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos) */ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) { - struct rv_monitor_def *m_def = p; + struct rv_monitor *mon = p; (*pos)++; - list_for_each_entry_continue(m_def, &rv_monitors_list, list) { - if (m_def->monitor->enabled) - return m_def; + list_for_each_entry_continue(mon, &rv_monitors_list, list) { + if (mon->enabled) + return mon; } return NULL; @@ -510,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos) static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) { - struct rv_monitor_def *m_def; + struct rv_monitor *mon; loff_t l; mutex_lock(&rv_interface_lock); @@ -518,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos) if (list_empty(&rv_monitors_list)) return NULL; - m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list); + mon = list_entry(&rv_monitors_list, struct rv_monitor, list); for (l = 0; l <= *pos; ) { - m_def = enabled_monitors_next(m, m_def, &l); - if (!m_def) + mon = enabled_monitors_next(m, mon, &l); + if (!mon) break; } - return m_def; + return mon; } /* @@ -566,13 +565,13 @@ static const struct file_operations available_monitors_ops = { */ static void disable_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int enabled = 0; mutex_lock(&rv_interface_lock); - list_for_each_entry(mdef, &rv_monitors_list, list) - enabled += __rv_disable_monitor(mdef, false); + list_for_each_entry(mon, &rv_monitors_list, list) + enabled += __rv_disable_monitor(mon, false); if (enabled) { /* @@ -598,7 +597,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user size_t count, loff_t *ppos) { char buff[MAX_RV_MONITOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; + struct rv_monitor *mon; int retval = -EINVAL; bool enable = true; char *ptr, *tmp; @@ -633,17 +632,17 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user if (tmp) ptr = tmp+1; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (strcmp(ptr, mdef->monitor->name) != 0) + list_for_each_entry(mon, &rv_monitors_list, list) { + if (strcmp(ptr, mon->name) != 0) continue; /* * Monitor found! */ if (enable) - retval = rv_enable_monitor(mdef); + retval = rv_enable_monitor(mon); else - retval = rv_disable_monitor(mdef); + retval = rv_disable_monitor(mon); if (!retval) retval = count; @@ -702,11 +701,11 @@ static void turn_monitoring_off(void) static void reset_all_monitors(void) { - struct rv_monitor_def *mdef; + struct rv_monitor *mon; - list_for_each_entry(mdef, &rv_monitors_list, list) { - if (mdef->monitor->enabled && mdef->monitor->reset) - mdef->monitor->reset(); + list_for_each_entry(mon, &rv_monitors_list, list) { + if (mon->enabled && mon->reset) + mon->reset(); } } @@ -768,10 +767,10 @@ static const struct file_operations monitoring_on_fops = { .read = monitoring_on_read_data, }; -static void destroy_monitor_dir(struct rv_monitor_def *mdef) +static void destroy_monitor_dir(struct rv_monitor *mon) { - reactor_cleanup_monitor(mdef); - rv_remove(mdef->root_d); + reactor_cleanup_monitor(mon); + rv_remove(mon->root_d); } /** @@ -783,7 +782,7 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef) */ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) { - struct rv_monitor_def *r, *p = NULL; + struct rv_monitor *r; int retval = 0; if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) { @@ -795,49 +794,31 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent) mutex_lock(&rv_interface_lock); list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(monitor->name, r->monitor->name) == 0) { + if (strcmp(monitor->name, r->name) == 0) { pr_info("Monitor %s is already registered\n", monitor->name); retval = -EEXIST; goto out_unlock; } } - if (parent) { - list_for_each_entry(r, &rv_monitors_list, list) { - if (strcmp(parent->name, r->monitor->name) == 0) { - p = r; - break; - } - } - } - - if (p && rv_is_nested_monitor(p)) { + if (parent && rv_is_nested_monitor(parent)) { pr_info("Parent monitor %s is already nested, cannot nest further\n", parent->name); retval = -EINVAL; goto out_unlock; } - r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL); - if (!r) { - retval = -ENOMEM; - goto out_unlock; - } - - r->monitor = monitor; - r->parent = parent; + monitor->parent = parent; - retval = create_monitor_dir(r, p); - if (retval) { - kfree(r); - goto out_unlock; - } + retval = create_monitor_dir(monitor, parent); + if (retval) + return retval; /* keep children close to the parent for easier visualisation */ - if (p) - list_add(&r->list, &p->list); + if (parent) + list_add(&monitor->list, &parent->list); else - list_add_tail(&r->list, &rv_monitors_list); + list_add_tail(&monitor->list, &rv_monitors_list); out_unlock: mutex_unlock(&rv_interface_lock); @@ -852,17 +833,11 @@ out_unlock: */ int rv_unregister_monitor(struct rv_monitor *monitor) { - struct rv_monitor_def *ptr, *next; - mutex_lock(&rv_interface_lock); - list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) { - if (strcmp(monitor->name, ptr->monitor->name) == 0) { - rv_disable_monitor(ptr); - list_del(&ptr->list); - destroy_monitor_dir(ptr); - } - } + rv_disable_monitor(monitor); + list_del(&monitor->list); + destroy_monitor_dir(monitor); mutex_unlock(&rv_interface_lock); return 0; diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 873364094402..f039ec1c9156 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -32,34 +32,23 @@ struct rv_reactor_def { }; #endif -struct rv_monitor_def { - struct list_head list; - struct rv_monitor *monitor; - struct rv_monitor *parent; - struct dentry *root_d; -#ifdef CONFIG_RV_REACTORS - struct rv_reactor_def *rdef; - bool reacting; -#endif -}; - struct dentry *get_monitors_root(void); -int rv_disable_monitor(struct rv_monitor_def *mdef); -int rv_enable_monitor(struct rv_monitor_def *mdef); -bool rv_is_container_monitor(struct rv_monitor_def *mdef); -bool rv_is_nested_monitor(struct rv_monitor_def *mdef); +int rv_disable_monitor(struct rv_monitor *mon); +int rv_enable_monitor(struct rv_monitor *mon); +bool rv_is_container_monitor(struct rv_monitor *mon); +bool rv_is_nested_monitor(struct rv_monitor *mon); #ifdef CONFIG_RV_REACTORS -int reactor_populate_monitor(struct rv_monitor_def *mdef); -void reactor_cleanup_monitor(struct rv_monitor_def *mdef); +int reactor_populate_monitor(struct rv_monitor *mon); +void reactor_cleanup_monitor(struct rv_monitor *mon); int init_rv_reactors(struct dentry *root_dir); #else -static inline int reactor_populate_monitor(struct rv_monitor_def *mdef) +static inline int reactor_populate_monitor(struct rv_monitor *mon) { return 0; } -static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +static inline void reactor_cleanup_monitor(struct rv_monitor *mon) { return; } diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 740603670dd1..7cc620a1be1a 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -138,10 +138,10 @@ static const struct file_operations available_reactors_ops = { */ static int monitor_reactor_show(struct seq_file *m, void *p) { - struct rv_monitor_def *mdef = m->private; + struct rv_monitor *mon = m->private; struct rv_reactor_def *rdef = p; - if (mdef->rdef == rdef) + if (mon->rdef == rdef) seq_printf(m, "[%s]\n", rdef->reactor->name); else seq_printf(m, "%s\n", rdef->reactor->name); @@ -158,41 +158,41 @@ static const struct seq_operations monitor_reactors_seq_ops = { .show = monitor_reactor_show }; -static void monitor_swap_reactors_single(struct rv_monitor_def *mdef, +static void monitor_swap_reactors_single(struct rv_monitor *mon, struct rv_reactor_def *rdef, bool reacting, bool nested) { bool monitor_enabled; /* nothing to do */ - if (mdef->rdef == rdef) + if (mon->rdef == rdef) return; - monitor_enabled = mdef->monitor->enabled; + monitor_enabled = mon->enabled; if (monitor_enabled) - rv_disable_monitor(mdef); + rv_disable_monitor(mon); /* swap reactor's usage */ - mdef->rdef->counter--; + mon->rdef->counter--; rdef->counter++; - mdef->rdef = rdef; - mdef->reacting = reacting; - mdef->monitor->react = rdef->reactor->react; + mon->rdef = rdef; + mon->reacting = reacting; + mon->react = rdef->reactor->react; /* enable only once if iterating through a container */ if (monitor_enabled && !nested) - rv_enable_monitor(mdef); + rv_enable_monitor(mon); } -static void monitor_swap_reactors(struct rv_monitor_def *mdef, +static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor_def *rdef, bool reacting) { - struct rv_monitor_def *p = mdef; + struct rv_monitor *p = mon; - if (rv_is_container_monitor(mdef)) + if (rv_is_container_monitor(mon)) list_for_each_entry_continue(p, &rv_monitors_list, list) { - if (p->parent != mdef->monitor) + if (p->parent != mon) break; monitor_swap_reactors_single(p, rdef, reacting, true); } @@ -202,7 +202,7 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef, * All nested monitors are enabled also if they were off, we may refine * this logic in the future. */ - monitor_swap_reactors_single(mdef, rdef, reacting, false); + monitor_swap_reactors_single(mon, rdef, reacting, false); } static ssize_t @@ -210,7 +210,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; - struct rv_monitor_def *mdef; + struct rv_monitor *mon; struct rv_reactor_def *rdef; struct seq_file *seq_f; int retval = -EINVAL; @@ -237,7 +237,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, * See monitor_reactors_open() */ seq_f = file->private_data; - mdef = seq_f->private; + mon = seq_f->private; mutex_lock(&rv_interface_lock); @@ -252,7 +252,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, else enable = true; - monitor_swap_reactors(mdef, rdef, enable); + monitor_swap_reactors(mon, rdef, enable); retval = count; break; @@ -268,7 +268,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, */ static int monitor_reactors_open(struct inode *inode, struct file *file) { - struct rv_monitor_def *mdef = inode->i_private; + struct rv_monitor *mon = inode->i_private; struct seq_file *seq_f; int ret; @@ -284,7 +284,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file) /* * Copy the create file "private" data to the seq_file private data. */ - seq_f->private = mdef; + seq_f->private = mon; return 0; }; @@ -454,37 +454,37 @@ static const struct file_operations reacting_on_fops = { /** * reactor_populate_monitor - creates per monitor reactors file - * @mdef: monitor's definition. + * @mon: The monitor. * * Returns 0 if successful, error otherwise. */ -int reactor_populate_monitor(struct rv_monitor_def *mdef) +int reactor_populate_monitor(struct rv_monitor *mon) { struct dentry *tmp; - tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops); + tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops); if (!tmp) return -ENOMEM; /* * Configure as the rv_nop reactor. */ - mdef->rdef = get_reactor_rdef_by_name("nop"); - mdef->rdef->counter++; - mdef->reacting = false; + mon->rdef = get_reactor_rdef_by_name("nop"); + mon->rdef->counter++; + mon->reacting = false; return 0; } /** * reactor_cleanup_monitor - cleanup a monitor reference - * @mdef: monitor's definition. + * @mon: the monitor. */ -void reactor_cleanup_monitor(struct rv_monitor_def *mdef) +void reactor_cleanup_monitor(struct rv_monitor *mon) { lockdep_assert_held(&rv_interface_lock); - mdef->rdef->counter--; - WARN_ON_ONCE(mdef->rdef->counter < 0); + mon->rdef->counter--; + WARN_ON_ONCE(mon->rdef->counter < 0); } /* -- cgit v1.2.3 From 3d3c376118b5f7ed7723c2b4fd7a0a1c1893d63e Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 24 Jul 2025 19:33:28 +0200 Subject: rv: Merge struct rv_reactor_def into struct rv_reactor Each struct rv_reactor has a unique struct rv_reactor_def associated with it. struct rv_reactor is statically allocated, while struct rv_reactor_def is dynamically allocated. This makes the code more complicated than it should be: - Lookup is required to get the associated rv_reactor_def from rv_reactor - Dynamic memory allocation is required for rv_reactor_def. This is harder to get right compared to static memory. For instance, there is an existing mistake: rv_unregister_reactor() does not free the memory allocated by rv_register_reactor(). This is fortunately not a real memory leak problem as rv_unregister_reactor() is never called. Simplify and merge rv_reactor_def into rv_reactor. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/71cb91c86cd40df5b8c492b788787f2a73c3eaa3.1753378331.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.h | 9 ----- kernel/trace/rv/rv_reactors.c | 92 ++++++++++++++++++------------------------- 2 files changed, 39 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index f039ec1c9156..8c38f9dd41bc 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -23,15 +23,6 @@ struct rv_interface { extern struct mutex rv_interface_lock; extern struct list_head rv_monitors_list; -#ifdef CONFIG_RV_REACTORS -struct rv_reactor_def { - struct list_head list; - struct rv_reactor *reactor; - /* protected by the monitor interface lock */ - int counter; -}; -#endif - struct dentry *get_monitors_root(void); int rv_disable_monitor(struct rv_monitor *mon); int rv_enable_monitor(struct rv_monitor *mon); diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 7cc620a1be1a..2c7909e6d0e7 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -70,12 +70,12 @@ */ static LIST_HEAD(rv_reactors_list); -static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) +static struct rv_reactor *get_reactor_rdef_by_name(char *name) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(name, r->reactor->name) == 0) + if (strcmp(name, r->name) == 0) return r; } return NULL; @@ -86,9 +86,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name) */ static int reactors_show(struct seq_file *m, void *p) { - struct rv_reactor_def *rea_def = p; + struct rv_reactor *reactor = p; - seq_printf(m, "%s\n", rea_def->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -139,12 +139,12 @@ static const struct file_operations available_reactors_ops = { static int monitor_reactor_show(struct seq_file *m, void *p) { struct rv_monitor *mon = m->private; - struct rv_reactor_def *rdef = p; + struct rv_reactor *reactor = p; - if (mon->rdef == rdef) - seq_printf(m, "[%s]\n", rdef->reactor->name); + if (mon->reactor == reactor) + seq_printf(m, "[%s]\n", reactor->name); else - seq_printf(m, "%s\n", rdef->reactor->name); + seq_printf(m, "%s\n", reactor->name); return 0; } @@ -159,13 +159,13 @@ static const struct seq_operations monitor_reactors_seq_ops = { }; static void monitor_swap_reactors_single(struct rv_monitor *mon, - struct rv_reactor_def *rdef, + struct rv_reactor *reactor, bool reacting, bool nested) { bool monitor_enabled; /* nothing to do */ - if (mon->rdef == rdef) + if (mon->reactor == reactor) return; monitor_enabled = mon->enabled; @@ -173,12 +173,12 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon, rv_disable_monitor(mon); /* swap reactor's usage */ - mon->rdef->counter--; - rdef->counter++; + mon->reactor->counter--; + reactor->counter++; - mon->rdef = rdef; + mon->reactor = reactor; mon->reacting = reacting; - mon->react = rdef->reactor->react; + mon->react = reactor->react; /* enable only once if iterating through a container */ if (monitor_enabled && !nested) @@ -186,7 +186,7 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon, } static void monitor_swap_reactors(struct rv_monitor *mon, - struct rv_reactor_def *rdef, bool reacting) + struct rv_reactor *reactor, bool reacting) { struct rv_monitor *p = mon; @@ -194,7 +194,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon, list_for_each_entry_continue(p, &rv_monitors_list, list) { if (p->parent != mon) break; - monitor_swap_reactors_single(p, rdef, reacting, true); + monitor_swap_reactors_single(p, reactor, reacting, true); } /* * This call enables and disables the monitor if they were active. @@ -202,7 +202,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon, * All nested monitors are enabled also if they were off, we may refine * this logic in the future. */ - monitor_swap_reactors_single(mon, rdef, reacting, false); + monitor_swap_reactors_single(mon, reactor, reacting, false); } static ssize_t @@ -211,7 +211,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, { char buff[MAX_RV_REACTOR_NAME_SIZE + 2]; struct rv_monitor *mon; - struct rv_reactor_def *rdef; + struct rv_reactor *reactor; struct seq_file *seq_f; int retval = -EINVAL; bool enable; @@ -243,16 +243,16 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, retval = -EINVAL; - list_for_each_entry(rdef, &rv_reactors_list, list) { - if (strcmp(ptr, rdef->reactor->name) != 0) + list_for_each_entry(reactor, &rv_reactors_list, list) { + if (strcmp(ptr, reactor->name) != 0) continue; - if (rdef == get_reactor_rdef_by_name("nop")) + if (strcmp(reactor->name, "nop")) enable = false; else enable = true; - monitor_swap_reactors(mon, rdef, enable); + monitor_swap_reactors(mon, reactor, enable); retval = count; break; @@ -299,23 +299,16 @@ static const struct file_operations monitor_reactors_ops = { static int __rv_register_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *r; + struct rv_reactor *r; list_for_each_entry(r, &rv_reactors_list, list) { - if (strcmp(reactor->name, r->reactor->name) == 0) { + if (strcmp(reactor->name, r->name) == 0) { pr_info("Reactor %s is already registered\n", reactor->name); return -EINVAL; } } - r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->reactor = reactor; - r->counter = 0; - - list_add_tail(&r->list, &rv_reactors_list); + list_add_tail(&reactor->list, &rv_reactors_list); return 0; } @@ -350,26 +343,19 @@ int rv_register_reactor(struct rv_reactor *reactor) */ int rv_unregister_reactor(struct rv_reactor *reactor) { - struct rv_reactor_def *ptr, *next; int ret = 0; mutex_lock(&rv_interface_lock); - list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) { - if (strcmp(reactor->name, ptr->reactor->name) == 0) { - - if (!ptr->counter) { - list_del(&ptr->list); - } else { - printk(KERN_WARNING - "rv: the rv_reactor %s is in use by %d monitor(s)\n", - ptr->reactor->name, ptr->counter); - printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", - ptr->reactor->name); - ret = -EBUSY; - break; - } - } + if (!reactor->counter) { + list_del(&reactor->list); + } else { + printk(KERN_WARNING + "rv: the rv_reactor %s is in use by %d monitor(s)\n", + reactor->name, reactor->counter); + printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", + reactor->name); + ret = -EBUSY; } mutex_unlock(&rv_interface_lock); @@ -469,8 +455,8 @@ int reactor_populate_monitor(struct rv_monitor *mon) /* * Configure as the rv_nop reactor. */ - mon->rdef = get_reactor_rdef_by_name("nop"); - mon->rdef->counter++; + mon->reactor = get_reactor_rdef_by_name("nop"); + mon->reactor->counter++; mon->reacting = false; return 0; @@ -483,8 +469,8 @@ int reactor_populate_monitor(struct rv_monitor *mon) void reactor_cleanup_monitor(struct rv_monitor *mon) { lockdep_assert_held(&rv_interface_lock); - mon->rdef->counter--; - WARN_ON_ONCE(mon->rdef->counter < 0); + mon->reactor->counter--; + WARN_ON_ONCE(mon->reactor->counter < 0); } /* -- cgit v1.2.3 From 3d3800b4f7f4b1472a0ec2cffd535c05603f8f60 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 24 Jul 2025 19:33:29 +0200 Subject: rv: Remove rv_reactor's reference counter rv_reactor has a reference counter to ensure it is not removed while monitors are still using it. However, this is futile, as __exit functions are not expected to fail and will proceed normally despite rv_unregister_reactor() returning an error. At the moment, reactors do not support being built as modules, therefore they are never removed and the reference counters are not necessary. If we support building RV reactors as modules in the future, kernel module's centralized facilities such as try_module_get(), module_put() or MODULE_SOFTDEP should be used instead of this custom implementation. Remove this reference counter. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/bb946398436a5e17fb0f5b842ef3313c02291852.1753378331.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.c | 1 - kernel/trace/rv/rv.h | 6 ------ kernel/trace/rv/rv_reactors.c | 33 ++------------------------------- 3 files changed, 2 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 6c0be2fdc52d..6c8498743b98 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -769,7 +769,6 @@ static const struct file_operations monitoring_on_fops = { static void destroy_monitor_dir(struct rv_monitor *mon) { - reactor_cleanup_monitor(mon); rv_remove(mon->root_d); } diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h index 8c38f9dd41bc..1485a70c1bf4 100644 --- a/kernel/trace/rv/rv.h +++ b/kernel/trace/rv/rv.h @@ -31,7 +31,6 @@ bool rv_is_nested_monitor(struct rv_monitor *mon); #ifdef CONFIG_RV_REACTORS int reactor_populate_monitor(struct rv_monitor *mon); -void reactor_cleanup_monitor(struct rv_monitor *mon); int init_rv_reactors(struct dentry *root_dir); #else static inline int reactor_populate_monitor(struct rv_monitor *mon) @@ -39,11 +38,6 @@ static inline int reactor_populate_monitor(struct rv_monitor *mon) return 0; } -static inline void reactor_cleanup_monitor(struct rv_monitor *mon) -{ - return; -} - static inline int init_rv_reactors(struct dentry *root_dir) { return 0; diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 2c7909e6d0e7..a8e849e6cd85 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -172,10 +172,6 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon, if (monitor_enabled) rv_disable_monitor(mon); - /* swap reactor's usage */ - mon->reactor->counter--; - reactor->counter++; - mon->reactor = reactor; mon->reacting = reacting; mon->react = reactor->react; @@ -343,23 +339,10 @@ int rv_register_reactor(struct rv_reactor *reactor) */ int rv_unregister_reactor(struct rv_reactor *reactor) { - int ret = 0; - mutex_lock(&rv_interface_lock); - - if (!reactor->counter) { - list_del(&reactor->list); - } else { - printk(KERN_WARNING - "rv: the rv_reactor %s is in use by %d monitor(s)\n", - reactor->name, reactor->counter); - printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n", - reactor->name); - ret = -EBUSY; - } - + list_del(&reactor->list); mutex_unlock(&rv_interface_lock); - return ret; + return 0; } /* @@ -456,23 +439,11 @@ int reactor_populate_monitor(struct rv_monitor *mon) * Configure as the rv_nop reactor. */ mon->reactor = get_reactor_rdef_by_name("nop"); - mon->reactor->counter++; mon->reacting = false; return 0; } -/** - * reactor_cleanup_monitor - cleanup a monitor reference - * @mon: the monitor. - */ -void reactor_cleanup_monitor(struct rv_monitor *mon) -{ - lockdep_assert_held(&rv_interface_lock); - mon->reactor->counter--; - WARN_ON_ONCE(mon->reactor->counter < 0); -} - /* * Nop reactor register */ -- cgit v1.2.3 From b8a7fba39cd49eab343bfe561d85bb5dc57541af Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Thu, 24 Jul 2025 19:33:30 +0200 Subject: rv: Remove struct rv_monitor::reacting The field 'reacting' in struct rv_monitor is set but never used. Delete it. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/a6c16f845d2f1a09c4d0934ab83f3cb14478a71d.1753378331.git.namcao@linutronix.de Reviewed-by: Gabriele Monaco Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv_reactors.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index a8e849e6cd85..106f2c4740f2 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -160,7 +160,7 @@ static const struct seq_operations monitor_reactors_seq_ops = { static void monitor_swap_reactors_single(struct rv_monitor *mon, struct rv_reactor *reactor, - bool reacting, bool nested) + bool nested) { bool monitor_enabled; @@ -173,7 +173,6 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon, rv_disable_monitor(mon); mon->reactor = reactor; - mon->reacting = reacting; mon->react = reactor->react; /* enable only once if iterating through a container */ @@ -181,8 +180,7 @@ static void monitor_swap_reactors_single(struct rv_monitor *mon, rv_enable_monitor(mon); } -static void monitor_swap_reactors(struct rv_monitor *mon, - struct rv_reactor *reactor, bool reacting) +static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor) { struct rv_monitor *p = mon; @@ -190,7 +188,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon, list_for_each_entry_continue(p, &rv_monitors_list, list) { if (p->parent != mon) break; - monitor_swap_reactors_single(p, reactor, reacting, true); + monitor_swap_reactors_single(p, reactor, true); } /* * This call enables and disables the monitor if they were active. @@ -198,7 +196,7 @@ static void monitor_swap_reactors(struct rv_monitor *mon, * All nested monitors are enabled also if they were off, we may refine * this logic in the future. */ - monitor_swap_reactors_single(mon, reactor, reacting, false); + monitor_swap_reactors_single(mon, reactor, false); } static ssize_t @@ -210,7 +208,6 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, struct rv_reactor *reactor; struct seq_file *seq_f; int retval = -EINVAL; - bool enable; char *ptr; int len; @@ -243,12 +240,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf, if (strcmp(ptr, reactor->name) != 0) continue; - if (strcmp(reactor->name, "nop")) - enable = false; - else - enable = true; - - monitor_swap_reactors(mon, reactor, enable); + monitor_swap_reactors(mon, reactor); retval = count; break; @@ -439,7 +431,6 @@ int reactor_populate_monitor(struct rv_monitor *mon) * Configure as the rv_nop reactor. */ mon->reactor = get_reactor_rdef_by_name("nop"); - mon->reacting = false; return 0; } -- cgit v1.2.3 From 2b03164eee20eac7ce0fe3aa4fbda7efc1e5427a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 21 Jul 2025 11:04:41 +0200 Subject: bpf/preload: Don't select USERMODE_DRIVER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The usermode driver framework is not used anymore by the BPF preload code. Fixes: cb80ddc67152 ("bpf: Convert bpf_preload.ko to use light skeleton.") Signed-off-by: Thomas Weißschuh Signed-off-by: Daniel Borkmann Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/bpf/20250721-remove-usermode-driver-v1-1-0d0083334382@linutronix.de --- kernel/bpf/preload/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index c9d45c9d6918..f9b11d01c3b5 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -10,7 +10,6 @@ menuconfig BPF_PRELOAD # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST - select USERMODE_DRIVER help This builds kernel module with several embedded BPF programs that are pinned into BPF FS mount point as human readable files that are -- cgit v1.2.3 From b7b3500bd4eef2c3b5124ed195f26eb048407d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 21 Jul 2025 11:04:42 +0200 Subject: umd: Remove usermode driver framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code is unused since 98e20e5e13d2 ("bpfilter: remove bpfilter"), therefore remove it. Signed-off-by: Thomas Weißschuh Signed-off-by: Daniel Borkmann Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Acked-by: "Eric W. Biederman" Link: https://lore.kernel.org/bpf/20250721-remove-usermode-driver-v1-2-0d0083334382@linutronix.de --- kernel/Makefile | 1 - kernel/bpf/preload/Kconfig | 4 - kernel/usermode_driver.c | 191 --------------------------------------------- 3 files changed, 196 deletions(-) delete mode 100644 kernel/usermode_driver.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 32e80dd626af..4332de7ffdee 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,7 +12,6 @@ obj-y = fork.o exec_domain.o panic.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o ksyms_common.o -obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o obj-$(CONFIG_MULTIUSER) += groups.o obj-$(CONFIG_VHOST_TASK) += vhost_task.o diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index f9b11d01c3b5..aef7b0bc96d6 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -1,8 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -config USERMODE_DRIVER - bool - default n - menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c deleted file mode 100644 index 8303f4c7ca71..000000000000 --- a/kernel/usermode_driver.c +++ /dev/null @@ -1,191 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * umd - User mode driver support - */ -#include -#include -#include -#include -#include -#include - -static struct vfsmount *blob_to_mnt(const void *data, size_t len, const char *name) -{ - struct file_system_type *type; - struct vfsmount *mnt; - struct file *file; - ssize_t written; - loff_t pos = 0; - - type = get_fs_type("tmpfs"); - if (!type) - return ERR_PTR(-ENODEV); - - mnt = kern_mount(type); - put_filesystem(type); - if (IS_ERR(mnt)) - return mnt; - - file = file_open_root_mnt(mnt, name, O_CREAT | O_WRONLY, 0700); - if (IS_ERR(file)) { - kern_unmount(mnt); - return ERR_CAST(file); - } - - written = kernel_write(file, data, len, &pos); - if (written != len) { - int err = written; - if (err >= 0) - err = -ENOMEM; - filp_close(file, NULL); - kern_unmount(mnt); - return ERR_PTR(err); - } - - fput(file); - - /* Flush delayed fput so exec can open the file read-only */ - flush_delayed_fput(); - task_work_run(); - return mnt; -} - -/** - * umd_load_blob - Remember a blob of bytes for fork_usermode_driver - * @info: information about usermode driver - * @data: a blob of bytes that can be executed as a file - * @len: The lentgh of the blob - * - */ -int umd_load_blob(struct umd_info *info, const void *data, size_t len) -{ - struct vfsmount *mnt; - - if (WARN_ON_ONCE(info->wd.dentry || info->wd.mnt)) - return -EBUSY; - - mnt = blob_to_mnt(data, len, info->driver_name); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - info->wd.mnt = mnt; - info->wd.dentry = mnt->mnt_root; - return 0; -} -EXPORT_SYMBOL_GPL(umd_load_blob); - -/** - * umd_unload_blob - Disassociate @info from a previously loaded blob - * @info: information about usermode driver - * - */ -int umd_unload_blob(struct umd_info *info) -{ - if (WARN_ON_ONCE(!info->wd.mnt || - !info->wd.dentry || - info->wd.mnt->mnt_root != info->wd.dentry)) - return -EINVAL; - - kern_unmount(info->wd.mnt); - info->wd.mnt = NULL; - info->wd.dentry = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(umd_unload_blob); - -static int umd_setup(struct subprocess_info *info, struct cred *new) -{ - struct umd_info *umd_info = info->data; - struct file *from_umh[2]; - struct file *to_umh[2]; - int err; - - /* create pipe to send data to umh */ - err = create_pipe_files(to_umh, 0); - if (err) - return err; - err = replace_fd(0, to_umh[0], 0); - fput(to_umh[0]); - if (err < 0) { - fput(to_umh[1]); - return err; - } - - /* create pipe to receive data from umh */ - err = create_pipe_files(from_umh, 0); - if (err) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - return err; - } - err = replace_fd(1, from_umh[1], 0); - fput(from_umh[1]); - if (err < 0) { - fput(to_umh[1]); - replace_fd(0, NULL, 0); - fput(from_umh[0]); - return err; - } - - set_fs_pwd(current->fs, &umd_info->wd); - umd_info->pipe_to_umh = to_umh[1]; - umd_info->pipe_from_umh = from_umh[0]; - umd_info->tgid = get_pid(task_tgid(current)); - return 0; -} - -static void umd_cleanup(struct subprocess_info *info) -{ - struct umd_info *umd_info = info->data; - - /* cleanup if umh_setup() was successful but exec failed */ - if (info->retval) - umd_cleanup_helper(umd_info); -} - -/** - * umd_cleanup_helper - release the resources which were allocated in umd_setup - * @info: information about usermode driver - */ -void umd_cleanup_helper(struct umd_info *info) -{ - fput(info->pipe_to_umh); - fput(info->pipe_from_umh); - put_pid(info->tgid); - info->tgid = NULL; -} -EXPORT_SYMBOL_GPL(umd_cleanup_helper); - -/** - * fork_usermode_driver - fork a usermode driver - * @info: information about usermode driver (shouldn't be NULL) - * - * Returns either negative error or zero which indicates success in - * executing a usermode driver. In such case 'struct umd_info *info' - * is populated with two pipes and a tgid of the process. The caller is - * responsible for health check of the user process, killing it via - * tgid, and closing the pipes when user process is no longer needed. - */ -int fork_usermode_driver(struct umd_info *info) -{ - struct subprocess_info *sub_info; - const char *argv[] = { info->driver_name, NULL }; - int err; - - if (WARN_ON_ONCE(info->tgid)) - return -EBUSY; - - err = -ENOMEM; - sub_info = call_usermodehelper_setup(info->driver_name, - (char **)argv, NULL, GFP_KERNEL, - umd_setup, umd_cleanup, info); - if (!sub_info) - goto out; - - err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); -out: - return err; -} -EXPORT_SYMBOL_GPL(fork_usermode_driver); - - -- cgit v1.2.3 From 3ba58312e65665e5b9097c7969a51fa49914d85d Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 24 Jul 2025 12:02:53 +0000 Subject: bpf: Move bpf_jit_get_prog_name() to core.c bpf_jit_get_prog_name() will be used by all JITs when enabling support for private stack. This function is currently implemented in the x86 JIT. Move the function to core.c so that other JITs can easily use it in their implementation of private stack. Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20250724120257.7299-2-puranjay@kernel.org --- kernel/bpf/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 61613785bdd0..29c0225c14aa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1297,6 +1297,13 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog, return 0; } +const char *bpf_jit_get_prog_name(struct bpf_prog *prog) +{ + if (prog->aux->ksym.prog) + return prog->aux->ksym.name; + return prog->aux->name; +} + static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, -- cgit v1.2.3 From 6676fd3c99b016b9102c584e8b6dfcfad4fa059e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 26 Jul 2025 02:35:49 -0700 Subject: kstack_erase: Add -mgeneral-regs-only to silence Clang warnings Once CONFIG_KSTACK_ERASE is enabled with Clang on i386, the build warns: kernel/kstack_erase.c:168:2: warning: function with attribute 'no_caller_saved_registers' should only call a function with attribute 'no_caller_saved_registers' or be compiled with '-mgeneral-regs-only' [-Wexcessive-regsave] Add -mgeneral-regs-only for the kstack_erase handler, to make Clang feel better (it is effectively a no-op flag for the kernel). No binary changes encountered. Build & boot tested with Clang 21 on x86_64, and i386. Build tested with GCC 14.2.0 on x86_64, i386, arm64, and arm. Reported-by: Nathan Chancellor Closes: https://lore.kernel.org/all/20250726004313.GA3650901@ax162 Tested-by: Nathan Chancellor Signed-off-by: Kees Cook --- kernel/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index e4f01f1d4d0c..0ee9afd8b7cf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o CFLAGS_kstack_erase.o += $(DISABLE_KSTACK_ERASE) +CFLAGS_kstack_erase.o += $(call cc-option,-mgeneral-regs-only) obj-$(CONFIG_KSTACK_ERASE) += kstack_erase.o KASAN_SANITIZE_kstack_erase.o := n KCSAN_SANITIZE_kstack_erase.o := n -- cgit v1.2.3 From 5345e64760d37524d38ddfa7471f42ec64b0f289 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 24 Jul 2025 19:42:52 +0200 Subject: bpf: Simplify bounds refinement from s32 During the bounds refinement, we improve the precision of various ranges by looking at other ranges. Among others, we improve the following in this order (other things happen between 1 and 2): 1. Improve u32 from s32 in __reg32_deduce_bounds. 2. Improve s/u64 from u32 in __reg_deduce_mixed_bounds. 3. Improve s/u64 from s32 in __reg_deduce_mixed_bounds. In particular, if the s32 range forms a valid u32 range, we will use it to improve the u32 range in __reg32_deduce_bounds. In __reg_deduce_mixed_bounds, under the same condition, we will use the s32 range to improve the s/u64 ranges. If at (1) we were able to learn from s32 to improve u32, we'll then be able to use that in (2) to improve s/u64. Hence, as (3) happens under the same precondition as (1), it won't improve s/u64 ranges further than (1)+(2) did. Thus, we can get rid of (3). In addition to the extensive suite of selftests for bounds refinement, this patch was also tested with the Agni formal verification tool [1]. Additionally, Eduard mentioned: The argument appears to be as follows: Under precondition `(u32)reg->s32_min <= (u32)reg->s32_max` __reg32_deduce_bounds produces: reg->u32_min = max_t(u32, reg->s32_min, reg->u32_min); reg->u32_max = min_t(u32, reg->s32_max, reg->u32_max); And then first part of __reg_deduce_mixed_bounds assigns: a. reg->umin umax= (reg->umin & ~0xffffffffULL) | max_t(u32, reg->s32_min, reg->u32_min); b. reg->umax umin= (reg->umax & ~0xffffffffULL) | min_t(u32, reg->s32_max, reg->u32_max); And then second part of __reg_deduce_mixed_bounds assigns: c. reg->umin umax= (reg->umin & ~0xffffffffULL) | (u32)reg->s32_min; d. reg->umax umin= (reg->umax & ~0xffffffffULL) | (u32)reg->s32_max; But assignment (c) is a noop because: max_t(u32, reg->s32_min, reg->u32_min) >= (u32)reg->s32_min Hence RHS(a) >= RHS(c) and umin= does nothing. Also assignment (d) is a noop because: min_t(u32, reg->s32_max, reg->u32_max) <= (u32)reg->s32_max Hence RHS(b) <= RHS(d) and umin= does nothing. Plus the same reasoning for the part dealing with reg->s{min,max}_value: e. reg->smin_value smax= (reg->smin_value & ~0xffffffffULL) | max_t(u32, reg->s32_min_value, reg->u32_min_value); f. reg->smax_value smin= (reg->smax_value & ~0xffffffffULL) | min_t(u32, reg->s32_max_value, reg->u32_max_value); vs g. reg->smin_value smax= (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; h. reg->smax_value smin= (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; RHS(e) >= RHS(g) and RHS(f) <= RHS(h), hence smax=,smin= do nothing. This appears to be correct. Also, Shung-Hsi: Beside going through the reasoning, I also played with CBMC a bit to double check that as far as a single run of __reg_deduce_bounds() is concerned (and that the register state matches certain handwavy expectations), the change indeed still preserve the original behavior. Signed-off-by: Paul Chaignon Signed-off-by: Daniel Borkmann Reviewed-by: Shung-Hsi Yu Acked-by: Eduard Zingerman Link: https://github.com/bpfverif/agni [1] Link: https://lore.kernel.org/bpf/aIJwnFnFyUjNsCNa@mail.gmail.com --- kernel/bpf/verifier.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e2fcea860755..d218516c3b33 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2554,20 +2554,6 @@ static void __reg_deduce_mixed_bounds(struct bpf_reg_state *reg) reg->smin_value = max_t(s64, reg->smin_value, new_smin); reg->smax_value = min_t(s64, reg->smax_value, new_smax); - /* if s32 can be treated as valid u32 range, we can use it as well */ - if ((u32)reg->s32_min_value <= (u32)reg->s32_max_value) { - /* s32 -> u64 tightening */ - new_umin = (reg->umin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_umax = (reg->umax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->umin_value = max_t(u64, reg->umin_value, new_umin); - reg->umax_value = min_t(u64, reg->umax_value, new_umax); - /* s32 -> s64 tightening */ - new_smin = (reg->smin_value & ~0xffffffffULL) | (u32)reg->s32_min_value; - new_smax = (reg->smax_value & ~0xffffffffULL) | (u32)reg->s32_max_value; - reg->smin_value = max_t(s64, reg->smin_value, new_smin); - reg->smax_value = min_t(s64, reg->smax_value, new_smax); - } - /* Here we would like to handle a special case after sign extending load, * when upper bits for a 64-bit range are all 1s or all 0s. * -- cgit v1.2.3 From e82aea50fe0600da176b2e50a6213f6057b719f9 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 27 Jul 2025 19:31:12 +0200 Subject: rv: Fix wrong type cast in monitors_show() Argument 'p' of monitors_show() is not a pointer to struct rv_monitor, it is actually a pointer to the list_head inside struct rv_monitor. Therefore it is wrong to cast 'p' to struct rv_monitor *. This wrong type cast has been there since the beginning. But it still worked because the list_head was the first field in struct rv_monitor_def. This is no longer true since commit 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") moved the list_head, and this wrong type cast became a functional problem. Properly use container_of() instead. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/35e49e97696007919ceacf73796487a2e15a3d02.1753625621.git.namcao@linutronix.de Fixes: 24cbfe18d55a ("rv: Merge struct rv_monitor_def into struct rv_monitor") Signed-off-by: Nam Cao Reviewed-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c index 6c8498743b98..bd7d56dbf6c2 100644 --- a/kernel/trace/rv/rv.c +++ b/kernel/trace/rv/rv.c @@ -458,7 +458,7 @@ out_remove_root: */ static int monitors_show(struct seq_file *m, void *p) { - struct rv_monitor *mon = p; + struct rv_monitor *mon = container_of(p, struct rv_monitor, list); if (mon->parent) seq_printf(m, "%s:%s\n", mon->parent->name, mon->name); -- cgit v1.2.3 From 3cfb9c1a7db091f9030826b897d07ddce1591e7f Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 27 Jul 2025 19:31:13 +0200 Subject: rv: Fix wrong type cast in reactors_show() and monitor_reactor_show() Argument 'p' of reactors_show() and monitor_reactor_show() is not a pointer to struct rv_reactor, it is actually a pointer to the list_head inside struct rv_reactor. Therefore it's wrong to cast 'p' to struct rv_reactor *. This wrong type cast has been there since the beginning. But it still worked because the list_head was the first field in struct rv_reactor_def. This is no longer true since commit 3d3c376118b5 ("rv: Merge struct rv_reactor_def into struct rv_reactor") moved the list_head, and this wrong type cast became a functional problem. Properly use container_of() instead. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Gabriele Monaco Link: https://lore.kernel.org/b4febbd6844311209e4c8768b65d508b81bd8c9b.1753625621.git.namcao@linutronix.de Fixes: 3d3c376118b5 ("rv: Merge struct rv_reactor_def into struct rv_reactor") Signed-off-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv_reactors.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c index 106f2c4740f2..d32859fec238 100644 --- a/kernel/trace/rv/rv_reactors.c +++ b/kernel/trace/rv/rv_reactors.c @@ -86,7 +86,7 @@ static struct rv_reactor *get_reactor_rdef_by_name(char *name) */ static int reactors_show(struct seq_file *m, void *p) { - struct rv_reactor *reactor = p; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); seq_printf(m, "%s\n", reactor->name); return 0; @@ -139,7 +139,7 @@ static const struct file_operations available_reactors_ops = { static int monitor_reactor_show(struct seq_file *m, void *p) { struct rv_monitor *mon = m->private; - struct rv_reactor *reactor = p; + struct rv_reactor *reactor = container_of(p, struct rv_reactor, list); if (mon->reactor == reactor) seq_printf(m, "[%s]\n", reactor->name); -- cgit v1.2.3 From 00bf8d0c6c9be0c481fc45a3f7d87c7f8812f229 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 28 Jul 2025 11:50:53 +0200 Subject: bpf: Improve bounds when s64 crosses sign boundary __reg64_deduce_bounds currently improves the s64 range using the u64 range and vice versa, but only if it doesn't cross the sign boundary. This patch improves __reg64_deduce_bounds to cover the case where the s64 range crosses the sign boundary but overlaps with the u64 range on only one end. In that case, we can improve both ranges. Consider the following example, with the s64 range crossing the sign boundary: 0 U64_MAX | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | |----------------------------|----------------------------| |xxxxx s64 range xxxxxxxxx] [xxxxxxx| 0 S64_MAX S64_MIN -1 The u64 range overlaps only with positive portion of the s64 range. We can thus derive the following new s64 and u64 ranges. 0 U64_MAX | [xxxxxx u64 range xxxxx] | |----------------------------|----------------------------| | [xxxxxx s64 range xxxxx] | 0 S64_MAX S64_MIN -1 The same logic can probably apply to the s32/u32 ranges, but this patch doesn't implement that change. In addition to the selftests, the __reg64_deduce_bounds change was also tested with Agni, the formal verification tool for the range analysis [1]. Link: https://github.com/bpfverif/agni [1] Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/933bd9ce1f36ded5559f92fdc09e5dbc823fa245.1753695655.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d218516c3b33..251e06dc07eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2523,6 +2523,58 @@ static void __reg64_deduce_bounds(struct bpf_reg_state *reg) if ((u64)reg->smin_value <= (u64)reg->smax_value) { reg->umin_value = max_t(u64, reg->smin_value, reg->umin_value); reg->umax_value = min_t(u64, reg->smax_value, reg->umax_value); + } else { + /* If the s64 range crosses the sign boundary, then it's split + * between the beginning and end of the U64 domain. In that + * case, we can derive new bounds if the u64 range overlaps + * with only one end of the s64 range. + * + * In the following example, the u64 range overlaps only with + * positive portion of the s64 range. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * We can thus derive the following new s64 and u64 ranges. + * + * 0 U64_MAX + * | [xxxxxx u64 range xxxxx] | + * |----------------------------|----------------------------| + * | [xxxxxx s64 range xxxxx] | + * 0 S64_MAX S64_MIN -1 + * + * If they overlap in two places, we can't derive anything + * because reg_state can't represent two ranges per numeric + * domain. + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxx s64 range xxxxxxxxx] [xxxxxxxxxx| + * 0 S64_MAX S64_MIN -1 + * + * The first condition below corresponds to the first diagram + * above. + */ + if (reg->umax_value < (u64)reg->smin_value) { + reg->smin_value = (s64)reg->umin_value; + reg->umax_value = min_t(u64, reg->umax_value, reg->smax_value); + } else if ((u64)reg->smax_value < reg->umin_value) { + /* This second condition considers the case where the u64 range + * overlaps with the negative portion of the s64 range: + * + * 0 U64_MAX + * | [xxxxxxxxxxxxxx u64 range xxxxxxxxxxxxxx] | + * |----------------------------|----------------------------| + * |xxxxxxxxx] [xxxxxxxxxxxx s64 range | + * 0 S64_MAX S64_MIN -1 + */ + reg->smax_value = (s64)reg->umax_value; + reg->umin_value = max_t(u64, reg->umin_value, reg->smin_value); + } } } -- cgit v1.2.3 From 5dbb19b16ac498b0b7f3a8a85f9d25d6d8af397d Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 28 Jul 2025 11:52:00 +0200 Subject: bpf: Add third round of bounds deduction Commit d7f008738171 ("bpf: try harder to deduce register bounds from different numeric domains") added a second call to __reg_deduce_bounds in reg_bounds_sync because a single call wasn't enough to converge to a fixed point in terms of register bounds. With patch "bpf: Improve bounds when s64 crosses sign boundary" from this series, Eduard noticed that calling __reg_deduce_bounds twice isn't enough anymore to converge. The first selftest added in "selftests/bpf: Test cross-sign 64bits range refinement" highlights the need for a third call to __reg_deduce_bounds. After instruction 7, reg_bounds_sync performs the following bounds deduction: reg_bounds_sync entry: scalar(smin=-655,smax=0xeffffeee,smin32=-783,smax32=-146) __update_reg_bounds: scalar(smin=-655,smax=0xeffffeee,smin32=-783,smax32=-146) __reg_deduce_bounds: __reg32_deduce_bounds: scalar(smin=-655,smax=0xeffffeee,smin32=-783,smax32=-146,umin32=0xfffffcf1,umax32=0xffffff6e) __reg64_deduce_bounds: scalar(smin=-655,smax=0xeffffeee,smin32=-783,smax32=-146,umin32=0xfffffcf1,umax32=0xffffff6e) __reg_deduce_mixed_bounds: scalar(smin=-655,smax=0xeffffeee,umin=umin32=0xfffffcf1,umax=0xffffffffffffff6e,smin32=-783,smax32=-146,umax32=0xffffff6e) __reg_deduce_bounds: __reg32_deduce_bounds: scalar(smin=-655,smax=0xeffffeee,umin=umin32=0xfffffcf1,umax=0xffffffffffffff6e,smin32=-783,smax32=-146,umax32=0xffffff6e) __reg64_deduce_bounds: scalar(smin=-655,smax=smax32=-146,umin=0xfffffffffffffd71,umax=0xffffffffffffff6e,smin32=-783,umin32=0xfffffcf1,umax32=0xffffff6e) __reg_deduce_mixed_bounds: scalar(smin=-655,smax=smax32=-146,umin=0xfffffffffffffd71,umax=0xffffffffffffff6e,smin32=-783,umin32=0xfffffcf1,umax32=0xffffff6e) __reg_bound_offset: scalar(smin=-655,smax=smax32=-146,umin=0xfffffffffffffd71,umax=0xffffffffffffff6e,smin32=-783,umin32=0xfffffcf1,umax32=0xffffff6e,var_off=(0xfffffffffffffc00; 0x3ff)) __update_reg_bounds: scalar(smin=-655,smax=smax32=-146,umin=0xfffffffffffffd71,umax=0xffffffffffffff6e,smin32=-783,umin32=0xfffffcf1,umax32=0xffffff6e,var_off=(0xfffffffffffffc00; 0x3ff)) In particular, notice how: 1. In the first call to __reg_deduce_bounds, __reg32_deduce_bounds learns new u32 bounds. 2. __reg64_deduce_bounds is unable to improve bounds at this point. 3. __reg_deduce_mixed_bounds derives new u64 bounds from the u32 bounds. 4. In the second call to __reg_deduce_bounds, __reg64_deduce_bounds improves the smax and umin bounds thanks to patch "bpf: Improve bounds when s64 crosses sign boundary" from this series. 5. Subsequent functions are unable to improve the ranges further (only tnums). Yet, a better smin32 bound could be learned from the smin bound. __reg32_deduce_bounds is able to improve smin32 from smin, but for that we need a third call to __reg_deduce_bounds. As discussed in [1], there may be a better way to organize the deduction rules to learn the same information with less calls to the same functions. Such an optimization requires further analysis and is orthogonal to the present patchset. Link: https://lore.kernel.org/bpf/aIKtSK9LjQXB8FLY@mail.gmail.com/ [1] Acked-by: Eduard Zingerman Co-developed-by: Eduard Zingerman Signed-off-by: Eduard Zingerman Signed-off-by: Paul Chaignon Link: https://lore.kernel.org/r/79619d3b42e5525e0e174ed534b75879a5ba15de.1753695655.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 251e06dc07eb..72e3f2b03349 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2672,6 +2672,7 @@ static void reg_bounds_sync(struct bpf_reg_state *reg) /* We might have learned something about the sign bit. */ __reg_deduce_bounds(reg); __reg_deduce_bounds(reg); + __reg_deduce_bounds(reg); /* We might have learned some bits from the bounds. */ __reg_bound_offset(reg); /* Intersecting with the old var_off might have improved our bounds -- cgit v1.2.3 From 5b4c54ac49af7f486806d79e3233fc8a9363961c Mon Sep 17 00:00:00 2001 From: Suchit Karunakaran Date: Sun, 27 Jul 2025 13:47:54 +0530 Subject: bpf: Fix various typos in verifier.c comments This patch fixes several minor typos in comments within the BPF verifier. No changes in functionality. Signed-off-by: Suchit Karunakaran Link: https://lore.kernel.org/r/20250727081754.15986-1-suchitkarunakaran@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 72e3f2b03349..e50d3d43be67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4557,7 +4557,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * . if (scalar cond K|scalar) * . helper_call(.., scalar, ...) where ARG_CONST is expected * backtrack through the verifier states and mark all registers and - * stack slots with spilled constants that these scalar regisers + * stack slots with spilled constants that these scalar registers * should be precise. * . during state pruning two registers (or spilled stack slots) * are equivalent if both are not precise. @@ -18489,7 +18489,7 @@ static void clean_verifier_state(struct bpf_verifier_env *env, /* the parentage chains form a tree. * the verifier states are added to state lists at given insn and * pushed into state stack for future exploration. - * when the verifier reaches bpf_exit insn some of the verifer states + * when the verifier reaches bpf_exit insn some of the verifier states * stored in the state lists have their final liveness state already, * but a lot of states will get revised from liveness point of view when * the verifier explores other branches. @@ -19205,7 +19205,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) * terminology) calls specially: as opposed to bounded BPF loops, it *expects* * states to match, which otherwise would look like an infinite loop. So while * iter_next() calls are taken care of, we still need to be careful and - * prevent erroneous and too eager declaration of "ininite loop", when + * prevent erroneous and too eager declaration of "infinite loop", when * iterators are involved. * * Here's a situation in pseudo-BPF assembly form: @@ -19247,7 +19247,7 @@ static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx) * * This approach allows to keep infinite loop heuristic even in the face of * active iterator. E.g., C snippet below is and will be detected as - * inifintely looping: + * infinitely looping: * * struct bpf_iter_num it; * int *p, x; @@ -24488,7 +24488,7 @@ static int compute_scc(struct bpf_verifier_env *env) * if pre[i] == 0: * recur(i) * - * Below implementation replaces explicit recusion with array 'dfs'. + * Below implementation replaces explicit recursion with array 'dfs'. */ for (i = 0; i < insn_cnt; i++) { if (pre[i]) -- cgit v1.2.3 From 7b70ac4cad2b20eaf415276bbaa0d9df9abb428c Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:14 +0200 Subject: rv: Remove trailing whitespace from tracepoint string RV event tracepoints print a line with the format: "event_xyz: S0 x event -> S1 " "event_xyz: S1 x event -> S0 (final)" While printing an event leading to a non-final state, the line has a trailing white space (visible above before the closing "). Adapt the format string not to print the trailing whitespace if we are not printing "(final)". Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250728135022.255578-3-gmonaco@redhat.com Reviewed-by: Nam Cao Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv_trace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index b6f310498466..17ba07329b67 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -29,11 +29,11 @@ DECLARE_EVENT_CLASS(event_da_monitor, __entry->final_state = final_state; ), - TP_printk("%s x %s -> %s %s", + TP_printk("%s x %s -> %s%s", __entry->state, __entry->event, __entry->next_state, - __entry->final_state ? "(final)" : "") + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor, @@ -90,12 +90,12 @@ DECLARE_EVENT_CLASS(event_da_monitor_id, __entry->final_state = final_state; ), - TP_printk("%d: %s x %s -> %s %s", + TP_printk("%d: %s x %s -> %s%s", __entry->id, __entry->state, __entry->event, __entry->next_state, - __entry->final_state ? "(final)" : "") + __entry->final_state ? " (final)" : "") ); DECLARE_EVENT_CLASS(error_da_monitor_id, -- cgit v1.2.3 From 7f904ff6e58d398c4336f3c19c42b338324451f7 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:15 +0200 Subject: rv: Use strings in da monitors tracepoints Using DA monitors tracepoints with KASAN enabled triggers the following warning: BUG: KASAN: global-out-of-bounds in do_trace_event_raw_event_event_da_monitor+0xd6/0x1a0 Read of size 32 at addr ffffffffaada8980 by task ... Call Trace: [...] do_trace_event_raw_event_event_da_monitor+0xd6/0x1a0 ? __pfx_do_trace_event_raw_event_event_da_monitor+0x10/0x10 ? trace_event_sncid+0x83/0x200 trace_event_sncid+0x163/0x200 [...] The buggy address belongs to the variable: automaton_snep+0x4e0/0x5e0 This is caused by the tracepoints reading 32 bytes __array instead of __string from the automata definition. Such strings are literals and reading 32 bytes ends up in out of bound memory accesses (e.g. the next automaton's data in this case). The error is harmless as, while printing the string, we stop at the null terminator, but it should still be fixed. Use the __string facilities while defining the tracepoints to avoid reading out of bound memory. Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250728135022.255578-4-gmonaco@redhat.com Fixes: 792575348ff7 ("rv/include: Add deterministic automata monitor definition via C macros") Reviewed-by: Nam Cao Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/rv_trace.h | 76 +++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 17ba07329b67..d38e0d3abdfd 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -16,23 +16,23 @@ DECLARE_EVENT_CLASS(event_da_monitor, TP_ARGS(state, event, next_state, final_state), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->final_state = final_state; ), TP_printk("%s x %s -> %s%s", - __entry->state, - __entry->event, - __entry->next_state, + __get_str(state), + __get_str(event), + __get_str(next_state), __entry->final_state ? " (final)" : "") ); @@ -43,18 +43,18 @@ DECLARE_EVENT_CLASS(error_da_monitor, TP_ARGS(state, event), TP_STRUCT__entry( - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); + __assign_str(state); + __assign_str(event); ), TP_printk("event %s not expected in the state %s", - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include @@ -75,26 +75,26 @@ DECLARE_EVENT_CLASS(event_da_monitor_id, TP_ARGS(id, state, event, next_state, final_state), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) - __array( char, next_state, MAX_DA_NAME_LEN ) - __field( bool, final_state ) + __field( int, id ) + __string( state, state ) + __string( event, event ) + __string( next_state, next_state ) + __field( bool, final_state ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN); - __entry->id = id; - __entry->final_state = final_state; + __assign_str(state); + __assign_str(event); + __assign_str(next_state); + __entry->id = id; + __entry->final_state = final_state; ), TP_printk("%d: %s x %s -> %s%s", __entry->id, - __entry->state, - __entry->event, - __entry->next_state, + __get_str(state), + __get_str(event), + __get_str(next_state), __entry->final_state ? " (final)" : "") ); @@ -105,21 +105,21 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, TP_ARGS(id, state, event), TP_STRUCT__entry( - __field( int, id ) - __array( char, state, MAX_DA_NAME_LEN ) - __array( char, event, MAX_DA_NAME_LEN ) + __field( int, id ) + __string( state, state ) + __string( event, event ) ), TP_fast_assign( - memcpy(__entry->state, state, MAX_DA_NAME_LEN); - memcpy(__entry->event, event, MAX_DA_NAME_LEN); - __entry->id = id; + __assign_str(state); + __assign_str(event); + __entry->id = id; ), TP_printk("%d: event %s not expected in the state %s", __entry->id, - __entry->event, - __entry->state) + __get_str(event), + __get_str(state)) ); #include -- cgit v1.2.3 From 79de661707a4a2dc695fd3e00529a14b4f5ec50d Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:16 +0200 Subject: rv: Adjust monitor dependencies RV monitors relying on the preemptirqs tracepoints are set as dependent on PREEMPT_TRACER and IRQSOFF_TRACER. In fact, those configurations do enable the tracepoints but are not the minimal configurations enabling them, which are TRACE_PREEMPT_TOGGLE and TRACE_IRQFLAGS (not selectable manually). Set TRACE_PREEMPT_TOGGLE and TRACE_IRQFLAGS as dependencies for monitors. Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250728135022.255578-5-gmonaco@redhat.com Fixes: fbe6c09b7eb4 ("rv: Add scpd, snep and sncid per-cpu monitors") Acked-by: Nam Cao Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/monitors/scpd/Kconfig | 2 +- kernel/trace/rv/monitors/sncid/Kconfig | 2 +- kernel/trace/rv/monitors/snep/Kconfig | 2 +- kernel/trace/rv/monitors/wip/Kconfig | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig index b9114fbf680f..682d0416188b 100644 --- a/kernel/trace/rv/monitors/scpd/Kconfig +++ b/kernel/trace/rv/monitors/scpd/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SCPD depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig index 76bcfef4fd10..3a5639feaaaf 100644 --- a/kernel/trace/rv/monitors/sncid/Kconfig +++ b/kernel/trace/rv/monitors/sncid/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SNCID depends on RV - depends on IRQSOFF_TRACER + depends on TRACE_IRQFLAGS depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig index 77527f971232..7dd54f434ff7 100644 --- a/kernel/trace/rv/monitors/snep/Kconfig +++ b/kernel/trace/rv/monitors/snep/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_SNEP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE depends on RV_MON_SCHED default y select DA_MON_EVENTS_IMPLICIT diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig index e464b9294865..87a26195792b 100644 --- a/kernel/trace/rv/monitors/wip/Kconfig +++ b/kernel/trace/rv/monitors/wip/Kconfig @@ -2,7 +2,7 @@ # config RV_MON_WIP depends on RV - depends on PREEMPT_TRACER + depends on TRACE_PREEMPT_TOGGLE select DA_MON_EVENTS_IMPLICIT bool "wip monitor" help -- cgit v1.2.3 From 9d475d80c93735f0f336b34a8e2c22beea6145ab Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:17 +0200 Subject: rv: Retry when da monitor detects race conditions DA monitor can be accessed from multiple cores simultaneously, this is likely, for instance when dealing with per-task monitors reacting on events that do not always occur on the CPU where the task is running. This can cause race conditions where two events change the next state and we see inconsistent values. E.g.: [62] event_srs: 27: sleepable x sched_wakeup -> running (final) [63] event_srs: 27: sleepable x sched_set_state_sleepable -> sleepable [63] error_srs: 27: event sched_switch_suspend not expected in the state running In this case the monitor fails because the event on CPU 62 wins against the one on CPU 63, although the correct state should have been sleepable, since the task get suspended. Detect if the current state was modified by using try_cmpxchg while storing the next value. If it was, try again reading the current state. After a maximum number of failed retries, react by calling a special tracepoint, print on the console and reset the monitor. Remove the functions da_monitor_curr_state() and da_monitor_set_state() as they only hide the underlying implementation in this case. Monitors where this type of condition can occur must be able to account for racing events in any possible order, as we cannot know the winner. Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Link: https://lore.kernel.org/20250728135022.255578-6-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Reviewed-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 5 +++++ kernel/trace/rv/rv_trace.h | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 26017378f79b..34164eb4ec91 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -3,12 +3,17 @@ config RV_MON_EVENTS bool +config RV_MON_MAINTENANCE_EVENTS + bool + config DA_MON_EVENTS_IMPLICIT select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS bool config DA_MON_EVENTS_ID select RV_MON_EVENTS + select RV_MON_MAINTENANCE_EVENTS bool config LTL_MON_EVENTS_ID diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index d38e0d3abdfd..3af46cd185b3 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -176,6 +176,30 @@ DECLARE_EVENT_CLASS(error_ltl_monitor_id, #include // Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here #endif /* CONFIG_LTL_MON_EVENTS_ID */ + +#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS +/* Tracepoint useful for monitors development, currenly only used in DA */ +TRACE_EVENT(rv_retries_error, + + TP_PROTO(char *name, char *event), + + TP_ARGS(name, event), + + TP_STRUCT__entry( + __string( name, name ) + __string( event, event ) + ), + + TP_fast_assign( + __assign_str(name); + __assign_str(event); + ), + + TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS) + " retries reached for event %s, resetting monitor %s", + __get_str(event), __get_str(name)) +); +#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */ #endif /* _TRACE_RV_H */ /* This part must be outside protection */ -- cgit v1.2.3 From adcc3bfa8806761ac21aa271f78454113ec6936e Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:18 +0200 Subject: sched: Adapt sched tracepoints for RV task model Add the following tracepoint: * sched_set_need_resched(tsk, cpu, tif) Called when a task is set the need resched [lazy] flag Remove the unused ip parameter from sched_entry and sched_exit and alter sched_entry to have a value of preempt consistent with the one used in sched_switch. Also adapt all monitors using sched_{entry,exit} to avoid breaking build. These tracepoints are useful to describe the Linux task model and are adapted from the patches by Daniel Bristot de Oliveira (https://bristot.me/linux-task-model/). Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Masami Hiramatsu Cc: Nam Cao Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250728135022.255578-7-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/sched/core.c | 13 ++++++++++--- kernel/trace/rv/monitors/sco/sco.c | 4 ++-- kernel/trace/rv/monitors/scpd/scpd.c | 4 ++-- kernel/trace/rv/monitors/sncid/sncid.c | 4 ++-- kernel/trace/rv/monitors/snep/snep.c | 4 ++-- kernel/trace/rv/monitors/tss/tss.c | 4 ++-- 6 files changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec68fc686bd7..b485e0639616 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,6 +1110,7 @@ static void __resched_curr(struct rq *rq, int tif) cpu = cpu_of(rq); + trace_sched_set_need_resched_tp(curr, cpu, tif); if (cpu == smp_processor_id()) { set_ti_thread_flag(cti, tif); if (tif == TIF_NEED_RESCHED) @@ -1125,6 +1126,11 @@ static void __resched_curr(struct rq *rq, int tif) } } +void __trace_set_need_resched(struct task_struct *curr, int tif) +{ + trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif); +} + void resched_curr(struct rq *rq) { __resched_curr(rq, TIF_NEED_RESCHED); @@ -5329,7 +5335,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) * switched the context for the first time. It is returning from * schedule for the first time in this path. */ - trace_sched_exit_tp(true, CALLER_ADDR0); + trace_sched_exit_tp(true); preempt_enable(); if (current->set_child_tid) @@ -6678,7 +6684,8 @@ static void __sched notrace __schedule(int sched_mode) struct rq *rq; int cpu; - trace_sched_entry_tp(preempt, CALLER_ADDR0); + /* Trace preemptions consistently with task switches */ + trace_sched_entry_tp(sched_mode == SM_PREEMPT); cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -6793,7 +6800,7 @@ picked: __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); } - trace_sched_exit_tp(is_switch, CALLER_ADDR0); + trace_sched_exit_tp(is_switch); } void __noreturn do_task_dead(void) diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c index 66f4639d46ac..04c36405e2e3 100644 --- a/kernel/trace/rv/monitors/sco/sco.c +++ b/kernel/trace/rv/monitors/sco/sco.c @@ -24,12 +24,12 @@ static void handle_sched_set_state(void *data, struct task_struct *tsk, int stat da_handle_start_event_sco(sched_set_state_sco); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_sco(schedule_entry_sco); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_sco(schedule_exit_sco); } diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c index 299703cd72b0..1e351ba52fee 100644 --- a/kernel/trace/rv/monitors/scpd/scpd.c +++ b/kernel/trace/rv/monitors/scpd/scpd.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_scpd(preempt_enable_scpd); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_scpd(schedule_entry_scpd); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_event_scpd(schedule_exit_scpd); } diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c index 3e1ee715a0fb..c8491f426365 100644 --- a/kernel/trace/rv/monitors/sncid/sncid.c +++ b/kernel/trace/rv/monitors/sncid/sncid.c @@ -30,12 +30,12 @@ static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent da_handle_start_event_sncid(irq_enable_sncid); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_start_event_sncid(schedule_entry_sncid); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_sncid(schedule_exit_sncid); } diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c index 2adc3108d60c..558950f524a5 100644 --- a/kernel/trace/rv/monitors/snep/snep.c +++ b/kernel/trace/rv/monitors/snep/snep.c @@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa da_handle_start_event_snep(preempt_enable_snep); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_snep(schedule_entry_snep); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_snep(schedule_exit_snep); } diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c index 0452fcd9edcf..95ebd15131f5 100644 --- a/kernel/trace/rv/monitors/tss/tss.c +++ b/kernel/trace/rv/monitors/tss/tss.c @@ -27,12 +27,12 @@ static void handle_sched_switch(void *data, bool preempt, da_handle_event_tss(sched_switch_tss); } -static void handle_schedule_entry(void *data, bool preempt, unsigned long ip) +static void handle_schedule_entry(void *data, bool preempt) { da_handle_event_tss(schedule_entry_tss); } -static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip) +static void handle_schedule_exit(void *data, bool is_switch) { da_handle_start_event_tss(schedule_exit_tss); } -- cgit v1.2.3 From d0096c2f9cfcb4ce385698491604610fcc1a53b3 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:19 +0200 Subject: rv: Replace tss and sncid monitors with more complete sts The tss monitor currently guarantees task switches can happen only while scheduling, whereas the sncid monitor enforces scheduling occurs with interrupt disabled. Replace the monitors with a more comprehensive specification which implies both but also ensures that: * each scheduler call disable interrupts to switch * each task switch happens with interrupts disabled Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Masami Hiramatsu Cc: Nam Cao Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Link: https://lore.kernel.org/20250728135022.255578-8-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 3 +- kernel/trace/rv/Makefile | 3 +- kernel/trace/rv/monitors/sncid/Kconfig | 15 --- kernel/trace/rv/monitors/sncid/sncid.c | 95 ---------------- kernel/trace/rv/monitors/sncid/sncid.h | 49 --------- kernel/trace/rv/monitors/sncid/sncid_trace.h | 15 --- kernel/trace/rv/monitors/sts/Kconfig | 19 ++++ kernel/trace/rv/monitors/sts/sts.c | 156 +++++++++++++++++++++++++++ kernel/trace/rv/monitors/sts/sts.h | 117 ++++++++++++++++++++ kernel/trace/rv/monitors/sts/sts_trace.h | 15 +++ kernel/trace/rv/monitors/tss/Kconfig | 14 --- kernel/trace/rv/monitors/tss/tss.c | 90 ---------------- kernel/trace/rv/monitors/tss/tss.h | 47 -------- kernel/trace/rv/monitors/tss/tss_trace.h | 15 --- kernel/trace/rv/rv_trace.h | 3 +- 15 files changed, 310 insertions(+), 346 deletions(-) delete mode 100644 kernel/trace/rv/monitors/sncid/Kconfig delete mode 100644 kernel/trace/rv/monitors/sncid/sncid.c delete mode 100644 kernel/trace/rv/monitors/sncid/sncid.h delete mode 100644 kernel/trace/rv/monitors/sncid/sncid_trace.h create mode 100644 kernel/trace/rv/monitors/sts/Kconfig create mode 100644 kernel/trace/rv/monitors/sts/sts.c create mode 100644 kernel/trace/rv/monitors/sts/sts.h create mode 100644 kernel/trace/rv/monitors/sts/sts_trace.h delete mode 100644 kernel/trace/rv/monitors/tss/Kconfig delete mode 100644 kernel/trace/rv/monitors/tss/tss.c delete mode 100644 kernel/trace/rv/monitors/tss/tss.h delete mode 100644 kernel/trace/rv/monitors/tss/tss_trace.h (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 34164eb4ec91..b688b24081c8 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -50,12 +50,11 @@ source "kernel/trace/rv/monitors/wip/Kconfig" source "kernel/trace/rv/monitors/wwnr/Kconfig" source "kernel/trace/rv/monitors/sched/Kconfig" -source "kernel/trace/rv/monitors/tss/Kconfig" source "kernel/trace/rv/monitors/sco/Kconfig" source "kernel/trace/rv/monitors/snroc/Kconfig" source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" -source "kernel/trace/rv/monitors/sncid/Kconfig" +source "kernel/trace/rv/monitors/sts/Kconfig" # Add new sched monitors here source "kernel/trace/rv/monitors/rtapp/Kconfig" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 13ec2944c665..1939d3d7621c 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -6,15 +6,14 @@ obj-$(CONFIG_RV) += rv.o obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o -obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o -obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o +obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sncid/Kconfig deleted file mode 100644 index 3a5639feaaaf..000000000000 --- a/kernel/trace/rv/monitors/sncid/Kconfig +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -config RV_MON_SNCID - depends on RV - depends on TRACE_IRQFLAGS - depends on RV_MON_SCHED - default y - select DA_MON_EVENTS_IMPLICIT - bool "sncid monitor" - help - Monitor to ensure schedule is not called with interrupt disabled. - This monitor is part of the sched monitors collection. - - For further information, see: - Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c deleted file mode 100644 index c8491f426365..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.c +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include - -#define MODULE_NAME "sncid" - -#include -#include -#include -#include - -#include "sncid.h" - -static struct rv_monitor rv_sncid; -DECLARE_DA_MON_PER_CPU(sncid, unsigned char); - -static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_event_sncid(irq_disable_sncid); -} - -static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) -{ - da_handle_start_event_sncid(irq_enable_sncid); -} - -static void handle_schedule_entry(void *data, bool preempt) -{ - da_handle_start_event_sncid(schedule_entry_sncid); -} - -static void handle_schedule_exit(void *data, bool is_switch) -{ - da_handle_start_event_sncid(schedule_exit_sncid); -} - -static int enable_sncid(void) -{ - int retval; - - retval = da_monitor_init_sncid(); - if (retval) - return retval; - - rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_sncid(void) -{ - rv_sncid.enabled = 0; - - rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable); - rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable); - rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_sncid(); -} - -static struct rv_monitor rv_sncid = { - .name = "sncid", - .description = "schedule not called with interrupt disabled.", - .enable = enable_sncid, - .disable = disable_sncid, - .reset = da_monitor_reset_all_sncid, - .enabled = 0, -}; - -static int __init register_sncid(void) -{ - return rv_register_monitor(&rv_sncid, &rv_sched); -} - -static void __exit unregister_sncid(void) -{ - rv_unregister_monitor(&rv_sncid); -} - -module_init(register_sncid); -module_exit(unregister_sncid); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco "); -MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled."); diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h deleted file mode 100644 index 21304725142b..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid.h +++ /dev/null @@ -1,49 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of sncid automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_sncid { - can_sched_sncid = 0, - cant_sched_sncid, - state_max_sncid -}; - -#define INVALID_STATE state_max_sncid - -enum events_sncid { - irq_disable_sncid = 0, - irq_enable_sncid, - schedule_entry_sncid, - schedule_exit_sncid, - event_max_sncid -}; - -struct automaton_sncid { - char *state_names[state_max_sncid]; - char *event_names[event_max_sncid]; - unsigned char function[state_max_sncid][event_max_sncid]; - unsigned char initial_state; - bool final_states[state_max_sncid]; -}; - -static const struct automaton_sncid automaton_sncid = { - .state_names = { - "can_sched", - "cant_sched" - }, - .event_names = { - "irq_disable", - "irq_enable", - "schedule_entry", - "schedule_exit" - }, - .function = { - { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid }, - { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE }, - }, - .initial_state = can_sched_sncid, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/sncid/sncid_trace.h deleted file mode 100644 index 3ce42a57671d..000000000000 --- a/kernel/trace/rv/monitors/sncid/sncid_trace.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -/* - * Snippet to be included in rv_trace.h - */ - -#ifdef CONFIG_RV_MON_SNCID -DEFINE_EVENT(event_da_monitor, event_sncid, - TP_PROTO(char *state, char *event, char *next_state, bool final_state), - TP_ARGS(state, event, next_state, final_state)); - -DEFINE_EVENT(error_da_monitor, error_sncid, - TP_PROTO(char *state, char *event), - TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_SNCID */ diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig new file mode 100644 index 000000000000..7d1ff0f6fc91 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_STS + depends on RV + depends on TRACE_IRQFLAGS + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_IMPLICIT + bool "sts monitor" + help + Monitor to ensure relationships between scheduler and task switches + * the scheduler is called and returns with interrupts disabled + * each call to the scheduler has up to one switch + * switches only happen inside the scheduler + * each call to the scheduler disables interrupts to switch + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c new file mode 100644 index 000000000000..c4a9cd67c1d2 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "sts" + +#include +#include +#include +#include +#include + +#include "sts.h" + +static struct rv_monitor rv_sts; +DECLARE_DA_MON_PER_CPU(sts, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_disable_sts); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_sts(irq_enable_sts); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_sts(irq_entry_sts); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + da_handle_event_sts(sched_switch_sts); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + da_handle_event_sts(schedule_entry_sts); +} + +static void handle_schedule_exit(void *data, bool is_switch) +{ + da_handle_start_event_sts(schedule_exit_sts); +} + +static int enable_sts(void) +{ + int retval; + + retval = da_monitor_init_sts(); + if (retval) + return retval; + + rv_attach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_attach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + attach_vector_irq(); + + return 0; +} + +static void disable_sts(void) +{ + rv_sts.enabled = 0; + + rv_detach_trace_probe("sts", irq_disable, handle_irq_disable); + rv_detach_trace_probe("sts", irq_enable, handle_irq_enable); + rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("sts", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry); + rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit); + detach_vector_irq(); + + da_monitor_destroy_sts(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_sts = { + .name = "sts", + .description = "schedule implies task switch.", + .enable = enable_sts, + .disable = disable_sts, + .reset = da_monitor_reset_all_sts, + .enabled = 0, +}; + +static int __init register_sts(void) +{ + return rv_register_monitor(&rv_sts, &rv_sched); +} + +static void __exit unregister_sts(void) +{ + rv_unregister_monitor(&rv_sts); +} + +module_init(register_sts); +module_exit(unregister_sts); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco "); +MODULE_DESCRIPTION("sts: schedule implies task switch."); diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h new file mode 100644 index 000000000000..3368b6599a00 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sts automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sts { + can_sched_sts = 0, + cant_sched_sts, + disable_to_switch_sts, + enable_to_exit_sts, + in_irq_sts, + scheduling_sts, + switching_sts, + state_max_sts +}; + +#define INVALID_STATE state_max_sts + +enum events_sts { + irq_disable_sts = 0, + irq_enable_sts, + irq_entry_sts, + sched_switch_sts, + schedule_entry_sts, + schedule_exit_sts, + event_max_sts +}; + +struct automaton_sts { + char *state_names[state_max_sts]; + char *event_names[event_max_sts]; + unsigned char function[state_max_sts][event_max_sts]; + unsigned char initial_state; + bool final_states[state_max_sts]; +}; + +static const struct automaton_sts automaton_sts = { + .state_names = { + "can_sched", + "cant_sched", + "disable_to_switch", + "enable_to_exit", + "in_irq", + "scheduling", + "switching" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "sched_switch", + "schedule_entry", + "schedule_exit" + }, + .function = { + { + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + scheduling_sts, + INVALID_STATE + }, + { + INVALID_STATE, + can_sched_sts, + cant_sched_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + in_irq_sts, + switching_sts, + INVALID_STATE, + INVALID_STATE + }, + { + enable_to_exit_sts, + enable_to_exit_sts, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + can_sched_sts + }, + { + INVALID_STATE, + scheduling_sts, + in_irq_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + disable_to_switch_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enable_to_exit_sts, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = can_sched_sts, + .final_states = { 1, 0, 0, 0, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sts/sts_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h new file mode 100644 index 000000000000..d78beb58d5b3 --- /dev/null +++ b/kernel/trace/rv/monitors/sts/sts_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_STS +DEFINE_EVENT(event_da_monitor, event_sts, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_sts, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_STS */ diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/tss/Kconfig deleted file mode 100644 index 479f86f52e60..000000000000 --- a/kernel/trace/rv/monitors/tss/Kconfig +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -config RV_MON_TSS - depends on RV - depends on RV_MON_SCHED - default y - select DA_MON_EVENTS_IMPLICIT - bool "tss monitor" - help - Monitor to ensure sched_switch happens only in scheduling context. - This monitor is part of the sched monitors collection. - - For further information, see: - Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c deleted file mode 100644 index 95ebd15131f5..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.c +++ /dev/null @@ -1,90 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include - -#define MODULE_NAME "tss" - -#include -#include -#include - -#include "tss.h" - -static struct rv_monitor rv_tss; -DECLARE_DA_MON_PER_CPU(tss, unsigned char); - -static void handle_sched_switch(void *data, bool preempt, - struct task_struct *prev, - struct task_struct *next, - unsigned int prev_state) -{ - da_handle_event_tss(sched_switch_tss); -} - -static void handle_schedule_entry(void *data, bool preempt) -{ - da_handle_event_tss(schedule_entry_tss); -} - -static void handle_schedule_exit(void *data, bool is_switch) -{ - da_handle_start_event_tss(schedule_exit_tss); -} - -static int enable_tss(void) -{ - int retval; - - retval = da_monitor_init_tss(); - if (retval) - return retval; - - rv_attach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - return 0; -} - -static void disable_tss(void) -{ - rv_tss.enabled = 0; - - rv_detach_trace_probe("tss", sched_switch, handle_sched_switch); - rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry); - rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit); - - da_monitor_destroy_tss(); -} - -static struct rv_monitor rv_tss = { - .name = "tss", - .description = "task switch while scheduling.", - .enable = enable_tss, - .disable = disable_tss, - .reset = da_monitor_reset_all_tss, - .enabled = 0, -}; - -static int __init register_tss(void) -{ - return rv_register_monitor(&rv_tss, &rv_sched); -} - -static void __exit unregister_tss(void) -{ - rv_unregister_monitor(&rv_tss); -} - -module_init(register_tss); -module_exit(unregister_tss); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Gabriele Monaco "); -MODULE_DESCRIPTION("tss: task switch while scheduling."); diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h deleted file mode 100644 index f0a36fda1b87..000000000000 --- a/kernel/trace/rv/monitors/tss/tss.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Automatically generated C representation of tss automaton - * For further information about this format, see kernel documentation: - * Documentation/trace/rv/deterministic_automata.rst - */ - -enum states_tss { - thread_tss = 0, - sched_tss, - state_max_tss -}; - -#define INVALID_STATE state_max_tss - -enum events_tss { - sched_switch_tss = 0, - schedule_entry_tss, - schedule_exit_tss, - event_max_tss -}; - -struct automaton_tss { - char *state_names[state_max_tss]; - char *event_names[event_max_tss]; - unsigned char function[state_max_tss][event_max_tss]; - unsigned char initial_state; - bool final_states[state_max_tss]; -}; - -static const struct automaton_tss automaton_tss = { - .state_names = { - "thread", - "sched" - }, - .event_names = { - "sched_switch", - "schedule_entry", - "schedule_exit" - }, - .function = { - { INVALID_STATE, sched_tss, INVALID_STATE }, - { sched_tss, INVALID_STATE, thread_tss }, - }, - .initial_state = thread_tss, - .final_states = { 1, 0 }, -}; diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/tss/tss_trace.h deleted file mode 100644 index 4619dbb50cc0..000000000000 --- a/kernel/trace/rv/monitors/tss/tss_trace.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -/* - * Snippet to be included in rv_trace.h - */ - -#ifdef CONFIG_RV_MON_TSS -DEFINE_EVENT(event_da_monitor, event_tss, - TP_PROTO(char *state, char *event, char *next_state, bool final_state), - TP_ARGS(state, event, next_state, final_state)); - -DEFINE_EVENT(error_da_monitor, error_tss, - TP_PROTO(char *state, char *event), - TP_ARGS(state, event)); -#endif /* CONFIG_RV_MON_TSS */ diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 3af46cd185b3..97b2f7e07f27 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -58,11 +58,10 @@ DECLARE_EVENT_CLASS(error_da_monitor, ); #include -#include #include #include #include -#include +#include // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ -- cgit v1.2.3 From e8440a88e56bb3aa24c384eec6de8bef1184bed2 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:20 +0200 Subject: rv: Add nrp and sssw per-task monitors Add 2 per-task monitors as part of the sched model: * nrp: need-resched preempts Monitor to ensure preemption requires need resched. * sssw: set state sleep and wakeup Monitor to ensure sched_set_state to sleepable leads to sleeping and sleeping tasks require wakeup. Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Masami Hiramatsu Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra Link: https://lore.kernel.org/20250728135022.255578-9-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Acked-by: Nam Cao Tested-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 2 + kernel/trace/rv/Makefile | 2 + kernel/trace/rv/monitors/nrp/Kconfig | 16 ++++ kernel/trace/rv/monitors/nrp/nrp.c | 138 +++++++++++++++++++++++++++++ kernel/trace/rv/monitors/nrp/nrp.h | 75 ++++++++++++++++ kernel/trace/rv/monitors/nrp/nrp_trace.h | 15 ++++ kernel/trace/rv/monitors/sched/Kconfig | 1 + kernel/trace/rv/monitors/sssw/Kconfig | 15 ++++ kernel/trace/rv/monitors/sssw/sssw.c | 116 ++++++++++++++++++++++++ kernel/trace/rv/monitors/sssw/sssw.h | 105 ++++++++++++++++++++++ kernel/trace/rv/monitors/sssw/sssw_trace.h | 15 ++++ kernel/trace/rv/rv_trace.h | 2 + 12 files changed, 502 insertions(+) create mode 100644 kernel/trace/rv/monitors/nrp/Kconfig create mode 100644 kernel/trace/rv/monitors/nrp/nrp.c create mode 100644 kernel/trace/rv/monitors/nrp/nrp.h create mode 100644 kernel/trace/rv/monitors/nrp/nrp_trace.h create mode 100644 kernel/trace/rv/monitors/sssw/Kconfig create mode 100644 kernel/trace/rv/monitors/sssw/sssw.c create mode 100644 kernel/trace/rv/monitors/sssw/sssw.h create mode 100644 kernel/trace/rv/monitors/sssw/sssw_trace.h (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index b688b24081c8..59d0db898d4a 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -55,6 +55,8 @@ source "kernel/trace/rv/monitors/snroc/Kconfig" source "kernel/trace/rv/monitors/scpd/Kconfig" source "kernel/trace/rv/monitors/snep/Kconfig" source "kernel/trace/rv/monitors/sts/Kconfig" +source "kernel/trace/rv/monitors/nrp/Kconfig" +source "kernel/trace/rv/monitors/sssw/Kconfig" # Add new sched monitors here source "kernel/trace/rv/monitors/rtapp/Kconfig" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 1939d3d7621c..2afac88539d3 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -14,6 +14,8 @@ obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o +obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o +obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/nrp/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig new file mode 100644 index 000000000000..f5ec08f65535 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/Kconfig @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_NRP + depends on RV + depends on RV_MON_SCHED + default y if !ARM64 + select DA_MON_EVENTS_ID + bool "nrp monitor" + help + Monitor to ensure preemption requires need resched. + This monitor is part of the sched monitors collection. + + This monitor is unstable on arm64, say N unless you are testing it. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c new file mode 100644 index 000000000000..5a83b7171432 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.c @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "nrp" + +#include +#include +#include +#include + +#include "nrp.h" + +static struct rv_monitor rv_nrp; +DECLARE_DA_MON_PER_TASK(nrp, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_nrp(current, irq_entry_nrp); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, + int cpu, int tif) +{ + /* + * Although need_resched leads to both the rescheduling and preempt_irq + * states, it is safer to start the monitor always in preempt_irq, + * which may not mirror the system state but makes the monitor simpler, + */ + if (tif == TIF_NEED_RESCHED) + da_handle_start_event_nrp(tsk, sched_need_resched_nrp); +} + +static void handle_schedule_entry(void *data, bool preempt) +{ + if (preempt) + da_handle_event_nrp(current, schedule_entry_preempt_nrp); + else + da_handle_event_nrp(current, schedule_entry_nrp); +} + +static int enable_nrp(void) +{ + int retval; + + retval = da_monitor_init_nrp(); + if (retval) + return retval; + + rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + attach_vector_irq(); + + return 0; +} + +static void disable_nrp(void) +{ + rv_nrp.enabled = 0; + + rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry); + detach_vector_irq(); + + da_monitor_destroy_nrp(); +} + +static struct rv_monitor rv_nrp = { + .name = "nrp", + .description = "need resched preempts.", + .enable = enable_nrp, + .disable = disable_nrp, + .reset = da_monitor_reset_all_nrp, + .enabled = 0, +}; + +static int __init register_nrp(void) +{ + return rv_register_monitor(&rv_nrp, &rv_sched); +} + +static void __exit unregister_nrp(void) +{ + rv_unregister_monitor(&rv_nrp); +} + +module_init(register_nrp); +module_exit(unregister_nrp); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco "); +MODULE_DESCRIPTION("nrp: need resched preempts."); diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h new file mode 100644 index 000000000000..c9f12207cbf6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of nrp automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_nrp { + preempt_irq_nrp = 0, + any_thread_running_nrp, + nested_preempt_nrp, + rescheduling_nrp, + state_max_nrp +}; + +#define INVALID_STATE state_max_nrp + +enum events_nrp { + irq_entry_nrp = 0, + sched_need_resched_nrp, + schedule_entry_nrp, + schedule_entry_preempt_nrp, + event_max_nrp +}; + +struct automaton_nrp { + char *state_names[state_max_nrp]; + char *event_names[event_max_nrp]; + unsigned char function[state_max_nrp][event_max_nrp]; + unsigned char initial_state; + bool final_states[state_max_nrp]; +}; + +static const struct automaton_nrp automaton_nrp = { + .state_names = { + "preempt_irq", + "any_thread_running", + "nested_preempt", + "rescheduling" + }, + .event_names = { + "irq_entry", + "sched_need_resched", + "schedule_entry", + "schedule_entry_preempt" + }, + .function = { + { + preempt_irq_nrp, + preempt_irq_nrp, + nested_preempt_nrp, + nested_preempt_nrp + }, + { + any_thread_running_nrp, + rescheduling_nrp, + any_thread_running_nrp, + INVALID_STATE + }, + { + nested_preempt_nrp, + preempt_irq_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + { + preempt_irq_nrp, + rescheduling_nrp, + any_thread_running_nrp, + any_thread_running_nrp + }, + }, + .initial_state = preempt_irq_nrp, + .final_states = { 0, 1, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h new file mode 100644 index 000000000000..2e13497de3b6 --- /dev/null +++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_NRP +DEFINE_EVENT(event_da_monitor_id, event_nrp, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_nrp, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_NRP */ diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig index ae3eb410abd7..aa16456da864 100644 --- a/kernel/trace/rv/monitors/sched/Kconfig +++ b/kernel/trace/rv/monitors/sched/Kconfig @@ -2,6 +2,7 @@ # config RV_MON_SCHED depends on RV + depends on RV_PER_TASK_MONITORS >= 3 bool "sched monitor" help Collection of monitors to check the scheduler behaves according to specifications. diff --git a/kernel/trace/rv/monitors/sssw/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig new file mode 100644 index 000000000000..23b7eeb38bbf --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_SSSW + depends on RV + depends on RV_MON_SCHED + default y + select DA_MON_EVENTS_ID + bool "sssw monitor" + help + Monitor to ensure sched_set_state to sleepable leads to sleeping and + sleeping tasks require wakeup. + This monitor is part of the sched monitors collection. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c new file mode 100644 index 000000000000..84b8d890d9d4 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "sssw" + +#include +#include +#include +#include + +#include "sssw.h" + +static struct rv_monitor rv_sssw; +DECLARE_DA_MON_PER_TASK(sssw, unsigned char); + +static void handle_sched_set_state(void *data, struct task_struct *tsk, int state) +{ + if (state == TASK_RUNNING) + da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw); + else + da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw); +} + +static void handle_sched_switch(void *data, bool preempt, + struct task_struct *prev, + struct task_struct *next, + unsigned int prev_state) +{ + if (preempt) + da_handle_event_sssw(prev, sched_switch_preempt_sssw); + else if (prev_state == TASK_RUNNING) + da_handle_event_sssw(prev, sched_switch_yield_sssw); + else if (prev_state == TASK_RTLOCK_WAIT) + /* special case of sleeping task with racy conditions */ + da_handle_event_sssw(prev, sched_switch_blocking_sssw); + else + da_handle_event_sssw(prev, sched_switch_suspend_sssw); + da_handle_event_sssw(next, sched_switch_in_sssw); +} + +static void handle_sched_wakeup(void *data, struct task_struct *p) +{ + /* + * Wakeup can also lead to signal_wakeup although the system is + * actually runnable. The monitor can safely start with this event. + */ + da_handle_start_event_sssw(p, sched_wakeup_sssw); +} + +static void handle_signal_deliver(void *data, int sig, + struct kernel_siginfo *info, + struct k_sigaction *ka) +{ + da_handle_event_sssw(current, signal_deliver_sssw); +} + +static int enable_sssw(void) +{ + int retval; + + retval = da_monitor_init_sssw(); + if (retval) + return retval; + + rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + return 0; +} + +static void disable_sssw(void) +{ + rv_sssw.enabled = 0; + + rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state); + rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch); + rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup); + rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver); + + da_monitor_destroy_sssw(); +} + +static struct rv_monitor rv_sssw = { + .name = "sssw", + .description = "set state sleep and wakeup.", + .enable = enable_sssw, + .disable = disable_sssw, + .reset = da_monitor_reset_all_sssw, + .enabled = 0, +}; + +static int __init register_sssw(void) +{ + return rv_register_monitor(&rv_sssw, &rv_sched); +} + +static void __exit unregister_sssw(void) +{ + rv_unregister_monitor(&rv_sssw); +} + +module_init(register_sssw); +module_exit(unregister_sssw); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco "); +MODULE_DESCRIPTION("sssw: set state sleep and wakeup."); diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h new file mode 100644 index 000000000000..243d54050c94 --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of sssw automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_sssw { + runnable_sssw = 0, + signal_wakeup_sssw, + sleepable_sssw, + sleeping_sssw, + state_max_sssw +}; + +#define INVALID_STATE state_max_sssw + +enum events_sssw { + sched_set_state_runnable_sssw = 0, + sched_set_state_sleepable_sssw, + sched_switch_blocking_sssw, + sched_switch_in_sssw, + sched_switch_preempt_sssw, + sched_switch_suspend_sssw, + sched_switch_yield_sssw, + sched_wakeup_sssw, + signal_deliver_sssw, + event_max_sssw +}; + +struct automaton_sssw { + char *state_names[state_max_sssw]; + char *event_names[event_max_sssw]; + unsigned char function[state_max_sssw][event_max_sssw]; + unsigned char initial_state; + bool final_states[state_max_sssw]; +}; + +static const struct automaton_sssw automaton_sssw = { + .state_names = { + "runnable", + "signal_wakeup", + "sleepable", + "sleeping" + }, + .event_names = { + "sched_set_state_runnable", + "sched_set_state_sleepable", + "sched_switch_blocking", + "sched_switch_in", + "sched_switch_preempt", + "sched_switch_suspend", + "sched_switch_yield", + "sched_wakeup", + "signal_deliver" + }, + .function = { + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + runnable_sssw, + runnable_sssw, + INVALID_STATE, + runnable_sssw, + runnable_sssw, + runnable_sssw + }, + { + INVALID_STATE, + sleepable_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + INVALID_STATE, + signal_wakeup_sssw, + signal_wakeup_sssw, + runnable_sssw + }, + { + runnable_sssw, + sleepable_sssw, + sleeping_sssw, + sleepable_sssw, + sleepable_sssw, + sleeping_sssw, + signal_wakeup_sssw, + runnable_sssw, + sleepable_sssw + }, + { + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + runnable_sssw, + INVALID_STATE + }, + }, + .initial_state = runnable_sssw, + .final_states = { 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h new file mode 100644 index 000000000000..6c03cfc6960b --- /dev/null +++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_SSSW +DEFINE_EVENT(event_da_monitor_id, event_sssw, + TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state), + TP_ARGS(id, state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor_id, error_sssw, + TP_PROTO(int id, char *state, char *event), + TP_ARGS(id, state, event)); +#endif /* CONFIG_RV_MON_SSSW */ diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index 97b2f7e07f27..e4e78d23b7b0 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -123,6 +123,8 @@ DECLARE_EVENT_CLASS(error_da_monitor_id, #include #include +#include +#include // Add new monitors based on CONFIG_DA_MON_EVENTS_ID here #endif /* CONFIG_DA_MON_EVENTS_ID */ -- cgit v1.2.3 From 614384533dfe99293a7ff1bce3d4389adadbb759 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Mon, 28 Jul 2025 15:50:21 +0200 Subject: rv: Add opid per-cpu monitor Add a per-cpu monitor as part of the sched model: * opid: operations with preemption and irq disabled Monitor to ensure wakeup and need_resched occur with irq and preemption disabled or in irq handlers. Cc: Jonathan Corbet Cc: Masami Hiramatsu Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Tomas Glozar Cc: Juri Lelli Cc: Clark Williams Cc: John Kacur Link: https://lore.kernel.org/20250728135022.255578-10-gmonaco@redhat.com Signed-off-by: Gabriele Monaco Acked-by: Nam Cao Tested-by: Nam Cao Signed-off-by: Steven Rostedt (Google) --- kernel/trace/rv/Kconfig | 1 + kernel/trace/rv/Makefile | 1 + kernel/trace/rv/monitors/opid/Kconfig | 19 ++++ kernel/trace/rv/monitors/opid/opid.c | 168 +++++++++++++++++++++++++++++ kernel/trace/rv/monitors/opid/opid.h | 104 ++++++++++++++++++ kernel/trace/rv/monitors/opid/opid_trace.h | 15 +++ kernel/trace/rv/rv_trace.h | 1 + 7 files changed, 309 insertions(+) create mode 100644 kernel/trace/rv/monitors/opid/Kconfig create mode 100644 kernel/trace/rv/monitors/opid/opid.c create mode 100644 kernel/trace/rv/monitors/opid/opid.h create mode 100644 kernel/trace/rv/monitors/opid/opid_trace.h (limited to 'kernel') diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig index 59d0db898d4a..5b4be87ba59d 100644 --- a/kernel/trace/rv/Kconfig +++ b/kernel/trace/rv/Kconfig @@ -57,6 +57,7 @@ source "kernel/trace/rv/monitors/snep/Kconfig" source "kernel/trace/rv/monitors/sts/Kconfig" source "kernel/trace/rv/monitors/nrp/Kconfig" source "kernel/trace/rv/monitors/sssw/Kconfig" +source "kernel/trace/rv/monitors/opid/Kconfig" # Add new sched monitors here source "kernel/trace/rv/monitors/rtapp/Kconfig" diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile index 2afac88539d3..750e4ad6fa0f 100644 --- a/kernel/trace/rv/Makefile +++ b/kernel/trace/rv/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o +obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o # Add new monitors here obj-$(CONFIG_RV_REACTORS) += rv_reactors.o obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig new file mode 100644 index 000000000000..561d32da572b --- /dev/null +++ b/kernel/trace/rv/monitors/opid/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +config RV_MON_OPID + depends on RV + depends on TRACE_IRQFLAGS + depends on TRACE_PREEMPT_TOGGLE + depends on RV_MON_SCHED + default y if PREEMPT_RT + select DA_MON_EVENTS_IMPLICIT + bool "opid monitor" + help + Monitor to ensure operations like wakeup and need resched occur with + interrupts and preemption disabled or during IRQs, where preemption + may not be disabled explicitly. + + This monitor is unstable on !PREEMPT_RT, say N unless you are testing it. + + For further information, see: + Documentation/trace/rv/monitor_sched.rst diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c new file mode 100644 index 000000000000..50d64e7fb8c4 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "opid" + +#include +#include +#include +#include +#include + +#include "opid.h" + +static struct rv_monitor rv_opid; +DECLARE_DA_MON_PER_CPU(opid, unsigned char); + +#ifdef CONFIG_X86_LOCAL_APIC +#include + +static void handle_vector_irq_entry(void *data, int vector) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void attach_vector_irq(void) +{ + rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +static void detach_vector_irq(void) +{ + rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_IRQ_WORK)) + rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry); + if (IS_ENABLED(CONFIG_SMP)) { + rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry); + rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry); + } +} + +#else +/* We assume irq_entry tracepoints are sufficient on other architectures */ +static void attach_vector_irq(void) { } +static void detach_vector_irq(void) { } +#endif + +static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_disable_opid); +} + +static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(irq_enable_opid); +} + +static void handle_irq_entry(void *data, int irq, struct irqaction *action) +{ + da_handle_event_opid(irq_entry_opid); +} + +static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_disable_opid); +} + +static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip) +{ + da_handle_event_opid(preempt_enable_opid); +} + +static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_need_resched_opid); + else + da_handle_start_event_opid(sched_need_resched_opid); +} + +static void handle_sched_waking(void *data, struct task_struct *p) +{ + /* The monitor's intitial state is not in_irq */ + if (this_cpu_read(hardirq_context)) + da_handle_event_opid(sched_waking_opid); + else + da_handle_start_event_opid(sched_waking_opid); +} + +static int enable_opid(void) +{ + int retval; + + retval = da_monitor_init_opid(); + if (retval) + return retval; + + rv_attach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_attach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_attach_trace_probe("opid", sched_waking, handle_sched_waking); + attach_vector_irq(); + + return 0; +} + +static void disable_opid(void) +{ + rv_opid.enabled = 0; + + rv_detach_trace_probe("opid", irq_disable, handle_irq_disable); + rv_detach_trace_probe("opid", irq_enable, handle_irq_enable); + rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry); + rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable); + rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable); + rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched); + rv_detach_trace_probe("opid", sched_waking, handle_sched_waking); + detach_vector_irq(); + + da_monitor_destroy_opid(); +} + +/* + * This is the monitor register section. + */ +static struct rv_monitor rv_opid = { + .name = "opid", + .description = "operations with preemption and irq disabled.", + .enable = enable_opid, + .disable = disable_opid, + .reset = da_monitor_reset_all_opid, + .enabled = 0, +}; + +static int __init register_opid(void) +{ + return rv_register_monitor(&rv_opid, &rv_sched); +} + +static void __exit unregister_opid(void) +{ + rv_unregister_monitor(&rv_opid); +} + +module_init(register_opid); +module_exit(unregister_opid); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Gabriele Monaco "); +MODULE_DESCRIPTION("opid: operations with preemption and irq disabled."); diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h new file mode 100644 index 000000000000..b4b8c2ff7f64 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Automatically generated C representation of opid automaton + * For further information about this format, see kernel documentation: + * Documentation/trace/rv/deterministic_automata.rst + */ + +enum states_opid { + disabled_opid = 0, + enabled_opid, + in_irq_opid, + irq_disabled_opid, + preempt_disabled_opid, + state_max_opid +}; + +#define INVALID_STATE state_max_opid + +enum events_opid { + irq_disable_opid = 0, + irq_enable_opid, + irq_entry_opid, + preempt_disable_opid, + preempt_enable_opid, + sched_need_resched_opid, + sched_waking_opid, + event_max_opid +}; + +struct automaton_opid { + char *state_names[state_max_opid]; + char *event_names[event_max_opid]; + unsigned char function[state_max_opid][event_max_opid]; + unsigned char initial_state; + bool final_states[state_max_opid]; +}; + +static const struct automaton_opid automaton_opid = { + .state_names = { + "disabled", + "enabled", + "in_irq", + "irq_disabled", + "preempt_disabled" + }, + .event_names = { + "irq_disable", + "irq_enable", + "irq_entry", + "preempt_disable", + "preempt_enable", + "sched_need_resched", + "sched_waking" + }, + .function = { + { + INVALID_STATE, + preempt_disabled_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + disabled_opid, + disabled_opid + }, + { + irq_disabled_opid, + INVALID_STATE, + INVALID_STATE, + preempt_disabled_opid, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + INVALID_STATE, + INVALID_STATE, + in_irq_opid, + in_irq_opid + }, + { + INVALID_STATE, + enabled_opid, + in_irq_opid, + disabled_opid, + INVALID_STATE, + irq_disabled_opid, + INVALID_STATE + }, + { + disabled_opid, + INVALID_STATE, + INVALID_STATE, + INVALID_STATE, + enabled_opid, + INVALID_STATE, + INVALID_STATE + }, + }, + .initial_state = disabled_opid, + .final_states = { 0, 1, 0, 0, 0 }, +}; diff --git a/kernel/trace/rv/monitors/opid/opid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h new file mode 100644 index 000000000000..3df6ff955c30 --- /dev/null +++ b/kernel/trace/rv/monitors/opid/opid_trace.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Snippet to be included in rv_trace.h + */ + +#ifdef CONFIG_RV_MON_OPID +DEFINE_EVENT(event_da_monitor, event_opid, + TP_PROTO(char *state, char *event, char *next_state, bool final_state), + TP_ARGS(state, event, next_state, final_state)); + +DEFINE_EVENT(error_da_monitor, error_opid, + TP_PROTO(char *state, char *event), + TP_ARGS(state, event)); +#endif /* CONFIG_RV_MON_OPID */ diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h index e4e78d23b7b0..4a6faddac614 100644 --- a/kernel/trace/rv/rv_trace.h +++ b/kernel/trace/rv/rv_trace.h @@ -62,6 +62,7 @@ DECLARE_EVENT_CLASS(error_da_monitor, #include #include #include +#include // Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here #endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */ -- cgit v1.2.3 From 133c302a0c60bca1f0a2f5f85ef11e7f5e8f1331 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 28 Jul 2025 11:13:12 +0900 Subject: tracing: trace_fprobe: Fix typo of the semicolon Fix a typo that uses ',' instead of ';' for line delimiter. Link: https://lore.kernel.org/linux-trace-kernel/175366879192.487099.5714468217360139639.stgit@mhiramat.tok.corp.google.com/ Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_fprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 9d14a910fbf7..b36ade43d4b3 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -1404,7 +1404,7 @@ static int trace_fprobe_create_cb(int argc, const char *argv[]) if (!ctx) return -ENOMEM; - ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE, + ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE; trace_probe_log_init("trace_fprobe", argc, argv); ret = trace_fprobe_create_internal(argc, argv, ctx); -- cgit v1.2.3 From a5a6b29a700fda1dd766cc42dde2cbba9b19f470 Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Thu, 24 Jul 2025 23:14:51 +0800 Subject: bpf: Show precise rejected function when attaching fexit/fmod_ret to __noreturn functions With this change, we know the precise rejected function name when attaching fexit/fmod_ret to __noreturn functions from log. $ ./fexit libbpf: prog 'fexit': BPF program load failed: -EINVAL libbpf: prog 'fexit': -- BEGIN PROG LOAD LOG -- Attaching fexit/fmod_ret to __noreturn function 'do_exit' is rejected. Suggested-by: Leon Hwang Signed-off-by: KaFai Wan Acked-by: Yafang Shao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250724151454.499040-2-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e50d3d43be67..4bf7392fd2c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23985,7 +23985,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_MODIFY_RETURN) && btf_id_set_contains(&noreturn_deny, btf_id)) { - verbose(env, "Attaching fexit/fmod_ret to __noreturn functions is rejected.\n"); + verbose(env, "Attaching fexit/fmod_ret to __noreturn function '%s' is rejected.\n", + tgt_info.tgt_name); return -EINVAL; } -- cgit v1.2.3 From 863aab3d4dcdfffa5cf0e0795c526dadca65be7a Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Thu, 24 Jul 2025 23:14:52 +0800 Subject: bpf: Add log for attaching tracing programs to functions in deny list Show the rejected function name when attaching tracing programs to functions in deny list. With this change, we know why tracing programs can't attach to functions like __rcu_read_lock() from log. $ ./fentry libbpf: prog '__rcu_read_lock': BPF program load failed: -EINVAL libbpf: prog '__rcu_read_lock': -- BEGIN PROG LOAD LOG -- Attaching tracing programs to function '__rcu_read_lock' is rejected. Suggested-by: Leon Hwang Signed-off-by: KaFai Wan Acked-by: Yafang Shao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250724151454.499040-3-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4bf7392fd2c3..399f03e62508 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23981,6 +23981,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } else if (prog->type == BPF_PROG_TYPE_TRACING && btf_id_set_contains(&btf_id_deny, btf_id)) { + verbose(env, "Attaching tracing programs to function '%s' is rejected.\n", + tgt_info.tgt_name); return -EINVAL; } else if ((prog->expected_attach_type == BPF_TRACE_FEXIT || prog->expected_attach_type == BPF_MODIFY_RETURN) && -- cgit v1.2.3 From a3e892ab0fc287389176eabdcd74234508f6e52d Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 29 Jul 2025 08:47:03 +0900 Subject: tracing: fprobe: Fix infinite recursion using preempt_*_notrace() Since preempt_count_add/del() are tracable functions, it is not allowed to use preempt_disable/enable() in ftrace handlers. Without this fix, probing on `preempt_count_add%return` will cause an infinite recursion of fprobes. To fix this problem, use preempt_disable/enable_notrace() in fprobe_return(). Link: https://lore.kernel.org/all/175374642359.1471729.1054175011228386560.stgit@mhiramat.tok.corp.google.com/ Fixes: 4346ba160409 ("fprobe: Rewrite fprobe on function-graph tracer") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/fprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index ba7ff14f5339..f9b3aa9afb17 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -352,7 +352,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, size_words = SIZE_IN_LONG(size); ret_ip = ftrace_regs_get_instruction_pointer(fregs); - preempt_disable(); + preempt_disable_notrace(); curr = 0; while (size_words > curr) { @@ -368,7 +368,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace, } curr += size; } - preempt_enable(); + preempt_enable_notrace(); } NOKPROBE_SYMBOL(fprobe_return); -- cgit v1.2.3 From 1a967e92bf47cf5170336b88d748117c700edc47 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 29 Jul 2025 14:10:35 +0900 Subject: tracing: Remove "__attribute__()" from the type field of event format With CONFIG_DEBUG_INFO_BTF=y and PAHOLE_HAS_BTF_TAG=y, `__user` is converted to `__attribute__((btf_type_tag("user")))`. In this case, some syscall events have it for __user data, like below; /sys/kernel/tracing # cat events/syscalls/sys_enter_openat/format name: sys_enter_openat ID: 720 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1; signed:0; field:int common_pid; offset:4; size:4; signed:1; field:int __syscall_nr; offset:8; size:4; signed:1; field:int dfd; offset:16; size:8; signed:0; field:const char __attribute__((btf_type_tag("user"))) * filename; offset:24; size:8; signed:0; field:int flags; offset:32; size:8; signed:0; field:umode_t mode; offset:40; size:8; signed:0; Then the trace event filter fails to set the string acceptable flag (FILTER_PTR_STRING) to the field and rejects setting string filter; # echo 'filename.ustring ~ "*ftracetest-dir.wbx24v*"' \ >> events/syscalls/sys_enter_openat/filter sh: write error: Invalid argument # cat error_log [ 723.743637] event filter parse error: error: Expecting numeric field Command: filename.ustring ~ "*ftracetest-dir.wbx24v*" Since this __attribute__ makes format parsing complicated and not needed, remove the __attribute__(.*) from the type string. Cc: Mathieu Desnoyers Link: https://lore.kernel.org/175376583493.1688759.12333973498014733551.stgit@mhiramat.tok.corp.google.com Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 28 ++++++---- kernel/trace/trace.h | 4 +- kernel/trace/trace_events.c | 128 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 127 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 06ab5b7a8711..945a8ecf2c62 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5930,17 +5930,27 @@ static inline void trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, int len) { } #endif /* !CONFIG_TRACE_EVAL_MAP_FILE */ -static void trace_insert_eval_map(struct module *mod, - struct trace_eval_map **start, int len) +static void +trace_event_update_with_eval_map(struct module *mod, + struct trace_eval_map **start, + int len) { struct trace_eval_map **map; - if (len <= 0) - return; + /* Always run sanitizer only if btf_type_tag attr exists. */ + if (len <= 0) { + if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) && + IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) && + __has_attribute(btf_type_tag))) + return; + } map = start; - trace_event_eval_update(map, len); + trace_event_update_all(map, len); + + if (len <= 0) + return; trace_insert_eval_map_file(mod, start, len); } @@ -10334,7 +10344,7 @@ static void __init eval_map_work_func(struct work_struct *work) int len; len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps; - trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); + trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len); } static int __init trace_eval_init(void) @@ -10387,9 +10397,6 @@ bool module_exists(const char *module) static void trace_module_add_evals(struct module *mod) { - if (!mod->num_trace_evals) - return; - /* * Modules with bad taint do not have events created, do * not bother with enums either. @@ -10397,7 +10404,8 @@ static void trace_module_add_evals(struct module *mod) if (trace_module_has_bad_taint(mod)) return; - trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals); + /* Even if no trace_evals, this need to sanitize field types. */ + trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals); } #ifdef CONFIG_TRACE_EVAL_MAP_FILE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bd084953a98b..1dbf1d3cf2f1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2125,13 +2125,13 @@ static inline const char *get_syscall_name(int syscall) #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); -void trace_event_eval_update(struct trace_eval_map **map, int len); +void trace_event_update_all(struct trace_eval_map **map, int len); /* Used from boot time tracer */ extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } -static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } +static inline void trace_event_update_all(struct trace_eval_map **map, int len) { } #endif #ifdef CONFIG_TRACER_SNAPSHOT diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6c0783fc4c2c..05447b958a1a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3264,43 +3264,120 @@ static void add_str_to_module(struct module *module, char *str) list_add(&modstr->next, &module_strings); } +#define ATTRIBUTE_STR "__attribute__(" +#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1) + +/* Remove all __attribute__() from @type. Return allocated string or @type. */ +static char *sanitize_field_type(const char *type) +{ + char *attr, *tmp, *next, *ret = (char *)type; + int depth; + + next = (char *)type; + while ((attr = strstr(next, ATTRIBUTE_STR))) { + /* Retry if "__attribute__(" is a part of another word. */ + if (attr != next && !isspace(attr[-1])) { + next = attr + ATTRIBUTE_STR_LEN; + continue; + } + + if (ret == type) { + ret = kstrdup(type, GFP_KERNEL); + if (WARN_ON_ONCE(!ret)) + return NULL; + attr = ret + (attr - type); + } + + /* the ATTRIBUTE_STR already has the first '(' */ + depth = 1; + next = attr + ATTRIBUTE_STR_LEN; + do { + tmp = strpbrk(next, "()"); + /* There is unbalanced parentheses */ + if (WARN_ON_ONCE(!tmp)) { + kfree(ret); + return (char *)type; + } + + if (*tmp == '(') + depth++; + else + depth--; + next = tmp + 1; + } while (depth > 0); + next = skip_spaces(next); + strcpy(attr, next); + next = attr; + } + return ret; +} + +static char *find_replacable_eval(const char *type, const char *eval_string, + int len) +{ + char *ptr; + + if (!eval_string) + return NULL; + + ptr = strchr(type, '['); + if (!ptr) + return NULL; + ptr++; + + if (!isalpha(*ptr) && *ptr != '_') + return NULL; + + if (strncmp(eval_string, ptr, len) != 0) + return NULL; + + return ptr; +} + static void update_event_fields(struct trace_event_call *call, struct trace_eval_map *map) { struct ftrace_event_field *field; + const char *eval_string = NULL; struct list_head *head; + int len = 0; char *ptr; char *str; - int len = strlen(map->eval_string); /* Dynamic events should never have field maps */ - if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC)) + if (call->flags & TRACE_EVENT_FL_DYNAMIC) return; + if (map) { + eval_string = map->eval_string; + len = strlen(map->eval_string); + } + head = trace_get_fields(call); list_for_each_entry(field, head, link) { - ptr = strchr(field->type, '['); - if (!ptr) - continue; - ptr++; - - if (!isalpha(*ptr) && *ptr != '_') - continue; + str = sanitize_field_type(field->type); + if (!str) + return; - if (strncmp(map->eval_string, ptr, len) != 0) - continue; + ptr = find_replacable_eval(str, eval_string, len); + if (ptr) { + if (str == field->type) { + str = kstrdup(field->type, GFP_KERNEL); + if (WARN_ON_ONCE(!str)) + return; + ptr = str + (ptr - field->type); + } - str = kstrdup(field->type, GFP_KERNEL); - if (WARN_ON_ONCE(!str)) - return; - ptr = str + (ptr - field->type); - ptr = eval_replace(ptr, map, len); - /* enum/sizeof string smaller than value */ - if (WARN_ON_ONCE(!ptr)) { - kfree(str); - continue; + ptr = eval_replace(ptr, map, len); + /* enum/sizeof string smaller than value */ + if (WARN_ON_ONCE(!ptr)) { + kfree(str); + continue; + } } + if (str == field->type) + continue; /* * If the event is part of a module, then we need to free the string * when the module is removed. Otherwise, it will stay allocated @@ -3310,14 +3387,18 @@ static void update_event_fields(struct trace_event_call *call, add_str_to_module(call->module, str); field->type = str; + if (field->filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(field->type); } } -void trace_event_eval_update(struct trace_eval_map **map, int len) +/* Update all events for replacing eval and sanitizing */ +void trace_event_update_all(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; bool first = false; + bool updated; int last_i; int i; @@ -3330,6 +3411,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) last_system = call->class->system; } + updated = false; /* * Since calls are grouped by systems, the likelihood that the * next call in the iteration belongs to the same system as the @@ -3349,8 +3431,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) } update_event_printk(call, map[i]); update_event_fields(call, map[i]); + updated = true; } } + /* If not updated yet, update field for sanitizing. */ + if (!updated) + update_event_fields(call, NULL); cond_resched(); } up_write(&trace_event_sem); -- cgit v1.2.3 From 71753c6ed2bf2aee5be26c1bc06a94c9e3713ade Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 29 Jul 2025 14:23:05 -0400 Subject: unwind_user: Add user space unwinding API with frame pointer support Introduce a generic API for unwinding user stacks. In order to expand user space unwinding to be able to handle more complex scenarios, such as deferred unwinding and reading user space information, create a generic interface that all architectures can use that support the various unwinding methods. This is an alternative method for handling user space stack traces from the simple stack_trace_save_user() API. This does not replace that interface, but this interface will be used to expand the functionality of user space stack walking. None of the structures introduced will be exposed to user space tooling. Support for frame pointer unwinding is added. For an architecture to support frame pointer unwinding it needs to enable CONFIG_HAVE_UNWIND_USER_FP and define ARCH_INIT_USER_FP_FRAME. By encoding the frame offsets in struct unwind_user_frame, much of this code can also be reused for future unwinder implementations like sframe. Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182404.975790139@kernel.org Reviewed-by: Jens Remus Signed-off-by: Josh Poimboeuf Co-developed-by: Mathieu Desnoyers Link: https://lore.kernel.org/all/20250710164301.3094-2-mathieu.desnoyers@efficios.com/ Signed-off-by: Mathieu Desnoyers Co-developed-by: Steven Rostedt (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/Makefile | 1 + kernel/unwind/Makefile | 1 + kernel/unwind/user.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100644 kernel/unwind/Makefile create mode 100644 kernel/unwind/user.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 32e80dd626af..541186050251 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -55,6 +55,7 @@ obj-y += rcu/ obj-y += livepatch/ obj-y += dma/ obj-y += entry/ +obj-y += unwind/ obj-$(CONFIG_MODULES) += module/ obj-$(CONFIG_KCMP) += kcmp.o diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile new file mode 100644 index 000000000000..349ce3677526 --- /dev/null +++ b/kernel/unwind/Makefile @@ -0,0 +1 @@ + obj-$(CONFIG_UNWIND_USER) += user.o diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c new file mode 100644 index 000000000000..97a8415e3216 --- /dev/null +++ b/kernel/unwind/user.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* +* Generic interfaces for unwinding user space +*/ +#include +#include +#include +#include +#include + +static const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME +}; + +#define for_each_user_frame(state) \ + for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) + +static int unwind_user_next_fp(struct unwind_user_state *state) +{ + const struct unwind_user_frame *frame = &fp_frame; + unsigned long cfa, fp, ra; + unsigned int shift; + + if (frame->use_fp) { + if (state->fp < state->sp) + return -EINVAL; + cfa = state->fp; + } else { + cfa = state->sp; + } + + /* Get the Canonical Frame Address (CFA) */ + cfa += frame->cfa_off; + + /* stack going in wrong direction? */ + if (cfa <= state->sp) + return -EINVAL; + + /* Make sure that the address is word aligned */ + shift = sizeof(long) == 4 ? 2 : 3; + if (cfa & ((1 << shift) - 1)) + return -EINVAL; + + /* Find the Return Address (RA) */ + if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + return -EINVAL; + + if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + return -EINVAL; + + state->ip = ra; + state->sp = cfa; + if (frame->fp_off) + state->fp = fp; + return 0; +} + +static int unwind_user_next(struct unwind_user_state *state) +{ + unsigned long iter_mask = state->available_types; + unsigned int bit; + + if (state->done) + return -EINVAL; + + for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) { + enum unwind_user_type type = BIT(bit); + + state->current_type = type; + switch (type) { + case UNWIND_USER_TYPE_FP: + if (!unwind_user_next_fp(state)) + return 0; + continue; + default: + WARN_ONCE(1, "Undefined unwind bit %d", bit); + break; + } + break; + } + + /* No successful unwind method. */ + state->current_type = UNWIND_USER_TYPE_NONE; + state->done = true; + return -EINVAL; +} + +static int unwind_user_start(struct unwind_user_state *state) +{ + struct pt_regs *regs = task_pt_regs(current); + + memset(state, 0, sizeof(*state)); + + if ((current->flags & PF_KTHREAD) || !user_mode(regs)) { + state->done = true; + return -EINVAL; + } + + if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP)) + state->available_types |= UNWIND_USER_TYPE_FP; + + state->ip = instruction_pointer(regs); + state->sp = user_stack_pointer(regs); + state->fp = frame_pointer(regs); + + return 0; +} + +int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries) +{ + struct unwind_user_state state; + + trace->nr = 0; + + if (!max_entries) + return -EINVAL; + + if (current->flags & PF_KTHREAD) + return 0; + + for_each_user_frame(&state) { + trace->entries[trace->nr++] = state.ip; + if (trace->nr >= max_entries) + break; + } + + return 0; +} -- cgit v1.2.3 From 5e32d0f15cc5c843a4115c4644d984d42524c794 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 29 Jul 2025 14:23:06 -0400 Subject: unwind_user/deferred: Add unwind_user_faultable() Add a new API to retrieve a user space callstack called unwind_user_faultable(). The difference between this user space stack tracer from the current user space stack tracer is that this must be called from faultable context as it may use routines to access user space data that needs to be faulted in. It can be safely called from entering or exiting a system call as the code can still be faulted in there. This code is based on work by Josh Poimboeuf's deferred unwinding code: Link: https://lore.kernel.org/all/6052e8487746603bdb29b65f4033e739092d9925.1737511963.git.jpoimboe@kernel.org/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182405.147896868@kernel.org Reviewed-by: Jens Remus Signed-off-by: Steven Rostedt (Google) --- kernel/fork.c | 4 ++++ kernel/unwind/Makefile | 2 +- kernel/unwind/deferred.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 kernel/unwind/deferred.c (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..3341d50c61f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -105,6 +105,7 @@ #include #include #include +#include #include #include @@ -732,6 +733,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + unwind_task_free(tsk); sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); @@ -2135,6 +2137,8 @@ __latent_entropy struct task_struct *copy_process( p->bpf_ctx = NULL; #endif + unwind_task_init(p); + /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); if (retval) diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile index 349ce3677526..eae37bea54fd 100644 --- a/kernel/unwind/Makefile +++ b/kernel/unwind/Makefile @@ -1 +1 @@ - obj-$(CONFIG_UNWIND_USER) += user.o + obj-$(CONFIG_UNWIND_USER) += user.o deferred.o diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c new file mode 100644 index 000000000000..a0badbeb3cc1 --- /dev/null +++ b/kernel/unwind/deferred.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Deferred user space unwinding + */ +#include +#include +#include +#include + +#define UNWIND_MAX_ENTRIES 512 + +/** + * unwind_user_faultable - Produce a user stacktrace in faultable context + * @trace: The descriptor that will store the user stacktrace + * + * This must be called in a known faultable context (usually when entering + * or exiting user space). Depending on the available implementations + * the @trace will be loaded with the addresses of the user space stacktrace + * if it can be found. + * + * Return: 0 on success and negative on error + * On success @trace will contain the user space stacktrace + */ +int unwind_user_faultable(struct unwind_stacktrace *trace) +{ + struct unwind_task_info *info = ¤t->unwind_info; + + /* Should always be called from faultable context */ + might_fault(); + + if (current->flags & PF_EXITING) + return -EINVAL; + + if (!info->entries) { + info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long), + GFP_KERNEL); + if (!info->entries) + return -ENOMEM; + } + + trace->nr = 0; + trace->entries = info->entries; + unwind_user(trace, UNWIND_MAX_ENTRIES); + + return 0; +} + +void unwind_task_init(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + memset(info, 0, sizeof(*info)); +} + +void unwind_task_free(struct task_struct *task) +{ + struct unwind_task_info *info = &task->unwind_info; + + kfree(info->entries); +} -- cgit v1.2.3 From 6443cdf567a900e03afe1d66fb8bcc7dad0835d0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 14 Jul 2025 17:08:58 +0100 Subject: ring-buffer: Make the const read-only 'type' static Don't populate the read-only 'type' on the stack at run time, instead make it static. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250714160858.1234719-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 903d9db75e12..5176e0270f07 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4216,7 +4216,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static const char *show_irq_str(int bits) { - const char *type[] = { + static const char * type[] = { ".", // 0 "s", // 1 "h", // 2 -- cgit v1.2.3 From 0dd1274a053f9ede97e3f3269b5012372567e521 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 Jul 2025 10:07:54 -0400 Subject: tracing: Have eprobes have their own config option Eprobes were added in 5.15 and were selected whenever any of the other probe events were selected. If kprobe events were enabled (which it is by default if kprobes are enabled) it would enable eprobe events as well. The same for uprobes and fprobes. Have eprobes have its own config and it gets enabled by default if tracing is enabled. Link: https://lore.kernel.org/all/20250729102636.b7cce553e7cc263722b12365@kernel.org/ Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Namhyung Kim Cc: Jonathan Corbet Cc: Randy Dunlap Link: https://lore.kernel.org/20250730140945.360286733@kernel.org Suggested-by: Masami Hiramatsu (Google) Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 14 ++++++++++++++ kernel/trace/Makefile | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 93e8e7fc11c0..f80298e6aa16 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -792,6 +792,20 @@ config UPROBE_EVENTS This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config EPROBE_EVENTS + bool "Enable event-based dynamic events" + depends on TRACING + depends on HAVE_REGS_AND_STACK_ACCESS_API + select PROBE_EVENTS + select DYNAMIC_EVENTS + default y + help + Eprobes are dynamic events that can be placed on other existing + events. It can be used to limit what fields are recorded in + an event or even dereference a field of an event. It can + convert the type of an event field. For example, turn an + address into a string. + config BPF_EVENTS depends on BPF_SYSCALL depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 057cd975d014..dcb4e02afc5f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -82,7 +82,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o -obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o +obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o -- cgit v1.2.3 From 199d9ffb31650f948dd342ade1c1b920e157630f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 11 Jul 2025 15:31:36 +0200 Subject: module: move 'struct module_use' to internal.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct was moved to the public header file in commit c8e21ced08b3 ("module: fix kdb's illicit use of struct module_use."). Back then the structure was used outside of the module core. Nowadays this is not true anymore, so the structure can be made internal. Signed-off-by: Thomas Weißschuh Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20250711-kunit-ifdef-modules-v2-1-39443decb1f8@linutronix.de Signed-off-by: Daniel Gomez --- kernel/module/internal.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 51ddd8866ef3..618202578b42 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -112,6 +112,13 @@ struct find_symbol_arg { enum mod_license license; }; +/* modules using other modules */ +struct module_use { + struct list_head source_list; + struct list_head target_list; + struct module *source, *target; +}; + int mod_verify_sig(const void *mod, struct load_info *info); int try_to_force_load(struct module *mod, const char *reason); bool find_symbol(struct find_symbol_arg *fsa); -- cgit v1.2.3 From a6323bd4e611567913e23df5b58f2d4e4da06789 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 30 Jun 2025 16:32:32 +0200 Subject: module: Prevent silent truncation of module name in delete_module(2) Passing a module name longer than MODULE_NAME_LEN to the delete_module syscall results in its silent truncation. This really isn't much of a problem in practice, but it could theoretically lead to the removal of an incorrect module. It is more sensible to return ENAMETOOLONG or ENOENT in such a case. Update the syscall to return ENOENT, as documented in the delete_module(2) man page to mean "No module by that name exists." This is appropriate because a module with a name longer than MODULE_NAME_LEN cannot be loaded in the first place. Signed-off-by: Petr Pavlu Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250630143535.267745-2-petr.pavlu@suse.com Signed-off-by: Daniel Gomez --- kernel/module/main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 81f9df8859dc..120e51550a88 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -779,14 +779,16 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, struct module *mod; char name[MODULE_NAME_LEN]; char buf[MODULE_FLAGS_BUF_SIZE]; - int ret, forced = 0; + int ret, len, forced = 0; if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) - return -EFAULT; - name[MODULE_NAME_LEN-1] = '\0'; + len = strncpy_from_user(name, name_user, MODULE_NAME_LEN); + if (len == 0 || len == MODULE_NAME_LEN) + return -ENOENT; + if (len < 0) + return len; audit_log_kern_module(name); -- cgit v1.2.3 From 6c171b2ccfe677ca97fc5334f853807959f26589 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 30 Jun 2025 16:32:33 +0200 Subject: module: Remove unnecessary +1 from last_unloaded_module::name size The variable last_unloaded_module::name tracks the name of the last unloaded module. It is a string copy of module::name, which is MODULE_NAME_LEN bytes in size and includes the NUL terminator. Therefore, the size of last_unloaded_module::name can also be just MODULE_NAME_LEN, without the need for an extra byte. Fixes: e14af7eeb47e ("debug: track and print last unloaded module in the oops trace") Signed-off-by: Petr Pavlu Reviewed-by: Daniel Gomez Link: https://lore.kernel.org/r/20250630143535.267745-3-petr.pavlu@suse.com Signed-off-by: Daniel Gomez --- kernel/module/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 120e51550a88..7f8bb51aedd4 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -608,7 +608,7 @@ MODINFO_ATTR(version); MODINFO_ATTR(srcversion); static struct { - char name[MODULE_NAME_LEN + 1]; + char name[MODULE_NAME_LEN]; char taints[MODULE_FLAGS_BUF_SIZE]; } last_unloaded_module; -- cgit v1.2.3 From a7c54b2b41dd1f6ec780e7fbfb13f70c64c9731d Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Mon, 30 Jun 2025 16:32:35 +0200 Subject: tracing: Replace MAX_PARAM_PREFIX_LEN with MODULE_NAME_LEN Use the MODULE_NAME_LEN definition in module_exists() to obtain the maximum size of a module name, instead of using MAX_PARAM_PREFIX_LEN. The values are the same but MODULE_NAME_LEN is more appropriate in this context. MAX_PARAM_PREFIX_LEN was added in commit 730b69d22525 ("module: check kernel param length at compile time, not runtime") only to break a circular dependency between module.h and moduleparam.h, and should mostly be limited to use in moduleparam.h. Signed-off-by: Petr Pavlu Cc: Steven Rostedt Cc: Masami Hiramatsu Reviewed-by: Daniel Gomez Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20250630143535.267745-5-petr.pavlu@suse.com Signed-off-by: Daniel Gomez --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7996f26c3f46..3112ac128145 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10367,7 +10367,7 @@ bool module_exists(const char *module) { /* All modules have the symbol __this_module */ static const char this_mod[] = "__this_module"; - char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; + char modname[MODULE_NAME_LEN + sizeof(this_mod) + 2]; unsigned long val; int n; -- cgit v1.2.3 From b9c73524106e1c0c857006fb9ff2e5a510dc4021 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 29 Jul 2025 14:23:07 -0400 Subject: unwind_user/deferred: Add unwind cache Cache the results of the unwind to ensure the unwind is only performed once, even when called by multiple tracers. The cache nr_entries gets cleared every time the task exits the kernel. When a stacktrace is requested, nr_entries gets set to the number of entries in the stacktrace. If another stacktrace is requested, if nr_entries is not zero, then it contains the same stacktrace that would be retrieved so it is not processed again and the entries is given to the caller. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182405.319691167@kernel.org Reviewed-by: Jens Remus Reviewed-By: Indu Bhagat Co-developed-by: Steven Rostedt (Google) Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) --- kernel/unwind/deferred.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index a0badbeb3cc1..96368a5aa522 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -4,10 +4,13 @@ */ #include #include +#include #include #include -#define UNWIND_MAX_ENTRIES 512 +/* Make the cache fit in a 4K page */ +#define UNWIND_MAX_ENTRIES \ + ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) /** * unwind_user_faultable - Produce a user stacktrace in faultable context @@ -24,6 +27,7 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) { struct unwind_task_info *info = ¤t->unwind_info; + struct unwind_cache *cache; /* Should always be called from faultable context */ might_fault(); @@ -31,17 +35,30 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) if (current->flags & PF_EXITING) return -EINVAL; - if (!info->entries) { - info->entries = kmalloc_array(UNWIND_MAX_ENTRIES, sizeof(long), - GFP_KERNEL); - if (!info->entries) + if (!info->cache) { + info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), + GFP_KERNEL); + if (!info->cache) return -ENOMEM; } + cache = info->cache; + trace->entries = cache->entries; + + if (cache->nr_entries) { + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + trace->nr = cache->nr_entries; + return 0; + } + trace->nr = 0; - trace->entries = info->entries; unwind_user(trace, UNWIND_MAX_ENTRIES); + cache->nr_entries = trace->nr; + return 0; } @@ -56,5 +73,5 @@ void unwind_task_free(struct task_struct *task) { struct unwind_task_info *info = &task->unwind_info; - kfree(info->entries); + kfree(info->cache); } -- cgit v1.2.3 From 2dffa355f6c279e7d2e574abf9446c41a631c9e5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 29 Jul 2025 14:23:08 -0400 Subject: unwind_user/deferred: Add deferred unwinding interface Add an interface for scheduling task work to unwind the user space stack before returning to user space. This solves several problems for its callers: - Ensure the unwind happens in task context even if the caller may be running in interrupt context. - Avoid duplicate unwinds, whether called multiple times by the same caller or by different callers. - Create a "context cookie" which allows trace post-processing to correlate kernel unwinds/traces with the user unwind. A concept of a "cookie" is created to detect when the stacktrace is the same. A cookie is generated the first time a user space stacktrace is requested after the task enters the kernel. As the stacktrace is saved on the task_struct while the task is in the kernel, if another request comes in, if the cookie is still the same, it will use the saved stacktrace, and not have to regenerate one. The cookie is passed to the caller on request, and when the stacktrace is generated upon returning to user space, it calls the requester's callback with the cookie as well as the stacktrace. The cookie is cleared when it goes back to user space. Note, this currently adds another conditional to the unwind_reset_info() path that is always called returning to user space, but future changes will put this back to a single conditional. A global list is created and protected by a global mutex that holds tracers that register with the unwind infrastructure. The number of registered tracers will be limited in future changes. Each perf program or ftrace instance will register its own descriptor to use for deferred unwind stack traces. Note, in the function unwind_deferred_task_work() that gets called when returning to user space, it uses a global mutex for synchronization which will cause a big bottleneck. This will be replaced by SRCU, but that change adds some complex synchronization that deservers its own commit. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Jens Remus Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182405.488066537@kernel.org Co-developed-by: Steven Rostedt (Google) Signed-off-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (Google) --- kernel/unwind/deferred.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index 96368a5aa522..2cbae2ada309 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -2,16 +2,63 @@ /* * Deferred user space unwinding */ +#include +#include +#include +#include #include #include #include #include -#include +#include /* Make the cache fit in a 4K page */ #define UNWIND_MAX_ENTRIES \ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) +/* Guards adding to and reading the list of callbacks */ +static DEFINE_MUTEX(callback_mutex); +static LIST_HEAD(callbacks); + +/* + * This is a unique percpu identifier for a given task entry context. + * Conceptually, it's incremented every time the CPU enters the kernel from + * user space, so that each "entry context" on the CPU gets a unique ID. In + * reality, as an optimization, it's only incremented on demand for the first + * deferred unwind request after a given entry-from-user. + * + * It's combined with the CPU id to make a systemwide-unique "context cookie". + */ +static DEFINE_PER_CPU(u32, unwind_ctx_ctr); + +/* + * The context cookie is a unique identifier that is assigned to a user + * space stacktrace. As the user space stacktrace remains the same while + * the task is in the kernel, the cookie is an identifier for the stacktrace. + * Although it is possible for the stacktrace to get another cookie if another + * request is made after the cookie was cleared and before reentering user + * space. + */ +static u64 get_cookie(struct unwind_task_info *info) +{ + u32 cnt = 1; + u32 old = 0; + + if (info->id.cpu) + return info->id.id; + + /* LSB is always set to ensure 0 is an invalid value */ + cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; + if (try_cmpxchg(&info->id.cnt, &old, cnt)) { + /* Update the per cpu counter */ + __this_cpu_write(unwind_ctx_ctr, cnt); + } + /* Interrupts are disabled, the CPU will always be same */ + info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ + + return info->id.id; +} + /** * unwind_user_faultable - Produce a user stacktrace in faultable context * @trace: The descriptor that will store the user stacktrace @@ -62,11 +109,117 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) return 0; } +static void unwind_deferred_task_work(struct callback_head *head) +{ + struct unwind_task_info *info = container_of(head, struct unwind_task_info, work); + struct unwind_stacktrace trace; + struct unwind_work *work; + u64 cookie; + + if (WARN_ON_ONCE(!info->pending)) + return; + + /* Allow work to come in again */ + WRITE_ONCE(info->pending, 0); + + /* + * From here on out, the callback must always be called, even if it's + * just an empty trace. + */ + trace.nr = 0; + trace.entries = NULL; + + unwind_user_faultable(&trace); + + cookie = info->id.id; + + guard(mutex)(&callback_mutex); + list_for_each_entry(work, &callbacks, list) { + work->func(work, &trace, cookie); + } +} + +/** + * unwind_deferred_request - Request a user stacktrace on task kernel exit + * @work: Unwind descriptor requesting the trace + * @cookie: The cookie of the first request made for this task + * + * Schedule a user space unwind to be done in task work before exiting the + * kernel. + * + * The returned @cookie output is the generated cookie of the very first + * request for a user space stacktrace for this task since it entered the + * kernel. It can be from a request by any caller of this infrastructure. + * Its value will also be passed to the callback function. It can be + * used to stitch kernel and user stack traces together in post-processing. + * + * It's valid to call this function multiple times for the same @work within + * the same task entry context. Each call will return the same cookie + * while the task hasn't left the kernel. If the callback is not pending + * because it has already been previously called for the same entry context, + * it will be called again with the same stack trace and cookie. + * + * Return: 1 if the the callback was already queued. + * 0 if the callback successfully was queued. + * Negative if there's an error. + * @cookie holds the cookie of the first request by any user + */ +int unwind_deferred_request(struct unwind_work *work, u64 *cookie) +{ + struct unwind_task_info *info = ¤t->unwind_info; + int ret; + + *cookie = 0; + + if (WARN_ON_ONCE(in_nmi())) + return -EINVAL; + + if ((current->flags & (PF_KTHREAD | PF_EXITING)) || + !user_mode(task_pt_regs(current))) + return -EINVAL; + + guard(irqsave)(); + + *cookie = get_cookie(info); + + /* callback already pending? */ + if (info->pending) + return 1; + + /* The work has been claimed, now schedule it. */ + ret = task_work_add(current, &info->work, TWA_RESUME); + if (WARN_ON_ONCE(ret)) + return ret; + + info->pending = 1; + return 0; +} + +void unwind_deferred_cancel(struct unwind_work *work) +{ + if (!work) + return; + + guard(mutex)(&callback_mutex); + list_del(&work->list); +} + +int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ + memset(work, 0, sizeof(*work)); + + guard(mutex)(&callback_mutex); + list_add(&work->list, &callbacks); + work->func = func; + return 0; +} + void unwind_task_init(struct task_struct *task) { struct unwind_task_info *info = &task->unwind_info; memset(info, 0, sizeof(*info)); + init_task_work(&info->work, unwind_deferred_task_work); } void unwind_task_free(struct task_struct *task) @@ -74,4 +227,5 @@ void unwind_task_free(struct task_struct *task) struct unwind_task_info *info = &task->unwind_info; kfree(info->cache); + task_work_cancel(task, &info->work); } -- cgit v1.2.3 From 055c7060e7ca71bb86da616158fc74254730ae2a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 29 Jul 2025 14:23:09 -0400 Subject: unwind_user/deferred: Make unwind deferral requests NMI-safe Make unwind_deferred_request() NMI-safe so tracers in NMI context can call it and safely request a user space stacktrace when the task exits. Note, this is only allowed for architectures that implement a safe cmpxchg. If an architecture requests a deferred stack trace from NMI context that does not support a safe NMI cmpxchg, it will get an -EINVAL and trigger a warning. For those architectures, they would need another method (perhaps an irqwork), to request a deferred user space stack trace. That can be dealt with later if one of theses architectures require this feature. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Josh Poimboeuf Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Jens Remus Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182405.657072238@kernel.org Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt (Google) --- kernel/unwind/deferred.c | 52 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index 2cbae2ada309..c5ac087d2396 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -12,6 +12,31 @@ #include #include +/* + * For requesting a deferred user space stack trace from NMI context + * the architecture must support a safe cmpxchg in NMI context. + * For those architectures that do not have that, then it cannot ask + * for a deferred user space stack trace from an NMI context. If it + * does, then it will get -EINVAL. + */ +#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) +# define CAN_USE_IN_NMI 1 +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + u32 old = 0; + + return try_cmpxchg(&info->id.cnt, &old, cnt); +} +#else +# define CAN_USE_IN_NMI 0 +/* When NMIs are not allowed, this always succeeds */ +static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) +{ + info->id.cnt = cnt; + return true; +} +#endif + /* Make the cache fit in a 4K page */ #define UNWIND_MAX_ENTRIES \ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) @@ -42,14 +67,13 @@ static DEFINE_PER_CPU(u32, unwind_ctx_ctr); static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = 1; - u32 old = 0; if (info->id.cpu) return info->id.id; /* LSB is always set to ensure 0 is an invalid value */ cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; - if (try_cmpxchg(&info->id.cnt, &old, cnt)) { + if (try_assign_cnt(info, cnt)) { /* Update the per cpu counter */ __this_cpu_write(unwind_ctx_ctr, cnt); } @@ -167,31 +191,43 @@ static void unwind_deferred_task_work(struct callback_head *head) int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; + long pending; int ret; *cookie = 0; - if (WARN_ON_ONCE(in_nmi())) - return -EINVAL; - if ((current->flags & (PF_KTHREAD | PF_EXITING)) || !user_mode(task_pt_regs(current))) return -EINVAL; + /* + * NMI requires having safe cmpxchg operations. + * Trigger a warning to make it obvious that an architecture + * is using this in NMI when it should not be. + */ + if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) + return -EINVAL; + guard(irqsave)(); *cookie = get_cookie(info); /* callback already pending? */ - if (info->pending) + pending = READ_ONCE(info->pending); + if (pending) + return 1; + + /* Claim the work unless an NMI just now swooped in to do so. */ + if (!try_cmpxchg(&info->pending, &pending, 1)) return 1; /* The work has been claimed, now schedule it. */ ret = task_work_add(current, &info->work, TWA_RESUME); - if (WARN_ON_ONCE(ret)) + if (WARN_ON_ONCE(ret)) { + WRITE_ONCE(info->pending, 0); return ret; + } - info->pending = 1; return 0; } -- cgit v1.2.3 From be3d526a5b34109cecf3bc23b96f0081ad600a5b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 29 Jul 2025 14:23:10 -0400 Subject: unwind deferred: Use bitmask to determine which callbacks to call In order to know which registered callback requested a stacktrace for when the task goes back to user space, add a bitmask to keep track of all registered tracers. The bitmask is the size of long, which means that on a 32 bit machine, it can have at most 32 registered tracers, and on 64 bit, it can have at most 64 registered tracers. This should not be an issue as there should not be more than 10 (unless BPF can abuse this?). When a tracer registers with unwind_deferred_init() it will get a bit number assigned to it. When a tracer requests a stacktrace, it will have its bit set within the task_struct. When the task returns back to user space, it will call the callbacks for all the registered tracers where their bits are set in the task's mask. When a tracer is removed by the unwind_deferred_cancel() all current tasks will clear the associated bit, just in case another tracer gets registered immediately afterward and then gets their callback called unexpectedly. To prevent live locks from happening if an event that happens between the task_work and when the task goes back to user space, triggers the deferred unwind, have the unwind_mask get cleared on exit to user space and not after the callback is made. Move the pending bit from a value on the task_struct to bit zero of the unwind_mask (saves space on the task_struct). This will allow modifying the pending bit along with the work bits atomically. Instead of clearing a work's bit after its callback is called, it is delayed until exit. If the work is requested again, the task_work is not queued again and the request will be notified that the task has already been called by returning a positive number (the same as if it was already pending). The pending bit is cleared before calling the callback functions but the current work bits remain. If one of the called works registers again, it will not trigger a task_work if its bit is still present in the task's unwind_mask. If a new work requests a deferred unwind, then it will set both the pending bit and its own bit. Note this will also cause any work that was previously queued and had their callback already executed to be executed again. Future work will remove these spurious callbacks. The use of atomic_long bit operations were suggested by Peter Zijlstra: Link: https://lore.kernel.org/all/20250715102912.GQ1613200@noisy.programming.kicks-ass.net/ The unwind_mask could not be converted to atomic_long_t do to atomic_long not having all the bit operations needed by unwind_mask. Instead it follows other use cases in the kernel and just typecasts the unwind_mask to atomic_long_t when using the two atomic_long functions. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Jens Remus Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182405.822789300@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/unwind/deferred.c | 87 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index c5ac087d2396..e19f02ef416d 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -45,6 +45,16 @@ static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) static DEFINE_MUTEX(callback_mutex); static LIST_HEAD(callbacks); +#define RESERVED_BITS (UNWIND_PENDING) + +/* Zero'd bits are available for assigning callback users */ +static unsigned long unwind_mask = RESERVED_BITS; + +static inline bool unwind_pending(struct unwind_task_info *info) +{ + return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); +} + /* * This is a unique percpu identifier for a given task entry context. * Conceptually, it's incremented every time the CPU enters the kernel from @@ -138,14 +148,15 @@ static void unwind_deferred_task_work(struct callback_head *head) struct unwind_task_info *info = container_of(head, struct unwind_task_info, work); struct unwind_stacktrace trace; struct unwind_work *work; + unsigned long bits; u64 cookie; - if (WARN_ON_ONCE(!info->pending)) + if (WARN_ON_ONCE(!unwind_pending(info))) return; - /* Allow work to come in again */ - WRITE_ONCE(info->pending, 0); - + /* Clear pending bit but make sure to have the current bits */ + bits = atomic_long_fetch_andnot(UNWIND_PENDING, + (atomic_long_t *)&info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. @@ -159,7 +170,8 @@ static void unwind_deferred_task_work(struct callback_head *head) guard(mutex)(&callback_mutex); list_for_each_entry(work, &callbacks, list) { - work->func(work, &trace, cookie); + if (test_bit(work->bit, &bits)) + work->func(work, &trace, cookie); } } @@ -183,15 +195,16 @@ static void unwind_deferred_task_work(struct callback_head *head) * because it has already been previously called for the same entry context, * it will be called again with the same stack trace and cookie. * - * Return: 1 if the the callback was already queued. - * 0 if the callback successfully was queued. + * Return: 0 if the callback successfully was queued. + * 1 if the callback is pending or was already executed. * Negative if there's an error. * @cookie holds the cookie of the first request by any user */ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; - long pending; + unsigned long old, bits; + unsigned long bit = BIT(work->bit); int ret; *cookie = 0; @@ -212,32 +225,59 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) *cookie = get_cookie(info); - /* callback already pending? */ - pending = READ_ONCE(info->pending); - if (pending) - return 1; + old = READ_ONCE(info->unwind_mask); - /* Claim the work unless an NMI just now swooped in to do so. */ - if (!try_cmpxchg(&info->pending, &pending, 1)) + /* Is this already queued or executed */ + if (old & bit) return 1; + /* + * This work's bit hasn't been set yet. Now set it with the PENDING + * bit and fetch the current value of unwind_mask. If ether the + * work's bit or PENDING was already set, then this is already queued + * to have a callback. + */ + bits = UNWIND_PENDING | bit; + old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + if (old & bits) { + /* + * If the work's bit was set, whatever set it had better + * have also set pending and queued a callback. + */ + WARN_ON_ONCE(!(old & UNWIND_PENDING)); + return old & bit; + } + /* The work has been claimed, now schedule it. */ ret = task_work_add(current, &info->work, TWA_RESUME); - if (WARN_ON_ONCE(ret)) { - WRITE_ONCE(info->pending, 0); - return ret; - } - return 0; + if (WARN_ON_ONCE(ret)) + WRITE_ONCE(info->unwind_mask, 0); + + return ret; } void unwind_deferred_cancel(struct unwind_work *work) { + struct task_struct *g, *t; + if (!work) return; + /* No work should be using a reserved bit */ + if (WARN_ON_ONCE(BIT(work->bit) & RESERVED_BITS)) + return; + guard(mutex)(&callback_mutex); list_del(&work->list); + + __clear_bit(work->bit, &unwind_mask); + + guard(rcu)(); + /* Clear this bit from all threads */ + for_each_process_thread(g, t) { + clear_bit(work->bit, &t->unwind_info.unwind_mask); + } } int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) @@ -245,6 +285,14 @@ int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) memset(work, 0, sizeof(*work)); guard(mutex)(&callback_mutex); + + /* See if there's a bit in the mask available */ + if (unwind_mask == ~0UL) + return -EBUSY; + + work->bit = ffz(unwind_mask); + __set_bit(work->bit, &unwind_mask); + list_add(&work->list, &callbacks); work->func = func; return 0; @@ -256,6 +304,7 @@ void unwind_task_init(struct task_struct *task) memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); + info->unwind_mask = 0; } void unwind_task_free(struct task_struct *task) -- cgit v1.2.3 From 4c75133e745aa95636c9ccbab1603ed363dabcd4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 29 Jul 2025 14:23:11 -0400 Subject: unwind deferred: Add unwind_completed mask to stop spurious callbacks If there's more than one registered tracer to the unwind deferred infrastructure, it is currently possible that one tracer could cause extra callbacks to happen for another tracer if the former requests a deferred stacktrace after the latter's callback was executed and before the task went back to user space. Here's an example of how this could occur: [Task enters kernel] tracer 1 request -> add cookie to its buffer tracer 1 request -> add cookie to its buffer <..> [ task work executes ] tracer 1 callback -> add trace + cookie to its buffer [tracer 2 requests and triggers the task work again] [ task work executes again ] tracer 1 callback -> add trace + cookie to its buffer tracer 2 callback -> add trace + cookie to its buffer [Task exits back to user space] This is because the bit for tracer 1 gets set in the task's unwind_mask when it did its request and does not get cleared until the task returns back to user space. But if another tracer were to request another deferred stacktrace, then the next task work will executed all tracer's callbacks that have their bits set in the task's unwind_mask. To fix this issue, add another mask called unwind_completed and place it into the task's info->cache structure. The cache structure is allocated on the first occurrence of a deferred stacktrace and this unwind_completed mask is not needed until then. It's better to have it in the cache than to permanently waste space in the task_struct. After a tracer's callback is executed, it's bit gets set in this unwind_completed mask. When the task_work enters, it will AND the task's unwind_mask with the inverse of the unwind_completed which will eliminate any work that already had its callback executed since the task entered the kernel. When the task leaves the kernel, it will reset this unwind_completed mask just like it resets the other values as it enters user space. Link: https://lore.kernel.org/all/20250716142609.47f0e4a5@batman.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Jens Remus Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182405.989222722@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/unwind/deferred.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index e19f02ef416d..a3d26014a2e6 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -166,12 +166,18 @@ static void unwind_deferred_task_work(struct callback_head *head) unwind_user_faultable(&trace); + if (info->cache) + bits &= ~(info->cache->unwind_completed); + cookie = info->id.id; guard(mutex)(&callback_mutex); list_for_each_entry(work, &callbacks, list) { - if (test_bit(work->bit, &bits)) + if (test_bit(work->bit, &bits)) { work->func(work, &trace, cookie); + if (info->cache) + info->cache->unwind_completed |= BIT(work->bit); + } } } @@ -260,23 +266,28 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) void unwind_deferred_cancel(struct unwind_work *work) { struct task_struct *g, *t; + int bit; if (!work) return; + bit = work->bit; + /* No work should be using a reserved bit */ - if (WARN_ON_ONCE(BIT(work->bit) & RESERVED_BITS)) + if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) return; guard(mutex)(&callback_mutex); list_del(&work->list); - __clear_bit(work->bit, &unwind_mask); + __clear_bit(bit, &unwind_mask); guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { - clear_bit(work->bit, &t->unwind_info.unwind_mask); + clear_bit(bit, &t->unwind_info.unwind_mask); + if (t->unwind_info.cache) + clear_bit(bit, &t->unwind_info.cache->unwind_completed); } } -- cgit v1.2.3 From 858fa8a3b083e862114bb6483b9fb50b3e2bc4c3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 29 Jul 2025 14:23:12 -0400 Subject: unwind: Add USED bit to only have one conditional on way back to user space On the way back to user space, the function unwind_reset_info() is called unconditionally (but always inlined). It currently has two conditionals. One that checks the unwind_mask which is set whenever a deferred trace is called and is used to know that the mask needs to be cleared. The other checks if the cache has been allocated, and if so, it resets the nr_entries so that the unwinder knows it needs to do the work to get a new user space stack trace again (it only does it once per entering the kernel). Use one of the bits in the unwind mask as a "USED" bit that gets set whenever a trace is created. This will make it possible to only check the unwind_mask in the unwind_reset_info() to know if it needs to do work or not and eliminates a conditional that happens every time the task goes back to user space. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Jens Remus Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182406.155422551@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/unwind/deferred.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index a3d26014a2e6..2311b725d691 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -45,7 +45,7 @@ static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) static DEFINE_MUTEX(callback_mutex); static LIST_HEAD(callbacks); -#define RESERVED_BITS (UNWIND_PENDING) +#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) /* Zero'd bits are available for assigning callback users */ static unsigned long unwind_mask = RESERVED_BITS; @@ -140,6 +140,9 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) cache->nr_entries = trace->nr; + /* Clear nr_entries on way back to user space */ + set_bit(UNWIND_USED_BIT, &info->unwind_mask); + return 0; } -- cgit v1.2.3 From 357eda2d745054eb737397368bc9b0f84814b0a5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 29 Jul 2025 14:23:13 -0400 Subject: unwind deferred: Use SRCU unwind_deferred_task_work() Instead of using the callback_mutex to protect the link list of callbacks in unwind_deferred_task_work(), use SRCU instead. This gets called every time a task exits that has to record a stack trace that was requested. This can happen for many tasks on several CPUs at the same time. A mutex is a bottleneck and can cause a bit of contention and slow down performance. As the callbacks themselves are allowed to sleep, regular RCU cannot be used to protect the list. Instead use SRCU, as that still allows the callbacks to sleep and the list can be read without needing to hold the callback_mutex. Link: https://lore.kernel.org/all/ca9bd83a-6c80-4ee0-a83c-224b9d60b755@efficios.com/ Cc: "Paul E. McKenney" Cc: Masami Hiramatsu Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Jens Remus Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182406.331548065@kernel.org Suggested-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (Google) --- kernel/unwind/deferred.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index 2311b725d691..a5ef1c1f915e 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -41,7 +41,7 @@ static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) #define UNWIND_MAX_ENTRIES \ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) -/* Guards adding to and reading the list of callbacks */ +/* Guards adding to or removing from the list of callbacks */ static DEFINE_MUTEX(callback_mutex); static LIST_HEAD(callbacks); @@ -49,6 +49,7 @@ static LIST_HEAD(callbacks); /* Zero'd bits are available for assigning callback users */ static unsigned long unwind_mask = RESERVED_BITS; +DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { @@ -174,8 +175,9 @@ static void unwind_deferred_task_work(struct callback_head *head) cookie = info->id.id; - guard(mutex)(&callback_mutex); - list_for_each_entry(work, &callbacks, list) { + guard(srcu)(&unwind_srcu); + list_for_each_entry_srcu(work, &callbacks, list, + srcu_read_lock_held(&unwind_srcu)) { if (test_bit(work->bit, &bits)) { work->func(work, &trace, cookie); if (info->cache) @@ -213,7 +215,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; unsigned long old, bits; - unsigned long bit = BIT(work->bit); + unsigned long bit; int ret; *cookie = 0; @@ -230,6 +232,14 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) return -EINVAL; + /* Do not allow cancelled works to request again */ + bit = READ_ONCE(work->bit); + if (WARN_ON_ONCE(bit < 0)) + return -EINVAL; + + /* Only need the mask now */ + bit = BIT(bit); + guard(irqsave)(); *cookie = get_cookie(info); @@ -281,10 +291,15 @@ void unwind_deferred_cancel(struct unwind_work *work) return; guard(mutex)(&callback_mutex); - list_del(&work->list); + list_del_rcu(&work->list); + + /* Do not allow any more requests and prevent callbacks */ + work->bit = -1; __clear_bit(bit, &unwind_mask); + synchronize_srcu(&unwind_srcu); + guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { @@ -307,7 +322,7 @@ int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) work->bit = ffz(unwind_mask); __set_bit(work->bit, &unwind_mask); - list_add(&work->list, &callbacks); + list_add_rcu(&work->list, &callbacks); work->func = func; return 0; } -- cgit v1.2.3 From b3b9cb11aa034cfa9eb880bb9bb3d5aaf732e479 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 29 Jul 2025 14:23:14 -0400 Subject: unwind: Finish up unwind when a task exits On do_exit() when a task is exiting, if a unwind is requested and the deferred user stacktrace is deferred via the task_work, the task_work callback is called after exit_mm() is called in do_exit(). This means that the user stack trace will not be retrieved and an empty stack is created. Instead, add a function unwind_deferred_task_exit() and call it just before exit_mm() so that the unwinder can call the requested callbacks with the user space stack. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jiri Olsa Cc: Arnaldo Carvalho de Melo Cc: Namhyung Kim Cc: Thomas Gleixner Cc: Andrii Nakryiko Cc: Indu Bhagat Cc: "Jose E. Marchesi" Cc: Beau Belgrave Cc: Jens Remus Cc: Linus Torvalds Cc: Andrew Morton Cc: Jens Axboe Cc: Florian Weimer Cc: Sam James Link: https://lore.kernel.org/20250729182406.504259474@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/exit.c | 2 ++ kernel/unwind/deferred.c | 23 ++++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index bb184a67ac73..1d8c8ac33c4f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -938,6 +939,7 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); + unwind_deferred_task_exit(tsk); trace_sched_process_exit(tsk, group_dead); /* diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index a5ef1c1f915e..dc6040aae3ee 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -114,7 +114,7 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) /* Should always be called from faultable context */ might_fault(); - if (current->flags & PF_EXITING) + if (!current->mm) return -EINVAL; if (!info->cache) { @@ -147,9 +147,9 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) return 0; } -static void unwind_deferred_task_work(struct callback_head *head) +static void process_unwind_deferred(struct task_struct *task) { - struct unwind_task_info *info = container_of(head, struct unwind_task_info, work); + struct unwind_task_info *info = &task->unwind_info; struct unwind_stacktrace trace; struct unwind_work *work; unsigned long bits; @@ -186,6 +186,23 @@ static void unwind_deferred_task_work(struct callback_head *head) } } +static void unwind_deferred_task_work(struct callback_head *head) +{ + process_unwind_deferred(current); +} + +void unwind_deferred_task_exit(struct task_struct *task) +{ + struct unwind_task_info *info = ¤t->unwind_info; + + if (!unwind_pending(info)) + return; + + process_unwind_deferred(task); + + task_work_cancel(task, &info->work); +} + /** * unwind_deferred_request - Request a user stacktrace on task kernel exit * @work: Unwind descriptor requesting the trace -- cgit v1.2.3 From 8557c8628cf3cf8ebd3b32601ccdde550bbf6c54 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Thu, 19 Jun 2025 14:26:25 -0400 Subject: clocksource: Improve randomness in clocksource_verify_choose_cpus() The current algorithm of picking a random CPU works OK for dense online cpumask, but if cpumask is non-dense, the distribution of picked CPUs is skewed. For example, on 8-CPU board with CPUs 4-7 offlined, the probability of selecting CPU 0 is 5/8. Accordingly, cpus 1, 2 and 3 are chosen with probability 1/8 each. The proper algorithm should pick each online CPU with probability 1/4. Switch it to cpumask_random(), which has better statistical characteristics. CC: Andrew Morton Acked-by: John Stultz Reviewed-by: Thomas Gleixner Signed-off-by: "Yury Norov [NVIDIA]" --- kernel/time/clocksource.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a8bc7da9062..4b005b2f3ef5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -342,10 +342,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = get_random_u32_below(nr_cpu_ids); - cpu = cpumask_next(cpu - 1, cpu_online_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_online_mask); + cpu = cpumask_random(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } -- cgit v1.2.3 From f49a4af3fabce56aaef125f0d959b81afe194afb Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Wed, 4 Jun 2025 20:12:52 -0400 Subject: watchdog: fix opencoded cpumask_next_wrap() in watchdog_next_cpu() The dedicated helper is more verbose and efficient comparing to cpumask_next() followed by cpumask_first(). Signed-off-by: "Yury Norov [NVIDIA]" Reviewed-by: Douglas Anderson --- kernel/watchdog_buddy.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c index 34dbfe091f4b..ee754d767c21 100644 --- a/kernel/watchdog_buddy.c +++ b/kernel/watchdog_buddy.c @@ -12,10 +12,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu) { unsigned int next_cpu; - next_cpu = cpumask_next(cpu, &watchdog_cpus); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(&watchdog_cpus); - + next_cpu = cpumask_next_wrap(cpu, &watchdog_cpus); if (next_cpu == cpu) return nr_cpu_ids; -- cgit v1.2.3 From 12df58ad294253ac1d8df0c9bb9cf726397a671d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Jul 2025 01:47:30 +0200 Subject: bpf: Add cookie object to bpf maps Add a cookie to BPF maps to uniquely identify BPF maps for the timespan when the node is up. This is different to comparing a pointer or BPF map id which could get rolled over and reused. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250730234733.530041-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e63039817af3..7a814e98d5f5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,7 @@ #define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) DEFINE_PER_CPU(int, bpf_prog_active); +DEFINE_COOKIE(bpf_map_cookie); static DEFINE_IDR(prog_idr); static DEFINE_SPINLOCK(prog_idr_lock); static DEFINE_IDR(map_idr); @@ -1487,6 +1489,10 @@ static int map_create(union bpf_attr *attr, bool kernel) if (err < 0) goto free_map; + preempt_disable(); + map->cookie = gen_cookie_next(&bpf_map_cookie); + preempt_enable(); + atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); -- cgit v1.2.3 From fd1c98f0ef5cbcec842209776505d9e70d8fcd53 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Jul 2025 01:47:31 +0200 Subject: bpf: Move bpf map owner out of common struct Given this is only relevant for BPF tail call maps, it is adding up space and penalizing other map types. We also need to extend this with further objects to track / compare to. Therefore, lets move this out into a separate structure and dynamically allocate it only for BPF tail call maps. Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250730234733.530041-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 35 ++++++++++++++++++----------------- kernel/bpf/syscall.c | 13 +++++++------ 2 files changed, 25 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 09dde5b00d0c..6e5b3a67e87f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2377,28 +2377,29 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); - bool ret; struct bpf_prog_aux *aux = fp->aux; + bool ret = false; if (fp->kprobe_override) - return false; + return ret; - spin_lock(&map->owner.lock); - if (!map->owner.type) { - /* There's no owner yet where we could check for - * compatibility. - */ - map->owner.type = prog_type; - map->owner.jited = fp->jited; - map->owner.xdp_has_frags = aux->xdp_has_frags; - map->owner.attach_func_proto = aux->attach_func_proto; + spin_lock(&map->owner_lock); + /* There's no owner yet where we could check for compatibility. */ + if (!map->owner) { + map->owner = bpf_map_owner_alloc(map); + if (!map->owner) + goto err; + map->owner->type = prog_type; + map->owner->jited = fp->jited; + map->owner->xdp_has_frags = aux->xdp_has_frags; + map->owner->attach_func_proto = aux->attach_func_proto; ret = true; } else { - ret = map->owner.type == prog_type && - map->owner.jited == fp->jited && - map->owner.xdp_has_frags == aux->xdp_has_frags; + ret = map->owner->type == prog_type && + map->owner->jited == fp->jited && + map->owner->xdp_has_frags == aux->xdp_has_frags; if (ret && - map->owner.attach_func_proto != aux->attach_func_proto) { + map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: @@ -2411,8 +2412,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, } } } - spin_unlock(&map->owner.lock); - +err: + spin_unlock(&map->owner_lock); return ret; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7a814e98d5f5..0fbfa8532c39 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -887,6 +887,7 @@ static void bpf_map_free_deferred(struct work_struct *work) security_bpf_map_free(map); bpf_map_release_memcg(map); + bpf_map_owner_free(map); bpf_map_free(map); } @@ -981,12 +982,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map_type_contains_progs(map)) { - spin_lock(&map->owner.lock); - type = map->owner.type; - jited = map->owner.jited; - spin_unlock(&map->owner.lock); + spin_lock(&map->owner_lock); + if (map->owner) { + type = map->owner->type; + jited = map->owner->jited; } + spin_unlock(&map->owner_lock); seq_printf(m, "map_type:\t%u\n" @@ -1496,7 +1497,7 @@ static int map_create(union bpf_attr *attr, bool kernel) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); - spin_lock_init(&map->owner.lock); + spin_lock_init(&map->owner_lock); if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, -- cgit v1.2.3 From abad3d0bad72a52137e0c350c59542d75ae4f513 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 31 Jul 2025 01:47:33 +0200 Subject: bpf: Fix oob access in cgroup local storage Lonial reported that an out-of-bounds access in cgroup local storage can be crafted via tail calls. Given two programs each utilizing a cgroup local storage with a different value size, and one program doing a tail call into the other. The verifier will validate each of the indivial programs just fine. However, in the runtime context the bpf_cg_run_ctx holds an bpf_prog_array_item which contains the BPF program as well as any cgroup local storage flavor the program uses. Helpers such as bpf_get_local_storage() pick this up from the runtime context: ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); storage = ctx->prog_item->cgroup_storage[stype]; if (stype == BPF_CGROUP_STORAGE_SHARED) ptr = &READ_ONCE(storage->buf)->data[0]; else ptr = this_cpu_ptr(storage->percpu_buf); For the second program which was called from the originally attached one, this means bpf_get_local_storage() will pick up the former program's map, not its own. With mismatching sizes, this can result in an unintended out-of-bounds access. To fix this issue, we need to extend bpf_map_owner with an array of storage_cookie[] to match on i) the exact maps from the original program if the second program was using bpf_get_local_storage(), or ii) allow the tail call combination if the second program was not using any of the cgroup local storage maps. Fixes: 7d9c3427894f ("bpf: Make cgroup storages shared between programs on the same cgroup") Reported-by: Lonial Con Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/r/20250730234733.530041-4-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6e5b3a67e87f..5d1650af899d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2378,7 +2378,9 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, { enum bpf_prog_type prog_type = resolve_prog_type(fp); struct bpf_prog_aux *aux = fp->aux; + enum bpf_cgroup_storage_type i; bool ret = false; + u64 cookie; if (fp->kprobe_override) return ret; @@ -2393,11 +2395,24 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map, map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; map->owner->attach_func_proto = aux->attach_func_proto; + for_each_cgroup_storage_type(i) { + map->owner->storage_cookie[i] = + aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + } ret = true; } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited && map->owner->xdp_has_frags == aux->xdp_has_frags; + for_each_cgroup_storage_type(i) { + if (!ret) + break; + cookie = aux->cgroup_storage[i] ? + aux->cgroup_storage[i]->cookie : 0; + ret = map->owner->storage_cookie[i] == cookie || + !cookie; + } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { -- cgit v1.2.3 From f1befc82addda926c8301436123d041bf3249505 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Fri, 1 Aug 2025 00:10:07 +0000 Subject: cfi: Move BPF CFI types and helpers to generic code Instead of duplicating the same code for each architecture, move the CFI type hash variables for BPF function types and related helper functions to generic CFI code, and allow architectures to override the function definitions if needed. Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20250801001004.1859976-7-samitolvanen@google.com Signed-off-by: Alexei Starovoitov --- kernel/cfi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/cfi.c b/kernel/cfi.c index 422fa4f958ae..4dad04ead06c 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -5,6 +5,8 @@ * Copyright (C) 2022 Google LLC */ +#include +#include #include bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE); @@ -27,6 +29,19 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, return BUG_TRAP_TYPE_BUG; } +/* + * Declare two non-existent functions with types that match bpf_func_t and + * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash + * variables for each function type. The cfi_bpf_* variables are used by + * arch-specific BPF JIT implementations to ensure indirectly callable JIT + * code has matching CFI type hashes. + */ +extern typeof(*(bpf_func_t)0) __bpf_prog_runX; +DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX); + +extern typeof(*(bpf_callback_t)0) __bpf_callback_fn; +DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn); + #ifdef CONFIG_ARCH_USES_CFI_TRAPS static inline unsigned long trap_address(s32 *p) { -- cgit v1.2.3 From 32d89a405adc204d82bea6ae2ba27a62d35568b4 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Wed, 4 Jun 2025 14:55:48 +0800 Subject: vhost: Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)) cocci warning: ./kernel/vhost_task.c:148:9-16: WARNING: ERR_CAST can be used with tsk Use ERR_CAST inlined function instead of ERR_PTR(PTR_ERR(...)). Signed-off-by: Pei Xiao Message-Id: <1a8499a5da53e4f72cf21aca044ae4b26db8b2ad.1749020055.git.xiaopei01@kylinos.cn> Signed-off-by: Michael S. Tsirkin --- kernel/vhost_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2f844c279a3e..bc738fa90c1d 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); if (IS_ERR(tsk)) { kfree(vtsk); - return ERR_PTR(PTR_ERR(tsk)); + return ERR_CAST(tsk); } vtsk->task = tsk; -- cgit v1.2.3 From f914876eec9e72ae94b5cee81a9dc7935c255b2f Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 1 Aug 2025 11:49:15 +0200 Subject: bpf: Improve ctx access verifier error message We've already had two "error during ctx access conversion" warnings triggered by syzkaller. Let's improve the error message by dumping the cnt variable so that we can more easily differentiate between the different error cases. Signed-off-by: Paul Chaignon Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/cc94316c30dd76fae4a75a664b61a2dbfe68e205.1754039605.git.paul.chaignon@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 399f03e62508..0806295945e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21445,7 +21445,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= INSN_BUF_SIZE || (ctx_field_size && !target_size)) { - verifier_bug(env, "error during ctx access conversion"); + verifier_bug(env, "error during ctx access conversion (%d)", cnt); return -EFAULT; } -- cgit v1.2.3 From c89504a703fb779052213add0e8ed642f4a4f1c8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2025 16:37:23 -0400 Subject: tracing: Remove unneeded goto out logic Several places in the trace.c file there's a goto out where the out is simply a return. There's no reason to jump to the out label if it's not doing any more logic but simply returning from the function. Replace the goto outs with a return and remove the out labels. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250801203857.538726745@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 945a8ecf2c62..0ec9cab9a812 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1841,7 +1841,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; @@ -1855,7 +1855,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; } @@ -1865,8 +1865,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; - ret = read; - goto out; + return read; } } @@ -1874,13 +1873,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; - else { - ret = -EINVAL; - goto out; - } + else + return -EINVAL; + ret = get_user(ch, ubuf++); if (ret) - goto out; + return ret; read++; cnt--; } @@ -1895,15 +1893,11 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { - ret = -EINVAL; - goto out; + return -EINVAL; } *ppos += read; - ret = read; - -out: - return ret; + return read; } /* TODO add a seq_buf_to_buffer() */ @@ -2405,10 +2399,10 @@ int __init register_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); if (ret || !default_bootup_tracer) - goto out_unlock; + return ret; if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) - goto out_unlock; + return 0; printk(KERN_INFO "Starting tracer '%s'\n", type->name); /* Do we want this tracer to start on bootup? */ @@ -2420,8 +2414,7 @@ int __init register_tracer(struct tracer *type) /* disable other selftests, since this will break it. */ disable_tracing_selftest("running a tracer"); - out_unlock: - return ret; + return 0; } static void tracing_reset_cpu(struct array_buffer *buf, int cpu) @@ -8963,12 +8956,12 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, out_reg: ret = tracing_arm_snapshot(tr); if (ret < 0) - goto out; + return ret; ret = register_ftrace_function_probe(glob, tr, ops, count); if (ret < 0) tracing_disarm_snapshot(tr); - out: + return ret < 0 ? ret : 0; } @@ -11070,7 +11063,7 @@ __init static int tracer_alloc_buffers(void) BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE); if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) - goto out; + return -ENOMEM; if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; @@ -11188,7 +11181,6 @@ out_free_cpumask: free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -out: return ret; } -- cgit v1.2.3 From 788fa4b47cdcd9b3d8c2d02ac0b3cd2540305f18 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2025 16:37:24 -0400 Subject: tracing: Add guard(ring_buffer_nest) Some calls to the tracing ring buffer can happen when the ring buffer is already being written to by the same context (for example, a trace_printk() in between a ring_buffer_lock_reserve() and a ring_buffer_unlock_commit()). In order to not trigger the recursion detection, these functions use ring_buffer_nest_start() and ring_buffer_nest_end(). Create a guard() for these functions so that their use cases can be simplified and not need to use goto for the release. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250801203857.710501021@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 69 ++++++++++++++++----------------------- kernel/trace/trace_events_synth.c | 6 ++-- 2 files changed, 31 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0ec9cab9a812..332487179e1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1160,13 +1160,11 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, trace_ctx); - if (!event) { - size = 0; - goto out; - } + if (!event) + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1182,8 +1180,6 @@ int __trace_array_puts(struct trace_array *tr, unsigned long ip, __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - out: - ring_buffer_nest_end(buffer); return size; } EXPORT_SYMBOL_GPL(__trace_array_puts); @@ -1213,7 +1209,6 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned int trace_ctx; int size = sizeof(struct bputs_entry); - int ret = 0; if (!printk_binsafe(tr)) return __trace_puts(ip, str, strlen(str)); @@ -1227,11 +1222,11 @@ int __trace_bputs(unsigned long ip, const char *str) trace_ctx = tracing_gen_ctx(); buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); event = __trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, trace_ctx); if (!event) - goto out; + return 0; entry = ring_buffer_event_data(event); entry->ip = ip; @@ -1240,10 +1235,7 @@ int __trace_bputs(unsigned long ip, const char *str) __buffer_unlock_commit(buffer, event); ftrace_trace_stack(tr, buffer, trace_ctx, 4, NULL); - ret = 1; - out: - ring_buffer_nest_end(buffer); - return ret; + return 1; } EXPORT_SYMBOL_GPL(__trace_bputs); @@ -3397,21 +3389,19 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) size = sizeof(*entry) + sizeof(u32) * len; buffer = tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - entry->fmt = fmt; - - memcpy(entry->buf, tbuffer, sizeof(u32) * len); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + trace_ctx); + if (!event) + goto out_put; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; -out: - ring_buffer_nest_end(buffer); + memcpy(entry->buf, tbuffer, sizeof(u32) * len); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr, buffer, trace_ctx, 6, NULL); + } out_put: put_trace_buf(); @@ -3452,20 +3442,19 @@ int __trace_array_vprintk(struct trace_buffer *buffer, len = vscnprintf(tbuffer, TRACE_BUF_SIZE, fmt, args); size = sizeof(*entry) + len + 1; - ring_buffer_nest_start(buffer); - event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, - trace_ctx); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->ip = ip; - - memcpy(&entry->buf, tbuffer, len + 1); - __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + scoped_guard(ring_buffer_nest, buffer) { + event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + trace_ctx); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + memcpy(&entry->buf, tbuffer, len + 1); + __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(printk_trace, buffer, trace_ctx, 6, NULL); + } out: - ring_buffer_nest_end(buffer); put_trace_buf(); out_nobuffer: diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 33cfbd4ed76d..f24ee61f8884 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -536,12 +536,12 @@ static notrace void trace_event_raw_event_synth(void *__data, * is being performed within another event. */ buffer = trace_file->tr->array_buffer.buffer; - ring_buffer_nest_start(buffer); + guard(ring_buffer_nest)(buffer); entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry) + fields_size); if (!entry) - goto out; + return; for (i = 0, n_u64 = 0; i < event->n_fields; i++) { val_idx = var_ref_idx[i]; @@ -584,8 +584,6 @@ static notrace void trace_event_raw_event_synth(void *__data, } trace_event_buffer_commit(&fbuffer); -out: - ring_buffer_nest_end(buffer); } static void free_synth_event_print_fmt(struct trace_event_call *call) -- cgit v1.2.3 From debe57fbe12cb16881b2db1f1787eb9673a8b8b0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2025 16:37:25 -0400 Subject: tracing: Add guard() around locks and mutexes in trace.c There's several locations in trace.c that can be simplified by using guards around raw_spin_lock_irqsave, mutexes and preempt disabling. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250801203857.879085376@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 144 ++++++++++++++++----------------------------------- 1 file changed, 46 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 332487179e1d..4299e89ed04e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -432,15 +432,13 @@ static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; - preempt_disable_notrace(); + guard(preempt_notrace)(); export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } - - preempt_enable_notrace(); } static inline void @@ -497,27 +495,18 @@ int register_ftrace_export(struct trace_export *export) if (WARN_ON_ONCE(!export->write)) return -1; - mutex_lock(&ftrace_export_lock); + guard(mutex)(&ftrace_export_lock); add_ftrace_export(&ftrace_exports_list, export); - mutex_unlock(&ftrace_export_lock); - return 0; } EXPORT_SYMBOL_GPL(register_ftrace_export); int unregister_ftrace_export(struct trace_export *export) { - int ret; - - mutex_lock(&ftrace_export_lock); - - ret = rm_ftrace_export(&ftrace_exports_list, export); - - mutex_unlock(&ftrace_export_lock); - - return ret; + guard(mutex)(&ftrace_export_lock); + return rm_ftrace_export(&ftrace_exports_list, export); } EXPORT_SYMBOL_GPL(unregister_ftrace_export); @@ -640,9 +629,8 @@ void trace_array_put(struct trace_array *this_tr) if (!this_tr) return; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); __trace_array_put(this_tr); - mutex_unlock(&trace_types_lock); } EXPORT_SYMBOL_GPL(trace_array_put); @@ -1424,13 +1412,8 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr) int tracing_arm_snapshot(struct trace_array *tr) { - int ret; - - mutex_lock(&trace_types_lock); - ret = tracing_arm_snapshot_locked(tr); - mutex_unlock(&trace_types_lock); - - return ret; + guard(mutex)(&trace_types_lock); + return tracing_arm_snapshot_locked(tr); } void tracing_disarm_snapshot(struct trace_array *tr) @@ -2483,9 +2466,8 @@ void tracing_reset_all_online_cpus_unlocked(void) void tracing_reset_all_online_cpus(void) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracing_reset_all_online_cpus_unlocked(); - mutex_unlock(&trace_types_lock); } int is_tracing_stopped(void) @@ -2496,18 +2478,17 @@ int is_tracing_stopped(void) static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; if (tracing_disabled) return; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (--tr->stop_count) { if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ tr->stop_count = 0; } - goto out; + return; } /* Prevent the buffers from switching */ @@ -2524,9 +2505,6 @@ static void tracing_start_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2544,11 +2522,10 @@ void tracing_start(void) static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; - unsigned long flags; - raw_spin_lock_irqsave(&tr->start_lock, flags); + guard(raw_spinlock_irqsave)(&tr->start_lock); if (tr->stop_count++) - goto out; + return; /* Prevent the buffers from switching */ arch_spin_lock(&tr->max_lock); @@ -2564,9 +2541,6 @@ static void tracing_stop_tr(struct trace_array *tr) #endif arch_spin_unlock(&tr->max_lock); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -2679,12 +2653,12 @@ void trace_buffered_event_enable(void) per_cpu(trace_buffered_event, cpu) = event; - preempt_disable(); - if (cpu == smp_processor_id() && - __this_cpu_read(trace_buffered_event) != - per_cpu(trace_buffered_event, cpu)) - WARN_ON_ONCE(1); - preempt_enable(); + scoped_guard(preempt,) { + if (cpu == smp_processor_id() && + __this_cpu_read(trace_buffered_event) != + per_cpu(trace_buffered_event, cpu)) + WARN_ON_ONCE(1); + } } } @@ -3029,7 +3003,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, skip++; #endif - preempt_disable_notrace(); + guard(preempt_notrace)(); stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; @@ -3087,8 +3061,6 @@ static void __ftrace_trace_stack(struct trace_array *tr, /* Again, don't let gcc optimize things here */ barrier(); __this_cpu_dec(ftrace_stack_reserve); - preempt_enable_notrace(); - } static inline void ftrace_trace_stack(struct trace_array *tr, @@ -3171,9 +3143,9 @@ ftrace_trace_userstack(struct trace_array *tr, * prevent recursion, since the user stack tracing may * trigger other kernel events. */ - preempt_disable(); + guard(preempt)(); if (__this_cpu_read(user_stack_count)) - goto out; + return; __this_cpu_inc(user_stack_count); @@ -3191,8 +3163,6 @@ ftrace_trace_userstack(struct trace_array *tr, out_drop_count: __this_cpu_dec(user_stack_count); - out: - preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ static void ftrace_trace_userstack(struct trace_array *tr, @@ -3374,7 +3344,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); if (!tbuffer) { @@ -3406,7 +3376,6 @@ out_put: put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -3430,7 +3399,7 @@ int __trace_array_vprintk(struct trace_buffer *buffer, pause_graph_tracing(); trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); + guard(preempt_notrace)(); tbuffer = get_trace_buf(); @@ -3458,7 +3427,6 @@ out: put_trace_buf(); out_nobuffer: - preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -4788,20 +4756,16 @@ int tracing_open_file_tr(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&event_mutex); + guard(mutex)(&event_mutex); /* Fail if the file is marked for removal */ if (file->flags & EVENT_FILE_FL_FREED) { trace_array_put(file->tr); - ret = -ENODEV; + return -ENODEV; } else { event_file_get(file); } - mutex_unlock(&event_mutex); - if (ret) - return ret; - filp->private_data = inode->i_private; return 0; @@ -5945,9 +5909,9 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, char buf[MAX_TRACER_SIZE+2]; int r; - mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", tr->current_trace->name); - mutex_unlock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + r = sprintf(buf, "%s\n", tr->current_trace->name); + } return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -6249,15 +6213,13 @@ int tracing_update_buffers(struct trace_array *tr) { int ret = 0; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); update_last_data(tr); if (!tr->ring_buffer_expanded) ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); - mutex_unlock(&trace_types_lock); - return ret; } @@ -6554,7 +6516,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (ret) return ret; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); cpu = tracing_get_cpu(inode); ret = open_pipe_on_cpu(tr, cpu); if (ret) @@ -6598,7 +6560,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) tr->trace_ref++; - mutex_unlock(&trace_types_lock); return ret; fail: @@ -6607,7 +6568,6 @@ fail_alloc_iter: close_pipe_on_cpu(tr, cpu); fail_pipe_on_cpu: __trace_array_put(tr); - mutex_unlock(&trace_types_lock); return ret; } @@ -6616,14 +6576,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) struct trace_iterator *iter = file->private_data; struct trace_array *tr = inode->i_private; - mutex_lock(&trace_types_lock); + scoped_guard(mutex, &trace_types_lock) { + tr->trace_ref--; - tr->trace_ref--; - - if (iter->trace->pipe_close) - iter->trace->pipe_close(iter); - close_pipe_on_cpu(tr, iter->cpu_file); - mutex_unlock(&trace_types_lock); + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + close_pipe_on_cpu(tr, iter->cpu_file); + } free_trace_iter_content(iter); kfree(iter); @@ -7426,7 +7385,7 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr->clock_id = i; @@ -7450,8 +7409,6 @@ int tracing_set_clock(struct trace_array *tr, const char *clockstr) tscratch->clock_id = i; } - mutex_unlock(&trace_types_lock); - return 0; } @@ -7503,15 +7460,13 @@ static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) { struct trace_array *tr = m->private; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (ring_buffer_time_stamp_abs(tr->array_buffer.buffer)) seq_puts(m, "delta [absolute]\n"); else seq_puts(m, "[delta] absolute\n"); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8099,14 +8054,14 @@ static void clear_tracing_err_log(struct trace_array *tr) { struct tracing_log_err *err, *next; - mutex_lock(&tracing_err_log_lock); + guard(mutex)(&tracing_err_log_lock); + list_for_each_entry_safe(err, next, &tr->err_log, list) { list_del(&err->list); free_tracing_log_err(err); } tr->n_err_log_entries = 0; - mutex_unlock(&tracing_err_log_lock); } static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) @@ -8377,7 +8332,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); iter->tr->trace_ref--; @@ -8388,8 +8343,6 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) info->spare_cpu, info->spare); kvfree(info); - mutex_unlock(&trace_types_lock); - return 0; } @@ -8597,14 +8550,13 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned * An ioctl call with cmd 0 to the ring buffer file will wake up all * waiters */ - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); /* Make sure the waiters see the new wait_index */ (void)atomic_fetch_inc_release(&iter->wait_index); ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); - mutex_unlock(&trace_types_lock); return 0; } @@ -9094,10 +9046,9 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; if (!!(topt->flags->val & topt->opt->bit) != val) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); ret = __set_tracer_option(topt->tr, topt->flags, topt->opt, !val); - mutex_unlock(&trace_types_lock); if (ret) return ret; } @@ -9406,7 +9357,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); if (!!val == tracer_tracing_is_on(tr)) { val = 0; /* do nothing */ } else if (val) { @@ -9420,7 +9371,6 @@ rb_simple_write(struct file *filp, const char __user *ubuf, /* Wake up any waiters */ ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } - mutex_unlock(&trace_types_lock); } (*ppos)++; @@ -9804,10 +9754,9 @@ static void __update_tracer_options(struct trace_array *tr) static void update_tracer_options(struct trace_array *tr) { - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tracer_options_updated = true; __update_tracer_options(tr); - mutex_unlock(&trace_types_lock); } /* Must have trace_types_lock held */ @@ -9829,11 +9778,10 @@ struct trace_array *trace_array_find_get(const char *instance) { struct trace_array *tr; - mutex_lock(&trace_types_lock); + guard(mutex)(&trace_types_lock); tr = trace_array_find(instance); if (tr) tr->ref++; - mutex_unlock(&trace_types_lock); return tr; } -- cgit v1.2.3 From 12d5189615862a9eb06d4aa7c8a990bcde2ebb01 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2025 16:37:26 -0400 Subject: tracing: Use __free(kfree) in trace.c to remove gotos There's a couple of locations that have goto out in trace.c for the only purpose of freeing a variable that was allocated. These can be replaced with __free(kfree). Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250801203858.040892777@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4299e89ed04e..d0b1964648c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5042,7 +5042,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; - char *mask_str; + char *mask_str __free(kfree) = NULL; int len; len = snprintf(NULL, 0, "%*pb\n", @@ -5053,16 +5053,10 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf, len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); - if (len >= count) { - count = -EINVAL; - goto out_err; - } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); - -out_err: - kfree(mask_str); + if (len >= count) + return -EINVAL; - return count; + return simple_read_from_buffer(ubuf, count, ppos, mask_str, len); } int tracing_set_cpumask(struct trace_array *tr, @@ -10739,7 +10733,8 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(const char *)) { - char *kbuf, *buf, *tmp; + char *kbuf __free(kfree) = NULL; + char *buf, *tmp; int ret = 0; size_t done = 0; size_t size; @@ -10754,10 +10749,9 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, if (size >= WRITE_BUFSIZE) size = WRITE_BUFSIZE - 1; - if (copy_from_user(kbuf, buffer + done, size)) { - ret = -EFAULT; - goto out; - } + if (copy_from_user(kbuf, buffer + done, size)) + return -EFAULT; + kbuf[size] = '\0'; buf = kbuf; do { @@ -10773,8 +10767,7 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ pr_warn("Line length is too long: Should be less than %d\n", WRITE_BUFSIZE - 2); - ret = -EINVAL; - goto out; + return -EINVAL; } } done += size; @@ -10787,17 +10780,12 @@ ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, ret = createfn(buf); if (ret) - goto out; + return ret; buf += size; } while (done < count); } - ret = done; - -out: - kfree(kbuf); - - return ret; + return done; } #ifdef CONFIG_TRACER_MAX_TRACE -- cgit v1.2.3 From db5f0c3e3e60939bb2ecc2dbdea4e6f32252620b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2025 16:37:27 -0400 Subject: ring-buffer: Convert ring_buffer_write() to use guard(preempt_notrace) The function ring_buffer_write() has a goto out to only do a preempt_enable_notrace(). This can be replaced by a guard. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250801203858.205479143@kernel.org Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 00fc38d70e86..9d7bf17fbfba 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4714,26 +4714,26 @@ int ring_buffer_write(struct trace_buffer *buffer, int ret = -EBUSY; int cpu; - preempt_disable_notrace(); + guard(preempt_notrace)(); if (atomic_read(&buffer->record_disabled)) - goto out; + return -EBUSY; cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) - goto out; + return -EBUSY; cpu_buffer = buffer->buffers[cpu]; if (atomic_read(&cpu_buffer->record_disabled)) - goto out; + return -EBUSY; if (length > buffer->max_data_size) - goto out; + return -EBUSY; if (unlikely(trace_recursive_lock(cpu_buffer))) - goto out; + return -EBUSY; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) @@ -4751,10 +4751,6 @@ int ring_buffer_write(struct trace_buffer *buffer, out_unlock: trace_recursive_unlock(cpu_buffer); - - out: - preempt_enable_notrace(); - return ret; } EXPORT_SYMBOL_GPL(ring_buffer_write); -- cgit v1.2.3 From 3ca824369b71d4b441e1fdcdee8e66bcb05510a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 Aug 2025 16:56:01 -0400 Subject: tracing: Have unsigned int function args displayed as hexadecimal Most function arguments that are passed in as unsigned int or unsigned long are better displayed as hexadecimal than normal integer. For example, the functions: static void __create_object(unsigned long ptr, size_t size, int min_count, gfp_t gfp, unsigned int objflags); static bool stack_access_ok(struct unwind_state *state, unsigned long _addr, size_t len); void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); Show up in the trace as: __create_object(ptr=-131387050520576, size=4096, min_count=1, gfp=3264, objflags=0) <-kmem_cache_alloc_noprof stack_access_ok(state=0xffffc9000233fc98, _addr=-60473102566256, len=8) <-unwind_next_frame __local_bh_disable_ip(ip=-2127311112, cnt=256) <-handle_softirqs Instead, by displaying unsigned as hexadecimal, they look more like this: __create_object(ptr=0xffff8881028d2080, size=0x280, min_count=1, gfp=0x82820, objflags=0x0) <-kmem_cache_alloc_node_noprof stack_access_ok(state=0xffffc90000003938, _addr=0xffffc90000003930, len=0x8) <-unwind_next_frame __local_bh_disable_ip(ip=0xffffffff8133cef8, cnt=0x100) <-handle_softirqs Which is much easier to understand as most unsigned longs are usually just pointers. Even the "unsigned int cnt" in __local_bh_disable_ip() looks better as hexadecimal as a lot of flags are passed as unsigned. Changes since v2: https://lore.kernel.org/20250801111453.01502861@gandalf.local.home - Use btf_int_encoding() instead of open coding it (Martin KaFai Lau) Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Douglas Raillard Cc: Martin KaFai Lau Link: https://lore.kernel.org/20250801165601.7770d65c@gandalf.local.home Acked-by: Yonghong Song Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_output.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0b3db02030a7..97db0b0ccf3e 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -701,6 +701,7 @@ void print_function_args(struct trace_seq *s, unsigned long *args, struct btf *btf; s32 tid, nr = 0; int a, p, x; + u16 encode; trace_seq_printf(s, "("); @@ -744,7 +745,12 @@ void print_function_args(struct trace_seq *s, unsigned long *args, trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_INT: - trace_seq_printf(s, "%ld", arg); + encode = btf_int_encoding(t); + /* Print unsigned ints as hex */ + if (encode & BTF_INT_SIGNED) + trace_seq_printf(s, "%ld", arg); + else + trace_seq_printf(s, "0x%lx", arg); break; case BTF_KIND_ENUM: trace_seq_printf(s, "%ld", arg); -- cgit v1.2.3 From 83e6384374bac8a9da3411fae7f24376a7dbd2a3 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Tue, 22 Jul 2025 09:18:18 -0700 Subject: smp: Fix spelling in on_each_cpu_cond_mask()'s doc-comment "boolean" is spelt as "blooean". Fix that. Signed-off-by: Roman Kisel Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250722161818.6139-1-romank@linux.microsoft.com --- kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4649fa4872ff..56f83aa58ec8 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1018,7 +1018,7 @@ void __init smp_init(void) * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should - * return a blooean value indicating whether to IPI + * return a boolean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. -- cgit v1.2.3 From e703b7e247503b8bf87b62c02a4392749b09eca8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Jul 2025 21:44:55 +0200 Subject: futex: Move futex cleanup to __mmdrop() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Futex hash allocations are done in mm_init() and the cleanup happens in __mmput(). That works most of the time, but there are mm instances which are instantiated via mm_alloc() and freed via mmdrop(), which causes the futex hash to be leaked. Move the cleanup to __mmdrop(). Fixes: 56180dd20c19 ("futex: Use RCU-based per-CPU reference counting instead of rcuref_t") Reported-by: André Draszik Signed-off-by: Thomas Gleixner Tested-by: André Draszik Link: https://lore.kernel.org/all/87ldo5ihu0.ffs@tglx Closes: https://lore.kernel.org/all/0c8cc83bb73abf080faf584f319008b67d0931db.camel@linaro.org --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f82b77eef7fe..1b0535ee5ffa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -686,6 +686,7 @@ void __mmdrop(struct mm_struct *mm) mm_pasid_drop(mm); mm_destroy_cid(mm); percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); + futex_hash_free(mm); free_mm(mm); } @@ -1133,7 +1134,6 @@ static inline void __mmput(struct mm_struct *mm) if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); - futex_hash_free(mm); mmdrop(mm); } -- cgit v1.2.3 From 1b30d44417278196a90c79244bb43e8428586345 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 1 Aug 2025 16:23:30 -0700 Subject: bpf: Fix memory leak of bpf_scc_info objects env->scc_info array contains references to bpf_scc_info objects allocated lazily in verifier.c:scc_visit_alloc(). env->scc_cnt was supposed to track env->scc_info array size in order to free referenced objects in verifier.c:free_states(). Fix initialization of env->scc_cnt that was omitted in verifier.c:compute_scc(). To reproduce the bug: - build with CONFIG_DEBUG_KMEMLEAK - boot and load bpf program with loops, e.g.: ./veristat -q pyperf180.bpf.o - initiate memleak scan and check results: echo scan > /sys/kernel/debug/kmemleak cat /sys/kernel/debug/kmemleak Fixes: c9e31900b54c ("bpf: propagate read/precision marks over state graph backedges") Reported-by: Jens Axboe Closes: https://lore.kernel.org/bpf/CAADnVQKXUWg9uRCPD5ebRXwN4dmBCRUFFM7kN=GxymYz3zU25A@mail.gmail.com/T/ Suggested-by: Alexei Starovoitov Tested-by: Jens Axboe Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250801232330.1800436-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0806295945e4..c4f69a9e9af6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23114,6 +23114,8 @@ static void free_states(struct bpf_verifier_env *env) for (i = 0; i < env->scc_cnt; ++i) { info = env->scc_info[i]; + if (!info) + continue; for (j = 0; j < info->num_visits; j++) free_backedges(&info->visits[j]); kvfree(info); @@ -24554,6 +24556,7 @@ dfs_continue: err = -ENOMEM; goto exit; } + env->scc_cnt = next_scc_id; exit: kvfree(stack); kvfree(pre); -- cgit v1.2.3 From 07d24902977e4704fab8472981e73a0ad6dfa1fd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 10 Jun 2025 08:53:27 +0000 Subject: kexec: enable CMA based contiguous allocation When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf Acked-by: Baoquan He Reviewed-by: Pasha Tatashin Cc: Zhongkun He Signed-off-by: Andrew Morton --- kernel/kexec.c | 2 +- kernel/kexec_core.c | 100 ++++++++++++++++++++++++++++++++++++++++++++---- kernel/kexec_file.c | 51 +++++++++++++++++++++++- kernel/kexec_internal.h | 2 +- 4 files changed, 144 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3a9a9f240dbc..e390c0df6d55 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + unsigned long maddr; + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -787,13 +872,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -858,18 +943,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 69fe76fd9233..41271eee0f99 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (ret <= 0) return ret; + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 30a733a55a67..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -- cgit v1.2.3 From f8cd9193b62e92ad25def5370ca8ea2bc7585381 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Jul 2025 19:45:57 +0200 Subject: ucount: fix atomic_long_inc_below() argument type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The type of u argument of atomic_long_inc_below() should be long to avoid unwanted truncation to int. The patch fixes the wrong argument type of an internal function to prevent unwanted argument truncation. It fixes an internal locking primitive; it should not have any direct effect on userspace. Mark said : AFAICT there's no problem in practice because atomic_long_inc_below() : is only used by inc_ucount(), and it looks like the value is : constrained between 0 and INT_MAX. : : In inc_ucount() the limit value is taken from : user_namespace::ucount_max[], and AFAICT that's only written by : sysctls, to the table setup by setup_userns_sysctls(), where : UCOUNT_ENTRY() limits the value between 0 and INT_MAX. : : This is certainly a cleanup, but there might be no functional issue in : practice as above. Link: https://lkml.kernel.org/r/20250721174610.28361-1-ubizjak@gmail.com Fixes: f9c82a4ea89c ("Increase size of ucounts to atomic_long_t") Signed-off-by: Uros Bizjak Reviewed-by: "Eric W. Biederman" Cc: Sebastian Andrzej Siewior Cc: "Paul E. McKenney" Cc: Alexey Gladkov Cc: Roman Gushchin Cc: MengEn Sun Cc: "Thomas Weißschuh" Cc: Mark Rutland Signed-off-by: Andrew Morton --- kernel/ucount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 8686e329b8f2..f629db485a07 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -199,7 +199,7 @@ void put_ucounts(struct ucounts *ucounts) } } -static inline bool atomic_long_inc_below(atomic_long_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, long u) { long c, old; c = atomic_long_read(v); -- cgit v1.2.3 From 58b4fba81a2e400a47ddbe7c1dc0a2bc038313b7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 21 Jul 2025 19:45:58 +0200 Subject: ucount: use atomic_long_try_cmpxchg() in atomic_long_inc_below() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use atomic_long_try_cmpxchg() instead of atomic_long_cmpxchg (*ptr, old, new) == old in atomic_long_inc_below(). x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_long_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Link: https://lkml.kernel.org/r/20250721174610.28361-2-ubizjak@gmail.com Signed-off-by: Uros Bizjak Reviewed-by: Alexey Gladkov Cc: Sebastian Andrzej Siewior Cc: "Paul E. McKenney" Cc: Alexey Gladkov Cc: Roman Gushchin Cc: MengEn Sun Cc: "Thomas Weißschuh" Signed-off-by: Andrew Morton --- kernel/ucount.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index f629db485a07..586af49fc03e 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -201,16 +201,14 @@ void put_ucounts(struct ucounts *ucounts) static inline bool atomic_long_inc_below(atomic_long_t *v, long u) { - long c, old; - c = atomic_long_read(v); - for (;;) { + long c = atomic_long_read(v); + + do { if (unlikely(c >= u)) return false; - old = atomic_long_cmpxchg(v, c, c+1); - if (likely(old == c)) - return true; - c = old; - } + } while (!atomic_long_try_cmpxchg(v, &c, c+1)); + + return true; } struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, -- cgit v1.2.3 From 881388f34338197f4ea3adf4d08dc6374c3420c8 Mon Sep 17 00:00:00 2001 From: Xuanye Liu Date: Wed, 23 Jul 2025 18:09:00 +0800 Subject: mm: add process info to bad rss-counter warning Enhance the debugging information in check_mm() by including the process name and PID when reporting bad rss-counter states. This helps identify which process is associated with the memory accounting issue. Link: https://lkml.kernel.org/r/20250723100901.1909683-1-liuqiye2025@163.com Signed-off-by: Xuanye Liu Acked-by: SeongJae Park Cc: Ben Segall Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/fork.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..f799d128b968 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,9 +585,12 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (unlikely(x)) - pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", - mm, resident_page_types[i], x); + if (unlikely(x)) { + pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n", + mm, resident_page_types[i], x, + current->comm, + task_pid_nr(current)); + } } if (mm_pgtables_bytes(mm)) -- cgit v1.2.3 From 838955f64ae7582f009a3538889bb9244f37ab26 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 13 Jul 2025 10:17:24 +0300 Subject: execmem: introduce execmem_alloc_rw() Some callers of execmem_alloc() require the memory to be temporarily writable even when it is allocated from ROX cache. These callers use execemem_make_temp_rw() right after the call to execmem_alloc(). Wrap this sequence in execmem_alloc_rw() API. Link: https://lkml.kernel.org/r/20250713071730.4117334-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Acked-by: Peter Zijlstra (Intel) Cc: Masami Hiramatsu (Google) Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- kernel/module/main.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module/main.c b/kernel/module/main.c index 413ac6ea3702..d009326ef7bb 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1292,20 +1292,11 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) else execmem_type = EXECMEM_MODULE_TEXT; - ptr = execmem_alloc(execmem_type, size); + ptr = execmem_alloc_rw(execmem_type, size); if (!ptr) return -ENOMEM; - if (execmem_is_rox(execmem_type)) { - int err = execmem_make_temp_rw(ptr, size); - - if (err) { - execmem_free(ptr); - return -ENOMEM; - } - - mod->mem[type].is_rox = true; - } + mod->mem[type].is_rox = execmem_is_rox(execmem_type); /* * The pointer to these blocks of memory are stored on the module -- cgit v1.2.3 From 99b773d720aeea1ef2170dce5fcfa80649e26b78 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Jul 2025 15:11:14 -0400 Subject: sched/psi: Fix psi_seq initialization With the seqcount moved out of the group into a global psi_seq, re-initializing the seqcount on group creation is causing seqcount corruption. Fixes: 570c8efd5eb7 ("sched/psi: Optimize psi_group_change() cpu_clock() usage") Reported-by: Chris Mason Suggested-by: Beata Michalska Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2024c1d36402..59fdb7ebbf22 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -176,7 +176,7 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; -static DEFINE_PER_CPU(seqcount_t, psi_seq); +static DEFINE_PER_CPU(seqcount_t, psi_seq) = SEQCNT_ZERO(psi_seq); static inline void psi_write_begin(int cpu) { @@ -204,11 +204,7 @@ static void poll_timer_fn(struct timer_list *t); static void group_init(struct psi_group *group) { - int cpu; - group->enabled = true; - for_each_possible_cpu(cpu) - seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); -- cgit v1.2.3 From 54473e0ef849f44e5ee43e6d6746c27030c3825b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Aug 2025 22:22:09 +0200 Subject: perf/core: Preserve AUX buffer allocation failure result A recent overhaul sets the return value to 0 unconditionally after the allocations, which causes reference count leaks and corrupts the user->vm accounting. Preserve the AUX buffer allocation failure return value, so that the subsequent code works correctly. Fixes: 0983593f32c4 ("perf/core: Lift event->mmap_mutex in perf_mmap()") Signed-off-by: Thomas Gleixner Reviewed-by: Lorenzo Stoakes Cc: stable@vger.kernel.org --- kernel/events/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 22fdf0c187cd..c05262e15b7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7115,6 +7115,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) perf_event_update_time(event); perf_event_init_userpage(event); perf_event_update_userpage(event); + ret = 0; } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); @@ -7122,8 +7123,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) rb->aux_mmap_locked = extra; } - ret = 0; - unlock: if (!ret) { atomic_long_add(user_extra, &user->locked_vm); -- cgit v1.2.3 From 5468c0fbccbb9d156522c50832244a8b722374fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 Aug 2025 12:39:39 +0200 Subject: perf/core: Don't leak AUX buffer refcount on allocation failure Failure of the AUX buffer allocation leaks the reference count. Set the reference count to 1 only when the allocation succeeds. Fixes: 45bfb2e50471 ("perf: Add AUX area to ring buffer for raw data streams") Signed-off-by: Thomas Gleixner Reviewed-by: Lorenzo Stoakes Cc: stable@vger.kernel.org --- kernel/events/core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c05262e15b7d..e89e77228591 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7051,8 +7051,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = 0; goto unlock; } - - atomic_set(&rb->aux_mmap_count, 1); } user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); @@ -7119,8 +7117,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) } else { ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, event->attr.aux_watermark, flags); - if (!ret) + if (!ret) { + atomic_set(&rb->aux_mmap_count, 1); rb->aux_mmap_locked = extra; + } } unlock: @@ -7130,6 +7130,7 @@ unlock: atomic_inc(&event->mmap_count); } else if (rb) { + /* AUX allocation failed */ atomic_dec(&rb->mmap_count); } aux_unlock: -- cgit v1.2.3 From 07091aade394f690e7b655578140ef84d0e8d7b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 Aug 2025 12:49:48 +0200 Subject: perf/core: Exit early on perf_mmap() fail When perf_mmap() fails to allocate a buffer, it still invokes the event_mapped() callback of the related event. On X86 this might increase the perf_rdpmc_allowed reference counter. But nothing undoes this as perf_mmap_close() is never called in this case, which causes another reference count leak. Return early on failure to prevent that. Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping") Signed-off-by: Thomas Gleixner Reviewed-by: Lorenzo Stoakes Cc: stable@vger.kernel.org --- kernel/events/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e89e77228591..a2e3591175c6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7138,6 +7138,9 @@ aux_unlock: mutex_unlock(aux_mutex); mutex_unlock(&event->mmap_mutex); + if (ret) + return ret; + /* * Since pinned accounting is per vm we cannot allow fork() to copy our * vma. @@ -7145,8 +7148,7 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; - if (!ret) - ret = map_range(rb, vma); + ret = map_range(rb, vma); mapped = get_mapped(event, event_mapped); if (mapped) -- cgit v1.2.3 From f74b9f4ba63ffdf597aaaa6cad7e284cb8e04820 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 Aug 2025 12:48:55 +0200 Subject: perf/core: Handle buffer mapping fail correctly in perf_mmap() After successful allocation of a buffer or a successful attachment to an existing buffer perf_mmap() tries to map the buffer read only into the page table. If that fails, the already set up page table entries are zapped, but the other perf specific side effects of that failure are not handled. The calling code just cleans up the VMA and does not invoke perf_mmap_close(). This leaks reference counts, corrupts user->vm accounting and also results in an unbalanced invocation of event::event_mapped(). Cure this by moving the event::event_mapped() invocation before the map_range() call so that on map_range() failure perf_mmap_close() can be invoked without causing an unbalanced event::event_unmapped() call. perf_mmap_close() undoes the reference counts and eventually frees buffers. Fixes: b709eb872e19 ("perf: map pages in advance") Signed-off-by: Thomas Gleixner Reviewed-by: Lorenzo Stoakes Cc: stable@vger.kernel.org --- kernel/events/core.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a2e3591175c6..4563bd864bbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7148,12 +7148,20 @@ aux_unlock: vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &perf_mmap_vmops; - ret = map_range(rb, vma); - mapped = get_mapped(event, event_mapped); if (mapped) mapped(event, vma->vm_mm); + /* + * Try to map it into the page table. On fail, invoke + * perf_mmap_close() to undo the above, as the callsite expects + * full cleanup in this case and therefore does not invoke + * vmops::close(). + */ + ret = map_range(rb, vma); + if (ret) + perf_mmap_close(vma); + return ret; } -- cgit v1.2.3 From b024d7b56c77191cde544f838debb7f8451cd0d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Jul 2025 23:01:21 +0200 Subject: perf/core: Prevent VMA split of buffer mappings The perf mmap code is careful about mmap()'ing the user page with the ringbuffer and additionally the auxiliary buffer, when the event supports it. Once the first mapping is established, subsequent mapping have to use the same offset and the same size in both cases. The reference counting for the ringbuffer and the auxiliary buffer depends on this being correct. Though perf does not prevent that a related mapping is split via mmap(2), munmap(2) or mremap(2). A split of a VMA results in perf_mmap_open() calls, which take reference counts, but then the subsequent perf_mmap_close() calls are not longer fulfilling the offset and size checks. This leads to reference count leaks. As perf already has the requirement for subsequent mappings to match the initial mapping, the obvious consequence is that VMA splits, caused by resizing of a mapping or partial unmapping, have to be prevented. Implement the vm_operations_struct::may_split() callback and return unconditionally -EINVAL. That ensures that the mapping offsets and sizes cannot be changed after the fact. Remapping to a different fixed address with the same size is still possible as it takes the references for the new mapping and drops those of the old mapping. Fixes: 45bfb2e50471 ("perf: Add AUX area to ring buffer for raw data streams") Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-27504 Signed-off-by: Thomas Gleixner Reviewed-by: Lorenzo Stoakes Acked-by: Arnaldo Carvalho de Melo Acked-by: Vlastimil Babka Cc: stable@vger.kernel.org --- kernel/events/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4563bd864bbc..8060c2857bb2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6842,10 +6842,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf) return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS; } +static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting perf mappings to prevent refcount leaks due to + * the resulting non-matching offsets and sizes. See open()/close(). + */ + return -EINVAL; +} + static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, /* non mergeable */ .pfn_mkwrite = perf_mmap_pfn_mkwrite, + .may_split = perf_mmap_may_split, }; static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma) -- cgit v1.2.3 From 73d210e9faf85c36d5c9d2e38cb42c2d9837ee51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20G=C3=B3rny?= Date: Tue, 29 Jul 2025 15:24:55 +0200 Subject: kheaders: make it possible to override TAR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 86cdd2fdc4e3 ("kheaders: make headers archive reproducible") introduced a number of options specific to GNU tar to the `tar` invocation in `gen_kheaders.sh` script. This causes the script to fail to work on systems where `tar` is not GNU tar. This can occur e.g. on recent Gentoo Linux installations that support using bsdtar from libarchive instead. Add a `TAR` make variable to make it possible to override the tar executable used, e.g. by specifying: make TAR=gtar Link: https://bugs.gentoo.org/884061 Reported-by: Sam James Tested-by: Sam James Co-developed-by: Masahiro Yamada Signed-off-by: Michał Górny Signed-off-by: Sam James Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index c64e5a00a3d9..896a503dfb29 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -30,8 +30,8 @@ rm -rf "${tmpdir}" mkdir "${tmpdir}" # shellcheck disable=SC2154 # srctree is passed as an env variable -sed "s:^${srctree}/::" "${srclist}" | tar -c -f - -C "${srctree}" -T - | tar -xf - -C "${tmpdir}" -tar -c -f - -T "${objlist}" | tar -xf - -C "${tmpdir}" +sed "s:^${srctree}/::" "${srclist}" | ${TAR} -c -f - -C "${srctree}" -T - | ${TAR} -xf - -C "${tmpdir}" +${TAR} -c -f - -T "${objlist}" | ${TAR} -xf - -C "${tmpdir}" # Remove comments except SDPX lines # Use a temporary file to store directory contents to prevent find/xargs from @@ -43,7 +43,7 @@ xargs -0 -P8 -n1 \ rm -f "${tmpdir}.contents.txt" # Create archive and try to normalize metadata for reproducibility. -tar "${timestamp:+--mtime=$timestamp}" \ +${TAR} "${timestamp:+--mtime=$timestamp}" \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ -I "${XZ}" -cf "${tarfile}" -C "${tmpdir}/" . > /dev/null -- cgit v1.2.3 From 5b65258229117995eb6c4bd74995e15fb5f2cfe3 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 5 Aug 2025 11:32:20 -0700 Subject: genirq/test: Resolve irq lock inversion warnings irq_shutdown_and_deactivate() is normally called with the descriptor lock held, and interrupts disabled. Nested a few levels down, it grabs the global irq_resend_lock. Lockdep rightfully complains when interrupts are not disabled: CPU0 CPU1 ---- ---- lock(irq_resend_lock); local_irq_disable(); lock(&irq_desc_lock_class); lock(irq_resend_lock); lock(&irq_desc_lock_class); ... _raw_spin_lock+0x2b/0x40 clear_irq_resend+0x14/0x70 irq_shutdown_and_deactivate+0x29/0x80 irq_shutdown_depth_test+0x1ce/0x600 kunit_try_run_case+0x90/0x120 Grab the descriptor lock and disable interrupts, to resolve the problem. Fixes: 66067c3c8a1e ("genirq: Add kunit tests for depth counts") Reported-by: Guenter Roeck Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Guenter Roeck Link: https://lore.kernel.org/all/aJJONEIoIiTSDMqc@google.com Closes: https://lore.kernel.org/lkml/31a761e4-8f81-40cf-aaf5-d220ba11911c@roeck-us.net/ --- kernel/irq/irq_test.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irq_test.c b/kernel/irq/irq_test.c index 5161b56a12f9..a75abebed7f2 100644 --- a/kernel/irq/irq_test.c +++ b/kernel/irq/irq_test.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: LGPL-2.1+ +#include #include #include #include @@ -134,7 +135,8 @@ static void irq_shutdown_depth_test(struct kunit *test) disable_irq(virq); KUNIT_EXPECT_EQ(test, desc->depth, 1); - irq_shutdown_and_deactivate(desc); + scoped_guard(raw_spinlock_irqsave, &desc->lock) + irq_shutdown_and_deactivate(desc); KUNIT_EXPECT_FALSE(test, irqd_is_activated(data)); KUNIT_EXPECT_FALSE(test, irqd_is_started(data)); -- cgit v1.2.3 From da274853fe7dbc7124e2dd84dad802be52a09321 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 29 Jul 2025 15:12:32 -0400 Subject: cpu: Remove obsolete comment from takedown_cpu() takedown_cpu() has a comment about "all preempt/rcu users must observe !cpu_active()" which is kind of meaningless in this function. This comment was originally introduced by commit 6acce3ef8452 ("sched: Remove get_online_cpus() usage") when _cpu_down() was setting cpu_active_mask and synchronize_rcu()/synchronize_sched() were added after that. Later commit 40190a78f85f ("sched/hotplug: Convert cpu_[in]active notifiers to state machine") added a new CPUHP_AP_ACTIVE hotplug state to set/clear cpu_active_mask. The following commit b2454caa8977 ("sched/hotplug: Move sync_rcu to be with set_cpu_active(false)") move the synchronize_*() calls to sched_cpu_deactivate() associated with the new hotplug state, but left the comment behind. Remove this comment as it is no longer relevant in takedown_cpu(). Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250729191232.664931-1-longman@redhat.com --- kernel/cpu.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index faf0f23fc5d8..db9f6c539b28 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1309,9 +1309,6 @@ static int takedown_cpu(unsigned int cpu) */ irq_lock_sparse(); - /* - * So now all preempt/rcu users must observe !cpu_active(). - */ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU refused to die */ -- cgit v1.2.3 From 65f97cc81b0adc5f49cf6cff5d874be0058e3f41 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2025 13:24:28 -0400 Subject: cgroup/cpuset: Use static_branch_enable_cpuslocked() on cpusets_insane_config_key The following lockdep splat was observed. [ 812.359086] ============================================ [ 812.359089] WARNING: possible recursive locking detected [ 812.359097] -------------------------------------------- [ 812.359100] runtest.sh/30042 is trying to acquire lock: [ 812.359105] ffffffffa7f27420 (cpu_hotplug_lock){++++}-{0:0}, at: static_key_enable+0xe/0x20 [ 812.359131] [ 812.359131] but task is already holding lock: [ 812.359134] ffffffffa7f27420 (cpu_hotplug_lock){++++}-{0:0}, at: cpuset_write_resmask+0x98/0xa70 : [ 812.359267] Call Trace: [ 812.359272] [ 812.359367] cpus_read_lock+0x3c/0xe0 [ 812.359382] static_key_enable+0xe/0x20 [ 812.359389] check_insane_mems_config.part.0+0x11/0x30 [ 812.359398] cpuset_write_resmask+0x9f2/0xa70 [ 812.359411] cgroup_file_write+0x1c7/0x660 [ 812.359467] kernfs_fop_write_iter+0x358/0x530 [ 812.359479] vfs_write+0xabe/0x1250 [ 812.359529] ksys_write+0xf9/0x1d0 [ 812.359558] do_syscall_64+0x5f/0xe0 Since commit d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order"), the ordering of cpu hotplug lock and cpuset_mutex had been reversed. That patch correctly used the cpuslocked version of the static branch API to enable cpusets_pre_enable_key and cpusets_enabled_key, but it didn't do the same for cpusets_insane_config_key. The cpusets_insane_config_key can be enabled in the check_insane_mems_config() which is called from update_nodemask() or cpuset_hotplug_update_tasks() with both cpu hotplug lock and cpuset_mutex held. Deadlock can happen with a pending hotplug event that tries to acquire the cpu hotplug write lock which will block further cpus_read_lock() attempt from check_insane_mems_config(). Fix that by switching to use static_branch_enable_cpuslocked(). Fixes: d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock order") Signed-off-by: Waiman Long Reviewed-by: Juri Lelli Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f74d04429a29..bf149246e001 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -280,7 +280,7 @@ static inline void check_insane_mems_config(nodemask_t *nodes) { if (!cpusets_insane_config() && movable_only_nodes(nodes)) { - static_branch_enable(&cpusets_insane_config_key); + static_branch_enable_cpuslocked(&cpusets_insane_config_key); pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" "Cpuset allocations might fail even with a lot of memory available.\n", nodemask_pr_args(nodes)); -- cgit v1.2.3 From 150e298ae0ccbecff2357a72fbabd80f8849ea6e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2025 13:24:29 -0400 Subject: cgroup/cpuset: Fix a partition error with CPU hotplug It was found during testing that an invalid leaf partition with an empty effective exclusive CPU list can become a valid empty partition with no CPU afer an offline/online operation of an unrelated CPU. An empty partition root is allowed in the special case that it has no task in its cgroup and has distributed out all its CPUs to its child partitions. That is certainly not the case here. The problem is in the cpumask_subsets() test in the hotplug case (update with no new mask) of update_parent_effective_cpumask() as it also returns true if the effective exclusive CPU list is empty. Fix that by addding the cpumask_empty() test to root out this exception case. Also add the cpumask_empty() test in cpuset_hotplug_update_tasks() to avoid calling update_parent_effective_cpumask() for this special case. Fixes: 0c7f293efc87 ("cgroup/cpuset: Add cpuset.cpus.exclusive.effective for v2") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index bf149246e001..d993e058a663 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1843,7 +1843,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - } else if (is_partition_invalid(cs) && + } else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) && cpumask_subset(xcpus, parent->effective_xcpus)) { struct cgroup_subsys_state *css; struct cpuset *child; @@ -3870,9 +3870,10 @@ retry: partcmd = partcmd_invalidate; /* * On the other hand, an invalid partition root may be transitioned - * back to a regular one. + * back to a regular one with a non-empty effective xcpus. */ - else if (is_partition_valid(parent) && is_partition_invalid(cs)) + else if (is_partition_valid(parent) && is_partition_invalid(cs) && + !cpumask_empty(cs->effective_xcpus)) partcmd = partcmd_update; if (partcmd >= 0) { -- cgit v1.2.3 From 87eba5bc5ab1d99e31c9d3b2c386187da94a5ab1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 6 Aug 2025 13:24:30 -0400 Subject: cgroup/cpuset: Remove the unnecessary css_get/put() in cpuset_partition_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The css_get/put() calls in cpuset_partition_write() are unnecessary as an active reference of the kernfs node will be taken which will prevent its removal and guarantee the existence of the css. Only the online check is needed. Signed-off-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d993e058a663..27adb04df675 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3358,14 +3358,12 @@ static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf, else return -EINVAL; - css_get(&cs->css); cpus_read_lock(); mutex_lock(&cpuset_mutex); if (is_cpuset_online(cs)) retval = update_prstate(cs, val); mutex_unlock(&cpuset_mutex); cpus_read_unlock(); - css_put(&cs->css); return retval ?: nbytes; } -- cgit v1.2.3 From eea51c6e3f6675b795f6439eaa960eb2948d6905 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 6 Aug 2025 17:33:50 -0700 Subject: cgroup: avoid null de-ref in css_rstat_exit() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit css_rstat_exit() may be called asynchronously in scenarios where preceding calls to css_rstat_init() have not completed. One such example is this sequence below: css_create(...) { ... init_and_link_css(css, ...); err = percpu_ref_init(...); if (err) goto err_free_css; err = cgroup_idr_alloc(...); if (err) goto err_free_css; err = css_rstat_init(css, ...); if (err) goto err_free_css; ... err_free_css: INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); } If any of the three goto jumps are taken, async cleanup will begin and css_rstat_exit() will be invoked on an uninitialized css->rstat_cpu. Avoid accessing the unitialized field by returning early in css_rstat_exit() if this is the case. Signed-off-by: JP Kobryn Suggested-by: Michal Koutný Fixes: 5da3bfa029d68 ("cgroup: use separate rstat trees for each subsystem") Cc: stable@vger.kernel.org # v6.16 Reported-by: syzbot+8d052e8b99e40bc625ed@syzkaller.appspotmail.com Acked-by: Shakeel Butt Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 981e2f77ad4e..a198e40c799b 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -479,6 +479,9 @@ void css_rstat_exit(struct cgroup_subsys_state *css) if (!css_uses_rstat(css)) return; + if (!css->rstat_cpu) + return; + css_rstat_flush(css); /* sanity check */ -- cgit v1.2.3 From 61399e0c5410567ef60cb1cda34cca42903842e3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Aug 2025 19:03:22 +0200 Subject: rcu: Fix racy re-initialization of irq_work causing hangs RCU re-initializes the deferred QS irq work everytime before attempting to queue it. However there are situations where the irq work is attempted to be queued even though it is already queued. In that case re-initializing messes-up with the irq work queue that is about to be handled. The chances for that to happen are higher when the architecture doesn't support self-IPIs and irq work are then all lazy, such as with the following sequence: 1) rcu_read_unlock() is called when IRQs are disabled and there is a grace period involving blocked tasks on the node. The irq work is then initialized and queued. 2) The related tasks are unblocked and the CPU quiescent state is reported. rdp->defer_qs_iw_pending is reset to DEFER_QS_IDLE, allowing the irq work to be requeued in the future (note the previous one hasn't fired yet). 3) A new grace period starts and the node has blocked tasks. 4) rcu_read_unlock() is called when IRQs are disabled again. The irq work is re-initialized (but it's queued! and its node is cleared) and requeued. Which means it's requeued to itself. 5) The irq work finally fires with the tick. But since it was requeued to itself, it loops and hangs. Fix this with initializing the irq work only once before the CPU boots. Fixes: b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202508071303.c1134cce-lkp@intel.com Signed-off-by: Frederic Weisbecker Reviewed-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 8 ++++++-- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 174ee243b349..8eff357b0436 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4262,6 +4262,8 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcu_preempt_deferred_qs_init(rdp); rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index de6ca13a7b5f..b8bbe7960cda 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -488,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fc14adf15cbb..4cd170b2d655 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -763,8 +763,6 @@ static void rcu_read_unlock_special(struct task_struct *t) cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - rdp->defer_qs_iw = - IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -904,6 +902,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) } } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) +{ + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +} #else /* #ifdef CONFIG_PREEMPT_RCU */ /* @@ -1103,6 +1105,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* -- cgit v1.2.3 From 2c223f7239f376a90d71903ec474ba887cf21d94 Mon Sep 17 00:00:00 2001 From: Oreoluwa Babatunde Date: Wed, 6 Aug 2025 10:24:21 -0700 Subject: of: reserved_mem: Restructure call site for dma_contiguous_early_fixup() Restructure the call site for dma_contiguous_early_fixup() to where the reserved_mem nodes are being parsed from the DT so that dma_mmu_remap[] is populated before dma_contiguous_remap() is called. Fixes: 8a6e02d0c00e ("of: reserved_mem: Restructure how the reserved memory regions are processed") Signed-off-by: Oreoluwa Babatunde Tested-by: William Zhang Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250806172421.2748302-1-oreoluwa.babatunde@oss.qualcomm.com --- kernel/dma/contiguous.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 67af8a55185d..d9b9dcba6ff7 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -483,8 +483,6 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) pr_err("Reserved memory: unable to setup CMA region\n"); return err; } - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(rmem->base, rmem->size); if (default_cma) dma_contiguous_default_area = cma; -- cgit v1.2.3 From dfb36e4a8db0cd56f92d4cb445f54e85a9b40897 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 11 Aug 2025 10:11:47 -0400 Subject: futex: Use user_write_access_begin/_end() in futex_put_value() Commit cec199c5e39b ("futex: Implement FUTEX2_NUMA") introduced the futex_put_value() helper to write a value to the given user address. However, it uses user_read_access_begin() before the write. For architectures that differentiate between read and write accesses, like PowerPC, futex_put_value() fails with -EFAULT. Fix that by using the user_write_access_begin/user_write_access_end() pair instead. Fixes: cec199c5e39b ("futex: Implement FUTEX2_NUMA") Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250811141147.322261-1-longman@redhat.com --- kernel/futex/futex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index c74eac572acd..2cd57096c38e 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -319,13 +319,13 @@ static __always_inline int futex_put_value(u32 val, u32 __user *to) { if (can_do_masked_user_access()) to = masked_user_access_begin(to); - else if (!user_read_access_begin(to, sizeof(*to))) + else if (!user_write_access_begin(to, sizeof(*to))) return -EFAULT; unsafe_put_user(val, to, Efault); - user_read_access_end(); + user_write_access_end(); return 0; Efault: - user_read_access_end(); + user_write_access_end(); return -EFAULT; } -- cgit v1.2.3 From ddf7233fcab6c247379d0928d46cc316ee122229 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Tue, 5 Aug 2025 10:59:11 +0200 Subject: sched/ext: Fix invalid task state transitions on class switch When enabling a sched_ext scheduler, we may trigger invalid task state transitions, resulting in warnings like the following (which can be easily reproduced by running the hotplug selftest in a loop): sched_ext: Invalid task state transition 0 -> 3 for fish[770] WARNING: CPU: 18 PID: 787 at kernel/sched/ext.c:3862 scx_set_task_state+0x7c/0xc0 ... RIP: 0010:scx_set_task_state+0x7c/0xc0 ... Call Trace: scx_enable_task+0x11f/0x2e0 switching_to_scx+0x24/0x110 scx_enable.isra.0+0xd14/0x13d0 bpf_struct_ops_link_create+0x136/0x1a0 __sys_bpf+0x1edd/0x2c30 __x64_sys_bpf+0x21/0x30 do_syscall_64+0xbb/0x370 entry_SYSCALL_64_after_hwframe+0x77/0x7f This happens because we skip initialization for tasks that are already dead (with their usage counter set to zero), but we don't exclude them during the scheduling class transition phase. Fix this by also skipping dead tasks during class swiching, preventing invalid task state transitions. Fixes: a8532fac7b5d2 ("sched_ext: TASK_DEAD tasks must be switched into SCX on ops_enable") Cc: stable@vger.kernel.org # v6.12+ Signed-off-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7dedc9a16281..4ae32ef179dd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5749,6 +5749,9 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) __setscheduler_class(p->policy, p->prio); struct sched_enq_and_set_ctx ctx; + if (!tryget_task_struct(p)) + continue; + if (old_class != new_class && p->se.sched_delayed) dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -5761,6 +5764,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) sched_enq_and_set_task(&ctx); check_class_changed(task_rq(p), p, old_class, p->prio); + put_task_struct(p); } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); -- cgit v1.2.3 From c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Jul 2025 14:26:11 +0200 Subject: ipvs: Fix estimator kthreads preferred affinity The estimator kthreads' affinity are defined by sysctl overwritten preferences and applied through a plain call to the scheduler's affinity API. However since the introduction of managed kthreads preferred affinity, such a practice shortcuts the kthreads core code which eventually overwrites the target to the default unbound affinity. Fix this with using the appropriate kthread's API. Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node") Signed-off-by: Frederic Weisbecker Acked-by: Julian Anastasov Signed-off-by: Florian Westphal --- kernel/kthread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 0e98b228a8ef..31b072e8d427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -893,6 +893,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences -- cgit v1.2.3 From 21924af67d69d7c9fdaf845be69043cfe75196a1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 5 Aug 2025 00:10:02 +0000 Subject: locking: Fix __clear_task_blocked_on() warning from __ww_mutex_wound() path The __clear_task_blocked_on() helper added a number of sanity checks ensuring we hold the mutex wait lock and that the task we are clearing blocked_on pointer (if set) matches the mutex. However, there is an edge case in the _ww_mutex_wound() logic where we need to clear the blocked_on pointer for the task that owns the mutex, not the task that is waiting on the mutex. For this case the sanity checks aren't valid, so handle this by allowing a NULL lock to skip the additional checks. K Prateek Nayak and Maarten Lankhorst also pointed out that in this case where we don't hold the owner's mutex wait_lock, we need to be a bit more careful using READ_ONCE/WRITE_ONCE in both the __clear_task_blocked_on() and __set_task_blocked_on() implementations to avoid accidentally tripping WARN_ONs if two instances race. So do that here as well. This issue was easier to miss, I realized, as the test-ww_mutex driver only exercises the wait-die class of ww_mutexes. I've sent a patch[1] to address this so the logic will be easier to test. [1]: https://lore.kernel.org/lkml/20250801023358.562525-2-jstultz@google.com/ Fixes: a4f0b6fef4b0 ("locking/mutex: Add p->blocked_on wrappers for correctness checks") Closes: https://lore.kernel.org/lkml/68894443.a00a0220.26d0e1.0015.GAE@google.com/ Reported-by: syzbot+602c4720aed62576cd79@syzkaller.appspotmail.com Reported-by: Maarten Lankhorst Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Acked-by: Maarten Lankhorst Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250805001026.2247040-1-jstultz@google.com --- kernel/locking/ww_mutex.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 086fd5487ca7..31a785afee6c 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -342,8 +342,12 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * When waking up the task to wound, be sure to clear the * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. + * + * NOTE: We pass NULL here instead of lock, because we + * are waking the mutex owner, who may be currently + * blocked on a different mutex. */ - __clear_task_blocked_on(owner, lock); + __clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; -- cgit v1.2.3 From 89a2d212bdb4bc29bed8e7077abe054b801137ea Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Mon, 11 Aug 2025 13:17:59 -0500 Subject: dma/pool: Ensure DMA_DIRECT_REMAP allocations are decrypted When CONFIG_DMA_DIRECT_REMAP is enabled, atomic pool pages are remapped via dma_common_contiguous_remap() using the supplied pgprot. Currently, the mapping uses pgprot_dmacoherent(PAGE_KERNEL), which leaves the memory encrypted on systems with memory encryption enabled (e.g., ARM CCA Realms). This can cause the DMA layer to fail or crash when accessing the memory, as the underlying physical pages are not configured as expected. Fix this by requesting a decrypted mapping in the vmap() call: pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)) This ensures that atomic pool memory is consistently mapped unencrypted. Cc: stable@vger.kernel.org Signed-off-by: Shanker Donthineni Reviewed-by: Catalin Marinas Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250811181759.998805-1-sdonthineni@nvidia.com --- kernel/dma/pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 7b04f7575796..ee45dee33d49 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -102,8 +102,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size, #ifdef CONFIG_DMA_DIRECT_REMAP addr = dma_common_contiguous_remap(page, pool_size, - pgprot_dmacoherent(PAGE_KERNEL), - __builtin_return_address(0)); + pgprot_decrypted(pgprot_dmacoherent(PAGE_KERNEL)), + __builtin_return_address(0)); if (!addr) goto free_page; #else -- cgit v1.2.3 From 2b986b9e917bc88f81aa1ed386af63b26c983f1d Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 Aug 2025 20:24:37 +0200 Subject: bpf, cpumap: Disable page_pool direct xdp_return need larger scope When running an XDP bpf_prog on the remote CPU in cpumap code then we must disable the direct return optimization that xdp_return can perform for mem_type page_pool. This optimization assumes code is still executing under RX-NAPI of the original receiving CPU, which isn't true on this remote CPU. The cpumap code already disabled this via helpers xdp_set_return_frame_no_direct() and xdp_clear_return_frame_no_direct(), but the scope didn't include xdp_do_flush(). When doing XDP_REDIRECT towards e.g devmap this causes the function bq_xmit_all() to run with direct return optimization enabled. This can lead to hard to find bugs. The issue only happens when bq_xmit_all() cannot ndo_xdp_xmit all frames and them frees them via xdp_return_frame_rx_napi(). Fix by expanding scope to include xdp_do_flush(). This was found by Dragos Tatulea. Fixes: 11941f8a8536 ("bpf: cpumap: Implement generic cpumap") Reported-by: Dragos Tatulea Reported-by: Chris Arges Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Tested-by: Chris Arges Link: https://patch.msgid.link/175519587755.3008742.1088294435150406835.stgit@firesoul --- kernel/bpf/cpumap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b2b7b8ec2c2a..c46360b27871 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -186,7 +186,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, struct xdp_buff xdp; int i, nframes = 0; - xdp_set_return_frame_no_direct(); xdp.rxq = &rxq; for (i = 0; i < n; i++) { @@ -231,7 +230,6 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } } - xdp_clear_return_frame_no_direct(); stats->pass += nframes; return nframes; @@ -255,6 +253,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, rcu_read_lock(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); + xdp_set_return_frame_no_direct(); ret->xdp_n = cpu_map_bpf_prog_run_xdp(rcpu, frames, ret->xdp_n, stats); if (unlikely(ret->skb_n)) @@ -264,6 +263,7 @@ static void cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, if (stats->redirect) xdp_do_flush(); + xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); rcu_read_unlock(); -- cgit v1.2.3 From e4414b01c1cd9887bbde92f946c1ba94e40d6d64 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Aug 2025 22:06:55 +0200 Subject: bpf: Check the helper function is valid in get_helper_proto kernel test robot reported verifier bug [1] where the helper func pointer could be NULL due to disabled config option. As Alexei suggested we could check on that in get_helper_proto directly. Marking tail_call helper func with BPF_PTR_POISON, because it is unused by design. [1] https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com Reported-by: kernel test robot Reported-by: syzbot+a9ed3d9132939852d0df@syzkaller.appspotmail.com Suggested-by: Alexei Starovoitov Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Paul Chaignon Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20250814200655.945632-1-jolsa@kernel.org Closes: https://lore.kernel.org/oe-lkp/202507160818.68358831-lkp@intel.com --- kernel/bpf/core.c | 5 ++++- kernel/bpf/verifier.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5d1650af899d..f8ac77d08ca7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3024,7 +3024,10 @@ EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { - .func = NULL, + /* func is unused for tail_call, we set it to pass the + * get_helper_proto check + */ + .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c4f69a9e9af6..c89e2b1bc644 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11354,7 +11354,7 @@ static int get_helper_proto(struct bpf_verifier_env *env, int func_id, return -EINVAL; *ptr = env->ops->get_func_proto(func_id, env->prog); - return *ptr ? 0 : -EINVAL; + return *ptr && (*ptr)->func ? 0 : -EINVAL; } static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn, -- cgit v1.2.3 From b64fdd422a85025b5e91ead794db9d3ef970e369 Mon Sep 17 00:00:00 2001 From: Yunseong Kim Date: Tue, 12 Aug 2025 18:10:47 +0000 Subject: perf: Avoid undefined behavior from stopping/starting inactive events Calling pmu->start()/stop() on perf events in PERF_EVENT_STATE_OFF can leave event->hw.idx at -1. When PMU drivers later attempt to use this negative index as a shift exponent in bitwise operations, it leads to UBSAN shift-out-of-bounds reports. The issue is a logical flaw in how event groups handle throttling when some members are intentionally disabled. Based on the analysis and the reproducer provided by Mark Rutland (this issue on both arm64 and x86-64). The scenario unfolds as follows: 1. A group leader event is configured with a very aggressive sampling period (e.g., sample_period = 1). This causes frequent interrupts and triggers the throttling mechanism. 2. A child event in the same group is created in a disabled state (.disabled = 1). This event remains in PERF_EVENT_STATE_OFF. Since it hasn't been scheduled onto the PMU, its event->hw.idx remains initialized at -1. 3. When throttling occurs, perf_event_throttle_group() and later perf_event_unthrottle_group() iterate through all siblings, including the disabled child event. 4. perf_event_throttle()/unthrottle() are called on this inactive child event, which then call event->pmu->start()/stop(). 5. The PMU driver receives the event with hw.idx == -1 and attempts to use it as a shift exponent. e.g., in macros like PMCNTENSET(idx), leading to the UBSAN report. The throttling mechanism attempts to start/stop events that are not actively scheduled on the hardware. Move the state check into perf_event_throttle()/perf_event_unthrottle() so that inactive events are skipped entirely. This ensures only active events with a valid hw.idx are processed, preventing undefined behavior and silencing UBSAN warnings. The corrected check ensures true before proceeding with PMU operations. The problem can be reproduced with the syzkaller reproducer: Fixes: 9734e25fbf5a ("perf: Fix the throttle logic for a group") Signed-off-by: Yunseong Kim Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20250812181046.292382-2-ysk@kzalloc.com --- kernel/events/core.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 8060c2857bb2..872122e074e5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2665,6 +2665,9 @@ static void perf_log_itrace_start(struct perf_event *event); static void perf_event_unthrottle(struct perf_event *event, bool start) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + event->hw.interrupts = 0; if (start) event->pmu->start(event, 0); @@ -2674,6 +2677,9 @@ static void perf_event_unthrottle(struct perf_event *event, bool start) static void perf_event_throttle(struct perf_event *event) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + event->hw.interrupts = MAX_INTERRUPTS; event->pmu->stop(event, 0); if (event == event->group_leader) -- cgit v1.2.3 From 5eb4b9a4cdbb70d70377fe8fb2920b75910e5024 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 13 Aug 2025 15:21:59 +0200 Subject: params: Replace deprecated strcpy() with strscpy() and memcpy() strcpy() is deprecated; use strscpy() and memcpy() instead. In param_set_copystring(), we can safely use memcpy() because we already know the length of the source string 'val' and that it is guaranteed to be NUL-terminated within the first 'kps->maxlen' bytes. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Reviewed-by: Daniel Gomez Reviewed-by: Petr Pavlu Link: https://lore.kernel.org/r/20250813132200.184064-2-thorsten.blum@linux.dev Signed-off-by: Daniel Gomez --- kernel/params.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index b92d64161b75..b96cfd693c99 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -513,13 +513,14 @@ EXPORT_SYMBOL(param_array_ops); int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; + const size_t len = strnlen(val, kps->maxlen); - if (strnlen(val, kps->maxlen) == kps->maxlen) { + if (len == kps->maxlen) { pr_err("%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); return -ENOSPC; } - strcpy(kps->string, val); + memcpy(kps->string, val, len + 1); return 0; } EXPORT_SYMBOL(param_set_copystring); @@ -841,7 +842,7 @@ static void __init param_sysfs_builtin(void) dot = strchr(kp->name, '.'); if (!dot) { /* This happens for core_param() */ - strcpy(modname, "kernel"); + strscpy(modname, "kernel"); name_len = 0; } else { name_len = dot - kp->name + 1; -- cgit v1.2.3 From a2c1f82618b0b65f1ef615aa9cfdac8122537d69 Mon Sep 17 00:00:00 2001 From: "Adrian Huang (Lenovo)" Date: Mon, 18 Aug 2025 21:43:10 +0800 Subject: signal: Fix memory leak for PIDFD_SELF* sentinels Commit f08d0c3a7111 ("pidfd: add PIDFD_SELF* sentinels to refer to own thread/process") introduced a leak by acquiring a pid reference through get_task_pid(), which increments pid->count but never drops it with put_pid(). As a result, kmemleak reports unreferenced pid objects after running tools/testing/selftests/pidfd/pidfd_test, for example: unreferenced object 0xff1100206757a940 (size 160): comm "pidfd_test", pid 16965, jiffies 4294853028 hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 fd 57 50 04 .............WP. 5e 44 00 00 00 00 00 00 18 de 34 17 01 00 11 ff ^D........4..... backtrace (crc cd8844d4): kmem_cache_alloc_noprof+0x2f4/0x3f0 alloc_pid+0x54/0x3d0 copy_process+0xd58/0x1740 kernel_clone+0x99/0x3b0 __do_sys_clone3+0xbe/0x100 do_syscall_64+0x7b/0x2c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix this by calling put_pid() after do_pidfd_send_signal() returns. Fixes: f08d0c3a7111 ("pidfd: add PIDFD_SELF* sentinels to refer to own thread/process") Signed-off-by: Adrian Huang (Lenovo) Link: https://lore.kernel.org/20250818134310.12273-1-adrianhuang0701@gmail.com Tested-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Signed-off-by: Christian Brauner --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e2c928de7d2c..fe9190d84f28 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4067,6 +4067,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, { struct pid *pid; enum pid_type type; + int ret; /* Enforce flags be set to 0 until we add an extension. */ if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) @@ -4108,7 +4109,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, } } - return do_pidfd_send_signal(pid, sig, type, info, flags); + ret = do_pidfd_send_signal(pid, sig, type, info, flags); + put_pid(pid); + + return ret; } static int -- cgit v1.2.3 From 63b17b653df30e90f95338083cb44c35d64bcae4 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 8 Aug 2025 20:18:02 +0000 Subject: kho: init new_physxa->phys_bits to fix lockdep Patch series "Several KHO Hotfixes". Three unrelated fixes for Kexec Handover. This patch (of 3): Lockdep shows the following warning: INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. [] dump_stack_lvl+0x66/0xa0 [] assign_lock_key+0x10c/0x120 [] register_lock_class+0xf4/0x2f0 [] __lock_acquire+0x7f/0x2c40 [] ? __pfx_hlock_conflict+0x10/0x10 [] ? native_flush_tlb_global+0x8e/0xa0 [] ? __flush_tlb_all+0x4e/0xa0 [] ? __kernel_map_pages+0x112/0x140 [] ? xa_load_or_alloc+0x67/0xe0 [] lock_acquire+0xe6/0x280 [] ? xa_load_or_alloc+0x67/0xe0 [] _raw_spin_lock+0x30/0x40 [] ? xa_load_or_alloc+0x67/0xe0 [] xa_load_or_alloc+0x67/0xe0 [] kho_preserve_folio+0x90/0x100 [] __kho_finalize+0xcf/0x400 [] kho_finalize+0x34/0x70 This is becase xa has its own lock, that is not initialized in xa_load_or_alloc. Modifiy __kho_preserve_order(), to properly call xa_init(&new_physxa->phys_bits); Link: https://lkml.kernel.org/r/20250808201804.772010-2-pasha.tatashin@soleen.com Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Changyuan Lyu Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Cc: Pratyush Yadav Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index e49743ae52c5..65145972d6d6 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -144,14 +144,34 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, unsigned int order) { struct kho_mem_phys_bits *bits; - struct kho_mem_phys *physxa; + struct kho_mem_phys *physxa, *new_physxa; const unsigned long pfn_high = pfn >> order; might_sleep(); - physxa = xa_load_or_alloc(&track->orders, order, sizeof(*physxa)); - if (IS_ERR(physxa)) - return PTR_ERR(physxa); + physxa = xa_load(&track->orders, order); + if (!physxa) { + int err; + + new_physxa = kzalloc(sizeof(*physxa), GFP_KERNEL); + if (!new_physxa) + return -ENOMEM; + + xa_init(&new_physxa->phys_bits); + physxa = xa_cmpxchg(&track->orders, order, NULL, new_physxa, + GFP_KERNEL); + + err = xa_err(physxa); + if (err || physxa) { + xa_destroy(&new_physxa->phys_bits); + kfree(new_physxa); + + if (err) + return err; + } else { + physxa = new_physxa; + } + } bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, sizeof(*bits)); -- cgit v1.2.3 From 8b66ed2c3f42cc462e05704af6b94e6a7bad2f5e Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 8 Aug 2025 20:18:03 +0000 Subject: kho: mm: don't allow deferred struct page with KHO KHO uses struct pages for the preserved memory early in boot, however, with deferred struct page initialization, only a small portion of memory has properly initialized struct pages. This problem was detected where vmemmap is poisoned, and illegal flag combinations are detected. Don't allow them to be enabled together, and later we will have to teach KHO to work properly with deferred struct page init kernel feature. Link: https://lkml.kernel.org/r/20250808201804.772010-3-pasha.tatashin@soleen.com Fixes: 4e1d010e3bda ("kexec: add config option for KHO") Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (Microsoft) Acked-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Changyuan Lyu Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 2ee603a98813..1224dd937df0 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,6 +97,7 @@ config KEXEC_JUMP config KEXEC_HANDOVER bool "kexec handover" depends on ARCH_SUPPORTS_KEXEC_HANDOVER && ARCH_SUPPORTS_KEXEC_FILE + depends on !DEFERRED_STRUCT_PAGE_INIT select MEMBLOCK_KHO_SCRATCH select KEXEC_FILE select DEBUG_FS -- cgit v1.2.3 From 44958f2025ed3f29fc3e93bb1f6c16121d7847ad Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 8 Aug 2025 20:18:04 +0000 Subject: kho: warn if KHO is disabled due to an error During boot scratch area is allocated based on command line parameters or auto calculated. However, scratch area may fail to allocate, and in that case KHO is disabled. Currently, no warning is printed that KHO is disabled, which makes it confusing for the end user to figure out why KHO is not available. Add the missing warning message. Link: https://lkml.kernel.org/r/20250808201804.772010-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: Mike Rapoport (Microsoft) Acked-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Changyuan Lyu Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 65145972d6d6..ecd1ac210dbd 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -564,6 +564,7 @@ err_free_scratch_areas: err_free_scratch_desc: memblock_free(kho_scratch, kho_scratch_cnt * sizeof(*kho_scratch)); err_disable_kho: + pr_warn("Failed to reserve scratch area, disabling kexec handover\n"); kho_enable = false; } -- cgit v1.2.3 From 6a909ea83f226803ea0e718f6e88613df9234d58 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 13 Aug 2025 04:02:32 +0000 Subject: tracing: Limit access to parser->buffer when trace_get_user failed When the length of the string written to set_ftrace_filter exceeds FTRACE_BUFF_MAX, the following KASAN alarm will be triggered: BUG: KASAN: slab-out-of-bounds in strsep+0x18c/0x1b0 Read of size 1 at addr ffff0000d00bd5ba by task ash/165 CPU: 1 UID: 0 PID: 165 Comm: ash Not tainted 6.16.0-g6bcdbd62bd56-dirty Hardware name: linux,dummy-virt (DT) Call trace: show_stack+0x34/0x50 (C) dump_stack_lvl+0xa0/0x158 print_address_description.constprop.0+0x88/0x398 print_report+0xb0/0x280 kasan_report+0xa4/0xf0 __asan_report_load1_noabort+0x20/0x30 strsep+0x18c/0x1b0 ftrace_process_regex.isra.0+0x100/0x2d8 ftrace_regex_release+0x484/0x618 __fput+0x364/0xa58 ____fput+0x28/0x40 task_work_run+0x154/0x278 do_notify_resume+0x1f0/0x220 el0_svc+0xec/0xf0 el0t_64_sync_handler+0xa0/0xe8 el0t_64_sync+0x1ac/0x1b0 The reason is that trace_get_user will fail when processing a string longer than FTRACE_BUFF_MAX, but not set the end of parser->buffer to 0. Then an OOB access will be triggered in ftrace_regex_release-> ftrace_process_regex->strsep->strpbrk. We can solve this problem by limiting access to parser->buffer when trace_get_user failed. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/20250813040232.1344527-1-pulehui@huaweicloud.com Fixes: 8c9af478c06b ("ftrace: Handle commands when closing set_ftrace_filter file") Signed-off-by: Pu Lehui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 18 ++++++++++++------ kernel/trace/trace.h | 8 +++++++- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4283ed4e8f59..8d8935ed416d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1816,7 +1816,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, ret = get_user(ch, ubuf++); if (ret) - return ret; + goto fail; read++; cnt--; @@ -1830,7 +1830,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && isspace(ch)) { ret = get_user(ch, ubuf++); if (ret) - return ret; + goto fail; read++; cnt--; } @@ -1848,12 +1848,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; - else - return -EINVAL; + else { + ret = -EINVAL; + goto fail; + } ret = get_user(ch, ubuf++); if (ret) - return ret; + goto fail; read++; cnt--; } @@ -1868,11 +1870,15 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* Make sure the parsed string always terminates with '\0'. */ parser->buffer[parser->idx] = 0; } else { - return -EINVAL; + ret = -EINVAL; + goto fail; } *ppos += read; return read; +fail: + trace_parser_fail(parser); + return ret; } /* TODO add a seq_buf_to_buffer() */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dbf1d3cf2f1..be6654899cae 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1292,6 +1292,7 @@ bool ftrace_event_is_function(struct trace_event_call *call); */ struct trace_parser { bool cont; + bool fail; char *buffer; unsigned idx; unsigned size; @@ -1299,7 +1300,7 @@ struct trace_parser { static inline bool trace_parser_loaded(struct trace_parser *parser) { - return (parser->idx != 0); + return !parser->fail && parser->idx != 0; } static inline bool trace_parser_cont(struct trace_parser *parser) @@ -1313,6 +1314,11 @@ static inline void trace_parser_clear(struct trace_parser *parser) parser->idx = 0; } +static inline void trace_parser_fail(struct trace_parser *parser) +{ + parser->fail = true; +} + extern int trace_parser_get_init(struct trace_parser *parser, int size); extern void trace_parser_put(struct trace_parser *parser); extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, -- cgit v1.2.3 From cd6e4faba96fe41d6b686e144b96dad5e6f2e771 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Wed, 13 Aug 2025 17:51:14 +0800 Subject: ring-buffer: Remove redundant semicolons Remove unnecessary semicolons. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250813095114.559530-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bb71a0dc9d69..43460949ad3f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7666,7 +7666,7 @@ static __init int test_ringbuffer(void) rb_test_started = true; set_current_state(TASK_INTERRUPTIBLE); - /* Just run for 10 seconds */; + /* Just run for 10 seconds */ schedule_timeout(10 * HZ); kthread_stop(rb_hammer); -- cgit v1.2.3 From edede7a6dcd7435395cf757d053974aaab6ab1c2 Mon Sep 17 00:00:00 2001 From: Ye Weihua Date: Mon, 18 Aug 2025 07:33:32 +0000 Subject: trace/fgraph: Fix the warning caused by missing unregister notifier This warning was triggered during testing on v6.16: notifier callback ftrace_suspend_notifier_call already registered WARNING: CPU: 2 PID: 86 at kernel/notifier.c:23 notifier_chain_register+0x44/0xb0 ... Call Trace: blocking_notifier_chain_register+0x34/0x60 register_ftrace_graph+0x330/0x410 ftrace_profile_write+0x1e9/0x340 vfs_write+0xf8/0x420 ? filp_flush+0x8a/0xa0 ? filp_close+0x1f/0x30 ? do_dup2+0xaf/0x160 ksys_write+0x65/0xe0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f When writing to the function_profile_enabled interface, the notifier was not unregistered after start_graph_tracing failed, causing a warning the next time function_profile_enabled was written. Fixed by adding unregister_pm_notifier in the exception path. Link: https://lore.kernel.org/20250818073332.3890629-1-yeweihua4@huawei.com Fixes: 4a2b8dda3f870 ("tracing/function-graph-tracer: fix a regression while suspend to disk") Acked-by: Masami Hiramatsu (Google) Signed-off-by: Ye Weihua Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index f4d200f0c610..2a42c1036ea8 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1397,6 +1397,7 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } -- cgit v1.2.3 From ec879e1a0be8007aa232ffedcf6a6445dfc1a3d7 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 16 Aug 2025 23:10:51 +0900 Subject: tracing: fprobe-event: Sanitize wildcard for fprobe event name Fprobe event accepts wildcards for the target functions, but unless user specifies its event name, it makes an event with the wildcards. /sys/kernel/tracing # echo 'f mutex*' >> dynamic_events /sys/kernel/tracing # cat dynamic_events f:fprobes/mutex*__entry mutex* /sys/kernel/tracing # ls events/fprobes/ enable filter mutex*__entry To fix this, replace the wildcard ('*') with an underscore. Link: https://lore.kernel.org/all/175535345114.282990.12294108192847938710.stgit@devnote2/ Fixes: 334e5519c375 ("tracing/probes: Add fprobe events for tracing function entry and exit.") Signed-off-by: Masami Hiramatsu (Google) Cc: stable@vger.kernel.org --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dbf1d3cf2f1..5a6688832da8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -2204,7 +2204,7 @@ static inline bool is_good_system_name(const char *name) static inline void sanitize_event_name(char *name) { while (*name++ != '\0') - if (*name == ':' || *name == '.') + if (*name == ':' || *name == '.' || *name == '*') *name = '_'; } -- cgit v1.2.3 From e3d01979e4bff5c87eb4054a22e7568bb679b1fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Aug 2025 19:55:22 -0400 Subject: fgraph: Copy args in intermediate storage with entry The output of the function graph tracer has two ways to display its entries. One way for leaf functions with no events recorded within them, and the other is for functions with events recorded inside it. As function graph has an entry and exit event, to simplify the output of leaf functions it combines the two, where as non leaf functions are separate: 2) | invoke_rcu_core() { 2) | raise_softirq() { 2) 0.391 us | __raise_softirq_irqoff(); 2) 1.191 us | } 2) 2.086 us | } The __raise_softirq_irqoff() function above is really two events that were merged into one. Otherwise it would have looked like: 2) | invoke_rcu_core() { 2) | raise_softirq() { 2) | __raise_softirq_irqoff() { 2) 0.391 us | } 2) 1.191 us | } 2) 2.086 us | } In order to do this merge, the reading of the trace output file needs to look at the next event before printing. But since the pointer to the event is on the ring buffer, it needs to save the entry event before it looks at the next event as the next event goes out of focus as soon as a new event is read from the ring buffer. After it reads the next event, it will print the entry event with either the '{' (non leaf) or ';' and timestamps (leaf). The iterator used to read the trace file has storage for this event. The problem happens when the function graph tracer has arguments attached to the entry event as the entry now has a variable length "args" field. This field only gets set when funcargs option is used. But the args are not recorded in this temp data and garbage could be printed. The entry field is copied via: data->ent = *curr; Where "curr" is the entry field. But this method only saves the non variable length fields from the structure. Add a helper structure to the iterator data that adds the max args size to the data storage in the iterator. Then simply copy the entire entry into this storage (with size protection). Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Mark Rutland Link: https://lore.kernel.org/20250820195522.51d4a268@gandalf.local.home Reported-by: Sasha Levin Tested-by: Sasha Levin Closes: https://lore.kernel.org/all/aJaxRVKverIjF4a6@lappy/ Fixes: ff5c9c576e75 ("ftrace: Add support for function argument to graph tracer") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 66e1a527cf1a..a7f4b9a47a71 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -27,14 +27,21 @@ struct fgraph_cpu_data { unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; +struct fgraph_ent_args { + struct ftrace_graph_ent_entry ent; + /* Force the sizeof of args[] to have FTRACE_REGS_MAX_ARGS entries */ + unsigned long args[FTRACE_REGS_MAX_ARGS]; +}; + struct fgraph_data { struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ union { - struct ftrace_graph_ent_entry ent; + struct fgraph_ent_args ent; + /* TODO allow retaddr to have args */ struct fgraph_retaddr_ent_entry rent; - } ent; + }; struct ftrace_graph_ret_entry ret; int failed; int cpu; @@ -627,10 +634,13 @@ get_return_for_leaf(struct trace_iterator *iter, * Save current and next entries for later reference * if the output fails. */ - if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) - data->ent.rent = *(struct fgraph_retaddr_ent_entry *)curr; - else - data->ent.ent = *curr; + if (unlikely(curr->ent.type == TRACE_GRAPH_RETADDR_ENT)) { + data->rent = *(struct fgraph_retaddr_ent_entry *)curr; + } else { + int size = min((int)sizeof(data->ent), (int)iter->ent_size); + + memcpy(&data->ent, curr, size); + } /* * If the next event is not a return type, then * we only care about what type it is. Otherwise we can -- cgit v1.2.3 From 4013aef2ced9b756a410f50d12df9ebe6a883e4a Mon Sep 17 00:00:00 2001 From: Tengda Wu Date: Fri, 22 Aug 2025 03:33:43 +0000 Subject: ftrace: Fix potential warning in trace_printk_seq during ftrace_dump When calling ftrace_dump_one() concurrently with reading trace_pipe, a WARN_ON_ONCE() in trace_printk_seq() can be triggered due to a race condition. The issue occurs because: CPU0 (ftrace_dump) CPU1 (reader) echo z > /proc/sysrq-trigger !trace_empty(&iter) trace_iterator_reset(&iter) <- len = size = 0 cat /sys/kernel/tracing/trace_pipe trace_find_next_entry_inc(&iter) __find_next_entry ring_buffer_empty_cpu <- all empty return NULL trace_printk_seq(&iter.seq) WARN_ON_ONCE(s->seq.len >= s->seq.size) In the context between trace_empty() and trace_find_next_entry_inc() during ftrace_dump, the ring buffer data was consumed by other readers. This caused trace_find_next_entry_inc to return NULL, failing to populate `iter.seq`. At this point, due to the prior trace_iterator_reset, both `iter.seq.len` and `iter.seq.size` were set to 0. Since they are equal, the WARN_ON_ONCE condition is triggered. Move the trace_printk_seq() into the if block that checks to make sure the return value of trace_find_next_entry_inc() is non-NULL in ftrace_dump_one(), ensuring the 'iter.seq' is properly populated before subsequent operations. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Ingo Molnar Link: https://lore.kernel.org/20250822033343.3000289-1-wutengda@huaweicloud.com Fixes: d769041f8653 ("ring_buffer: implement new locking") Signed-off-by: Tengda Wu Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d8935ed416d..1b7db732c0b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -10638,10 +10638,10 @@ static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_m ret = print_trace_line(&iter); if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); + + trace_printk_seq(&iter.seq); } touch_nmi_watchdog(); - - trace_printk_seq(&iter.seq); } if (!cnt) -- cgit v1.2.3 From bfb336cf97df7b37b2b2edec0f69773e06d11955 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 22 Aug 2025 18:36:06 -0400 Subject: ftrace: Also allocate and copy hash for reading of filter files Currently the reader of set_ftrace_filter and set_ftrace_notrace just adds the pointer to the global tracer hash to its iterator. Unlike the writer that allocates a copy of the hash, the reader keeps the pointer to the filter hashes. This is problematic because this pointer is static across function calls that release the locks that can update the global tracer hashes. This can cause UAF and similar bugs. Allocate and copy the hash for reading the filter files like it is done for the writers. This not only fixes UAF bugs, but also makes the code a bit simpler as it doesn't have to differentiate when to free the iterator's hash between writers and readers. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nathan Chancellor Cc: Linus Torvalds Link: https://lore.kernel.org/20250822183606.12962cc3@batman.local.home Fixes: c20489dad156 ("ftrace: Assign iter->hash to filter or notrace hashes on seq read") Closes: https://lore.kernel.org/all/20250813023044.2121943-1-wutengda@huaweicloud.com/ Closes: https://lore.kernel.org/all/20250822192437.GA458494@ax162/ Reported-by: Tengda Wu Tested-by: Tengda Wu Tested-by: Nathan Chancellor Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 00b76d450a89..a69067367c29 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4661,13 +4661,17 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, } else { iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); } + } else { + if (hash) + iter->hash = alloc_and_copy_ftrace_hash(hash->size_bits, hash); + else + iter->hash = EMPTY_HASH; + } - if (!iter->hash) { - trace_parser_put(&iter->parser); - goto out_unlock; - } - } else - iter->hash = hash; + if (!iter->hash) { + trace_parser_put(&iter->parser); + goto out_unlock; + } ret = 0; @@ -6543,9 +6547,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); - } else { - /* For read only, the hash is the ops hash */ - iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); -- cgit v1.2.3 From 4717432dfd99bbd015b6782adca216c6f9340038 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sat, 9 Aug 2025 21:04:19 +0800 Subject: sched/deadline: Fix dl_server_stopped() Commit cccb45d7c429 ("sched/deadline: Less agressive dl_server handling") introduces dl_server_stopped(). But it is obvious that dl_server_stopped() should return true if dl_se->dl_server_active is 0. Fixes: cccb45d7c429 ("sched/deadline: Less agressive dl_server handling") Signed-off-by: Huacai Chen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250809130419.1980742-1-chenhuacai@loongson.cn --- kernel/sched/deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e2d51f4306b3..bb813afe5b08 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1611,7 +1611,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) static bool dl_server_stopped(struct sched_dl_entity *dl_se) { if (!dl_se->dl_server_active) - return false; + return true; if (dl_se->dl_server_idle) { dl_server_stop(dl_se); -- cgit v1.2.3 From bb4700adc3abec34c0a38b64f66258e4e233fc16 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 21 Jul 2025 15:01:42 +0200 Subject: sched/deadline: Always stop dl-server before changing parameters Commit cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") reduced dl-server overhead by delaying disabling servers only after there are no fair task around for a whole period, which means that deadline entities are not dequeued right away on a server stop event. However, the delay opens up a window in which a request for changing server parameters can break per-runqueue running_bw tracking, as reported by Yuri. Close the problematic window by unconditionally calling dl_server_stop() before applying the new parameters (ensuring deadline entities go through an actual dequeue). Fixes: cccb45d7c4295 ("sched/deadline: Less agressive dl_server handling") Reported-by: Yuri Andriaccio Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20250721-upstream-fix-dlserver-lessaggressive-b4-v1-1-4ebc10c87e40@redhat.com --- kernel/sched/debug.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 3f06ab84d53f..02e16b70a790 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -376,10 +376,8 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu return -EINVAL; } - if (rq->cfs.h_nr_queued) { - update_rq_clock(rq); - dl_server_stop(&rq->fair_server); - } + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); if (retval) -- cgit v1.2.3 From 421fc59cf58c64f898cafbbbbda0bc705837e7df Mon Sep 17 00:00:00 2001 From: kuyo chang Date: Sun, 15 Jun 2025 21:10:56 +0800 Subject: sched/deadline: Fix RT task potential starvation when expiry time passed [Symptom] The fair server mechanism, which is intended to prevent fair starvation when higher-priority tasks monopolize the CPU. Specifically, RT tasks on the runqueue may not be scheduled as expected. [Analysis] The log "sched: DL replenish lagged too much" triggered. By memory dump of dl_server: curr = 0xFFFFFF80D6A0AC00 ( dl_server = 0xFFFFFF83CD5B1470( dl_runtime = 0x02FAF080, dl_deadline = 0x3B9ACA00, dl_period = 0x3B9ACA00, dl_bw = 0xCCCC, dl_density = 0xCCCC, runtime = 0x02FAF080, deadline = 0x0000082031EB0E80, flags = 0x0, dl_throttled = 0x0, dl_yielded = 0x0, dl_non_contending = 0x0, dl_overrun = 0x0, dl_server = 0x1, dl_server_active = 0x1, dl_defer = 0x1, dl_defer_armed = 0x0, dl_defer_running = 0x1, dl_timer = ( node = ( expires = 0x000008199756E700), _softexpires = 0x000008199756E700, function = 0xFFFFFFDB9AF44D30 = dl_task_timer, base = 0xFFFFFF83CD5A12C0, state = 0x0, is_rel = 0x0, is_soft = 0x0, clock_update_flags = 0x4, clock = 0x000008204A496900, - The timer expiration time (rq->curr->dl_server->dl_timer->expires) is already in the past, indicating the timer has expired. - The timer state (rq->curr->dl_server->dl_timer->state) is 0. [Suspected Root Cause] The relevant code flow in the throttle path of update_curr_dl_se() as follows: dequeue_dl_entity(dl_se, 0); // the DL entity is dequeued if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { if (dl_server(dl_se)) // timer registration fails enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH);//enqueue immediately ... } The failure of `start_dl_timer` is caused by attempting to register a timer with an expiration time that is already in the past. When this situation persists, the code repeatedly re-enqueues the DL entity without properly replenishing or restarting the timer, resulting in RT task may not be scheduled as expected. [Proposed Solution]: Instead of immediately re-enqueuing the DL entity on timer registration failure, this change ensures the DL entity is properly replenished and the timer is restarted, preventing RT potential starvation. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Signed-off-by: kuyo chang Signed-off-by: Peter Zijlstra (Intel) Closes: https://lore.kernel.org/CAMuHMdXn4z1pioTtBGMfQM0jsLviqS2jwysaWXpoLxWYoGa82w@mail.gmail.com Tested-by: Geert Uytterhoeven Tested-by: Jiri Slaby Tested-by: Diederik de Haas Link: https://lkml.kernel.org/r/20250615131129.954975-1-kuyo.chang@mediatek.com --- kernel/sched/deadline.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index bb813afe5b08..88c3bd64a8a0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1496,10 +1496,12 @@ throttle: } if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) { - if (dl_server(dl_se)) - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); - else + if (dl_server(dl_se)) { + replenish_dl_new_period(dl_se, rq); + start_dl_timer(dl_se); + } else { enqueue_task_dl(rq, dl_task_of(dl_se), ENQUEUE_REPLENISH); + } } if (!is_leftmost(dl_se, &rq->dl)) -- cgit v1.2.3 From 52d15521eb75f9b521744db675bee61025d2fa52 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Fri, 27 Jun 2025 11:54:20 +0800 Subject: sched/deadline: Don't count nr_running for dl_server proxy tasks On CPU offline the kernel stalled with below call trace: INFO: task kworker/0:1:11 blocked for more than 120 seconds. cpuhp hold the cpu hotplug lock endless and stalled vmstat_shepherd. This is because we count nr_running twice on cpuhp enqueuing and failed the wait condition of cpuhp: enqueue_task_fair() // pick cpuhp from idle, rq->nr_running = 0 dl_server_start() [...] add_nr_running() // rq->nr_running = 1 add_nr_running() // rq->nr_running = 2 [switch to cpuhp, waiting on balance_hotplug_wait()] rcuwait_wait_event(rq->nr_running == 1 && ...) // failed, rq->nr_running=2 schedule() // wait again It doesn't make sense to count the dl_server towards runnable tasks, since it runs other tasks. Fixes: 63ba8422f876 ("sched/deadline: Introduce deadline servers") Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250627035420.37712-1-yangyicong@huawei.com --- kernel/sched/deadline.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 88c3bd64a8a0..f25301267e47 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1851,7 +1851,9 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) u64 deadline = dl_se->deadline; dl_rq->dl_nr_running++; - add_nr_running(rq_of_dl_rq(dl_rq), 1); + + if (!dl_server(dl_se)) + add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); } @@ -1861,7 +1863,9 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; - sub_nr_running(rq_of_dl_rq(dl_rq), 1); + + if (!dl_server(dl_se)) + sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); } -- cgit v1.2.3 From d9b05321e21e4b218de4ce8a590bf375f58b6346 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 22 Aug 2025 16:12:38 +0200 Subject: futex: Move futex_hash_free() back to __mmput() To avoid a memory leak via mm_alloc() + mmdrop() the futex cleanup code has been moved to __mmdrop(). This resulted in a warnings if the futex hash table has been allocated via vmalloc() the mmdrop() was invoked from atomic context. The free path must stay in __mmput() to ensure it is invoked from preemptible context. In order to avoid the memory leak, delay the allocation of mm_struct::mm->futex_ref to futex_hash_allocate(). This works because neither the per-CPU counter nor the private hash has been allocated and therefore - futex_private_hash() callers (such as exit_pi_state_list()) don't acquire reference if there is no private hash yet. There is also no reference put. - Regular callers (futex_hash()) fallback to global hash. No reference counting here. The futex_ref member can be allocated in futex_hash_allocate() before the private hash itself is allocated. This happens either while the first thread is created or on request. In both cases the process has just a single thread so there can be either futex operation in progress or the request to create a private hash. Move futex_hash_free() back to __mmput(); Move the allocation of mm_struct::futex_ref to futex_hash_allocate(). [ bp: Fold a follow-up fix to prevent a use-after-free: https://lore.kernel.org/r/20250830213806.sEKuuGSm@linutronix.de ] Fixes: e703b7e247503 ("futex: Move futex cleanup to __mmdrop()") Closes: https://lore.kernel.org/all/20250821102721.6deae493@kernel.org/ Reported-by: Jakub Kicinski Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov (AMD) Link: https://lkml.kernel.org/r/20250822141238.PfnkTjFb@linutronix.de --- kernel/fork.c | 2 +- kernel/futex/core.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..c4ada32598bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -689,7 +689,6 @@ void __mmdrop(struct mm_struct *mm) mm_pasid_drop(mm); mm_destroy_cid(mm); percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); - futex_hash_free(mm); free_mm(mm); } @@ -1138,6 +1137,7 @@ static inline void __mmput(struct mm_struct *mm) if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); + futex_hash_free(mm); mmdrop(mm); } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index d9bb5567af0c..125804fbb5cb 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1722,12 +1722,9 @@ int futex_mm_init(struct mm_struct *mm) RCU_INIT_POINTER(mm->futex_phash, NULL); mm->futex_phash_new = NULL; /* futex-ref */ + mm->futex_ref = NULL; atomic_long_set(&mm->futex_atomic, 0); mm->futex_batches = get_state_synchronize_rcu(); - mm->futex_ref = alloc_percpu(unsigned int); - if (!mm->futex_ref) - return -ENOMEM; - this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ return 0; } @@ -1801,6 +1798,17 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) } } + if (!mm->futex_ref) { + /* + * This will always be allocated by the first thread and + * therefore requires no locking. + */ + mm->futex_ref = alloc_percpu(unsigned int); + if (!mm->futex_ref) + return -ENOMEM; + this_cpu_inc(*mm->futex_ref); /* 0 -> 1 */ + } + fph = kvzalloc(struct_size(fph, queues, hash_slots), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!fph) -- cgit v1.2.3 From 7e2368a21741e2db542330b32aa6fdd8908e7cff Mon Sep 17 00:00:00 2001 From: Baochen Qiang Date: Thu, 28 Aug 2025 16:17:33 +0800 Subject: dma-debug: don't enforce dma mapping check on noncoherent allocations As discussed in [1], there is no need to enforce dma mapping check on noncoherent allocations, a simple test on the returned CPU address is good enough. Add a new pair of debug helpers and use them for noncoherent alloc/free to fix this issue. Fixes: efa70f2fdc84 ("dma-mapping: add a new dma_alloc_pages API") Link: https://lore.kernel.org/all/ff6c1fe6-820f-4e58-8395-df06aa91706c@oss.qualcomm.com # 1 Signed-off-by: Baochen Qiang Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250828-dma-debug-fix-noncoherent-dma-check-v1-1-76e9be0dd7fc@oss.qualcomm.com --- kernel/dma/debug.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/dma/debug.h | 20 ++++++++++++++++++++ kernel/dma/mapping.c | 4 ++-- 3 files changed, 69 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index e43c6de2bce4..b82399437db0 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -39,6 +39,7 @@ enum { dma_debug_sg, dma_debug_coherent, dma_debug_resource, + dma_debug_noncoherent, }; enum map_err_types { @@ -141,6 +142,7 @@ static const char *type2name[] = { [dma_debug_sg] = "scatter-gather", [dma_debug_coherent] = "coherent", [dma_debug_resource] = "resource", + [dma_debug_noncoherent] = "noncoherent", }; static const char *dir2name[] = { @@ -993,7 +995,8 @@ static void check_unmap(struct dma_debug_entry *ref) "[mapped as %s] [unmapped as %s]\n", ref->dev_addr, ref->size, type2name[entry->type], type2name[ref->type]); - } else if (entry->type == dma_debug_coherent && + } else if ((entry->type == dma_debug_coherent || + entry->type == dma_debug_noncoherent) && ref->paddr != entry->paddr) { err_printk(ref->dev, entry, "device driver frees " "DMA memory with different CPU address " @@ -1581,6 +1584,49 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, } } +void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_noncoherent; + entry->dev = dev; + entry->paddr = page_to_phys(page); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + + add_dma_entry(entry, attrs); +} + +void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_noncoherent, + .dev = dev, + .paddr = page_to_phys(page), + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} + static int __init dma_debug_driver_setup(char *str) { int i; diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..48757ca13f31 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -54,6 +54,13 @@ extern void debug_dma_sync_sg_for_cpu(struct device *dev, extern void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, int direction); +extern void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs); +extern void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr); #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, @@ -126,5 +133,18 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, int nelems, int direction) { } + +static inline void debug_dma_alloc_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr, + unsigned long attrs) +{ +} + +static inline void debug_dma_free_pages(struct device *dev, struct page *page, + size_t size, int direction, + dma_addr_t dma_addr) +{ +} #endif /* CONFIG_DMA_API_DEBUG */ #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 107e4a4d251d..56de28a3b179 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -712,7 +712,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, if (page) { trace_dma_alloc_pages(dev, page_to_virt(page), *dma_handle, size, dir, gfp, 0); - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); + debug_dma_alloc_pages(dev, page, size, dir, *dma_handle, 0); } else { trace_dma_alloc_pages(dev, NULL, 0, size, dir, gfp, 0); } @@ -738,7 +738,7 @@ void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir) { trace_dma_free_pages(dev, page_to_virt(page), dma_handle, size, dir, 0); - debug_dma_unmap_page(dev, dma_handle, size, dir); + debug_dma_free_pages(dev, page, size, dir, dma_handle); __dma_free_pages(dev, size, page, dma_handle, dir); } EXPORT_SYMBOL_GPL(dma_free_pages); -- cgit v1.2.3 From 81ac63321eb936b1a1f7045b37674661f8ffb4a5 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 5 Aug 2025 10:36:29 +0800 Subject: trace: Remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250805023630.335719-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index af42aaa3d172..2ab283fd3032 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -496,7 +496,7 @@ static bool user_event_enabler_queue_fault(struct user_event_mm *mm, { struct user_event_enabler_fault *fault; - fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN); + fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT); if (!fault) return false; -- cgit v1.2.3 From 3d62ab32df065e4a7797204a918f6489ddb8a237 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Tue, 19 Aug 2025 10:51:52 +0000 Subject: tracing: Fix tracing_marker may trigger page fault during preempt_disable Both tracing_mark_write and tracing_mark_raw_write call __copy_from_user_inatomic during preempt_disable. But in some case, __copy_from_user_inatomic may trigger page fault, and will call schedule() subtly. And if a task is migrated to other cpu, the following warning will be trigger: if (RB_WARN_ON(cpu_buffer, !local_read(&cpu_buffer->committing))) An example can illustrate this issue: process flow CPU --------------------------------------------------------------------- tracing_mark_raw_write(): cpu:0 ... ring_buffer_lock_reserve(): cpu:0 ... cpu = raw_smp_processor_id() cpu:0 cpu_buffer = buffer->buffers[cpu] cpu:0 ... ... __copy_from_user_inatomic(): cpu:0 ... # page fault do_mem_abort(): cpu:0 ... # Call schedule schedule() cpu:0 ... # the task schedule to cpu1 __buffer_unlock_commit(): cpu:1 ... ring_buffer_unlock_commit(): cpu:1 ... cpu = raw_smp_processor_id() cpu:1 cpu_buffer = buffer->buffers[cpu] cpu:1 As shown above, the process will acquire cpuid twice and the return values are not the same. To fix this problem using copy_from_user_nofault instead of __copy_from_user_inatomic, as the former performs 'access_ok' before copying. Link: https://lore.kernel.org/20250819105152.2766363-1-luogengkun@huaweicloud.com Fixes: 656c7f0d2d2b ("tracing: Replace kmap with copy_from_user() in trace_marker writing") Signed-off-by: Luo Gengkun Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1b7db732c0b1..2f1ae6c0ee81 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7209,7 +7209,7 @@ static ssize_t write_marker_to_buffer(struct trace_array *tr, const char __user entry = ring_buffer_event_data(event); entry->ip = ip; - len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); + len = copy_from_user_nofault(&entry->buf, ubuf, cnt); if (len) { memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); cnt = FAULTED_SIZE; @@ -7306,7 +7306,7 @@ static ssize_t write_raw_marker_to_buffer(struct trace_array *tr, entry = ring_buffer_event_data(event); - len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); + len = copy_from_user_nofault(&entry->id, ubuf, cnt); if (len) { entry->id = -1; memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); -- cgit v1.2.3 From 18dbcbfabfffc4a5d3ea10290c5ad27f22b0d240 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 11 Aug 2025 11:26:44 -0700 Subject: perf: Fix the POLL_HUP delivery breakage The event_limit can be set by the PERF_EVENT_IOC_REFRESH to limit the number of events. When the event_limit reaches 0, the POLL_HUP signal should be sent. But it's missed. The corresponding counter should be stopped when the event_limit reaches 0. It was implemented in the ARCH-specific code. However, since the commit 9734e25fbf5a ("perf: Fix the throttle logic for a group"), all the ARCH-specific code has been moved to the generic code. The code to handle the event_limit was lost. Add the event->pmu->stop(event, 0); back. Fixes: 9734e25fbf5a ("perf: Fix the throttle logic for a group") Closes: https://lore.kernel.org/lkml/aICYAqM5EQUlTqtX@li-2b55cdcc-350b-11b2-a85c-a78bff51fc11.ibm.com/ Reported-by: Sumanth Korikkar Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Tested-by: Sumanth Korikkar Link: https://lkml.kernel.org/r/20250811182644.1305952-1-kan.liang@linux.intel.com --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 872122e074e5..820127536e62 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10330,6 +10330,7 @@ static int __perf_event_overflow(struct perf_event *event, ret = 1; event->pending_kill = POLL_HUP; perf_event_disable_inatomic(event); + event->pmu->stop(event, 0); } if (event->attr.sigtrap) { -- cgit v1.2.3 From 762af5a2aa0ad18da1316666dae30d369268d44c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 25 Aug 2025 15:26:35 +0200 Subject: vdso/vsyscall: Avoid slow division loop in auxiliary clock update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The call to __iter_div_u64_rem() in vdso_time_update_aux() is a wrapper around subtraction. It cannot be used to divide large numbers, as that introduces long, computationally expensive delays. A regular u64 division is also not possible in the timekeeper update path as it can be too slow. Instead of splitting the ktime_t offset into into second and subsecond components during the timekeeper update fast-path, do it together with the adjustment of tk->offs_aux in the slow-path. Equivalent to the handling of offs_boot and monotonic_to_boot. Reuse the storage of monotonic_to_boot for the new field, as it is not used by auxiliary timekeepers. Fixes: 380b84e168e5 ("vdso/vsyscall: Update auxiliary clock data in the datapage") Reported-by: Miroslav Lichvar Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250825-vdso-auxclock-division-v1-1-a1d32a16a313@linutronix.de Closes: https://lore.kernel.org/lkml/aKwsNNWsHJg8IKzj@localhost/ --- kernel/time/timekeeping.c | 10 ++++++++-- kernel/time/vsyscall.c | 4 ++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 059fa8b79be6..b6974fce800c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -83,6 +83,12 @@ static inline bool tk_is_aux(const struct timekeeper *tk) } #endif +static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) +{ + tk->offs_aux = offs; + tk->monotonic_to_aux = ktime_to_timespec64(offs); +} + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -1506,7 +1512,7 @@ static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespe timekeeping_restore_shadow(tkd); return -EINVAL; } - tks->offs_aux = offs; + tk_update_aux_offs(tks, offs); } timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); @@ -2937,7 +2943,7 @@ static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) * xtime ("realtime") is not applicable for auxiliary clocks and * kept in sync with "monotonic". */ - aux_tks->offs_aux = ktime_sub(timespec64_to_ktime(*tnew), tnow); + tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow)); timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); return 0; diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 8ba8b0d8a387..aa59919b8f2c 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -159,10 +159,10 @@ void vdso_time_update_aux(struct timekeeper *tk) if (clock_mode != VDSO_CLOCKMODE_NONE) { fill_clock_configuration(vc, &tk->tkr_mono); - vdso_ts->sec = tk->xtime_sec; + vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - nsec += tk->offs_aux; + nsec += tk->monotonic_to_aux.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); nsec = nsec << tk->tkr_mono.shift; vdso_ts->nsec = nsec; -- cgit v1.2.3 From 5ebf512f335053a42482ebff91e46c6dc156bf8c Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Wed, 3 Sep 2025 16:48:32 +0100 Subject: sched: Fix sched_numa_find_nth_cpu() if mask offline sched_numa_find_nth_cpu() uses a bsearch to look for the 'closest' CPU in sched_domains_numa_masks and given cpus mask. However they might not intersect if all CPUs in the cpus mask are offline. bsearch will return NULL in that case, bail out instead of dereferencing a bogus pointer. The previous behaviour lead to this bug when using maxcpus=4 on an rk3399 (LLLLbb) (i.e. booting with all big CPUs offline): [ 1.422922] Unable to handle kernel paging request at virtual address ffffff8000000000 [ 1.423635] Mem abort info: [ 1.423889] ESR = 0x0000000096000006 [ 1.424227] EC = 0x25: DABT (current EL), IL = 32 bits [ 1.424715] SET = 0, FnV = 0 [ 1.424995] EA = 0, S1PTW = 0 [ 1.425279] FSC = 0x06: level 2 translation fault [ 1.425735] Data abort info: [ 1.425998] ISV = 0, ISS = 0x00000006, ISS2 = 0x00000000 [ 1.426499] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 1.426952] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 1.427428] swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000004a9f000 [ 1.428038] [ffffff8000000000] pgd=18000000f7fff403, p4d=18000000f7fff403, pud=18000000f7fff403, pmd=0000000000000000 [ 1.429014] Internal error: Oops: 0000000096000006 [#1] SMP [ 1.429525] Modules linked in: [ 1.429813] CPU: 3 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-rc4-dirty #343 PREEMPT [ 1.430559] Hardware name: Pine64 RockPro64 v2.1 (DT) [ 1.431012] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 1.431634] pc : sched_numa_find_nth_cpu+0x2a0/0x488 [ 1.432094] lr : sched_numa_find_nth_cpu+0x284/0x488 [ 1.432543] sp : ffffffc084e1b960 [ 1.432843] x29: ffffffc084e1b960 x28: ffffff80078a8800 x27: ffffffc0846eb1d0 [ 1.433495] x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000 [ 1.434144] x23: 0000000000000000 x22: fffffffffff7f093 x21: ffffffc081de6378 [ 1.434792] x20: 0000000000000000 x19: 0000000ffff7f093 x18: 00000000ffffffff [ 1.435441] x17: 3030303866666666 x16: 66663d736b73616d x15: ffffffc104e1b5b7 [ 1.436091] x14: 0000000000000000 x13: ffffffc084712860 x12: 0000000000000372 [ 1.436739] x11: 0000000000000126 x10: ffffffc08476a860 x9 : ffffffc084712860 [ 1.437389] x8 : 00000000ffffefff x7 : ffffffc08476a860 x6 : 0000000000000000 [ 1.438036] x5 : 000000000000bff4 x4 : 0000000000000000 x3 : 0000000000000000 [ 1.438683] x2 : 0000000000000000 x1 : ffffffc0846eb000 x0 : ffffff8000407b68 [ 1.439332] Call trace: [ 1.439559] sched_numa_find_nth_cpu+0x2a0/0x488 (P) [ 1.440016] smp_call_function_any+0xc8/0xd0 [ 1.440416] armv8_pmu_init+0x58/0x27c [ 1.440770] armv8_cortex_a72_pmu_init+0x20/0x2c [ 1.441199] arm_pmu_device_probe+0x1e4/0x5e8 [ 1.441603] armv8_pmu_device_probe+0x1c/0x28 [ 1.442007] platform_probe+0x5c/0xac [ 1.442347] really_probe+0xbc/0x298 [ 1.442683] __driver_probe_device+0x78/0x12c [ 1.443087] driver_probe_device+0xdc/0x160 [ 1.443475] __driver_attach+0x94/0x19c [ 1.443833] bus_for_each_dev+0x74/0xd4 [ 1.444190] driver_attach+0x24/0x30 [ 1.444525] bus_add_driver+0xe4/0x208 [ 1.444874] driver_register+0x60/0x128 [ 1.445233] __platform_driver_register+0x24/0x30 [ 1.445662] armv8_pmu_driver_init+0x28/0x4c [ 1.446059] do_one_initcall+0x44/0x25c [ 1.446416] kernel_init_freeable+0x1dc/0x3bc [ 1.446820] kernel_init+0x20/0x1d8 [ 1.447151] ret_from_fork+0x10/0x20 [ 1.447493] Code: 90022e21 f000e5f5 910de2b5 2a1703e2 (f8767803) [ 1.448040] ---[ end trace 0000000000000000 ]--- [ 1.448483] note: swapper/0[1] exited with preempt_count 1 [ 1.449047] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b [ 1.449741] SMP: stopping secondary CPUs [ 1.450105] Kernel Offset: disabled [ 1.450419] CPU features: 0x000000,00080000,20002001,0400421b [ 1.450935] Memory Limit: none [ 1.451217] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]--- Yury: with the fix, the function returns cpu == nr_cpu_ids, and later in smp_call_function_any -> smp_call_function_single -> generic_exec_single we test the cpu for '>= nr_cpu_ids' and return -ENXIO. So everything is handled correctly. Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()") Cc: stable@vger.kernel.org Signed-off-by: Christian Loehle Signed-off-by: Yury Norov (NVIDIA) --- kernel/sched/topology.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 977e133bb8a4..6e2f54169e66 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2201,6 +2201,8 @@ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) goto unlock; hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp); + if (!hop_masks) + goto unlock; hop = hop_masks - k.masks; ret = hop ? -- cgit v1.2.3 From 4540f1d23e7f387880ce46d11b5cd3f27248bf8d Mon Sep 17 00:00:00 2001 From: Stanislav Fort Date: Tue, 2 Sep 2025 14:00:49 +0300 Subject: audit: fix out-of-bounds read in audit_compare_dname_path() When a watch on dir=/ is combined with an fsnotify event for a single-character name directly under / (e.g., creating /a), an out-of-bounds read can occur in audit_compare_dname_path(). The helper parent_len() returns 1 for "/". In audit_compare_dname_path(), when parentlen equals the full path length (1), the code sets p = path + 1 and pathlen = 1 - 1 = 0. The subsequent loop then dereferences p[pathlen - 1] (i.e., p[-1]), causing an out-of-bounds read. Fix this by adding a pathlen > 0 check to the while loop condition to prevent the out-of-bounds access. Cc: stable@vger.kernel.org Fixes: e92eebb0d611 ("audit: fix suffixed '/' filename matching") Reported-by: Stanislav Fort Suggested-by: Linus Torvalds Signed-off-by: Stanislav Fort [PM: subject tweak, sign-off email fixes] Signed-off-by: Paul Moore --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index e3f42018ed46..f7708fe2c457 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1326,7 +1326,7 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par /* handle trailing slashes */ pathlen -= parentlen; - while (p[pathlen - 1] == '/') + while (pathlen > 0 && p[pathlen - 1] == '/') pathlen--; if (pathlen != dlen) -- cgit v1.2.3 From ab1396af7595e7d49a3850481b24d7fe7cbdfd31 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 5 Sep 2025 22:06:18 -0700 Subject: trace/fgraph: Fix error handling Commit edede7a6dcd7 ("trace/fgraph: Fix the warning caused by missing unregister notifier") added a call to unregister the PM notifier if register_ftrace_graph() failed. It does so unconditionally. However, the PM notifier is only registered with the first call to register_ftrace_graph(). If the first registration was successful and a subsequent registration failed, the notifier is now unregistered even if ftrace graphs are still registered. Fix the problem by only unregistering the PM notifier during error handling if there are no active fgraph registrations. Fixes: edede7a6dcd7 ("trace/fgraph: Fix the warning caused by missing unregister notifier") Closes: https://lore.kernel.org/all/63b0ba5a-a928-438e-84f9-93028dd72e54@roeck-us.net/ Cc: Ye Weihua Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250906050618.2634078-1-linux@roeck-us.net Signed-off-by: Guenter Roeck Signed-off-by: Steven Rostedt (Google) --- kernel/trace/fgraph.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 2a42c1036ea8..1e3b32b1e82c 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -1397,7 +1397,8 @@ error: ftrace_graph_active--; gops->saved_func = NULL; fgraph_lru_release_index(i); - unregister_pm_notifier(&ftrace_suspend_notifier); + if (!ftrace_graph_active) + unregister_pm_notifier(&ftrace_suspend_notifier); } return ret; } -- cgit v1.2.3 From c1628c00c4351dd0727ef7f670694f68d9e663d8 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Sat, 6 Sep 2025 11:56:10 +0800 Subject: tracing/osnoise: Fix null-ptr-deref in bitmap_parselist() A crash was observed with the following output: BUG: kernel NULL pointer dereference, address: 0000000000000010 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 2 UID: 0 PID: 92 Comm: osnoise_cpus Not tainted 6.17.0-rc4-00201-gd69eb204c255 #138 PREEMPT(voluntary) RIP: 0010:bitmap_parselist+0x53/0x3e0 Call Trace: osnoise_cpus_write+0x7a/0x190 vfs_write+0xf8/0x410 ? do_sys_openat2+0x88/0xd0 ksys_write+0x60/0xd0 do_syscall_64+0xa4/0x260 entry_SYSCALL_64_after_hwframe+0x77/0x7f This issue can be reproduced by below code: fd=open("/sys/kernel/debug/tracing/osnoise/cpus", O_WRONLY); write(fd, "0-2", 0); When user pass 'count=0' to osnoise_cpus_write(), kmalloc() will return ZERO_SIZE_PTR (16) and cpulist_parse() treat it as a normal value, which trigger the null pointer dereference. Add check for the parameter 'count'. Cc: Cc: Cc: Link: https://lore.kernel.org/20250906035610.3880282-1-wangliang74@huawei.com Fixes: 17f89102fe23 ("tracing/osnoise: Allow arbitrarily long CPU string") Signed-off-by: Wang Liang Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_osnoise.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index fd259da0aa64..337bc0eb5d71 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2322,6 +2322,9 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count, int running, err; char *buf __free(kfree) = NULL; + if (count < 1) + return 0; + buf = kmalloc(count, GFP_KERNEL); if (!buf) return -ENOMEM; -- cgit v1.2.3 From cd4453c5e983cf1fd5757e9acb915adb1e4602b6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 8 Sep 2025 02:46:58 +0000 Subject: tracing: Silence warning when chunk allocation fails in trace_pid_write Syzkaller trigger a fault injection warning: WARNING: CPU: 1 PID: 12326 at tracepoint_add_func+0xbfc/0xeb0 Modules linked in: CPU: 1 UID: 0 PID: 12326 Comm: syz.6.10325 Tainted: G U 6.14.0-rc5-syzkaller #0 Tainted: [U]=USER Hardware name: Google Compute Engine/Google Compute Engine RIP: 0010:tracepoint_add_func+0xbfc/0xeb0 kernel/tracepoint.c:294 Code: 09 fe ff 90 0f 0b 90 0f b6 74 24 43 31 ff 41 bc ea ff ff ff RSP: 0018:ffffc9000414fb48 EFLAGS: 00010283 RAX: 00000000000012a1 RBX: ffffffff8e240ae0 RCX: ffffc90014b78000 RDX: 0000000000080000 RSI: ffffffff81bbd78b RDI: 0000000000000001 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000001 R12: ffffffffffffffef R13: 0000000000000000 R14: dffffc0000000000 R15: ffffffff81c264f0 FS: 00007f27217f66c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2e80dff8 CR3: 00000000268f8000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tracepoint_probe_register_prio+0xc0/0x110 kernel/tracepoint.c:464 register_trace_prio_sched_switch include/trace/events/sched.h:222 [inline] register_pid_events kernel/trace/trace_events.c:2354 [inline] event_pid_write.isra.0+0x439/0x7a0 kernel/trace/trace_events.c:2425 vfs_write+0x24c/0x1150 fs/read_write.c:677 ksys_write+0x12b/0x250 fs/read_write.c:731 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f We can reproduce the warning by following the steps below: 1. echo 8 >> set_event_notrace_pid. Let tr->filtered_pids owns one pid and register sched_switch tracepoint. 2. echo ' ' >> set_event_pid, and perform fault injection during chunk allocation of trace_pid_list_alloc. Let pid_list with no pid and assign to tr->filtered_pids. 3. echo ' ' >> set_event_pid. Let pid_list is NULL and assign to tr->filtered_pids. 4. echo 9 >> set_event_pid, will trigger the double register sched_switch tracepoint warning. The reason is that syzkaller injects a fault into the chunk allocation in trace_pid_list_alloc, causing a failure in trace_pid_list_set, which may trigger double register of the same tracepoint. This only occurs when the system is about to crash, but to suppress this warning, let's add failure handling logic to trace_pid_list_set. Link: https://lore.kernel.org/20250908024658.2390398-1-pulehui@huaweicloud.com Fixes: 8d6e90983ade ("tracing: Create a sparse bitmask for pid filtering") Reported-by: syzbot+161412ccaeff20ce4dde@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67cb890e.050a0220.d8275.022e.GAE@google.com Signed-off-by: Pu Lehui Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2f1ae6c0ee81..b3c94fbaf002 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -834,7 +834,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, /* copy the current bits to the new max */ ret = trace_pid_list_first(filtered_pids, &pid); while (!ret) { - trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_set(pid_list, pid); + if (ret < 0) + goto out; + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } @@ -871,6 +874,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_clear(&parser); ret = 0; } + out: trace_parser_put(&parser); if (ret < 0) { -- cgit v1.2.3 From e895f8e29119c8c966ea794af9e9100b10becb88 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Tue, 5 Aug 2025 16:10:25 +0800 Subject: hrtimers: Unconditionally update target CPU base after offline timer migration When testing softirq based hrtimers on an ARM32 board, with high resolution mode and NOHZ inactive, softirq based hrtimers fail to expire after being moved away from an offline CPU: CPU0 CPU1 hrtimer_start(..., HRTIMER_MODE_SOFT); cpu_down(CPU1) ... hrtimers_cpu_dying() // Migrate timers to CPU0 smp_call_function_single(CPU0, returgger_next_event); retrigger_next_event() if (!highres && !nohz) return; As retrigger_next_event() is a NOOP when both high resolution timers and NOHZ are inactive CPU0's hrtimer_cpu_base::softirq_expires_next is not updated and the migrated softirq timers never expire unless there is a softirq based hrtimer queued on CPU0 later. Fix this by removing the hrtimer_hres_active() and tick_nohz_active() check in retrigger_next_event(), which enforces a full update of the CPU base. As this is not a fast path the extra cost does not matter. [ tglx: Massaged change log ] Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Co-developed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker Signed-off-by: Xiongfeng Wang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250805081025.54235-1-wangxiongfeng2@huawei.com --- kernel/time/hrtimer.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..e8c479329282 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -787,10 +787,10 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) @@ -2295,11 +2295,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) &new_base->clock_base[i]); } - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); -- cgit v1.2.3 From f9bb6ffa7f5ad0f8ee0f53fc4a10655872ee4a14 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 29 Aug 2025 16:36:56 +0200 Subject: bpf: Fix out-of-bounds dynptr write in bpf_crypto_crypt Stanislav reported that in bpf_crypto_crypt() the destination dynptr's size is not validated to be at least as large as the source dynptr's size before calling into the crypto backend with 'len = src_len'. This can result in an OOB write when the destination is smaller than the source. Concretely, in mentioned function, psrc and pdst are both linear buffers fetched from each dynptr: psrc = __bpf_dynptr_data(src, src_len); [...] pdst = __bpf_dynptr_data_rw(dst, dst_len); [...] err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv) : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv); The crypto backend expects pdst to be large enough with a src_len length that can be written. Add an additional src_len > dst_len check and bail out if it's the case. Note that these kfuncs are accessible under root privileges only. Fixes: 3e1c6f35409f ("bpf: make common crypto API for TC/XDP programs") Reported-by: Stanislav Fort Signed-off-by: Daniel Borkmann Cc: Vadim Fedorenko Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20250829143657.318524-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- kernel/bpf/crypto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c index 94854cd9c4cc..83c4d9943084 100644 --- a/kernel/bpf/crypto.c +++ b/kernel/bpf/crypto.c @@ -278,7 +278,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx, siv_len = siv ? __bpf_dynptr_size(siv) : 0; src_len = __bpf_dynptr_size(src); dst_len = __bpf_dynptr_size(dst); - if (!src_len || !dst_len) + if (!src_len || !dst_len || src_len > dst_len) return -EINVAL; if (siv_len != ctx->siv_len) -- cgit v1.2.3 From 7edfc024708258d75f65fadffd7e5f6ac46810b6 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 30 Aug 2025 00:31:58 +0800 Subject: bpf: Fix bpf_strnstr() to handle suffix match cases better bpf_strnstr() should not treat the ending '\0' of s2 as a matching character if the parameter 'len' equal to s2 string length, for example: 1. bpf_strnstr("openat", "open", 4) = -ENOENT 2. bpf_strnstr("openat", "open", 5) = 0 This patch makes (1) return 0, fix just the `len == strlen(s2)` case. And fix a more general case when s2 is a suffix of the first len characters of s1. Fixes: e91370550f1f ("bpf: Add kfuncs for read-only string operations") Signed-off-by: Rong Tao Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/tencent_17DC57B9D16BC443837021BEACE84B7C1507@qq.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68..b9b0c5fe33f6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3664,10 +3664,17 @@ __bpf_kfunc int bpf_strnstr(const char *s1__ign, const char *s2__ign, size_t len guard(pagefault)(); for (i = 0; i < XATTR_SIZE_MAX; i++) { - for (j = 0; i + j < len && j < XATTR_SIZE_MAX; j++) { + for (j = 0; i + j <= len && j < XATTR_SIZE_MAX; j++) { __get_kernel_nofault(&c2, s2__ign + j, char, err_out); if (c2 == '\0') return i; + /* + * We allow reading an extra byte from s2 (note the + * `i + j <= len` above) to cover the case when s2 is + * a suffix of the first len chars of s1. + */ + if (i + j == len) + break; __get_kernel_nofault(&c1, s1__ign + j, char, err_out); if (c1 == '\0') return -ENOENT; -- cgit v1.2.3 From 0d80e7f951be1bdd08d328fd87694be0d6e8aaa8 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 9 Sep 2025 18:49:59 +0000 Subject: rqspinlock: Choose trylock fallback for NMI waiters Currently, out of all 3 types of waiters in the rqspinlock slow path (i.e., pending bit waiter, wait queue head waiter, and wait queue non-head waiter), only the pending bit waiter and wait queue head waiters apply deadlock checks and a timeout on their waiting loop. The assumption here was that the wait queue head's forward progress would be sufficient to identify cases where the lock owner or pending bit waiter is stuck, and non-head waiters relying on the head waiter would prove to be sufficient for their own forward progress. However, the head waiter itself can be preempted by a non-head waiter for the same lock (AA) or a different lock (ABBA) in a manner that impedes its forward progress. In such a case, non-head waiters not performing deadlock and timeout checks becomes insufficient, and the system can enter a state of lockup. This is typically not a concern with non-NMI lock acquisitions, as lock holders which in run in different contexts (IRQ, non-IRQ) use "irqsave" variants of the lock APIs, which naturally excludes such lock holders from preempting one another on the same CPU. It might seem likely that a similar case may occur for rqspinlock when programs are attached to contention tracepoints (begin, end), however, these tracepoints either precede the enqueue into the wait queue, or succeed it, therefore cannot be used to preempt a head waiter's waiting loop. We must still be careful against nested kprobe and fentry programs that may attach to the middle of the head's waiting loop to stall forward progress and invoke another rqspinlock acquisition that proceeds as a non-head waiter. To this end, drop CC_FLAGS_FTRACE from the rqspinlock.o object file. For now, this issue is resolved by falling back to a repeated trylock on the lock word from NMI context, while performing the deadlock checks to break out early in case forward progress is impossible, and use the timeout as a final fallback. A more involved fix to terminate the queue when such a condition occurs will be made as a follow up. A selftest to stress this aspect of nested NMI/non-NMI locking attempts will be added in a subsequent patch to the bpf-next tree when this fix lands and trees are synchronized. Reported-by: Josef Bacik Fixes: 164c246571e9 ("rqspinlock: Protect waiters in queue from stalls") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250909184959.3509085-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/Makefile | 1 + kernel/bpf/rqspinlock.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 269c04a24664..f6cf8c2af5f7 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -62,3 +62,4 @@ CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_rqspinlock.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/rqspinlock.c b/kernel/bpf/rqspinlock.c index 5ab354d55d82..a00561b1d3e5 100644 --- a/kernel/bpf/rqspinlock.c +++ b/kernel/bpf/rqspinlock.c @@ -471,7 +471,7 @@ queue: * any MCS node. This is not the most elegant solution, but is * simple enough. */ - if (unlikely(idx >= _Q_MAX_NODES)) { + if (unlikely(idx >= _Q_MAX_NODES || in_nmi())) { lockevent_inc(lock_no_node); RES_RESET_TIMEOUT(ts, RES_DEF_TIMEOUT); while (!queued_spin_trylock(lock)) { -- cgit v1.2.3 From df0cb5cb50bd54d3cd4d0d83417ceec6a66404aa Mon Sep 17 00:00:00 2001 From: KaFai Wan Date: Tue, 9 Sep 2025 22:46:14 +0800 Subject: bpf: Allow fall back to interpreter for programs with stack size <= 512 OpenWRT users reported regression on ARMv6 devices after updating to latest HEAD, where tcpdump filter: tcpdump "not ether host 3c37121a2b3c and not ether host 184ecbca2a3a \ and not ether host 14130b4d3f47 and not ether host f0f61cf440b7 \ and not ether host a84b4dedf471 and not ether host d022be17e1d7 \ and not ether host 5c497967208b and not ether host 706655784d5b" fails with warning: "Kernel filter failed: No error information" when using config: # CONFIG_BPF_JIT_ALWAYS_ON is not set CONFIG_BPF_JIT_DEFAULT_ON=y The issue arises because commits: 1. "bpf: Fix array bounds error with may_goto" changed default runtime to __bpf_prog_ret0_warn when jit_requested = 1 2. "bpf: Avoid __bpf_prog_ret0_warn when jit fails" returns error when jit_requested = 1 but jit fails This change restores interpreter fallback capability for BPF programs with stack size <= 512 bytes when jit fails. Reported-by: Felix Fietkau Closes: https://lore.kernel.org/bpf/2e267b4b-0540-45d8-9310-e127bf95fc63@nbd.name/ Fixes: 6ebc5030e0c5 ("bpf: Fix array bounds error with may_goto") Signed-off-by: KaFai Wan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250909144614.2991253-1-kafai.wan@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f8ac77d08ca7..e4568d44e827 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2366,8 +2366,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON - * is not working properly, or interpreter is being used when - * prog->jit_requested is not 0, so warn about it! + * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; @@ -2468,8 +2467,9 @@ out: return ret; } -static void bpf_prog_select_func(struct bpf_prog *fp) +static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { + bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; @@ -2478,15 +2478,16 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ - if (!fp->jit_requested && - !WARN_ON_ONCE(idx >= ARRAY_SIZE(interpreters))) { + if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; + select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif + return select_interpreter; } /** @@ -2505,7 +2506,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ - bool jit_needed = fp->jit_requested; + bool jit_needed = false; if (fp->bpf_func) goto finalize; @@ -2514,7 +2515,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) bpf_prog_has_kfunc_call(fp)) jit_needed = true; - bpf_prog_select_func(fp); + if (!bpf_prog_select_interpreter(fp)) + jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during -- cgit v1.2.3 From 6d78b4473cdb08b74662355a9e8510bde09c511e Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 9 Sep 2025 09:52:20 +0000 Subject: bpf: Tell memcg to use allow_spinning=false path in bpf_timer_init() Currently, calling bpf_map_kmalloc_node() from __bpf_async_init() can cause various locking issues; see the following stack trace (edited for style) as one example: ... [10.011566] do_raw_spin_lock.cold [10.011570] try_to_wake_up (5) double-acquiring the same [10.011575] kick_pool rq_lock, causing a hardlockup [10.011579] __queue_work [10.011582] queue_work_on [10.011585] kernfs_notify [10.011589] cgroup_file_notify [10.011593] try_charge_memcg (4) memcg accounting raises an [10.011597] obj_cgroup_charge_pages MEMCG_MAX event [10.011599] obj_cgroup_charge_account [10.011600] __memcg_slab_post_alloc_hook [10.011603] __kmalloc_node_noprof ... [10.011611] bpf_map_kmalloc_node [10.011612] __bpf_async_init [10.011615] bpf_timer_init (3) BPF calls bpf_timer_init() [10.011617] bpf_prog_xxxxxxxxxxxxxxxx_fcg_runnable [10.011619] bpf__sched_ext_ops_runnable [10.011620] enqueue_task_scx (2) BPF runs with rq_lock held [10.011622] enqueue_task [10.011626] ttwu_do_activate [10.011629] sched_ttwu_pending (1) grabs rq_lock ... The above was reproduced on bpf-next (b338cf849ec8) by modifying ./tools/sched_ext/scx_flatcg.bpf.c to call bpf_timer_init() during ops.runnable(), and hacking the memcg accounting code a bit to make a bpf_timer_init() call more likely to raise an MEMCG_MAX event. We have also run into other similar variants (both internally and on bpf-next), including double-acquiring cgroup_file_kn_lock, the same worker_pool::lock, etc. As suggested by Shakeel, fix this by using __GFP_HIGH instead of GFP_ATOMIC in __bpf_async_init(), so that e.g. if try_charge_memcg() raises an MEMCG_MAX event, we call __memcg_memory_event() with @allow_spinning=false and avoid calling cgroup_file_notify() there. Depends on mm patch "memcg: skip cgroup_file_notify if spinning is not allowed": https://lore.kernel.org/bpf/20250905201606.66198-1-shakeel.butt@linux.dev/ v0 approach s/bpf_map_kmalloc_node/bpf_mem_alloc/ https://lore.kernel.org/bpf/20250905061919.439648-1-yepeilin@google.com/ v1 approach: https://lore.kernel.org/bpf/20250905234547.862249-1-yepeilin@google.com/ Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Suggested-by: Shakeel Butt Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20250909095222.2121438-1-yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b9b0c5fe33f6..8af62cb243d9 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1274,8 +1274,11 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u goto out; } - /* allocate hrtimer via map_kmalloc to use memcg accounting */ - cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); + /* Allocate via bpf_map_kmalloc_node() for memcg accounting. Until + * kmalloc_nolock() is available, avoid locking issues by using + * __GFP_HIGH (GFP_ATOMIC & ~__GFP_RECLAIM). + */ + cb = bpf_map_kmalloc_node(map, size, __GFP_HIGH, map->numa_node); if (!cb) { ret = -ENOMEM; goto out; -- cgit v1.2.3 From e0423541477dfb684fbc6e6b5386054bc650f264 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 5 Sep 2025 15:44:45 +0200 Subject: PM: EM: Add function for registering a PD without capacity update The intel_pstate driver manages CPU capacity changes itself and it does not need an update of the capacity of all CPUs in the system to be carried out after registering a PD. Moreover, in some configurations (for instance, an SMT-capable hybrid x86 system booted with nosmt in the kernel command line) the em_check_capacity_update() call at the end of em_dev_register_perf_domain() always fails and reschedules itself to run once again in 1 s, so effectively it runs in vain every 1 s forever. To address this, introduce a new variant of em_dev_register_perf_domain(), called em_dev_register_pd_no_update(), that does not invoke em_check_capacity_update(), and make intel_pstate use it instead of the original. Fixes: 7b010f9b9061 ("cpufreq: intel_pstate: EAS support for hybrid platforms") Closes: https://lore.kernel.org/linux-pm/40212796-734c-4140-8a85-854f72b8144d@panix.com/ Reported-by: Kenneth R. Crudup Tested-by: Kenneth R. Crudup Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index ea7995a25780..8df55397414a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -552,6 +552,30 @@ EXPORT_SYMBOL_GPL(em_cpu_get); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts) +{ + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; @@ -636,12 +660,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); - if (_is_cpu_device(dev)) - em_check_capacity_update(); - return ret; } -EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device -- cgit v1.2.3 From 449c9c02537a146ac97ef962327a221e21c9cab3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 10 Sep 2025 11:41:59 +0200 Subject: PM: hibernate: Restrict GFP mask in hibernation_snapshot() Commit 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") incorrectly removed a pm_restrict_gfp_mask() call from hibernation_snapshot(), so memory allocations involving swap are not prevented from being carried out in this code path any more which may lead to serious breakage. The symptoms of such breakage have become visible after adding a shrink_shmem_memory() call to hibernation_snapshot() in commit 2640e819474f ("PM: hibernate: shrink shmem pages after dev_pm_ops.prepare()") which caused this problem to be much more likely to manifest itself. However, since commit 2640e819474f was initially present in the DRM tree that did not include commit 12ffc3b1513e, the symptoms of this issue were not visible until merge commit 260f6f4fda93 ("Merge tag 'drm-next-2025-07-30' of https://gitlab.freedesktop.org/drm/kernel") that exposed it through an entirely reasonable merge conflict resolution. Fixes: 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220555 Reported-by: Todd Brandt Tested-by: Todd Brandt Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f1f30cca573..2f66ab453823 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -449,6 +449,7 @@ int hibernation_snapshot(int platform_mode) shrink_shmem_memory(); console_suspend_all(); + pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); -- cgit v1.2.3 From e25ddfb388c8b7e5f20e3bf38d627fb485003781 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Wed, 10 Sep 2025 20:57:39 +0800 Subject: bpf: Reject bpf_timer for PREEMPT_RT When enable CONFIG_PREEMPT_RT, the kernel will warn when run timer selftests by './test_progs -t timer': BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 In order to avoid such warning, reject bpf_timer in verifier when PREEMPT_RT is enabled. Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20250910125740.52172-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c89e2b1bc644..9fb1f957a093 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8547,6 +8547,10 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, verifier_bug(env, "Two map pointers in a timer helper"); return -EFAULT; } + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + verbose(env, "bpf_timer cannot be used for PREEMPT_RT.\n"); + return -EOPNOTSUPP; + } meta->map_uid = reg->map_uid; meta->map_ptr = map; return 0; -- cgit v1.2.3