From 1440648c0feed03cfd51c7dba92a77feb34bf27b Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Thu, 31 Jul 2025 07:11:54 +0900 Subject: hung_task: dump blocker task if it is not hung Dump the lock blocker task if it is not hung because if the blocker task is also hung, it should be dumped by the detector. This will de-duplicate the same stackdumps if the blocker task is also blocked by another task (and hung). Link: https://lkml.kernel.org/r/175391351423.688839.11917911323784986774.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Suggested-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Acked-by: Lance Yang Signed-off-by: Andrew Morton --- kernel/hung_task.c | 78 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 8708a1205f82..b2c1f14b8129 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,9 +95,41 @@ static struct notifier_block panic_block = { .notifier_call = hung_task_panic, }; +static bool task_is_hung(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + unsigned int state = READ_ONCE(t->__state); + + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + * skip the TASK_FROZEN task -- it reasonably stops scheduling by freezer + */ + if (!(state & TASK_UNINTERRUPTIBLE) || + (state & (TASK_WAKEKILL | TASK_NOLOAD | TASK_FROZEN))) + return false; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) + return false; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + t->last_switch_time = jiffies; + return false; + } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return false; + + return true; +} #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER -static void debug_show_blocker(struct task_struct *task) +static void debug_show_blocker(struct task_struct *task, unsigned long timeout) { struct task_struct *g, *t; unsigned long owner, blocker, blocker_type; @@ -174,41 +206,21 @@ static void debug_show_blocker(struct task_struct *task) t->pid, rwsem_blocked_by); break; } - sched_show_task(t); + /* Avoid duplicated task dump, skip if the task is also hung. */ + if (!task_is_hung(t, timeout)) + sched_show_task(t); return; } } #else -static inline void debug_show_blocker(struct task_struct *task) +static inline void debug_show_blocker(struct task_struct *task, unsigned long timeout) { } #endif static void check_hung_task(struct task_struct *t, unsigned long timeout) { - unsigned long switch_count = t->nvcsw + t->nivcsw; - - /* - * Ensure the task is not frozen. - * Also, skip vfork and any other user process that freezer should skip. - */ - if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) - return; - - /* - * When a freshly created task is scheduled once, changes its state to - * TASK_UNINTERRUPTIBLE without having ever been switched out once, it - * musn't be checked. - */ - if (unlikely(!switch_count)) - return; - - if (switch_count != t->last_switch_count) { - t->last_switch_count = switch_count; - t->last_switch_time = jiffies; - return; - } - if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + if (!task_is_hung(t, timeout)) return; /* @@ -243,7 +255,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_blocker(t); + debug_show_blocker(t, timeout); hung_task_show_lock = true; if (sysctl_hung_task_all_cpu_backtrace) @@ -299,7 +311,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { - unsigned int state; if (!max_count--) goto unlock; @@ -308,15 +319,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* - * skip the TASK_KILLABLE tasks -- these can be killed - * skip the TASK_IDLE tasks -- those are genuinely idle - */ - state = READ_ONCE(t->__state); - if ((state & TASK_UNINTERRUPTIBLE) && - !(state & TASK_WAKEKILL) && - !(state & TASK_NOLOAD)) - check_hung_task(t, timeout); + + check_hung_task(t, timeout); } unlock: rcu_read_unlock(); -- cgit v1.2.3 From f367474b5884edbc42661e7fecf784cb131dd25d Mon Sep 17 00:00:00 2001 From: Brian Mak Date: Tue, 5 Aug 2025 14:15:27 -0700 Subject: x86/kexec: carry forward the boot DTB on kexec Currently, the kexec_file_load syscall on x86 does not support passing a device tree blob to the new kernel. Some embedded x86 systems use device trees. On these systems, failing to pass a device tree to the new kernel causes a boot failure. To add support for this, we copy the behavior of ARM64 and PowerPC and copy the current boot's device tree blob for use in the new kernel. We do this on x86 by passing the device tree blob as a setup_data entry in accordance with the x86 boot protocol. This behavior is gated behind the KEXEC_FILE_FORCE_DTB flag. Link: https://lkml.kernel.org/r/20250805211527.122367-3-makb@juniper.net Signed-off-by: Brian Mak Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rob Herring Cc: Saravana Kannan Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- kernel/kexec_file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); -- cgit v1.2.3 From c2fe368b6eb24af72708890b04e9a773c8465703 Mon Sep 17 00:00:00 2001 From: Soham Bagchi Date: Mon, 28 Jul 2025 12:43:17 -0600 Subject: kcov: use write memory barrier after memcpy() in kcov_move_area() KCOV Remote uses two separate memory buffers, one private to the kernel space (kcov_remote_areas) and the second one shared between user and kernel space (kcov->area). After every pair of kcov_remote_start() and kcov_remote_stop(), the coverage data collected in the kcov_remote_areas is copied to kcov->area so the user can read the collected coverage data. This memcpy() is located in kcov_move_area(). The load/store pattern on the kernel-side [1] is: ``` /* dst_area === kcov->area, dst_area[0] is where the count is stored */ dst_len = READ_ONCE(*(unsigned long *)dst_area); ... memcpy(dst_entries, src_entries, ...); ... WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); ``` And for the user [2]: ``` /* cover is equivalent to kcov->area */ n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); ``` Without a write-memory barrier, the atomic load for the user can potentially read fresh values of the count stored at cover[0], but continue to read stale coverage data from the buffer itself. Hence, we recommend adding a write-memory barrier between the memcpy() and the WRITE_ONCE() in kcov_move_area(). Link: https://lkml.kernel.org/r/20250728184318.1839137-1-soham.bagchi@utah.edu Link: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/kernel/kcov.c?h=master#n978 [1] Link: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/Documentation/dev-tools/kcov.rst#n364 [2] Signed-off-by: Soham Bagchi Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Jonathan Corbet Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- kernel/kcov.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 1d85597057e1..6563141f5de9 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -978,6 +978,15 @@ static void kcov_move_area(enum kcov_mode mode, void *dst_area, memcpy(dst_entries, src_entries, bytes_to_move); entries_moved = bytes_to_move >> entry_size_log; + /* + * A write memory barrier is required here, to ensure + * that the writes from the memcpy() are visible before + * the count is updated. Without this, it is possible for + * a user to observe a new count value but stale + * coverage data. + */ + smp_wmb(); + switch (mode) { case KCOV_MODE_TRACE_PC: WRITE_ONCE(*(unsigned long *)dst_area, dst_len + entries_moved); -- cgit v1.2.3 From 41f88ddfd453fe894678e1f6909b9fb9e08e8c3d Mon Sep 17 00:00:00 2001 From: ZhenguoYao Date: Tue, 12 Aug 2025 15:41:32 +0800 Subject: watchdog/softlockup: fix wrong output when watchdog_thresh < 3 When watchdog_thresh is below 3, sample_period will be less than 1 second. So the following output will print when softlockup: CPU#3 Utilization every 0s during lockup Fix this by changing time unit from seconds to milliseconds. Link: https://lkml.kernel.org/r/20250812074132.27810-1-yaozhenguo@jd.com Signed-off-by: ZhenguoYao Cc: Bitao Hu Cc: Li Huafei Cc: Max Kellermann Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- kernel/watchdog.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 80b56c002c7f..9c7134f7d2c4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -455,17 +455,17 @@ static void print_cpustat(void) { int i, group; u8 tail = __this_cpu_read(cpustat_tail); - u64 sample_period_second = sample_period; + u64 sample_period_msecond = sample_period; - do_div(sample_period_second, NSEC_PER_SEC); + do_div(sample_period_msecond, NSEC_PER_MSEC); /* * Outputting the "watchdog" prefix on every line is redundant and not * concise, and the original alarm information is sufficient for * positioning in logs, hence here printk() is used instead of pr_crit(). */ - printk(KERN_CRIT "CPU#%d Utilization every %llus during lockup:\n", - smp_processor_id(), sample_period_second); + printk(KERN_CRIT "CPU#%d Utilization every %llums during lockup:\n", + smp_processor_id(), sample_period_msecond); for (i = 0; i < NUM_SAMPLE_PERIODS; i++) { group = (tail + i) % NUM_SAMPLE_PERIODS; -- cgit v1.2.3 From 95f091274f3db39493e8b5c44671b9f1e02c0c25 Mon Sep 17 00:00:00 2001 From: ZhenguoYao Date: Tue, 12 Aug 2025 16:25:10 +0800 Subject: watchdog/softlockup: fix incorrect CPU utilization output during softlockup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we use 16-bit precision, the raw data will undergo integer division, which may sometimes result in data loss. This can lead to slightly inaccurate CPU utilization calculations. Under normal circumstances, this isn't an issue. However, when CPU utilization reaches 100%, the calculated result might exceed 100%. For example, with raw data like the following: sample_period 400000134 new_stat 83648414036 old_stat 83247417494 sample_period=400000134/2^24=23 new_stat=83648414036/2^24=4985 old_stat=83247417494/2^24=4961 util=105% Below log will output: CPU#3 Utilization every 0s during lockup: #1: 0% system, 0% softirq, 105% hardirq, 0% idle #2: 0% system, 0% softirq, 105% hardirq, 0% idle #3: 0% system, 0% softirq, 100% hardirq, 0% idle #4: 0% system, 0% softirq, 105% hardirq, 0% idle #5: 0% system, 0% softirq, 105% hardirq, 0% idle To avoid confusion, we enforce a 100% display cap when calculations exceed this threshold. We also round to the nearest multiple of 16.8 milliseconds to improve the accuracy. [yaozhenguo1@gmail.com: make get_16bit_precision() more accurate, fix comment layout] Link: https://lkml.kernel.org/r/20250818081438.40540-1-yaozhenguo@jd.com Link: https://lkml.kernel.org/r/20250812082510.32291-1-yaozhenguo@jd.com Signed-off-by: ZhenguoYao Cc: Bitao Hu Cc: Li Huafei Cc: Max Kellermann Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- kernel/watchdog.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9c7134f7d2c4..5413aa85e8a4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -425,7 +425,11 @@ static DEFINE_PER_CPU(u8, cpustat_tail); */ static u16 get_16bit_precision(u64 data_ns) { - return data_ns >> 24LL; /* 2^24ns ~= 16.8ms */ + /* + * 2^24ns ~= 16.8ms + * Round to the nearest multiple of 16.8 milliseconds. + */ + return (data_ns + (1 << 23)) >> 24LL; } static void update_cpustat(void) @@ -444,6 +448,14 @@ static void update_cpustat(void) old_stat = __this_cpu_read(cpustat_old[i]); new_stat = get_16bit_precision(cpustat[tracked_stats[i]]); util = DIV_ROUND_UP(100 * (new_stat - old_stat), sample_period_16); + /* + * Since we use 16-bit precision, the raw data will undergo + * integer division, which may sometimes result in data loss, + * and then result might exceed 100%. To avoid confusion, + * we enforce a 100% display cap when calculations exceed this threshold. + */ + if (util > 100) + util = 100; __this_cpu_write(cpustat_util[tail][i], util); __this_cpu_write(cpustat_old[i], new_stat); } -- cgit v1.2.3 From b32730e68d326bef5c081c4b7cdd275c45b1902b Mon Sep 17 00:00:00 2001 From: Tio Zhang Date: Wed, 20 Aug 2025 18:18:46 +0800 Subject: fork: remove #ifdef CONFIG_LOCKDEP in copy_process() lockdep_init_task() is defined as an empty when CONFIG_LOCKDEP is not set. So the #ifdef here is redundant, remove it. Link: https://lkml.kernel.org/r/20250820101826.GA2484@didi-ThinkCentre-M930t-N000 Signed-off-by: Tio Zhang Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/fork.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..e06cfaa85a84 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2124,9 +2124,7 @@ __latent_entropy struct task_struct *copy_process( p->pagefault_disabled = 0; -#ifdef CONFIG_LOCKDEP lockdep_init_task(p); -#endif p->blocked_on = NULL; /* not blocked yet */ -- cgit v1.2.3 From f7071db2fe3d20991a35043b32012e1b37d32cc0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2025 18:39:46 +0200 Subject: fork: kill the pointless lower_32_bits() in create_io_thread(), kernel_thread(), and user_mode_thread() Unlike sys_clone(), these helpers have only in kernel users which should pass the correct "flags" argument. lower_32_bits(flags) just adds the unnecessary confusion and doesn't allow to use the CLONE_ flags which don't fit into 32 bits. create_io_thread() looks especially confusing because: - "flags" is a compile-time constant, so lower_32_bits() simply has no effect - .exit_signal = (lower_32_bits(flags) & CSIGNAL) is harmless but doesn't look right, copy_process(CLONE_THREAD) will ignore this argument anyway. None of these helpers actually need CLONE_UNTRACED or "& ~CSIGNAL", but their presence does not add any confusion and improves code clarity. Link: https://lkml.kernel.org/r/20250820163946.GA18549@redhat.com Signed-off-by: Oleg Nesterov Reviewed-by: Jens Axboe Cc: Christian Brauner Cc: Kees Cook Signed-off-by: Andrew Morton --- kernel/fork.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e06cfaa85a84..a8674ba2b33b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2537,11 +2537,9 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| - CLONE_IO; + CLONE_IO|CLONE_VM|CLONE_UNTRACED; struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = flags, .fn = fn, .fn_arg = arg, .io_thread = 1, @@ -2653,9 +2651,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, @@ -2671,9 +2668,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { - .flags = ((lower_32_bits(flags) | CLONE_VM | - CLONE_UNTRACED) & ~CSIGNAL), - .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), .fn = fn, .fn_arg = arg, }; -- cgit v1.2.3 From 13818f7b8c85c89aa97a430f8116490c1b833470 Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Mon, 25 Aug 2025 20:33:05 +0800 Subject: kexec_core: remove redundant 0 value initialization The kimage struct is already zeroed by kzalloc(). It's redundant to initialize image->head to 0. Link: https://lkml.kernel.org/r/20250825123307.306634-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Signed-off-by: Andrew Morton --- kernel/kexec_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 31203f0bacaf..fa00b239c5d9 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -233,7 +233,6 @@ struct kimage *do_kimage_alloc_init(void) if (!image) return NULL; - image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ -- cgit v1.2.3 From 2683df6539cbc3f0eeeba11154bc0cbf042a5cee Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 25 Aug 2025 10:57:00 +0800 Subject: panic: add note that 'panic_print' parameter is deprecated Just like for 'panic_print's systcl interface, add similar note for setup of kernel cmdline parameter and parameter under /sys/module/kernel/. Also add __core_param_cb() macro, which enables to add special get/set operation for a kernel parameter. Link: https://lkml.kernel.org/r/20250825025701.81921-4-feng.tang@linux.alibaba.com Signed-off-by: Feng Tang Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Cc: Askar Safin Cc: John Ogness Cc: Jonathan Corbet Cc: Lance Yang Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/panic.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 72fcbb5a071b..12a10e17ab4a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -937,12 +937,29 @@ EXPORT_SYMBOL(__stack_chk_fail); #endif core_param(panic, panic_timeout, int, 0644); -core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); core_param(panic_console_replay, panic_console_replay, bool, 0644); +static int panic_print_set(const char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_set_ulong(val, kp); +} + +static int panic_print_get(char *val, const struct kernel_param *kp) +{ + pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + return param_get_ulong(val, kp); +} + +static const struct kernel_param_ops panic_print_ops = { + .set = panic_print_set, + .get = panic_print_get, +}; +__core_param_cb(panic_print, &panic_print_ops, &panic_print, 0644); + static int __init oops_setup(char *s) { if (!s) -- cgit v1.2.3 From e40d2014b2ccaf0f1a49ba0d0cfb59ac2a36cc6e Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 25 Aug 2025 10:57:01 +0800 Subject: panic: clean up message about deprecated 'panic_print' parameter Remove duplication of the message about deprecated 'panic_print' parameter. Also make the wording more direct. Make it clear that the new parameters already exist and should be used instead. Link: https://lkml.kernel.org/r/20250825025701.81921-5-feng.tang@linux.alibaba.com Signed-off-by: Petr Mladek Signed-off-by: Feng Tang Reviewed-by: Lance Yang Tested-by: Lance Yang Reviewed-by: Feng Tang Cc: Askar Safin Cc: John Ogness Cc: Jonathan Corbet Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/panic.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 12a10e17ab4a..24bca263f896 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -77,6 +77,11 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); +static void panic_print_deprecated(void) +{ + pr_info_once("Kernel: The 'panic_print' parameter is now deprecated. Please use 'panic_sys_info' and 'panic_console_replay' instead.\n"); +} + #ifdef CONFIG_SYSCTL /* @@ -125,7 +130,7 @@ static int proc_taint(const struct ctl_table *table, int write, static int sysctl_panic_print_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - pr_info_once("Kernel: 'panic_print' sysctl interface will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -944,13 +949,13 @@ core_param(panic_console_replay, panic_console_replay, bool, 0644); static int panic_print_set(const char *val, const struct kernel_param *kp) { - pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return param_set_ulong(val, kp); } static int panic_print_get(char *val, const struct kernel_param *kp) { - pr_info_once("Kernel: 'panic_print' parameter will be obsoleted by both 'panic_sys_info' and 'panic_console_replay'\n"); + panic_print_deprecated(); return param_get_ulong(val, kp); } -- cgit v1.2.3 From d0d9c7235548f1d772f1e48c9d5742c65d81c705 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:29 +0800 Subject: panic: introduce helper functions for panic state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "panic: introduce panic status function family", v2. This series introduces a family of helper functions to manage panic state and updates existing code to use them. Before this series, panic state helpers were scattered and inconsistent. For example, panic_in_progress() was defined in printk/printk.c, not in panic.c or panic.h. As a result, developers had to look in unexpected places to understand or re-use panic state logic. Other checks were open- coded, duplicating logic across panic, crash, and watchdog paths. The new helpers centralize the functionality in panic.c/panic.h: - panic_try_start() - panic_reset() - panic_in_progress() - panic_on_this_cpu() - panic_on_other_cpu() Patches 1–8 add the helpers and convert panic/crash and printk/nbcon code to use them. Patch 9 fixes a bug in the watchdog subsystem by skipping checks when a panic is in progress, avoiding interference with the panic CPU. Together, this makes panic state handling simpler, more discoverable, and more robust. This patch (of 9): This patch introduces four new helper functions to abstract the management of the panic_cpu variable. These functions will be used in subsequent patches to refactor existing code. The direct use of panic_cpu can be error-prone and ambiguous, as it requires manual checks to determine which CPU is handling the panic. The new helpers clarify intent: panic_try_start(): Atomically sets the current CPU as the panicking CPU. panic_reset(): Reset panic_cpu to PANIC_CPU_INVALID. panic_in_progress(): Checks if a panic has been triggered. panic_on_this_cpu(): Returns true if the current CPU is the panic originator. panic_on_other_cpu(): Returns true if a panic is on another CPU. This change lays the groundwork for improved code readability and robustness in the panic handling subsystem. Link: https://lkml.kernel.org/r/20250825022947.1596226-1-wangjinchao600@gmail.com Link: https://lkml.kernel.org/r/20250825022947.1596226-2-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) b Signed-off-by: Andrew Morton --- kernel/panic.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/printk/printk.c | 5 ----- 2 files changed, 53 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 24bca263f896..010a1bfc4843 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -299,6 +299,59 @@ void __weak crash_smp_send_stop(void) atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); +bool panic_try_start(void) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + old_cpu = PANIC_CPU_INVALID; + this_cpu = raw_smp_processor_id(); + + return atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu); +} +EXPORT_SYMBOL(panic_try_start); + +void panic_reset(void) +{ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_reset); + +bool panic_in_progress(void) +{ + return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); +} +EXPORT_SYMBOL(panic_in_progress); + +/* Return true if a panic is in progress on the current CPU. */ +bool panic_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because it is impossible for + * the task to be migrated to the panic_cpu, or away from it. If + * panic_cpu has already been set, and we're not currently executing on + * that CPU, then we never will be. + */ + return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); +} +EXPORT_SYMBOL(panic_on_this_cpu); + +/* + * Return true if a panic is in progress on a remote CPU. + * + * On true, the local CPU should immediately release any printing resources + * that may be needed by the panic CPU. + */ +bool panic_on_other_cpu(void) +{ + return (panic_in_progress() && !this_cpu_in_panic()); +} +EXPORT_SYMBOL(panic_on_other_cpu); + /* * A variant of panic() called from NMI context. We return if we've already * panicked on this CPU. If another CPU already panicked, loop in diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 0efbcdda9aab..5fe35f377b79 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -345,11 +345,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -static bool panic_in_progress(void) -{ - return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID); -} - /* Return true if a panic is in progress on the current CPU. */ bool this_cpu_in_panic(void) { -- cgit v1.2.3 From 33effbcaf110a68b49b7ab4f8720858e7598c216 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:31 +0800 Subject: crash_core: use panic_try_start() in crash_kexec() crash_kexec() had its own code to exclude parallel execution by setting panic_cpu. This is already handled by panic_try_start(). Switch to panic_try_start() to remove the duplication and keep the logic consistent. Link: https://lkml.kernel.org/r/20250825022947.1596226-4-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/crash_core.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a4ef79591eb2..bb38bbaf3a26 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,6 +4,7 @@ * Copyright (C) 2002-2004 Eric Biederman */ +#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -143,17 +144,7 @@ STACK_FRAME_NON_STANDARD(__crash_kexec); __bpf_kfunc void crash_kexec(struct pt_regs *regs) { - int old_cpu, this_cpu; - - /* - * Only one CPU is allowed to execute the crash_kexec() code as with - * panic(). Otherwise parallel calls of panic() and crash_kexec() - * may stop each other. To exclude them, we use panic_cpu here too. - */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); @@ -161,7 +152,7 @@ __bpf_kfunc void crash_kexec(struct pt_regs *regs) * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ - atomic_set(&panic_cpu, PANIC_CPU_INVALID); + panic_reset(); } } -- cgit v1.2.3 From 6b69c7ef96f1afc7b426195087f7488f1510c2a4 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:32 +0800 Subject: panic: use panic_try_start() in nmi_panic() nmi_panic() duplicated the logic to claim panic_cpu with atomic_try_cmpxchg. This is already wrapped in panic_try_start(). Replace the open-coded logic with panic_try_start(), and use panic_on_other_cpu() for the fallback path. This removes duplication and keeps panic handling code consistent. Link: https://lkml.kernel.org/r/20250825022947.1596226-5-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/panic.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 010a1bfc4843..f7ecb36cf2b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -360,15 +360,9 @@ EXPORT_SYMBOL(panic_on_other_cpu); */ void nmi_panic(struct pt_regs *regs, const char *msg) { - int old_cpu, this_cpu; - - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) + if (panic_try_start()) panic("%s", msg); - else if (old_cpu != this_cpu) + else if (panic_on_other_cpu()) nmi_panic_self_stop(regs); } EXPORT_SYMBOL(nmi_panic); -- cgit v1.2.3 From 6f313b558562161c181734c1d23b25bf71e574b9 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:33 +0800 Subject: panic: use panic_try_start() in vpanic() vpanic() had open-coded logic to claim panic_cpu with atomic_try_cmpxchg. This is already handled by panic_try_start(). Switch to panic_try_start() and use panic_on_other_cpu() for the fallback path. This removes duplicate code and makes panic handling consistent across functions. Link: https://lkml.kernel.org/r/20250825022947.1596226-6-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/panic.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index f7ecb36cf2b3..c4ef86fc643f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -420,7 +420,6 @@ void vpanic(const char *fmt, va_list args) static char buf[1024]; long i, i_next = 0, len; int state = 0; - int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; if (panic_on_warn) { @@ -457,13 +456,10 @@ void vpanic(const char *fmt, va_list args) * `old_cpu == this_cpu' means we came from nmi_panic() which sets * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - old_cpu = PANIC_CPU_INVALID; - this_cpu = raw_smp_processor_id(); - /* atomic_try_cmpxchg updates old_cpu on failure */ - if (atomic_try_cmpxchg(&panic_cpu, &old_cpu, this_cpu)) { + if (panic_try_start()) { /* go ahead */ - } else if (old_cpu != this_cpu) + } else if (panic_on_other_cpu()) panic_smp_self_stop(); console_verbose(); -- cgit v1.2.3 From 2325e8eadf7cd2a086855809ffcd054336369d47 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:34 +0800 Subject: printk/nbcon: use panic_on_this_cpu() helper nbcon_context_try_acquire() compared panic_cpu directly with smp_processor_id(). This open-coded check is now provided by panic_on_this_cpu(). Switch to panic_on_this_cpu() to simplify the code and improve readability. Link: https://lkml.kernel.org/r/20250825022947.1596226-7-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/printk/nbcon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 646801813415..7490865e2f44 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -2,6 +2,7 @@ // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner +#include "linux/panic.h" #include #include #include @@ -589,7 +590,6 @@ static struct printk_buffers panic_nbcon_pbufs; */ static bool nbcon_context_try_acquire(struct nbcon_context *ctxt, bool is_reacquire) { - unsigned int cpu = smp_processor_id(); struct console *con = ctxt->console; struct nbcon_state cur; int err; @@ -614,7 +614,7 @@ out: /* Acquire succeeded. */ /* Assign the appropriate buffer for this context. */ - if (atomic_read(&panic_cpu) == cpu) + if (panic_on_this_cpu()) ctxt->pbufs = &panic_nbcon_pbufs; else ctxt->pbufs = con->pbufs; -- cgit v1.2.3 From c6be36e2997662f423edfa3979a63935873ff648 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:35 +0800 Subject: panic/printk: replace this_cpu_in_panic() with panic_on_this_cpu() The helper this_cpu_in_panic() duplicated logic already provided by panic_on_this_cpu(). Remove this_cpu_in_panic() and switch all users to panic_on_this_cpu(). This simplifies the code and avoids having two helpers for the same check. Link: https://lkml.kernel.org/r/20250825022947.1596226-8-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/panic.c | 2 +- kernel/printk/nbcon.c | 2 +- kernel/printk/printk.c | 15 ++------------- kernel/printk/printk_ringbuffer.c | 2 +- 4 files changed, 5 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index c4ef86fc643f..a8b1bf60e09f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -348,7 +348,7 @@ EXPORT_SYMBOL(panic_on_this_cpu); */ bool panic_on_other_cpu(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } EXPORT_SYMBOL(panic_on_other_cpu); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 7490865e2f44..c6d1a4a747e9 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1394,7 +1394,7 @@ enum nbcon_prio nbcon_get_default_prio(void) { unsigned int *cpu_emergency_nesting; - if (this_cpu_in_panic()) + if (panic_on_this_cpu()) return NBCON_PRIO_PANIC; cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5fe35f377b79..faa8b1f0585b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -17,6 +17,7 @@ * 01Mar01 Andrew Morton */ +#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -345,18 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -/* Return true if a panic is in progress on the current CPU. */ -bool this_cpu_in_panic(void) -{ - /* - * We can use raw_smp_processor_id() here because it is impossible for - * the task to be migrated to the panic_cpu, or away from it. If - * panic_cpu has already been set, and we're not currently executing on - * that CPU, then we never will be. - */ - return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id()); -} - /* * Return true if a panic is in progress on a remote CPU. * @@ -365,7 +354,7 @@ bool this_cpu_in_panic(void) */ bool other_cpu_in_panic(void) { - return (panic_in_progress() && !this_cpu_in_panic()); + return (panic_in_progress() && !panic_on_this_cpu()); } /* diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index d9fb053cff67..e2a1b2d34d2b 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -2143,7 +2143,7 @@ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ - if (this_cpu_in_panic() && + if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; -- cgit v1.2.3 From d4a36db5639db032a434aef968f9188a600139ec Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:36 +0800 Subject: panic/printk: replace other_cpu_in_panic() with panic_on_other_cpu() The helper other_cpu_in_panic() duplicated logic already provided by panic_on_other_cpu(). Remove other_cpu_in_panic() and update all users to call panic_on_other_cpu() instead. This removes redundant code and makes panic handling consistent. Link: https://lkml.kernel.org/r/20250825022947.1596226-9-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- kernel/printk/internal.h | 1 - kernel/printk/nbcon.c | 8 ++++---- kernel/printk/printk.c | 19 ++++--------------- 3 files changed, 8 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index ef282001f200..f72bbfa266d6 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -332,7 +332,6 @@ struct printk_message { unsigned long dropped; }; -bool other_cpu_in_panic(void); bool printk_get_next_message(struct printk_message *pmsg, u64 seq, bool is_extended, bool may_supress); diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index c6d1a4a747e9..171480135830 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -255,7 +255,7 @@ static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt, * opportunity to perform any necessary cleanup if they were * interrupted by the panic CPU while printing. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && (!is_reacquire || cur->unsafe_takeover)) { return -EPERM; } @@ -310,7 +310,7 @@ static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio) * Event #2 implies the new context is PANIC. * Event #3 occurs when panic() has flushed the console. * Event #4 occurs when a non-panic CPU reacquires. - * Event #5 is not possible due to the other_cpu_in_panic() check + * Event #5 is not possible due to the panic_on_other_cpu() check * in nbcon_context_try_acquire_handover(). */ @@ -349,7 +349,7 @@ static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt, struct nbcon_state new; /* Note that the caller must still remove the request! */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* @@ -447,7 +447,7 @@ static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt, * nbcon_waiter_matches(). In particular, the assumption that * lower priorities are ignored during panic. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return -EPERM; /* Handover is not possible on the same CPU. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index faa8b1f0585b..236f03937107 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -346,17 +346,6 @@ static void __up_console_sem(unsigned long ip) } #define up_console_sem() __up_console_sem(_RET_IP_) -/* - * Return true if a panic is in progress on a remote CPU. - * - * On true, the local CPU should immediately release any printing resources - * that may be needed by the panic CPU. - */ -bool other_cpu_in_panic(void) -{ - return (panic_in_progress() && !panic_on_this_cpu()); -} - /* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's @@ -2391,7 +2380,7 @@ asmlinkage int vprintk_emit(int facility, int level, * non-panic CPUs are generating any messages, they will be * silently dropped. */ - if (other_cpu_in_panic() && + if (panic_on_other_cpu() && !debug_non_panic_cpus && !panic_triggering_all_cpu_backtrace) return 0; @@ -2827,7 +2816,7 @@ void console_lock(void) might_sleep(); /* On panic, the console_lock must be left to the panic cpu. */ - while (other_cpu_in_panic()) + while (panic_on_other_cpu()) msleep(1000); down_console_sem(); @@ -2847,7 +2836,7 @@ EXPORT_SYMBOL(console_lock); int console_trylock(void) { /* On panic, the console_lock must be left to the panic cpu. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) return 0; if (down_trylock_console_sem()) return 0; @@ -3227,7 +3216,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove any_progress = true; /* Allow panic_cpu to take over the consoles safely. */ - if (other_cpu_in_panic()) + if (panic_on_other_cpu()) goto abandon; if (do_cond_resched) -- cgit v1.2.3 From 3d5f4f15b778d6da9760d54455cc256ecf924c0a Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Mon, 25 Aug 2025 10:29:37 +0800 Subject: watchdog: skip checks when panic is in progress This issue was found when an EFI pstore was configured for kdump logging with the NMI hard lockup detector enabled. The efi-pstore write operation was slow, and with a large number of logs, the pstore dump callback within kmsg_dump() took a long time. This delay triggered the NMI watchdog, leading to a nested panic. The call flow demonstrates how the secondary panic caused an emergency_restart() to be triggered before the initial pstore operation could finish, leading to a failure to dump the logs: real panic() { kmsg_dump() { ... pstore_dump() { start_dump(); ... // long time operation triggers NMI watchdog nmi panic() { ... emergency_restart(); // pstore unfinished } ... finish_dump(); // never reached } } } Both watchdog_buddy_check_hardlockup() and watchdog_overflow_callback() may trigger during a panic. This can lead to recursive panic handling. Add panic_in_progress() checks so watchdog activity is skipped once a panic has begun. This prevents recursive panic and keeps the panic path more reliable. Link: https://lkml.kernel.org/r/20250825022947.1596226-10-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Reviewed-by: Yury Norov (NVIDIA) Cc: Anna Schumaker Cc: Baoquan He Cc: "Darrick J. Wong" Cc: Dave Young Cc: Doug Anderson Cc: "Guilherme G. Piccoli" Cc: Helge Deller Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Joel Granados Cc: John Ogness Cc: Kees Cook Cc: Li Huafei Cc: "Luck, Tony" Cc: Luo Gengkun Cc: Max Kellermann Cc: Nam Cao Cc: oushixiong Cc: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Sohil Mehta Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Zimemrmann Cc: Thorsten Blum Cc: Ville Syrjala Cc: Vivek Goyal Cc: Yicong Yang Cc: Yunhui Cui Signed-off-by: Andrew Morton --- kernel/watchdog.c | 6 ++++++ kernel/watchdog_perf.c | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5413aa85e8a4..5b62d1002783 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -752,6 +752,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (!watchdog_enabled) return HRTIMER_NORESTART; + /* + * pass the buddy check if a panic is in process + */ + if (panic_in_progress()) + return HRTIMER_NORESTART; + watchdog_hardlockup_kick(); /* kick the softlockup detector */ diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c index 9c58f5b4381d..d3ca70e3c256 100644 --- a/kernel/watchdog_perf.c +++ b/kernel/watchdog_perf.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "NMI watchdog: " fmt +#include #include #include #include @@ -108,6 +109,9 @@ static void watchdog_overflow_callback(struct perf_event *event, /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; + if (panic_in_progress()) + return; + if (!watchdog_check_timestamp()) return; -- cgit v1.2.3 From 652ab7c8fab36bd803d2947a3abf26155faa5dc5 Mon Sep 17 00:00:00 2001 From: Jinchao Wang Date: Fri, 29 Aug 2025 13:13:02 +0800 Subject: panic: use angle-bracket include for panic.h Replace quoted includes of panic.h with `#include ` for consistency across the kernel. Link: https://lkml.kernel.org/r/20250829051312.33773-1-wangjinchao600@gmail.com Signed-off-by: Jinchao Wang Reviewed-by: John Ogness Reviewed-by: Petr Mladek Cc: Qianqiang Liu Cc: Sergey Senozhatsky Cc: Steven Rostedt Signed-off-by: Andrew Morton --- kernel/crash_core.c | 2 +- kernel/printk/nbcon.c | 2 +- kernel/printk/printk.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index bb38bbaf3a26..a5e8523dd6eb 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -4,7 +4,6 @@ * Copyright (C) 2002-2004 Eric Biederman */ -#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -23,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index 171480135830..558ef3177976 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -2,7 +2,6 @@ // Copyright (C) 2022 Linutronix GmbH, John Ogness // Copyright (C) 2022 Intel, Thomas Gleixner -#include "linux/panic.h" #include #include #include @@ -13,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 236f03937107..5aee9ffb16b9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -17,7 +17,6 @@ * 01Mar01 Andrew Morton */ -#include "linux/panic.h" #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include @@ -49,6 +48,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 37aa782df94d16277b45b9a62b748cd62b4bccb9 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 3 Sep 2025 12:04:18 +0200 Subject: panic: remove redundant panic-cpu backtrace Backtraces from all CPUs are printed during panic() when SYS_INFO_ALL_CPU_BT is set. It shows the backtrace for the panic-CPU even when it has already been explicitly printed before. Do not change the legacy code which prints the backtrace in various contexts, for example, as part of Oops report, right after panic message. It will always be visible in the crash dump. Instead, remember when the backtrace was printed, and skip it when dumping the optional backtraces on all CPUs. [akpm@linux-foundation.org: make panic_this_cpu_backtrace_printed static] Closes: https://lore.kernel.org/oe-kbuild-all/202509050048.FMpVvh1u-lkp@intel.com/ [pmladek@suse.com: Handle situations when the backtrace was not printed for the panic CPU] Link: https://lkml.kernel.org/r/20250903100418.410026-1-pmladek@suse.com Signed-off-by: Sergey Senozhatsky Link: https://lore.kernel.org/r/20250731030314.3818040-1-senozhatsky@chromium.org Signed-off-by: Petr Mladek Tested-by: Feng Tang Reviewed-by: John Ogness Signed-off-by: Andrew Morton --- kernel/panic.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index a8b1bf60e09f..ebd81c259fa9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -67,6 +67,7 @@ static unsigned int warn_limit __read_mostly; static bool panic_console_replay; bool panic_triggering_all_cpu_backtrace; +static bool panic_this_cpu_backtrace_printed; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -380,6 +381,19 @@ void check_panic_on_warn(const char *origin) origin, limit); } +static void panic_trigger_all_cpu_backtrace(void) +{ + /* Temporary allow non-panic CPUs to write their backtraces. */ + panic_triggering_all_cpu_backtrace = true; + + if (panic_this_cpu_backtrace_printed) + trigger_allbutcpu_cpu_backtrace(raw_smp_processor_id()); + else + trigger_all_cpu_backtrace(); + + panic_triggering_all_cpu_backtrace = false; +} + /* * Helper that triggers the NMI backtrace (if set in panic_print) * and then performs the secondary CPUs shutdown - we cannot have @@ -387,12 +401,8 @@ void check_panic_on_warn(const char *origin) */ static void panic_other_cpus_shutdown(bool crash_kexec) { - if (panic_print & SYS_INFO_ALL_CPU_BT) { - /* Temporary allow non-panic CPUs to write their backtraces. */ - panic_triggering_all_cpu_backtrace = true; - trigger_all_cpu_backtrace(); - panic_triggering_all_cpu_backtrace = false; - } + if (panic_print & SYS_INFO_ALL_CPU_BT) + panic_trigger_all_cpu_backtrace(); /* * Note that smp_send_stop() is the usual SMP shutdown function, @@ -470,13 +480,15 @@ void vpanic(const char *fmt, va_list args) buf[len - 1] = '\0'; pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE /* * Avoid nested stack-dumping if a panic occurs during oops processing */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + if (test_taint(TAINT_DIE) || oops_in_progress > 1) { + panic_this_cpu_backtrace_printed = true; + } else if (IS_ENABLED(CONFIG_DEBUG_BUGVERBOSE)) { dump_stack(); -#endif + panic_this_cpu_backtrace_printed = true; + } /* * If kgdb is enabled, give it a chance to run before we stop all -- cgit v1.2.3 From 913e65a2fe1a16fa253c4a016e2306b2cf9ffef8 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Thu, 4 Sep 2025 17:38:53 +0800 Subject: crash: add KUnit tests for crash_exclude_mem_range crash_exclude_mem_range seems to be a simple function but there have been multiple attempts to fix it, - commit a2e9a95d2190 ("kexec: Improve & fix crash_exclude_mem_range() to handle overlapping ranges") - commit 6dff31597264 ("crash_core: fix and simplify the logic of crash_exclude_mem_range()") So add a set of unit tests to verify the correctness of current implementation. Shall we change the function in the future, the unit tests can also help prevent any regression. For example, we may make the function smarter by allocating extra crash_mem range on demand thus there is no need for the caller to foresee any memory range split or address -ENOMEM failure. The testing strategy is to verify the correctness of base case. The base case is there is one to-be-excluded range A and one existing range B. Then we can exhaust all possibilities of the position of A regarding B. For example, here are two combinations, Case: A is completely inside B (causes split) Original: [----B----] Exclude: {--A--} Result: [B1] .. [B2] Case: A overlaps B's left part Original: [----B----] Exclude: {---A---} Result: [..B..] In theory we can prove the correctness by induction, - Base case: crash_exclude_mem_range is correct in the case where n=1 (n is the number of existing ranges). - Inductive step: If crash_exclude_mem_range is correct for n=k existing ranges, then the it's also correct for n=k+1 ranges. But for the sake of simplicity, simply use unit tests to cover the base case together with two regression tests. Note most of the exclude_single_range_test() code is generated by Google Gemini with some small tweaks. The function specification, function body and the exhausting test strategy are presented as prompts. [akpm@linux-foundation.org: export crash_exclude_mem_range() to modules, for kernel/crash_core_test.c] Link: https://lkml.kernel.org/r/20250904093855.1180154-2-coxu@redhat.com Signed-off-by: Coiby Xu Assisted-by: Google Gemini Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: fuqiang wang Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleinxer Cc: Vivek Goyal Signed-off-by: Andrew Morton --- kernel/Kconfig.kexec | 11 ++ kernel/Makefile | 1 + kernel/crash_core.c | 15 +++ kernel/crash_core_test.c | 343 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 370 insertions(+) create mode 100644 kernel/crash_core_test.c (limited to 'kernel') diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 1224dd937df0..422270d64820 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -148,6 +148,17 @@ config CRASH_DM_CRYPT_CONFIGS CRASH_DM_CRYPT cannot directly select CONFIGFS_FS, because that is required to be built-in. +config CRASH_DUMP_KUNIT_TEST + tristate "Unit Tests for kernel crash dumps" if !KUNIT_ALL_TESTS + depends on CRASH_DUMP && KUNIT + default KUNIT_ALL_TESTS + help + This option builds KUnit unit tests for kernel crash dumps. The unit + tests will be used to verify the correctness of covered functions and + also prevent any regression. + + If unsure, say N. + config CRASH_HOTPLUG bool "Update the crash elfcorehdr on system configuration changes" default y diff --git a/kernel/Makefile b/kernel/Makefile index c60623448235..216a7dfc3a68 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_CRASH_RESERVE) += crash_reserve.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_CRASH_DUMP) += crash_core.o obj-$(CONFIG_CRASH_DM_CRYPT) += crash_dump_dm_crypt.o +obj-$(CONFIG_CRASH_DUMP_KUNIT_TEST) += crash_core_test.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a5e8523dd6eb..3b1c43382eec 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -265,6 +265,20 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int need_kernel_map, return 0; } +/** + * crash_exclude_mem_range - exclude a mem range for existing ranges + * @mem: mem->range contains an array of ranges sorted in ascending order + * @mstart: the start of to-be-excluded range + * @mend: the start of to-be-excluded range + * + * If you are unsure if a range split will happen, to avoid function call + * failure because of -ENOMEM, always make sure + * mem->max_nr_ranges == mem->nr_ranges + 1 + * before calling the function each time. + * + * returns 0 if a memory range is excluded successfully + * return -ENOMEM if mem->ranges doesn't have space to hold split ranges + */ int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mstart, unsigned long long mend) { @@ -324,6 +338,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, return 0; } +EXPORT_SYMBOL_GPL(crash_exclude_mem_range); ssize_t crash_get_memory_size(void) { diff --git a/kernel/crash_core_test.c b/kernel/crash_core_test.c new file mode 100644 index 000000000000..8aadf6801530 --- /dev/null +++ b/kernel/crash_core_test.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include // For struct crash_mem and struct range if defined there + +// Helper to create and initialize crash_mem +static struct crash_mem *create_crash_mem(struct kunit *test, unsigned int max_ranges, + unsigned int nr_initial_ranges, + const struct range *initial_ranges) +{ + struct crash_mem *mem; + size_t alloc_size; + + // Check if max_ranges can even hold initial_ranges + if (max_ranges < nr_initial_ranges) { + kunit_err(test, "max_ranges (%u) < nr_initial_ranges (%u)\n", + max_ranges, nr_initial_ranges); + return NULL; + } + + alloc_size = sizeof(struct crash_mem) + (size_t)max_ranges * sizeof(struct range); + mem = kunit_kzalloc(test, alloc_size, GFP_KERNEL); + if (!mem) { + kunit_err(test, "Failed to allocate crash_mem\n"); + return NULL; + } + + mem->max_nr_ranges = max_ranges; + mem->nr_ranges = nr_initial_ranges; + if (initial_ranges && nr_initial_ranges > 0) { + memcpy(mem->ranges, initial_ranges, + nr_initial_ranges * sizeof(struct range)); + } + + return mem; +} + +// Helper to compare ranges for assertions +static void assert_ranges_equal(struct kunit *test, + const struct range *actual_ranges, + unsigned int actual_nr_ranges, + const struct range *expected_ranges, + unsigned int expected_nr_ranges, + const char *case_name) +{ + unsigned int i; + + KUNIT_ASSERT_EQ_MSG(test, expected_nr_ranges, actual_nr_ranges, + "%s: Number of ranges mismatch.", case_name); + + for (i = 0; i < expected_nr_ranges; i++) { + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].start, actual_ranges[i].start, + "%s: Range %u start mismatch.", case_name, i); + KUNIT_ASSERT_EQ_MSG(test, expected_ranges[i].end, actual_ranges[i].end, + "%s: Range %u end mismatch.", case_name, i); + } +} + +// Structure for test parameters +struct exclude_test_param { + const char *description; + unsigned long long exclude_start; + unsigned long long exclude_end; + unsigned int initial_max_ranges; + const struct range *initial_ranges; + unsigned int initial_nr_ranges; + const struct range *expected_ranges; + unsigned int expected_nr_ranges; + int expected_ret; +}; + +static void run_exclude_test_case(struct kunit *test, const struct exclude_test_param *params) +{ + struct crash_mem *mem; + int ret; + + kunit_info(test, "%s", params->description); + + mem = create_crash_mem(test, params->initial_max_ranges, + params->initial_nr_ranges, params->initial_ranges); + if (!mem) + return; // Error already logged by create_crash_mem or kunit_kzalloc + + ret = crash_exclude_mem_range(mem, params->exclude_start, params->exclude_end); + + KUNIT_ASSERT_EQ_MSG(test, params->expected_ret, ret, + "%s: Return value mismatch.", params->description); + + if (params->expected_ret == 0) { + assert_ranges_equal(test, mem->ranges, mem->nr_ranges, + params->expected_ranges, params->expected_nr_ranges, + params->description); + } else { + // If an error is expected, nr_ranges might still be relevant to check + // depending on the exact point of failure. For ENOMEM on split, + // nr_ranges shouldn't have changed. + KUNIT_ASSERT_EQ_MSG(test, params->initial_nr_ranges, + mem->nr_ranges, + "%s: Number of ranges mismatch on error.", + params->description); + } +} + +/* + * Test Strategy 1: One to-be-excluded range A and one existing range B. + * + * Exhaust all possibilities of the position of A regarding B. + */ + +static const struct range single_range_b = { .start = 100, .end = 199 }; + +static const struct exclude_test_param exclude_single_range_test_data[] = { + { + .description = "1.1: A is left of B, no overlap", + .exclude_start = 10, .exclude_end = 50, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.2: A's right boundary touches B's left boundary", + .exclude_start = 10, .exclude_end = 99, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.3: A overlaps B's left part", + .exclude_start = 50, .exclude_end = 149, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 150, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.4: A is completely inside B", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 119 }, + { .start = 180, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.5: A overlaps B's right part", + .exclude_start = 150, .exclude_end = 249, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 149 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.6: A's left boundary touches B's right boundary", + .exclude_start = 200, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.7: A is right of B, no overlap", + .exclude_start = 250, .exclude_end = 300, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.8: A completely covers B and extends beyond", + .exclude_start = 50, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.9: A covers B and extends to the left", + .exclude_start = 50, .exclude_end = 199, // A ends exactly where B ends + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.10: A covers B and extends to the right", + .exclude_start = 100, .exclude_end = 250, // A starts exactly where B starts + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.11: A is identical to B", + .exclude_start = 100, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, .expected_nr_ranges = 0, + .expected_ret = 0, + }, + { + .description = "1.12: A is a point, left of B, no overlap", + .exclude_start = 10, .exclude_end = 10, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.13: A is a point, at start of B", + .exclude_start = 100, .exclude_end = 100, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 101, .end = 199 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.14: A is a point, in middle of B (causes split)", + .exclude_start = 150, .exclude_end = 150, + .initial_max_ranges = 2, // Needs space for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){ + { .start = 100, .end = 149 }, + { .start = 151, .end = 199 } + }, + .expected_nr_ranges = 2, + .expected_ret = 0, + }, + { + .description = "1.15: A is a point, at end of B", + .exclude_start = 199, .exclude_end = 199, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = (const struct range[]){{ .start = 100, .end = 198 }}, + .expected_nr_ranges = 1, + .expected_ret = 0, + }, + { + .description = "1.16: A is a point, right of B, no overlap", + .exclude_start = 250, .exclude_end = 250, + .initial_max_ranges = 1, + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = &single_range_b, .expected_nr_ranges = 1, + .expected_ret = 0, + }, + // ENOMEM case for single range split + { + .description = "1.17: A completely inside B (split), no space (ENOMEM)", + .exclude_start = 120, .exclude_end = 179, + .initial_max_ranges = 1, // Not enough for split + .initial_ranges = &single_range_b, .initial_nr_ranges = 1, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 1, // Should remain unchanged + .expected_ret = -ENOMEM, + }, +}; + + +static void exclude_single_range_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_single_range_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_single_range_test_data[i].description); + run_exclude_test_case(test, &exclude_single_range_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * Test Strategy 2: Regression test. + */ + +static const struct exclude_test_param exclude_range_regression_test_data[] = { + // Test data from commit a2e9a95d2190 + { + .description = "2.1: exclude low 1M", + .exclude_start = 0, .exclude_end = (1 << 20) - 1, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 0, .end = 0x3efff }, + { .start = 0x3f000, .end = 0x3ffff }, + { .start = 0x40000, .end = 0x9ffff } + }, + .initial_nr_ranges = 3, + .expected_nr_ranges = 0, + .expected_ret = 0, + }, + // Test data from https://lore.kernel.org/all/ZXrY7QbXAlxydsSC@MiWiFi-R3L-srv/T/#u + { + .description = "2.2: when range out of bound", + .exclude_start = 100, .exclude_end = 200, + .initial_max_ranges = 3, + .initial_ranges = (const struct range[]){ + { .start = 1, .end = 299 }, + { .start = 401, .end = 1000 }, + { .start = 1001, .end = 2000 } + }, + .initial_nr_ranges = 3, + .expected_ranges = NULL, // Not checked on error by assert_ranges_equal for content + .expected_nr_ranges = 3, // Should remain unchanged + .expected_ret = -ENOMEM + }, + +}; + + +static void exclude_range_regression_test(struct kunit *test) +{ + size_t i; + + for (i = 0; i < ARRAY_SIZE(exclude_range_regression_test_data); i++) { + kunit_log(KERN_INFO, test, "Running: %s", exclude_range_regression_test_data[i].description); + run_exclude_test_case(test, &exclude_range_regression_test_data[i]); + // KUnit will stop on first KUNIT_ASSERT failure within run_exclude_test_case + } +} + +/* + * KUnit Test Suite + */ +static struct kunit_case crash_exclude_mem_range_test_cases[] = { + KUNIT_CASE(exclude_single_range_test), + KUNIT_CASE(exclude_range_regression_test), + {} +}; + +static struct kunit_suite crash_exclude_mem_range_suite = { + .name = "crash_exclude_mem_range_tests", + .test_cases = crash_exclude_mem_range_test_cases, + // .init and .exit can be NULL if not needed globally for the suite +}; + +kunit_test_suite(crash_exclude_mem_range_suite); + +MODULE_DESCRIPTION("crash dump KUnit test suite"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From d6d5116391857fc78fad9aa42317b36e4ce17b58 Mon Sep 17 00:00:00 2001 From: Evangelos Petrongonas Date: Thu, 21 Aug 2025 17:58:59 +0000 Subject: kexec: introduce is_kho_boot() Patch series "efi: Fix EFI boot with kexec handover (KHO)", v3. This patch series fixes a kernel panic that occurs when booting with both EFI and KHO (Kexec HandOver) enabled. The issue arises because EFI's `reserve_regions()` clears all memory regions with `memblock_remove(0, PHYS_ADDR_MAX)` before rebuilding them from EFI data. This destroys KHO scratch regions that were set up early during device tree scanning, causing a panic as the kernel has no valid memory regions for early allocations. The first patch introduces `is_kho_boot()` to allow early boot components to reliably detect if the kernel was booted via KHO-enabled kexec. The existing `kho_is_enabled()` only checks the command line and doesn't verify if an actual KHO FDT was passed. The second patch modifies EFI's `reserve_regions()` to selectively remove only non-KHO memory regions when KHO is active, preserving the critical scratch regions while still allowing EFI to rebuild its memory map. This patch (of 3): During early initialisation, after a kexec, other components, like EFI need to know if a KHO enabled kexec is performed. The `kho_is_enabled` function is not enough as in the early stages, it only reflects whether the cmdline has KHO enabled, not if an actual KHO FDT exists. Extend the KHO API with `is_kho_boot()` to provide a way for components to check if a KHO enabled kexec is performed. Link: https://lkml.kernel.org/r/cover.1755721529.git.epetron@amazon.de Link: https://lkml.kernel.org/r/7dc6674a76bf6e68cca0222ccff32427699cc02e.1755721529.git.epetron@amazon.de Signed-off-by: Evangelos Petrongonas Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Ard Biesheuvel Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index ecd1ac210dbd..49a39aee6a8e 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -951,6 +951,26 @@ static const void *kho_get_fdt(void) return kho_in.fdt_phys ? phys_to_virt(kho_in.fdt_phys) : NULL; } +/** + * is_kho_boot - check if current kernel was booted via KHO-enabled + * kexec + * + * This function checks if the current kernel was loaded through a kexec + * operation with KHO enabled, by verifying that a valid KHO FDT + * was passed. + * + * Note: This function returns reliable results only after + * kho_populate() has been called during early boot. Before that, + * it may return false even if KHO data is present. + * + * Return: true if booted via KHO-enabled kexec, false otherwise + */ +bool is_kho_boot(void) +{ + return !!kho_get_fdt(); +} +EXPORT_SYMBOL_GPL(is_kho_boot); + /** * kho_retrieve_subtree - retrieve a preserved sub FDT by its name. * @name: the name of the sub FDT passed to kho_add_subtree(). -- cgit v1.2.3 From a15f37a40145c986cdf289a4b88390f35efdecc4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Sep 2025 14:09:17 +0200 Subject: kernel/sys.c: fix the racy usage of task_lock(tsk->group_leader) in sys_prlimit64() paths The usage of task_lock(tsk->group_leader) in sys_prlimit64()->do_prlimit() path is very broken. sys_prlimit64() does get_task_struct(tsk) but this only protects task_struct itself. If tsk != current and tsk is not a leader, this process can exit/exec and task_lock(tsk->group_leader) may use the already freed task_struct. Another problem is that sys_prlimit64() can race with mt-exec which changes ->group_leader. In this case do_prlimit() may take the wrong lock, or (worse) ->group_leader may change between task_lock() and task_unlock(). Change sys_prlimit64() to take tasklist_lock when necessary. This is not nice, but I don't see a better fix for -stable. Link: https://lkml.kernel.org/r/20250915120917.GA27702@redhat.com Fixes: 18c91bb2d872 ("prlimit: do not grab the tasklist_lock") Signed-off-by: Oleg Nesterov Cc: Christian Brauner Cc: Jiri Slaby Cc: Mateusz Guzik Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 1e28b40053ce..36d66ff41611 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1734,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit old, new; struct task_struct *tsk; unsigned int checkflags = 0; + bool need_tasklist; int ret; if (old_rlim) @@ -1760,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, get_task_struct(tsk); rcu_read_unlock(); - ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, - old_rlim ? &old : NULL); + need_tasklist = !same_thread_group(tsk, current); + if (need_tasklist) { + /* + * Ensure we can't race with group exit or de_thread(), + * so tsk->group_leader can't be freed or changed until + * read_unlock(tasklist_lock) below. + */ + read_lock(&tasklist_lock); + if (!pid_alive(tsk)) + ret = -ESRCH; + } + + if (!ret) { + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + } + + if (need_tasklist) + read_unlock(&tasklist_lock); if (!ret && old_rlim) { rlim_to_rlim64(&old, &old64); -- cgit v1.2.3 From f322a97aeb2a05b6b1ee17629145eb02e1a4c6a0 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Thu, 18 Sep 2025 19:06:15 +0200 Subject: kho: only fill kimage if KHO is finalized kho_fill_kimage() only checks for KHO being enabled before filling in the FDT to the image. KHO being enabled does not mean that the kernel has data to hand over. That happens when KHO is finalized. When a kexec is done with KHO enabled but not finalized, the FDT page is allocated but not initialized. FDT initialization happens after finalize. This means the KHO segment is filled in but the FDT contains garbage data. This leads to the below error messages in the next kernel: [ 0.000000] KHO: setup: handover FDT (0x10116b000) is invalid: -9 [ 0.000000] KHO: disabling KHO revival: -22 There is no problem in practice, and the next kernel boots and works fine. But this still leads to misleading error messages and garbage being handed over. Only fill in KHO segment when KHO is finalized. When KHO is not enabled, the debugfs interface is not created and there is no way to finalize it anyway. So the check for kho_enable is not needed, and kho_out.finalize alone is enough. Link: https://lkml.kernel.org/r/20250918170617.91413-1-pratyush@kernel.org Fixes: 3bdecc3c93f9 ("kexec: add KHO support to kexec file loads") Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Jason Gunthorpe Cc: Pasha Tatashin Cc: Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 49a39aee6a8e..e238ec6b470b 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1253,7 +1253,7 @@ int kho_fill_kimage(struct kimage *image) int err = 0; struct kexec_buf scratch; - if (!kho_enable) + if (!kho_out.finalized) return 0; image->kho.fdt = page_to_phys(kho_out.ser.fdt); -- cgit v1.2.3 From 634cdfd6b394cf4a5bfaeacf3b325998c752df45 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sat, 13 Sep 2025 18:28:49 -0400 Subject: kernel: prevent prctl(PR_SET_PDEATHSIG) from racing with parent process exit If a process calls prctl(PR_SET_PDEATHSIG) at the same time that the parent process exits, the child will write to me->pdeath_sig at the same time the parent is reading it. Since there is no synchronization, this is a data race. Worse, it is possible that a subsequent call to getppid() can continue to return the previous parent process ID without the parent death signal being delivered. This happens in the following scenario: parent child forget_original_parent() prctl(PR_SET_PDEATHSIG, SIGKILL) sys_prctl() me->pdeath_sig = SIGKILL; getppid(); RCU_INIT_POINTER(t->real_parent, reaper); if (t->pdeath_signal) /* reads stale me->pdeath_sig */ group_send_sig_info(t->pdeath_signal, ...); And in the following: parent child forget_original_parent() RCU_INIT_POINTER(t->real_parent, reaper); /* also no barrier */ if (t->pdeath_signal) /* reads stale me->pdeath_sig */ group_send_sig_info(t->pdeath_signal, ...); prctl(PR_SET_PDEATHSIG, SIGKILL) sys_prctl() me->pdeath_sig = SIGKILL; getppid(); /* reads old ppid() */ As a result, the following pattern is racy: pid_t parent_pid = getpid(); pid_t child_pid = fork(); if (child_pid == -1) { /* handle error... */ return; } if (child_pid == 0) { if (prctl(PR_SET_PDEATHSIG, SIGKILL) != 0) { /* handle error */ _exit(126); } if (getppid() != parent_pid) { /* parent died already */ raise(SIGKILL); } /* keep going in child */ } /* keep going in parent */ If the parent is killed at exactly the wrong time, the child process can (wrongly) stay running. I didn't manage to reproduce this in my testing, but I'm pretty sure the race is real. KCSAN is probably the best way to spot the race. Fix the bug by holding tasklist_lock for reading whenever pdeath_signal is being written to. This prevents races on me->pdeath_sig, and the locking and unlocking of the rwlock provide the needed memory barriers. If prctl(PR_SET_PDEATHSIG) happens before the parent exits, the signal will be sent. If it happens afterwards, a subsequent getppid() will return the new value. Link: https://lkml.kernel.org/r/20250913-fix-prctl-pdeathsig-race-v1-1-44e2eb426fe9@gmail.com Signed-off-by: Demi Marie Obenour Cc: Oleg Nesterov Cc: Mateusz Guzik Signed-off-by: Andrew Morton --- kernel/sys.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 36d66ff41611..bd25f39a6b57 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2488,7 +2488,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = -EINVAL; break; } + /* + * Ensure that either: + * + * 1. Subsequent getppid() calls reflect the parent process having died. + * 2. forget_original_parent() will send the new me->pdeath_signal. + * + * Also prevent the read of me->pdeath_signal from being a data race. + */ + read_lock(&tasklist_lock); me->pdeath_signal = arg2; + read_unlock(&tasklist_lock); break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); -- cgit v1.2.3 From 1daf37592a050da046a03f78b20abb2a91f6d934 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 24 Sep 2025 11:43:04 +0200 Subject: panic: remove CONFIG_PANIC_ON_OOPS_VALUE There's really no need for this since it's 0 or 1 when CONFIG_PANIC_ON_OOPS is disabled/enabled, so just use IS_ENABLED() instead. The extra symbol goes back to the original code adding it in commit 2a01bb3885c9 ("panic: Make panic_on_oops configurable"). Link: https://lkml.kernel.org/r/20250924094303.18521-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Andrew Morton --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index ebd81c259fa9..24cc3eec1805 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -53,7 +53,7 @@ static unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; #define sysctl_oops_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ -int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; +int panic_on_oops = IS_ENABLED(CONFIG_PANIC_ON_OOPS); static unsigned long tainted_mask = IS_ENABLED(CONFIG_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; static int pause_on_oops; -- cgit v1.2.3 From 94b3f02fb33f56c896d855ccbac270766d1aa48b Mon Sep 17 00:00:00 2001 From: Sahil Chandna Date: Fri, 26 Sep 2025 13:20:53 +0530 Subject: kallsyms: use kmalloc_array() instead of kmalloc() Replace kmalloc(sizeof(*stat) * 2, GFP_KERNEL) with kmalloc_array(2, sizeof(*stat), GFP_KERNEL) to prevent potential overflow, as recommended in Documentation/process/deprecated.rst. Link: https://lkml.kernel.org/r/20250926075053.25615-1-chandna.linuxkernel@gmail.com Signed-off-by: Sahil Chandna Cc: Shuah Khan Cc: David Hunter Signed-off-by: Andrew Morton --- kernel/kallsyms_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index cf4af5728307..2b082a7e24a2 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -264,7 +264,7 @@ static int test_kallsyms_basic_function(void) char namebuf[KSYM_NAME_LEN]; struct test_stat *stat, *stat2; - stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + stat = kmalloc_array(2, sizeof(*stat), GFP_KERNEL); if (!stat) return -ENOMEM; stat2 = stat + 1; -- cgit v1.2.3