From 61399e0c5410567ef60cb1cda34cca42903842e3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Aug 2025 19:03:22 +0200 Subject: rcu: Fix racy re-initialization of irq_work causing hangs RCU re-initializes the deferred QS irq work everytime before attempting to queue it. However there are situations where the irq work is attempted to be queued even though it is already queued. In that case re-initializing messes-up with the irq work queue that is about to be handled. The chances for that to happen are higher when the architecture doesn't support self-IPIs and irq work are then all lazy, such as with the following sequence: 1) rcu_read_unlock() is called when IRQs are disabled and there is a grace period involving blocked tasks on the node. The irq work is then initialized and queued. 2) The related tasks are unblocked and the CPU quiescent state is reported. rdp->defer_qs_iw_pending is reset to DEFER_QS_IDLE, allowing the irq work to be requeued in the future (note the previous one hasn't fired yet). 3) A new grace period starts and the node has blocked tasks. 4) rcu_read_unlock() is called when IRQs are disabled again. The irq work is re-initialized (but it's queued! and its node is cleared) and requeued. Which means it's requeued to itself. 5) The irq work finally fires with the tick. But since it was requeued to itself, it loops and hangs. Fix this with initializing the irq work only once before the CPU boots. Fixes: b41642c87716 ("rcu: Fix rcu_read_unlock() deadloop due to IRQ work") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202508071303.c1134cce-lkp@intel.com Signed-off-by: Frederic Weisbecker Reviewed-by: Joel Fernandes Signed-off-by: Neeraj Upadhyay (AMD) --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 8 ++++++-- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 174ee243b349..8eff357b0436 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4262,6 +4262,8 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcu_preempt_deferred_qs_init(rdp); rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index de6ca13a7b5f..b8bbe7960cda 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -488,6 +488,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_flavor_sched_clock_irq(int user); static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static bool rcu_is_callbacks_kthread(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fc14adf15cbb..4cd170b2d655 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -763,8 +763,6 @@ static void rcu_read_unlock_special(struct task_struct *t) cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. - rdp->defer_qs_iw = - IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); rdp->defer_qs_iw_pending = DEFER_QS_PENDING; irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); } @@ -904,6 +902,10 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) } } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) +{ + rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(rcu_preempt_deferred_qs_handler); +} #else /* #ifdef CONFIG_PREEMPT_RCU */ /* @@ -1103,6 +1105,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks)); } +static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp) { } + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* -- cgit v1.2.3 From dfb36e4a8db0cd56f92d4cb445f54e85a9b40897 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 11 Aug 2025 10:11:47 -0400 Subject: futex: Use user_write_access_begin/_end() in futex_put_value() Commit cec199c5e39b ("futex: Implement FUTEX2_NUMA") introduced the futex_put_value() helper to write a value to the given user address. However, it uses user_read_access_begin() before the write. For architectures that differentiate between read and write accesses, like PowerPC, futex_put_value() fails with -EFAULT. Fix that by using the user_write_access_begin/user_write_access_end() pair instead. Fixes: cec199c5e39b ("futex: Implement FUTEX2_NUMA") Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250811141147.322261-1-longman@redhat.com --- kernel/futex/futex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index c74eac572acd..2cd57096c38e 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -319,13 +319,13 @@ static __always_inline int futex_put_value(u32 val, u32 __user *to) { if (can_do_masked_user_access()) to = masked_user_access_begin(to); - else if (!user_read_access_begin(to, sizeof(*to))) + else if (!user_write_access_begin(to, sizeof(*to))) return -EFAULT; unsafe_put_user(val, to, Efault); - user_read_access_end(); + user_write_access_end(); return 0; Efault: - user_read_access_end(); + user_write_access_end(); return -EFAULT; } -- cgit v1.2.3 From c0a23bbc98e93704a1f4fb5e7e7bb2d7c0fb6eb3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 29 Jul 2025 14:26:11 +0200 Subject: ipvs: Fix estimator kthreads preferred affinity The estimator kthreads' affinity are defined by sysctl overwritten preferences and applied through a plain call to the scheduler's affinity API. However since the introduction of managed kthreads preferred affinity, such a practice shortcuts the kthreads core code which eventually overwrites the target to the default unbound affinity. Fix this with using the appropriate kthread's API. Fixes: d1a89197589c ("kthread: Default affine kthread to its preferred NUMA node") Signed-off-by: Frederic Weisbecker Acked-by: Julian Anastasov Signed-off-by: Florian Westphal --- kernel/kthread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 0e98b228a8ef..31b072e8d427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -893,6 +893,7 @@ out: return ret; } +EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences -- cgit v1.2.3 From 21924af67d69d7c9fdaf845be69043cfe75196a1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 5 Aug 2025 00:10:02 +0000 Subject: locking: Fix __clear_task_blocked_on() warning from __ww_mutex_wound() path The __clear_task_blocked_on() helper added a number of sanity checks ensuring we hold the mutex wait lock and that the task we are clearing blocked_on pointer (if set) matches the mutex. However, there is an edge case in the _ww_mutex_wound() logic where we need to clear the blocked_on pointer for the task that owns the mutex, not the task that is waiting on the mutex. For this case the sanity checks aren't valid, so handle this by allowing a NULL lock to skip the additional checks. K Prateek Nayak and Maarten Lankhorst also pointed out that in this case where we don't hold the owner's mutex wait_lock, we need to be a bit more careful using READ_ONCE/WRITE_ONCE in both the __clear_task_blocked_on() and __set_task_blocked_on() implementations to avoid accidentally tripping WARN_ONs if two instances race. So do that here as well. This issue was easier to miss, I realized, as the test-ww_mutex driver only exercises the wait-die class of ww_mutexes. I've sent a patch[1] to address this so the logic will be easier to test. [1]: https://lore.kernel.org/lkml/20250801023358.562525-2-jstultz@google.com/ Fixes: a4f0b6fef4b0 ("locking/mutex: Add p->blocked_on wrappers for correctness checks") Closes: https://lore.kernel.org/lkml/68894443.a00a0220.26d0e1.0015.GAE@google.com/ Reported-by: syzbot+602c4720aed62576cd79@syzkaller.appspotmail.com Reported-by: Maarten Lankhorst Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Acked-by: Maarten Lankhorst Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250805001026.2247040-1-jstultz@google.com --- kernel/locking/ww_mutex.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 086fd5487ca7..31a785afee6c 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -342,8 +342,12 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * When waking up the task to wound, be sure to clear the * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. + * + * NOTE: We pass NULL here instead of lock, because we + * are waking the mutex owner, who may be currently + * blocked on a different mutex. */ - __clear_task_blocked_on(owner, lock); + __clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; -- cgit v1.2.3