From f8619c300f49c5831d344d35df93d3af447efc97 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 6 Mar 2023 20:48:13 -0800 Subject: locktorture: Add long_hold to adjust lock-hold delays This commit adds a long_hold module parameter to allow testing diagnostics for excessive lock-hold times. Also adjust torture_param() invocations for longer line length while in the area. Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/locking/locktorture.c | 51 +++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 153ddc4c47ef..949d3deae506 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -33,24 +33,19 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); -torture_param(int, nwriters_stress, -1, - "Number of write-locking stress-test threads"); -torture_param(int, nreaders_stress, -1, - "Number of read-locking stress-test threads"); +torture_param(int, nwriters_stress, -1, "Number of write-locking stress-test threads"); +torture_param(int, nreaders_stress, -1, "Number of read-locking stress-test threads"); +torture_param(int, long_hold, 100, "Do occasional long hold of lock (ms), 0=disable"); torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); -torture_param(int, onoff_interval, 0, - "Time between CPU hotplugs (s), 0=disable"); -torture_param(int, shuffle_interval, 3, - "Number of jiffies between shuffles, 0=disable"); +torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); +torture_param(int, shuffle_interval, 3, "Number of jiffies between shuffles, 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable."); -torture_param(int, stat_interval, 60, - "Number of seconds between stats printk()s"); +torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(int, rt_boost, 2, - "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); + "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); -torture_param(int, verbose, 1, - "Enable verbose debugging printk()s"); +torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ #define MAX_NESTED_LOCKS 8 @@ -120,7 +115,7 @@ static int torture_lock_busted_write_lock(int tid __maybe_unused) static void torture_lock_busted_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % @@ -198,16 +193,18 @@ __acquires(torture_spinlock) static void torture_spin_lock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; + unsigned long j; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. */ - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) { + j = jiffies; mdelay(longdelay_ms); - if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + pr_alert("%s: delay = %lu jiffies.\n", __func__, jiffies - j); + } + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -322,7 +319,7 @@ __acquires(torture_rwlock) static void torture_rwlock_write_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a short delay mostly to emulate likely code, and * we want a long delay occasionally to force massive contention. @@ -455,14 +452,12 @@ __acquires(torture_mutex) static void torture_mutex_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 5); - else - mdelay(longdelay_ms / 5); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -630,7 +625,7 @@ __acquires(torture_rtmutex) static void torture_rtmutex_delay(struct torture_random_state *trsp) { const unsigned long shortdelay_us = 2; - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* * We want a short delay mostly to emulate likely code, and @@ -640,7 +635,7 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2 * shortdelay_us))) + (cxt.nrealwriters_stress * 200 * shortdelay_us))) udelay(shortdelay_us); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ @@ -695,14 +690,12 @@ __acquires(torture_rwsem) static void torture_rwsem_write_delay(struct torture_random_state *trsp) { - const unsigned long longdelay_ms = 100; + const unsigned long longdelay_ms = long_hold ? long_hold : ULONG_MAX; /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 10); - else - mdelay(longdelay_ms / 10); if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) torture_preempt_schedule(); /* Allow test to be preempted. */ } @@ -848,8 +841,8 @@ static int lock_torture_writer(void *arg) lwsp->n_lock_acquired++; } - cxt.cur_ops->write_delay(&rand); if (!skip_main_lock) { + cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(tid); -- cgit v1.2.3 From eb1cfd09f788e39948a82be8063e54e40dd018d9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 May 2023 15:58:46 -0400 Subject: lockdep: Add lock_set_cmp_fn() annotation This implements a new interface to lockdep, lock_set_cmp_fn(), for defining a custom ordering when taking multiple locks of the same class. This is an alternative to subclasses, but can not fully replace them since subclasses allow lock hierarchies with other clasees inter-twined, while this relies on pure class nesting. Specifically, if A is our nesting class then: A/0 <- B <- A/1 Would be a valid lock order with subclasses (each subclass really is a full class from the validation PoV) but not with this annotation, which requires all nesting to be consecutive. Example output: | ============================================ | WARNING: possible recursive locking detected | 6.2.0-rc8-00003-g7d81e591ca6a-dirty #15 Not tainted | -------------------------------------------- | kworker/14:3/938 is trying to acquire lock: | ffff8880143218c8 (&b->lock l=0 0:2803368){++++}-{3:3}, at: bch_btree_node_get.part.0+0x81/0x2b0 | | but task is already holding lock: | ffff8880143de8c8 (&b->lock l=1 1048575:9223372036854775807){++++}-{3:3}, at: __bch_btree_map_nodes+0xea/0x1e0 | and the lock comparison function returns 1: | | other info that might help us debug this: | Possible unsafe locking scenario: | | CPU0 | ---- | lock(&b->lock l=1 1048575:9223372036854775807); | lock(&b->lock l=0 0:2803368); | | *** DEADLOCK *** | | May be due to missing lock nesting notation | | 3 locks held by kworker/14:3/938: | #0: ffff888005ea9d38 ((wq_completion)bcache){+.+.}-{0:0}, at: process_one_work+0x1ec/0x530 | #1: ffff8880098c3e70 ((work_completion)(&cl->work)#3){+.+.}-{0:0}, at: process_one_work+0x1ec/0x530 | #2: ffff8880143de8c8 (&b->lock l=1 1048575:9223372036854775807){++++}-{3:3}, at: __bch_btree_map_nodes+0xea/0x1e0 [peterz: extended changelog] Signed-off-by: Kent Overstreet Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230509195847.1745548-1-kent.overstreet@linux.dev --- kernel/locking/lockdep.c | 118 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 87 insertions(+), 31 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dcd1d5bfc1e0..3e8950f3beab 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -709,7 +709,7 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static void __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct held_lock *hlock, struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; @@ -724,17 +724,19 @@ static void __print_lock_name(struct lock_class *class) printk(KERN_CONT "#%d", class->name_version); if (class->subclass) printk(KERN_CONT "/%d", class->subclass); + if (hlock && class->print_fn) + class->print_fn(hlock->instance); } } -static void print_lock_name(struct lock_class *class) +static void print_lock_name(struct held_lock *hlock, struct lock_class *class) { char usage[LOCK_USAGE_CHARS]; get_usage_chars(class, usage); printk(KERN_CONT " ("); - __print_lock_name(class); + __print_lock_name(hlock, class); printk(KERN_CONT "){%s}-{%d:%d}", usage, class->wait_type_outer ?: class->wait_type_inner, class->wait_type_inner); @@ -772,7 +774,7 @@ static void print_lock(struct held_lock *hlock) } printk(KERN_CONT "%px", hlock->instance); - print_lock_name(lock); + print_lock_name(hlock, lock); printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); } @@ -1868,7 +1870,7 @@ print_circular_bug_entry(struct lock_list *target, int depth) if (debug_locks_silent) return; printk("\n-> #%u", depth); - print_lock_name(target->class); + print_lock_name(NULL, target->class); printk(KERN_CONT ":\n"); print_lock_trace(target->trace, 6); } @@ -1899,11 +1901,11 @@ print_circular_lock_scenario(struct held_lock *src, */ if (parent != source) { printk("Chain exists of:\n "); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT " --> "); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT " --> "); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT "\n\n"); } @@ -1914,13 +1916,13 @@ print_circular_lock_scenario(struct held_lock *src, printk(" rlock("); else printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(parent); + __print_lock_name(NULL, parent); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(target); + __print_lock_name(tgt, target); printk(KERN_CONT ");\n"); if (src_read != 0) printk(" rlock("); @@ -1928,7 +1930,7 @@ print_circular_lock_scenario(struct held_lock *src, printk(" sync("); else printk(" lock("); - __print_lock_name(source); + __print_lock_name(src, source); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2154,6 +2156,8 @@ check_path(struct held_lock *target, struct lock_list *src_entry, return ret; } +static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *); + /* * Prove that the dependency graph starting at can not * lead to . If it can, there is a circle when adding @@ -2185,7 +2189,10 @@ check_noncircular(struct held_lock *src, struct held_lock *target, *trace = save_trace(); } - print_circular_bug(&src_entry, target_entry, src, target); + if (src->class_idx == target->class_idx) + print_deadlock_bug(current, src, target); + else + print_circular_bug(&src_entry, target_entry, src, target); } return ret; @@ -2341,7 +2348,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) int bit; printk("%*s->", depth, ""); - print_lock_name(class); + print_lock_name(NULL, class); #ifdef CONFIG_DEBUG_LOCKDEP printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); #endif @@ -2523,11 +2530,11 @@ print_irq_lock_scenario(struct lock_list *safe_entry, */ if (middle_class != unsafe_class) { printk("Chain exists of:\n "); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT " --> "); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT " --> "); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT "\n\n"); } @@ -2535,18 +2542,18 @@ print_irq_lock_scenario(struct lock_list *safe_entry, printk(" CPU0 CPU1\n"); printk(" ---- ----\n"); printk(" lock("); - __print_lock_name(unsafe_class); + __print_lock_name(NULL, unsafe_class); printk(KERN_CONT ");\n"); printk(" local_irq_disable();\n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(middle_class); + __print_lock_name(NULL, middle_class); printk(KERN_CONT ");\n"); printk(" \n"); printk(" lock("); - __print_lock_name(safe_class); + __print_lock_name(NULL, safe_class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -2583,20 +2590,20 @@ print_bad_irq_dependency(struct task_struct *curr, pr_warn("\nand this task is already holding:\n"); print_lock(prev); pr_warn("which would create a new lock dependency:\n"); - print_lock_name(hlock_class(prev)); + print_lock_name(prev, hlock_class(prev)); pr_cont(" ->"); - print_lock_name(hlock_class(next)); + print_lock_name(next, hlock_class(next)); pr_cont("\n"); pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n", irqclass); - print_lock_name(backwards_entry->class); + print_lock_name(NULL, backwards_entry->class); pr_warn("\n... which became %s-irq-safe at:\n", irqclass); print_lock_trace(backwards_entry->class->usage_traces[bit1], 1); pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass); - print_lock_name(forwards_entry->class); + print_lock_name(NULL, forwards_entry->class); pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass); pr_warn("..."); @@ -2966,10 +2973,10 @@ print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(prev); + __print_lock_name(prv, prev); printk(KERN_CONT ");\n"); printk(" lock("); - __print_lock_name(next); + __print_lock_name(nxt, next); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); printk(" May be due to missing lock nesting notation\n\n"); @@ -2979,6 +2986,8 @@ static void print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, struct held_lock *next) { + struct lock_class *class = hlock_class(prev); + if (!debug_locks_off_graph_unlock() || debug_locks_silent) return; @@ -2993,6 +3002,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, pr_warn("\nbut task is already holding lock:\n"); print_lock(prev); + if (class->cmp_fn) { + pr_warn("and the lock comparison function returns %i:\n", + class->cmp_fn(prev->instance, next->instance)); + } + pr_warn("\nother info that might help us debug this:\n"); print_deadlock_scenario(next, prev); lockdep_print_held_locks(curr); @@ -3014,6 +3028,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, static int check_deadlock(struct task_struct *curr, struct held_lock *next) { + struct lock_class *class; struct held_lock *prev; struct held_lock *nest = NULL; int i; @@ -3034,6 +3049,12 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) if ((next->read == 2) && prev->read) continue; + class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + continue; + /* * We're holding the nest_lock, which serializes this lock's * nesting behaviour. @@ -3095,6 +3116,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 2; } + if (prev->class_idx == next->class_idx) { + struct lock_class *class = hlock_class(prev); + + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + return 2; + } + /* * Prove that the new -> dependency would not * create a circular dependency in the graph. (We do this by @@ -3571,7 +3600,7 @@ static void print_chain_keys_chain(struct lock_chain *chain) hlock_id = chain_hlocks[chain->base + i]; chain_key = print_chain_key_iteration(hlock_id, chain_key); - print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id)); + print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id)); printk("\n"); } } @@ -3928,11 +3957,11 @@ static void print_usage_bug_scenario(struct held_lock *lock) printk(" CPU0\n"); printk(" ----\n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk(" \n"); printk(" lock("); - __print_lock_name(class); + __print_lock_name(lock, class); printk(KERN_CONT ");\n"); printk("\n *** DEADLOCK ***\n\n"); } @@ -4018,7 +4047,7 @@ print_irq_inversion_bug(struct task_struct *curr, pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); - print_lock_name(other->class); + print_lock_name(NULL, other->class); pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n"); pr_warn("\nother info that might help us debug this:\n"); @@ -4882,6 +4911,33 @@ EXPORT_SYMBOL_GPL(lockdep_init_map_type); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); +#ifdef CONFIG_PROVE_LOCKING +void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn, + lock_print_fn print_fn) +{ + struct lock_class *class = lock->class_cache[0]; + unsigned long flags; + + raw_local_irq_save(flags); + lockdep_recursion_inc(); + + if (!class) + class = register_lock_class(lock, 0, 0); + + if (class) { + WARN_ON(class->cmp_fn && class->cmp_fn != cmp_fn); + WARN_ON(class->print_fn && class->print_fn != print_fn); + + class->cmp_fn = cmp_fn; + class->print_fn = print_fn; + } + + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn); +#endif + static void print_lock_nested_lock_not_held(struct task_struct *curr, struct held_lock *hlock) -- cgit v1.2.3 From ff7138813ac4a20628ce4b40952c69faba835fbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 15:10:54 +0200 Subject: locking: add lockevent_read() prototype lockevent_read() has a __weak definition and the only caller in kernel/locking/lock_events.c, plus a strong definition in qspinlock_stat.h that overrides it, but no other declaration. This causes a W=1 warning: kernel/locking/lock_events.c:61:16: error: no previous prototype for 'lockevent_read' [-Werror=missing-prototypes] Add shared prototype to avoid the warnings. Link: https://lkml.kernel.org/r/20230517131102.934196-7-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Boqun Feng Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Paris Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Moore Cc: Pavel Machek Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Russell King Cc: Tejun Heo Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/locking/lock_events.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 8c7e7d25f09c..a6016b91803d 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -57,4 +57,8 @@ static inline void __lockevent_add(enum lock_events event, int inc) #define lockevent_cond_inc(ev, c) #endif /* CONFIG_LOCK_EVENT_COUNTS */ + +ssize_t lockevent_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos); + #endif /* __LOCKING_LOCK_EVENTS_H */ -- cgit v1.2.3 From f7853c34241807bb97673a5e97719123be39a09e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Jul 2023 16:19:09 +0200 Subject: locking/rtmutex: Fix task->pi_waiters integrity Henry reported that rt_mutex_adjust_prio_check() has an ordering problem and puts the lie to the comment in [7]. Sharing the sort key between lock->waiters and owner->pi_waiters *does* create problems, since unlike what the comment claims, holding [L] is insufficient. Notably, consider: A / \ M1 M2 | | B C That is, task A owns both M1 and M2, B and C block on them. In this case a concurrent chain walk (B & C) will modify their resp. sort keys in [7] while holding M1->wait_lock and M2->wait_lock. So holding [L] is meaningless, they're different Ls. This then gives rise to a race condition between [7] and [11], where the requeue of pi_waiters will observe an inconsistent tree order. B C (holds M1->wait_lock, (holds M2->wait_lock, holds B->pi_lock) holds A->pi_lock) [7] waiter_update_prio(); ... [8] raw_spin_unlock(B->pi_lock); ... [10] raw_spin_lock(A->pi_lock); [11] rt_mutex_enqueue_pi(); // observes inconsistent A->pi_waiters // tree order Fixing this means either extending the range of the owner lock from [10-13] to [6-13], with the immediate problem that this means [6-8] hold both blocked and owner locks, or duplicating the sort key. Since the locking in chain walk is horrible enough without having to consider pi_lock nesting rules, duplicate the sort key instead. By giving each tree their own sort key, the above race becomes harmless, if C sees B at the old location, then B will correct things (if they need correcting) when it walks up the chain and reaches A. Fixes: fb00aca47440 ("rtmutex: Turn the plist into an rb-tree") Reported-by: Henry Wu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Thomas Gleixner Tested-by: Henry Wu Link: https://lkml.kernel.org/r/20230707161052.GF2883469%40hirez.programming.kicks-ass.net --- kernel/locking/rtmutex.c | 170 +++++++++++++++++++++++++++------------- kernel/locking/rtmutex_api.c | 2 +- kernel/locking/rtmutex_common.h | 47 ++++++++--- kernel/locking/ww_mutex.h | 12 +-- 4 files changed, 155 insertions(+), 76 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 728f434de2bb..21db0df0eb00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -333,21 +333,43 @@ static __always_inline int __waiter_prio(struct task_struct *task) return prio; } +/* + * Update the waiter->tree copy of the sort keys. + */ static __always_inline void waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) { - waiter->prio = __waiter_prio(task); - waiter->deadline = task->dl.deadline; + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); + + waiter->tree.prio = __waiter_prio(task); + waiter->tree.deadline = task->dl.deadline; +} + +/* + * Update the waiter->pi_tree copy of the sort keys (from the tree copy). + */ +static __always_inline void +waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert_held(&task->pi_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); + + waiter->pi_tree.prio = waiter->tree.prio; + waiter->pi_tree.deadline = waiter->tree.deadline; } /* - * Only use with rt_mutex_waiter_{less,equal}() + * Only use with rt_waiter_node_{less,equal}() */ +#define task_to_waiter_node(p) \ + &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } -static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio < right->prio) return 1; @@ -364,8 +386,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, return 0; } -static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio != right->prio) return 0; @@ -385,7 +407,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *top_waiter) { - if (rt_mutex_waiter_less(waiter, top_waiter)) + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) return true; #ifdef RT_MUTEX_BUILD_SPINLOCKS @@ -393,30 +415,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, * Note that RT tasks are excluded from same priority (lateral) * steals to prevent the introduction of an unbounded latency. */ - if (rt_prio(waiter->prio) || dl_prio(waiter->prio)) + if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) return false; - return rt_mutex_waiter_equal(waiter, top_waiter); + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); #else return false; #endif } #define __node_2_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, tree_entry) + rb_entry((node), struct rt_mutex_waiter, tree.entry) static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) { struct rt_mutex_waiter *aw = __node_2_waiter(a); struct rt_mutex_waiter *bw = __node_2_waiter(b); - if (rt_mutex_waiter_less(aw, bw)) + if (rt_waiter_node_less(&aw->tree, &bw->tree)) return 1; if (!build_ww_mutex()) return 0; - if (rt_mutex_waiter_less(bw, aw)) + if (rt_waiter_node_less(&bw->tree, &aw->tree)) return 0; /* NOTE: relies on waiter->ww_ctx being set before insertion */ @@ -434,48 +456,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod static __always_inline void rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); + lockdep_assert_held(&lock->wait_lock); + + rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); } static __always_inline void rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->tree_entry)) + lockdep_assert_held(&lock->wait_lock); + + if (RB_EMPTY_NODE(&waiter->tree.entry)) return; - rb_erase_cached(&waiter->tree_entry, &lock->waiters); - RB_CLEAR_NODE(&waiter->tree_entry); + rb_erase_cached(&waiter->tree.entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree.entry); } -#define __node_2_pi_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) +#define __node_2_rt_node(node) \ + rb_entry((node), struct rt_waiter_node, entry) -static __always_inline bool -__pi_waiter_less(struct rb_node *a, const struct rb_node *b) +static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) { - return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); + return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); } static __always_inline void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); + lockdep_assert_held(&task->pi_lock); + + rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); } static __always_inline void rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + lockdep_assert_held(&task->pi_lock); + + if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) return; - rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); - RB_CLEAR_NODE(&waiter->pi_tree_entry); + rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree.entry); } -static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) +static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, + struct task_struct *p) { struct task_struct *pi_task = NULL; + lockdep_assert_held(&lock->wait_lock); + lockdep_assert(rt_mutex_owner(lock) == p); lockdep_assert_held(&p->pi_lock); if (task_has_pi_waiters(p)) @@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * Chain walk basics and protection scope * * [R] refcount on task - * [P] task->pi_lock held + * [Pn] task->pi_lock held * [L] rtmutex->wait_lock held * + * Normal locking order: + * + * rtmutex->wait_lock + * task->pi_lock + * * Step Description Protected by * function arguments: * @task [R] @@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * again: * loop_sanity_check(); * retry: - * [1] lock(task->pi_lock); [R] acquire [P] - * [2] waiter = task->pi_blocked_on; [P] - * [3] check_exit_conditions_1(); [P] - * [4] lock = waiter->lock; [P] - * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] - * unlock(task->pi_lock); release [P] + * [1] lock(task->pi_lock); [R] acquire [P1] + * [2] waiter = task->pi_blocked_on; [P1] + * [3] check_exit_conditions_1(); [P1] + * [4] lock = waiter->lock; [P1] + * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] + * unlock(task->pi_lock); release [P1] * goto retry; * } - * [6] check_exit_conditions_2(); [P] + [L] - * [7] requeue_lock_waiter(lock, waiter); [P] + [L] - * [8] unlock(task->pi_lock); release [P] + * [6] check_exit_conditions_2(); [P1] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] + * [8] unlock(task->pi_lock); release [P1] * put_task_struct(task); release [R] * [9] check_exit_conditions_3(); [L] * [10] task = owner(lock); [L] * get_task_struct(task); [L] acquire [R] - * lock(task->pi_lock); [L] acquire [P] - * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] - * [12] check_exit_conditions_4(); [P] + [L] - * [13] unlock(task->pi_lock); release [P] + * lock(task->pi_lock); [L] acquire [P2] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] + * [12] check_exit_conditions_4(); [P2] + [L] + * [13] unlock(task->pi_lock); release [P2] * unlock(lock->wait_lock); release [L] * goto again; + * + * Where P1 is the blocking task and P2 is the lock owner; going up one step + * the owner becomes the next blocked task etc.. + * +* */ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, enum rtmutex_chainwalk chwalk, @@ -756,7 +798,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -764,13 +806,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * [4] Get the next lock + * [4] Get the next lock; per holding task->pi_lock we can't unblock + * and guarantee @lock's existence. */ lock = waiter->lock; /* * [5] We need to trylock here as we are holding task->pi_lock, * which is the reverse lock order versus the other rtmutex * operations. + * + * Per the above, holding task->pi_lock guarantees lock exists, so + * inverting this lock order is infeasible from a life-time + * perspective. */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irq(&task->pi_lock); @@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * or * * DL CBS enforcement advancing the effective deadline. - * - * Even though pi_waiters also uses these fields, and that tree is only - * updated in [11], we can do this here, since we hold [L], which - * serializes all pi_waiters access and rb_erase() does not care about - * the values of the node being removed. */ waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); - /* [8] Release the task */ + /* + * [8] Release the (blocking) task in preparation for + * taking the owner task in [10]. + * + * Since we hold lock->waiter_lock, task cannot unblock, even if we + * release task->pi_lock. + */ raw_spin_unlock(&task->pi_lock); put_task_struct(task); @@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, return 0; } - /* [10] Grab the next task, i.e. the owner of @lock */ + /* + * [10] Grab the next task, i.e. the owner of @lock + * + * Per holding lock->wait_lock and checking for !owner above, there + * must be an owner and it cannot go away. + */ task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); @@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else if (prerequeue_top_waiter == waiter) { /* @@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else { /* * Nothing changed. No need to do any priority @@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, waiter->task = task; waiter->lock = lock; waiter_update_prio(waiter, task); + waiter_clone_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, { struct rt_mutex_waiter *waiter; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_adjust_prio(current); + rt_mutex_adjust_prio(lock, current); /* * As we are waking up the top waiter, and the waiter stays @@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index cb9fdff76a8a..a6974d044593 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c47e8361bfb5..1162e07cdaea 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -17,27 +17,44 @@ #include #include + +/* + * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two + * separate trees and they need their own copy of the sort keys because of + * different locking requirements. + * + * @entry: rbtree node to enqueue into the waiters tree + * @prio: Priority of the waiter + * @deadline: Deadline of the waiter if applicable + * + * See rt_waiter_node_less() and waiter_*_prio(). + */ +struct rt_waiter_node { + struct rb_node entry; + int prio; + u64 deadline; +}; + /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @tree_entry: pi node to enqueue into the mutex waiters tree - * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree + * @tree: node to enqueue into the mutex waiters tree + * @pi_tree: node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task * @lock: Pointer to the rt_mutex on which the waiter blocks * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) - * @prio: Priority of the waiter - * @deadline: Deadline of the waiter if applicable * @ww_ctx: WW context pointer + * + * @tree is ordered by @lock->wait_lock + * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock */ struct rt_mutex_waiter { - struct rb_node tree_entry; - struct rb_node pi_tree_entry; + struct rt_waiter_node tree; + struct rt_waiter_node pi_tree; struct task_struct *task; struct rt_mutex_base *lock; unsigned int wake_state; - int prio; - u64 deadline; struct ww_acquire_ctx *ww_ctx; }; @@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, { struct rb_node *leftmost = rb_first_cached(&lock->waiters); - return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; + return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter; } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) @@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base * struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; + lockdep_assert_held(&lock->wait_lock); + if (leftmost) { - w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry); BUG_ON(w->lock != lock); } return w; @@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p) static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) { + lockdep_assert_held(&p->pi_lock); + return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + pi_tree.entry); } #define RT_MUTEX_HAS_WAITERS 1UL @@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); + RB_CLEAR_NODE(&waiter->pi_tree.entry); + RB_CLEAR_NODE(&waiter->tree.entry); waiter->wake_state = TASK_NORMAL; waiter->task = NULL; } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 56f139201f24..3ad2cc4823e5 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock) struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_next(&w->tree_entry); + struct rb_node *n = rb_next(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_prev(&w->tree_entry); + struct rb_node *n = rb_prev(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * @@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock) struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline void -- cgit v1.2.3