From aff7385b5a16bca6b8d9243f01a9ea5a5b411e1d Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 21 Jan 2014 15:35:53 -0800 Subject: locking/mutexes/mcs: Correct barrier usage This patch corrects the way memory barriers are used in the MCS lock with smp_load_acquire and smp_store_release fucnctions. The previous barriers could leak critical sections if mcs lock is used by itself. It is not a problem when mcs lock is embedded in mutex but will be an issue when the mcs_lock is used elsewhere. The patch removes the incorrect barriers and put in correct barriers with the pair of functions smp_load_acquire and smp_store_release. Suggested-by: Michel Lespinasse Reviewed-by: Paul E. McKenney Signed-off-by: Waiman Long Signed-off-by: Jason Low Signed-off-by: Tim Chen Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1390347353.3138.62.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4dd6e4c219de..fbbd2eda867e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -136,9 +136,12 @@ void mspin_lock(struct mspin_node **lock, struct mspin_node *node) return; } ACCESS_ONCE(prev->next) = node; - smp_wmb(); - /* Wait until the lock holder passes the lock down */ - while (!ACCESS_ONCE(node->locked)) + /* + * Wait until the lock holder passes the lock down. + * Using smp_load_acquire() provides a memory barrier that + * ensures subsequent operations happen after the lock is acquired. + */ + while (!(smp_load_acquire(&node->locked))) arch_mutex_cpu_relax(); } @@ -156,8 +159,13 @@ static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) while (!(next = ACCESS_ONCE(node->next))) arch_mutex_cpu_relax(); } - ACCESS_ONCE(next->locked) = 1; - smp_wmb(); + /* + * Pass lock to next waiter. + * smp_store_release() provides a memory barrier to ensure + * all operations in the critical section has been completed + * before unlocking. + */ + smp_store_release(&next->locked, 1); } /* -- cgit v1.2.3 From e72246748ff006ab928bc774e276e6ef5542f9c5 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 21 Jan 2014 15:36:00 -0800 Subject: locking/mutexes/mcs: Restructure the MCS lock defines and locking code into its own file We will need the MCS lock code for doing optimistic spinning for rwsem and queued rwlock. Extracting the MCS code from mutex.c and put into its own file allow us to reuse this code easily. We also inline mcs_spin_lock and mcs_spin_unlock functions for better efficiency. Note that using the smp_load_acquire/smp_store_release pair used in mcs_lock and mcs_unlock is not sufficient to form a full memory barrier across cpus for many architectures (except x86). For applications that absolutely need a full barrier across multiple cpus with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be used after mcs_lock. Reviewed-by: Paul E. McKenney Signed-off-by: Tim Chen Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1390347360.3138.63.camel@schen9-DESK Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 68 ++++++-------------------------------------------- 1 file changed, 7 insertions(+), 61 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index fbbd2eda867e..45fe1b5293d6 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,6 +25,7 @@ #include #include #include +#include /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, @@ -52,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->spin_mlock = NULL; + lock->mcs_lock = NULL; #endif debug_mutex_init(lock, name, key); @@ -111,62 +112,7 @@ EXPORT_SYMBOL(mutex_lock); * more or less simultaneously, the spinners need to acquire a MCS lock * first before spinning on the owner field. * - * We don't inline mspin_lock() so that perf can correctly account for the - * time spent in this lock function. */ -struct mspin_node { - struct mspin_node *next ; - int locked; /* 1 if lock acquired */ -}; -#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) - -static noinline -void mspin_lock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *prev; - - /* Init node */ - node->locked = 0; - node->next = NULL; - - prev = xchg(lock, node); - if (likely(prev == NULL)) { - /* Lock acquired */ - node->locked = 1; - return; - } - ACCESS_ONCE(prev->next) = node; - /* - * Wait until the lock holder passes the lock down. - * Using smp_load_acquire() provides a memory barrier that - * ensures subsequent operations happen after the lock is acquired. - */ - while (!(smp_load_acquire(&node->locked))) - arch_mutex_cpu_relax(); -} - -static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) -{ - struct mspin_node *next = ACCESS_ONCE(node->next); - - if (likely(!next)) { - /* - * Release the lock by setting it to NULL - */ - if (cmpxchg(lock, node, NULL) == node) - return; - /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) - arch_mutex_cpu_relax(); - } - /* - * Pass lock to next waiter. - * smp_store_release() provides a memory barrier to ensure - * all operations in the critical section has been completed - * before unlocking. - */ - smp_store_release(&next->locked, 1); -} /* * Mutex spinning code migrated from kernel/sched/core.c @@ -456,7 +402,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, for (;;) { struct task_struct *owner; - struct mspin_node node; + struct mcs_spinlock node; if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; @@ -478,10 +424,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * If there's an owner, wait for it to either * release the lock or go to sleep. */ - mspin_lock(MLOCK(lock), &node); + mcs_spin_lock(&lock->mcs_lock, &node); owner = ACCESS_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) { - mspin_unlock(MLOCK(lock), &node); + mcs_spin_unlock(&lock->mcs_lock, &node); goto slowpath; } @@ -496,11 +442,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mspin_unlock(MLOCK(lock), &node); + mcs_spin_unlock(&lock->mcs_lock, &node); preempt_enable(); return 0; } - mspin_unlock(MLOCK(lock), &node); + mcs_spin_unlock(&lock->mcs_lock, &node); /* * When there's no owner, we might have preempted between the -- cgit v1.2.3 From fb9edbe98493fcd9df66de926ae9157cbe0e4dcd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:06 +0100 Subject: lockdep: Make held_lock->check and "int check" argument bool The "int check" argument of lock_acquire() and held_lock->check are misleading. This is actually a boolean: 2 means "true", everything else is "false". And there is no need to pass 1 or 0 to lock_acquire() depending on CONFIG_PROVE_LOCKING, __lock_acquire() checks prove_locking at the start and clears "check" if !CONFIG_PROVE_LOCKING. Note: probably we can simply kill this member/arg. The only explicit user of check => 0 is rcu_lock_acquire(), perhaps we can change it to use lock_acquire(trylock =>, read => 2). __lockdep_no_validate means check => 0 implicitly, but we can change validate_chain() to check hlock->instance->key instead. Not to mention it would be nice to get rid of lockdep_set_novalidate_class(). Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182006.GA26495@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index eb8a54783fa0..8c85a0da5a38 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, * (If lookup_chain_cache() returns with 1 it acquires * graph_lock for us) */ - if (!hlock->trylock && (hlock->check == 2) && + if (!hlock->trylock && hlock->check && lookup_chain_cache(curr, hlock, chain_key)) { /* * Check whether last held lock: @@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; - if (!prove_locking) - check = 1; - if (unlikely(!debug_locks)) return 0; @@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (lock->key == &__lockdep_no_validate__) - check = 1; + if (!prove_locking || lock->key == &__lockdep_no_validate__) + check = 0; if (subclass < NR_LOCKDEP_CACHING_CLASSES) class = lock->class_cache[subclass]; @@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->holdtime_stamp = lockstat_clock(); #endif - if (check == 2 && !mark_irqflags(curr, hlock)) + if (check && !mark_irqflags(curr, hlock)) return 0; /* mark it as used: */ -- cgit v1.2.3 From 1b5ff816cab708ba44c7d7b56b613516269eb577 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:10 +0100 Subject: lockdep: Don't create the wrong dependency on hlock->check == 0 Test-case: DEFINE_MUTEX(m1); DEFINE_MUTEX(m2); DEFINE_MUTEX(mx); void lockdep_should_complain(void) { lockdep_set_novalidate_class(&mx); // m1 -> mx -> m2 mutex_lock(&m1); mutex_lock(&mx); mutex_lock(&m2); mutex_unlock(&m2); mutex_unlock(&mx); mutex_unlock(&m1); // m2 -> m1 ; should trigger the warning mutex_lock(&m2); mutex_lock(&m1); mutex_unlock(&m1); mutex_unlock(&m2); } this doesn't trigger any warning, lockdep can't detect the trivial deadlock. This is because lock(&mx) correctly avoids m1 -> mx dependency, it skips validate_chain() due to mx->check == 0. But lock(&m2) wrongly adds mx -> m2 and thus m1 -> m2 is not created. rcu_lock_acquire()->lock_acquire(check => 0) is fine due to read == 2, so currently only __lockdep_no_validate__ can trigger this problem. Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182010.GA26498@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8c85a0da5a38..f7eba92cb97c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; - hlock = curr->held_locks + depth-1; + hlock = curr->held_locks + depth - 1; /* * Only non-recursive-read entries get new dependencies * added: */ - if (hlock->read != 2) { + if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, distance, trylock_loop)) return 0; -- cgit v1.2.3 From 34d0ed5ea7a72d5961552fb1758a94f0d3f8f3dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jan 2014 19:20:13 +0100 Subject: lockdep: Change mark_held_locks() to check hlock->check instead of lockdep_no_validate The __lockdep_no_validate check in mark_held_locks() adds the subtle and (afaics) unnecessary difference between no-validate and check==0. And this looks even more inconsistent because __lock_acquire() skips mark_irqflags()->mark_lock() if !check. Change mark_held_locks() to check hlock->check instead. Signed-off-by: Oleg Nesterov Cc: Dave Jones Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Paul McKenney Cc: Steven Rostedt Cc: Alan Stern Cc: Sasha Levin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140120182013.GA26505@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f7eba92cb97c..bf0c6b0dd9c5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) BUG_ON(usage_bit >= LOCK_USAGE_STATES); - if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) + if (!hlock->check) continue; if (!mark_lock(curr, hlock, usage_bit)) -- cgit v1.2.3 From c9122da1e2d29bd6a1475a0d1ce2aa6ac6ea25fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 13:32:16 +0100 Subject: locking: Move mcs_spinlock.h into kernel/locking/ The mcs_spinlock code is not meant (or suitable) as a generic locking primitive, therefore take it away from the normal includes and place it in kernel/locking/. This way the locking primitives implemented there can use it as part of their implementation but we do not risk it getting used inapropriately. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-byirmpamgr7h25m5kyavwpzx@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mcs_spinlock.h | 114 ++++++++++++++++++++++++++++++++++++++++++ kernel/locking/mutex.c | 2 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 kernel/locking/mcs_spinlock.h (limited to 'kernel/locking') diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h new file mode 100644 index 000000000000..f2a5c6360083 --- /dev/null +++ b/kernel/locking/mcs_spinlock.h @@ -0,0 +1,114 @@ +/* + * MCS lock defines + * + * This file contains the main data structure and API definitions of MCS lock. + * + * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock + * with the desirable properties of being fair, and with each cpu trying + * to acquire the lock spinning on a local variable. + * It avoids expensive cache bouncings that common test-and-set spin-lock + * implementations incur. + */ +#ifndef __LINUX_MCS_SPINLOCK_H +#define __LINUX_MCS_SPINLOCK_H + +#include + +struct mcs_spinlock { + struct mcs_spinlock *next; + int locked; /* 1 if lock acquired */ +}; + +#ifndef arch_mcs_spin_lock_contended +/* + * Using smp_load_acquire() provides a memory barrier that ensures + * subsequent operations happen after the lock is acquired. + */ +#define arch_mcs_spin_lock_contended(l) \ +do { \ + while (!(smp_load_acquire(l))) \ + arch_mutex_cpu_relax(); \ +} while (0) +#endif + +#ifndef arch_mcs_spin_unlock_contended +/* + * smp_store_release() provides a memory barrier to ensure all + * operations in the critical section has been completed before + * unlocking. + */ +#define arch_mcs_spin_unlock_contended(l) \ + smp_store_release((l), 1) +#endif + +/* + * Note: the smp_load_acquire/smp_store_release pair is not + * sufficient to form a full memory barrier across + * cpus for many architectures (except x86) for mcs_unlock and mcs_lock. + * For applications that need a full barrier across multiple cpus + * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be + * used after mcs_lock. + */ + +/* + * In order to acquire the lock, the caller should declare a local node and + * pass a reference of the node to this function in addition to the lock. + * If the lock has already been acquired, then this will proceed to spin + * on this node->locked until the previous lock holder sets the node->locked + * in mcs_spin_unlock(). + * + * We don't inline mcs_spin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +static inline +void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* + * Lock acquired, don't need to set node->locked to 1. Threads + * only spin on its own node->locked value for lock acquisition. + * However, since this thread can immediately acquire the lock + * and does not proceed to spin on its own node->locked, this + * value won't be used. If a debug mode is needed to + * audit lock status, then set node->locked value here. + */ + return; + } + ACCESS_ONCE(prev->next) = node; + + /* Wait until the lock holder passes the lock down. */ + arch_mcs_spin_lock_contended(&node->locked); +} + +/* + * Releases the lock. The caller should pass in the corresponding node that + * was used to acquire the lock. + */ +static inline +void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + + /* Pass lock to next waiter. */ + arch_mcs_spin_unlock_contended(&next->locked); +} + +#endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 45fe1b5293d6..4f408be39a07 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include "mcs_spinlock.h" /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, -- cgit v1.2.3 From 46af29e479cc0c1c63633007993af5292c2c3e75 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Jan 2014 11:13:12 -0800 Subject: locking/mutexes: Return false if task need_resched() in mutex_can_spin_on_owner() The mutex_can_spin_on_owner() function should also return false if the task needs to be rescheduled to avoid entering the MCS queue when it needs to reschedule. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Waiman.Long@hp.com Cc: torvalds@linux-foundation.org Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Cc: chegu_vinod@hp.com Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1390936396-3962-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4f408be39a07..e6d646b18d6c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -166,6 +166,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + if (need_resched()) + return 0; + rcu_read_lock(); owner = ACCESS_ONCE(lock->owner); if (owner) -- cgit v1.2.3 From 47667fa1502e4d759df87e9cc7fbc0f202483361 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Jan 2014 11:13:13 -0800 Subject: locking/mutexes: Modify the way optimistic spinners are queued The mutex->spin_mlock was introduced in order to ensure that only 1 thread spins for lock acquisition at a time to reduce cache line contention. When lock->owner is NULL and the lock->count is still not 1, the spinner(s) will continually release and obtain the lock->spin_mlock. This can generate quite a bit of overhead/contention, and also might just delay the spinner from getting the lock. This patch modifies the way optimistic spinners are queued by queuing before entering the optimistic spinning loop as oppose to acquiring before every call to mutex_spin_on_owner(). So in situations where the spinner requires a few extra spins before obtaining the lock, then there will only be 1 spinner trying to get the lock and it will avoid the overhead from unnecessarily unlocking and locking the spin_mlock. Signed-off-by: Jason Low Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Cc: chegu_vinod@hp.com Cc: Waiman.Long@hp.com Cc: paulmck@linux.vnet.ibm.com Cc: torvalds@linux-foundation.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1390936396-3962-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e6d646b18d6c..82dad2ccd40b 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -403,9 +403,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; + mcs_spin_lock(&lock->mcs_lock, &node); for (;;) { struct task_struct *owner; - struct mcs_spinlock node; if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; @@ -420,19 +420,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * performed the optimistic spinning cannot be done. */ if (ACCESS_ONCE(ww->ctx)) - goto slowpath; + break; } /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ - mcs_spin_lock(&lock->mcs_lock, &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) { - mcs_spin_unlock(&lock->mcs_lock, &node); - goto slowpath; - } + if (owner && !mutex_spin_on_owner(lock, owner)) + break; if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { @@ -449,7 +446,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_enable(); return 0; } - mcs_spin_unlock(&lock->mcs_lock, &node); /* * When there's no owner, we might have preempted between the @@ -458,7 +454,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) - goto slowpath; + break; /* * The cpu_relax() call is a compiler barrier which forces @@ -468,6 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } + mcs_spin_unlock(&lock->mcs_lock, &node); slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From 1d8fe7dc8078b23e060ec62ccb4cdc1ac3c41bf8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Tue, 28 Jan 2014 11:13:14 -0800 Subject: locking/mutexes: Unlock the mutex without the wait_lock When running workloads that have high contention in mutexes on an 8 socket machine, mutex spinners would often spin for a long time with no lock owner. The main reason why this is occuring is in __mutex_unlock_common_slowpath(), if __mutex_slowpath_needs_to_unlock(), then the owner needs to acquire the mutex->wait_lock before releasing the mutex (setting lock->count to 1). When the wait_lock is contended, this delays the mutex from being released. We should be able to release the mutex without holding the wait_lock. Signed-off-by: Jason Low Cc: chegu_vinod@hp.com Cc: paulmck@linux.vnet.ibm.com Cc: Waiman.Long@hp.com Cc: torvalds@linux-foundation.org Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1390936396-3962-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 82dad2ccd40b..dc3d6f2bbe2a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -671,10 +671,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) struct mutex *lock = container_of(lock_count, struct mutex, count); unsigned long flags; - spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); - debug_mutex_unlock(lock); - /* * some architectures leave the lock unlocked in the fastpath failure * case, others need to leave it locked. In the later case we have to @@ -683,6 +679,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) if (__mutex_slowpath_needs_to_unlock()) atomic_set(&lock->count, 1); + spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); + if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = -- cgit v1.2.3 From fb0527bd5ea99bfeb2dd91e3c1433ecf745d6b99 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Jan 2014 12:51:42 +0100 Subject: locking/mutexes: Introduce cancelable MCS lock for adaptive spinning Since we want a task waiting for a mutex_lock() to go to sleep and reschedule on need_resched() we must be able to abort the mcs_spin_lock() around the adaptive spin. Therefore implement a cancelable mcs lock. Signed-off-by: Peter Zijlstra Cc: chegu_vinod@hp.com Cc: paulmck@linux.vnet.ibm.com Cc: Waiman.Long@hp.com Cc: torvalds@linux-foundation.org Cc: tglx@linutronix.de Cc: riel@redhat.com Cc: akpm@linux-foundation.org Cc: davidlohr@hp.com Cc: hpa@zytor.com Cc: andi@firstfloor.org Cc: aswin@hp.com Cc: scott.norton@hp.com Cc: Jason Low Link: http://lkml.kernel.org/n/tip-62hcl5wxydmjzd182zhvk89m@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 2 +- kernel/locking/mcs_spinlock.c | 178 ++++++++++++++++++++++++++++++++++++++++++ kernel/locking/mcs_spinlock.h | 15 ++++ kernel/locking/mutex.c | 10 ++- 4 files changed, 200 insertions(+), 5 deletions(-) create mode 100644 kernel/locking/mcs_spinlock.c (limited to 'kernel/locking') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index baab8e5e7f66..2a9ee96ecf00 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o lglock.o +obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = -pg diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c new file mode 100644 index 000000000000..838dc9e00669 --- /dev/null +++ b/kernel/locking/mcs_spinlock.c @@ -0,0 +1,178 @@ + +#include +#include +#include +#include "mcs_spinlock.h" + +#ifdef CONFIG_SMP + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + * + * Using a single mcs node per CPU is safe because sleeping locks should not be + * called from interrupt context and we have preemption disabled while + * spinning. + */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); + +/* + * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. + * Can return NULL in case we were the last queued and we updated @lock instead. + */ +static inline struct optimistic_spin_queue * +osq_wait_next(struct optimistic_spin_queue **lock, + struct optimistic_spin_queue *node, + struct optimistic_spin_queue *prev) +{ + struct optimistic_spin_queue *next = NULL; + + for (;;) { + if (*lock == node && cmpxchg(lock, node, prev) == node) { + /* + * We were the last queued, we moved @lock back. @prev + * will now observe @lock and will complete its + * unlock()/unqueue(). + */ + break; + } + + /* + * We must xchg() the @node->next value, because if we were to + * leave it in, a concurrent unlock()/unqueue() from + * @node->next might complete Step-A and think its @prev is + * still valid. + * + * If the concurrent unlock()/unqueue() wins the race, we'll + * wait for either @lock to point to us, through its Step-B, or + * wait for a new @node->next from its Step-C. + */ + if (node->next) { + next = xchg(&node->next, NULL); + if (next) + break; + } + + arch_mutex_cpu_relax(); + } + + return next; +} + +bool osq_lock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *prev, *next; + + node->locked = 0; + node->next = NULL; + + node->prev = prev = xchg(lock, node); + if (likely(prev == NULL)) + return true; + + ACCESS_ONCE(prev->next) = node; + + /* + * Normally @prev is untouchable after the above store; because at that + * moment unlock can proceed and wipe the node element from stack. + * + * However, since our nodes are static per-cpu storage, we're + * guaranteed their existence -- this allows us to apply + * cmpxchg in an attempt to undo our queueing. + */ + + while (!smp_load_acquire(&node->locked)) { + /* + * If we need to reschedule bail... so we can block. + */ + if (need_resched()) + goto unqueue; + + arch_mutex_cpu_relax(); + } + return true; + +unqueue: + /* + * Step - A -- stabilize @prev + * + * Undo our @prev->next assignment; this will make @prev's + * unlock()/unqueue() wait for a next pointer since @lock points to us + * (or later). + */ + + for (;;) { + if (prev->next == node && + cmpxchg(&prev->next, node, NULL) == node) + break; + + /* + * We can only fail the cmpxchg() racing against an unlock(), + * in which case we should observe @node->locked becomming + * true. + */ + if (smp_load_acquire(&node->locked)) + return true; + + arch_mutex_cpu_relax(); + + /* + * Or we race against a concurrent unqueue()'s step-B, in which + * case its step-C will write us a new @node->prev pointer. + */ + prev = ACCESS_ONCE(node->prev); + } + + /* + * Step - B -- stabilize @next + * + * Similar to unlock(), wait for @node->next or move @lock from @node + * back to @prev. + */ + + next = osq_wait_next(lock, node, prev); + if (!next) + return false; + + /* + * Step - C -- unlink + * + * @prev is stable because its still waiting for a new @prev->next + * pointer, @next is stable because our @node->next pointer is NULL and + * it will wait in Step-A. + */ + + ACCESS_ONCE(next->prev) = prev; + ACCESS_ONCE(prev->next) = next; + + return false; +} + +void osq_unlock(struct optimistic_spin_queue **lock) +{ + struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_queue *next; + + /* + * Fast path for the uncontended case. + */ + if (likely(cmpxchg(lock, node, NULL) == node)) + return; + + /* + * Second most likely case. + */ + next = xchg(&node->next, NULL); + if (next) { + ACCESS_ONCE(next->locked) = 1; + return; + } + + next = osq_wait_next(lock, node, NULL); + if (next) + ACCESS_ONCE(next->locked) = 1; +} + +#endif + diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index f2a5c6360083..a2dbac4aca6b 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -111,4 +111,19 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) arch_mcs_spin_unlock_contended(&next->locked); } +/* + * Cancellable version of the MCS lock above. + * + * Intended for adaptive spinning of sleeping locks: + * mutex_lock()/rwsem_down_{read,write}() etc. + */ + +struct optimistic_spin_queue { + struct optimistic_spin_queue *next, *prev; + int locked; /* 1 if lock acquired */ +}; + +extern bool osq_lock(struct optimistic_spin_queue **lock); +extern void osq_unlock(struct optimistic_spin_queue **lock); + #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dc3d6f2bbe2a..2670b84067d6 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -53,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->mcs_lock = NULL; + lock->osq = NULL; #endif debug_mutex_init(lock, name, key); @@ -403,7 +403,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (!mutex_can_spin_on_owner(lock)) goto slowpath; - mcs_spin_lock(&lock->mcs_lock, &node); + if (!osq_lock(&lock->osq)) + goto slowpath; + for (;;) { struct task_struct *owner; @@ -442,7 +444,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } mutex_set_owner(lock); - mcs_spin_unlock(&lock->mcs_lock, &node); + osq_unlock(&lock->osq); preempt_enable(); return 0; } @@ -464,7 +466,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } - mcs_spin_unlock(&lock->mcs_lock, &node); + osq_unlock(&lock->osq); slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From 34c6bc2c919a55e5ad4e698510a2f35ee13ab900 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Feb 2014 16:21:09 +0100 Subject: locking/mutexes: Add extra reschedule point Add in an extra reschedule in an attempt to avoid getting reschedule the moment we've acquired the lock. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-zah5eyn9gu7qlgwh9r6n2anc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2670b84067d6..02c61a9c8906 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -468,6 +468,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } osq_unlock(&lock->osq); slowpath: + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock the mutex. This avoids getting + * scheduled out right after we obtained the mutex. + */ + if (need_resched()) + schedule_preempt_disabled(); #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From 6f008e72cd111a119b5d8de8c5438d892aae99eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Mar 2014 13:24:42 +0100 Subject: locking/mutex: Fix debug checks OK, so commit: 1d8fe7dc8078 ("locking/mutexes: Unlock the mutex without the wait_lock") generates this boot warning when CONFIG_DEBUG_MUTEXES=y: WARNING: CPU: 0 PID: 139 at /usr/src/linux-2.6/kernel/locking/mutex-debug.c:82 debug_mutex_unlock+0x155/0x180() DEBUG_LOCKS_WARN_ON(lock->owner != current) And that makes sense, because as soon as we release the lock a new owner can come in... One would think that !__mutex_slowpath_needs_to_unlock() implementations suffer the same, but for DEBUG we fall back to mutex-null.h which has an unconditional 1 for that. The mutex debug code requires the mutex to be unlocked after doing the debug checks, otherwise it can find inconsistent state. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: jason.low2@hp.com Link: http://lkml.kernel.org/r/20140312122442.GB27965@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.c | 6 ++++++ kernel/locking/mutex.c | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index faf6f5b53e77..e1191c996c59 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock) DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); + + /* + * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug + * mutexes so that we can do it here after we've verified state. + */ + atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 02c61a9c8906..14fe72cc8ce7 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -34,6 +34,13 @@ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" # include +/* + * Must be 0 for the debug case so we do not do the unlock outside of the + * wait_lock region. debug_mutex_unlock() will do the actual unlock in this + * case. + */ +# undef __mutex_slowpath_needs_to_unlock +# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" # include -- cgit v1.2.3