From 133e89ef5ef338e1358b16246521ba17d935c396 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 13 May 2016 11:56:26 -0700 Subject: locking/rwsem: Enable lockless waiter wakeup(s) As wake_qs gain users, we can teach rwsems about them such that waiters can be awoken without the wait_lock. This is for both readers and writer, the former being the most ideal candidate as we can batch the wakeups shortening the critical region that much more -- ie writer task blocking a bunch of tasks waiting to service page-faults (mmap_sem readers). In general applying wake_qs to rwsem (xadd) is not difficult as the wait_lock is intended to be released soon _anyways_, with the exception of when a writer slowpath will proactively wakeup any queued readers if it sees that the lock is owned by a reader, in which we simply do the wakeups with the lock held (see comment in __rwsem_down_write_failed_common()). Similar to other locking primitives, delaying the waiter being awoken does allow, at least in theory, the lock to be stolen in the case of writers, however no harm was seen in this (in fact lock stealing tends to be a _good_ thing in most workloads), and this is a tiny window anyways. Some page-fault (pft) and mmap_sem intensive benchmarks show some pretty constant reduction in systime (by up to ~8 and ~10%) on a 2-socket, 12 core AMD box. In addition, on an 8-core Westmere doing page allocations (page_test) aim9: 4.6-rc6 4.6-rc6 rwsemv2 Min page_test 378167.89 ( 0.00%) 382613.33 ( 1.18%) Min exec_test 499.00 ( 0.00%) 502.67 ( 0.74%) Min fork_test 3395.47 ( 0.00%) 3537.64 ( 4.19%) Hmean page_test 395433.06 ( 0.00%) 414693.68 ( 4.87%) Hmean exec_test 499.67 ( 0.00%) 505.30 ( 1.13%) Hmean fork_test 3504.22 ( 0.00%) 3594.95 ( 2.59%) Stddev page_test 17426.57 ( 0.00%) 26649.92 (-52.93%) Stddev exec_test 0.47 ( 0.00%) 1.41 (-199.05%) Stddev fork_test 63.74 ( 0.00%) 32.59 ( 48.86%) Max page_test 429873.33 ( 0.00%) 456960.00 ( 6.30%) Max exec_test 500.33 ( 0.00%) 507.66 ( 1.47%) Max fork_test 3653.33 ( 0.00%) 3650.90 ( -0.07%) 4.6-rc6 4.6-rc6 rwsemv2 User 1.12 0.04 System 0.23 0.04 Elapsed 727.27 721.98 Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Cc: dave@stgolabs.net Cc: jason.low2@hp.com Cc: peter@hurleysoftware.com Link: http://lkml.kernel.org/r/1463165787-25937-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 58 ++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 16 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..80b05ac0f015 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -114,12 +114,16 @@ enum rwsem_wake_type { * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - there must be someone on the queue - * - the spinlock must be held by the caller + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false + * - writers are only marked woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +__rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. */ - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); + } goto out; } @@ -196,7 +203,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) */ smp_mb(); waiter->task = NULL; - wake_up_process(tsk); + wake_q_add(wake_q, tsk); put_task_struct(tsk); } while (--loop); @@ -216,6 +223,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; + WAKE_Q(wake_q); /* set up my own style of waitqueue */ waiter.task = tsk; @@ -238,9 +246,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); /* wait to be given the lock */ while (true) { @@ -440,6 +449,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; + WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); @@ -472,8 +482,19 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * no active writers, the lock must be read owned; so we try to * wake any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + if (count > RWSEM_WAITING_BIAS) { + WAKE_Q(wake_q); + + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + } } else count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); @@ -509,8 +530,9 @@ out_nolock: if (list_empty(&sem->wait_list)) rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); else - __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); return ERR_PTR(-EINTR); } @@ -537,6 +559,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -573,9 +596,10 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } @@ -590,14 +614,16 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } -- cgit v1.2.3 From e38513905eeaae59056eac2c9ac55a43b1fc41b2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 13 May 2016 11:56:27 -0700 Subject: locking/rwsem: Rework zeroing reader waiter->task Readers that are awoken will expect a nil ->task indicating that a wakeup has occurred. Because of the way readers are implemented, there's a small chance that the waiter will never block in the slowpath (rwsem_down_read_failed), and therefore requires some form of reference counting to avoid the following scenario: rwsem_down_read_failed() rwsem_wake() get_task_struct(); spin_lock_irq(&wait_lock); list_add_tail(&waiter.list) spin_unlock_irq(&wait_lock); raw_spin_lock_irqsave(&wait_lock) __rwsem_do_wake() while (1) { set_task_state(TASK_UNINTERRUPTIBLE); waiter->task = NULL if (!waiter.task) // true break; schedule() // never reached __set_task_state(TASK_RUNNING); do_exit(); wake_up_process(tsk); // boom ... and therefore race with do_exit() when the caller returns. There is also a mismatch between the smp_mb() and its documentation, in that the serialization is done between reading the task and the nil store. Furthermore, in addition to having the overlapping of loads and stores to waiter->task guaranteed to be ordered within that CPU, both wake_up_process() originally and now wake_q_add() already imply barriers upon successful calls, which serves the comment. Now, as an alternative to perhaps inverting the checks in the blocker side (which has its own penalty in that schedule is unavoidable), with lockless wakeups this situation is naturally addressed and we can just use the refcount held by wake_q_add(), instead doing so explicitly. Of course, we must guarantee that the nil store is done as the _last_ operation in that the task must already be marked for deletion to not fall into the race above. Spurious wakeups are also handled transparently in that the task's reference is only removed when wake_up_q() is actually called _after_ the nil store. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Cc: dave@stgolabs.net Cc: jason.low2@hp.com Cc: peter@hurleysoftware.com Link: http://lkml.kernel.org/r/1463165787-25937-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 80b05ac0f015..fcbf75ac3dcb 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -194,17 +194,15 @@ __rwsem_mark_wake(struct rw_semaphore *sem, waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + + wake_q_add(wake_q, tsk); /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). + * Ensure that the last operation is setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. */ - smp_mb(); - waiter->task = NULL; - wake_q_add(wake_q, tsk); - put_task_struct(tsk); + smp_store_release(&waiter->task, NULL); } while (--loop); sem->wait_list.next = next; @@ -228,7 +226,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) -- cgit v1.2.3 From c0fcb6c2d332041256dc55d8a1ec3c0a2d0befb8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 16 May 2016 17:38:00 -0700 Subject: locking/rwsem: Optimize write lock by reducing operations in slowpath When acquiring the rwsem write lock in the slowpath, we first try to set count to RWSEM_WAITING_BIAS. When that is successful, we then atomically add the RWSEM_WAITING_BIAS in cases where there are other tasks on the wait list. This causes write lock operations to often issue multiple atomic operations. We can instead make the list_is_singular() check first, and then set the count accordingly, so that we issue at most 1 atomic operation when acquiring the write lock and reduce unnecessary cacheline contention. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Fenghua Yu Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Matt Turner Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Tony Luck Link: http://lkml.kernel.org/r/1463445486-16078-2-git-send-email-jason.low2@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index fcbf75ac3dcb..b957da7fcb19 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -261,17 +261,28 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } EXPORT_SYMBOL(rwsem_down_read_failed); +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { /* - * Try acquiring the write lock. Check count first in order - * to reduce unnecessary expensive cmpxchg() operations. + * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. */ - if (count == RWSEM_WAITING_BIAS && - cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + if (count != RWSEM_WAITING_BIAS) + return false; + + /* + * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there + * are other tasks on the wait list, we need to add on WAITING_BIAS. + */ + count = list_is_singular(&sem->wait_list) ? + RWSEM_ACTIVE_WRITE_BIAS : + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + + if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } -- cgit v1.2.3 From 6e2814745c67ab422b86262b05e6f23a56f28aa3 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 20 May 2016 15:19:36 -0700 Subject: locking/mutex: Set and clear owner using WRITE_ONCE() The mutex owner can get read and written to locklessly. Use WRITE_ONCE when setting and clearing the owner field in order to avoid optimizations such as store tearing. This avoids situations where the owner field gets written to with multiple stores and another thread could concurrently read and use a partially written owner value. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Davidlohr Bueso Acked-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Terry Rudd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463782776.2479.9.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.h | 4 ++-- kernel/locking/mutex.h | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..372e6530180d 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #define spin_lock_mutex(lock, flags) \ diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..12f96199441c 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -17,14 +17,20 @@ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * The mutex owner can get read and written to locklessly. + * We should use WRITE_ONCE when writing the owner value to + * avoid store tearing, otherwise, a thread could potentially + * read a partially written and incomplete owner value. + */ static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #else static inline void mutex_set_owner(struct mutex *lock) -- cgit v1.2.3 From dfaaf3fa01d65cf6e2072965bb0b7aaa7285344f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2016 18:31:33 +0200 Subject: locking/lockdep: Use __jhash_mix() for iterate_chain_key() Use __jhash_mix() to mix the class_idx into the class_key. This function provides better mixing than the previously used, home grown mix function. Leave hashing to the professionals :-) Suggested-by: George Spelvin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -46,6 +46,7 @@ #include #include #include +#include #include @@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; * It's a 64-bit hash, because it's important for the keys to be * unique. */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) +static inline u64 iterate_chain_key(u64 key, u32 idx) +{ + u32 k0 = key, k1 = key >> 32; + + __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ + + return k0 | (u64)k1 << 32; +} void lockdep_off(void) { -- cgit v1.2.3 From a461d58792d4a46b8b10ae50973ec9b2763b694e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 27 May 2016 15:47:18 +0200 Subject: locking/rtmutex: Only warn once on a trylock from bad context One warning should be enough to get one motivated to fix this. It is possible that this happens more than once and that starts flooding the output. Later the prints will be suppressed so we only get half of it. Depending on the console system used it might not be helpful. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464356838-1755-1-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { - if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); -- cgit v1.2.3 From 8d53fa19041ae65c484d81d75179b4a577e6d8e4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 09:12:30 +0200 Subject: locking/qspinlock: Clarify xchg_tail() ordering While going over the code I noticed that xchg_tail() is a RELEASE but had no obvious pairing commented. It pairs with a somewhat unique address dependency through decode_tail(). So the store-release of xchg_tail() is paired by the address dependency of the load of xchg_tail followed by the dereference from the pointer computed from that load. The @old -> @prev transformation itself is pure, and therefore does not depend on external state, so that is immaterial wrt. ordering. Signed-off-by: Peter Zijlstra (Intel) Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5fc8c311b8fe..ee7deb08d43d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); * therefore increment the cpu number by one. */ -static inline u32 encode_tail(int cpu, int idx) +static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) return tail; } -static inline struct mcs_spinlock *decode_tail(u32 tail) +static inline __pure struct mcs_spinlock *decode_tail(u32 tail) { int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; @@ -455,6 +455,8 @@ queue: * pending stuff. * * p,*,* -> n,*,* + * + * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -465,6 +467,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* + * The above xchg_tail() is also a load of @lock which generates, + * through decode_tail(), a pointer. + * + * The address dependency matches the RELEASE of xchg_tail() + * such that the access to @prev must happen after. + */ + smp_read_barrier_depends(); + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); -- cgit v1.2.3 From 055ce0fd1b86c204430cbc0887165599d6e15090 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Jun 2016 10:36:53 +0200 Subject: locking/qspinlock: Add comments I figured we need to document the spin_is_locked() and spin_unlock_wait() constraints somwehere. Ideally 'someone' would rewrite Documentation/atomic_ops.txt and we could find a place in there. But currently that document is stale to the point of hardly being useful. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ee7deb08d43d..2f9153b183c9 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -267,6 +267,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif +/* + * Various notes on spin_is_locked() and spin_unlock_wait(), which are + * 'interesting' functions: + * + * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE + * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, + * PPC). Also qspinlock has a similar issue per construction, the setting of + * the locked byte can be unordered acquiring the lock proper. + * + * This gets to be 'interesting' in the following cases, where the /should/s + * end up false because of this issue. + * + * + * CASE 1: + * + * So the spin_is_locked() correctness issue comes from something like: + * + * CPU0 CPU1 + * + * global_lock(); local_lock(i) + * spin_lock(&G) spin_lock(&L[i]) + * for (i) if (!spin_is_locked(&G)) { + * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); + * return; + * } + * // deal with fail + * + * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such + * that there is exclusion between the two critical sections. + * + * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from + * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) + * /should/ be constrained by the ACQUIRE from spin_lock(&G). + * + * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. + * + * + * CASE 2: + * + * For spin_unlock_wait() there is a second correctness issue, namely: + * + * CPU0 CPU1 + * + * flag = set; + * smp_mb(); spin_lock(&l) + * spin_unlock_wait(&l); if (!flag) + * // add to lockless list + * spin_unlock(&l); + * // iterate lockless list + * + * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 + * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE + * semantics etc..) + * + * Where flag /should/ be ordered against the locked store of l. + */ + /* * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before * issuing an _unordered_ store to set _Q_LOCKED_VAL. -- cgit v1.2.3 From 8ee62b1870be8e630158701632a533d0378e15b8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 3 Jun 2016 22:26:02 -0700 Subject: locking/rwsem: Convert sem->count to 'atomic_long_t' Convert the rwsem count variable to an atomic_long_t since we use it as an atomic variable. This also allows us to remove the rwsem_atomic_{add,update}() "abstraction" which would now be an unnecesary level of indirection. In follow up patches, we also remove the rwsem_atomic_{add,update}() definitions across the various architectures. Suggested-by: Peter Zijlstra Signed-off-by: Jason Low [ Build warning fixes on various architectures. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Fenghua Yu Cc: Heiko Carstens Cc: Jason Low Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Paul E. McKenney Cc: Peter Hurley Cc: Terry Rudd Cc: Thomas Gleixner Cc: Tim Chen Cc: Tony Luck Cc: Waiman Long Link: http://lkml.kernel.org/r/1465017963-4839-2-git-send-email-jason.low2@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b957da7fcb19..63b40a5c62ec 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->count = RWSEM_UNLOCKED_VALUE; + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -153,10 +153,11 @@ __rwsem_mark_wake(struct rw_semaphore *sem, if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & + if (atomic_long_sub_return(adjustment, &sem->count) & RWSEM_ACTIVE_MASK) goto out; /* Last active locker left. Retry waking readers. */ @@ -186,7 +187,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem, adjustment -= RWSEM_WAITING_BIAS; if (adjustment) - rwsem_atomic_add(adjustment, sem); + atomic_long_add(adjustment, &sem->count); next = sem->wait_list.next; loop = woken; @@ -233,7 +234,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + count = atomic_long_add_return(adjustment, &sem->count); /* If there are no active locks, wake the front queued process(es). * @@ -282,7 +283,8 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) RWSEM_ACTIVE_WRITE_BIAS : RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; - if (cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) { + if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) + == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } @@ -296,13 +298,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = READ_ONCE(sem->count); + long old, count = atomic_long_read(&sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg_acquire(&sem->count, count, + old = atomic_long_cmpxchg_acquire(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); @@ -324,7 +326,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); if (!owner) { - long count = READ_ONCE(sem->count); + long count = atomic_long_read(&sem->count); /* * If sem->owner is not set, yet we have just recently entered the * slowpath with the lock being active, then there is a possibility @@ -375,7 +377,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) * held by readers. Check the counter to verify the * state. */ - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); return (count == 0 || count == RWSEM_WAITING_BIAS); } @@ -460,7 +462,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ - count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) @@ -483,7 +485,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are @@ -505,7 +507,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) } } else - count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); /* wait until we successfully acquire the lock */ set_current_state(state); @@ -521,7 +523,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) schedule(); set_current_state(state); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } @@ -536,7 +538,7 @@ out_nolock: raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); if (list_empty(&sem->wait_list)) - rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); else __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3 From 19c5d690e41697fcdd19379ab9d10d8d37818414 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:19 -0400 Subject: locking/rwsem: Add reader-owned state to the owner field Currently, it is not possible to determine for sure if a reader owns a rwsem by looking at the content of the rwsem data structure. This patch adds a new state RWSEM_READER_OWNED to the owner field to indicate that readers currently own the lock. This enables us to address the following 2 issues in the rwsem optimistic spinning code: 1) rwsem_can_spin_on_owner() will disallow optimistic spinning if the owner field is NULL which can mean either the readers own the lock or the owning writer hasn't set the owner field yet. In the latter case, we miss the chance to do optimistic spinning. 2) While a writer is waiting in the OSQ and a reader takes the lock, the writer will continue to spin when out of the OSQ in the main rwsem_optimistic_spin() loop as the owner field is NULL wasting CPU cycles if some of readers are sleeping. Adding the new state will allow optimistic spinning to go forward as long as the owner field is not RWSEM_READER_OWNED and the owner is running, if set, but stop immediately when that state has been reached. On a 4-socket Haswell machine running on a 4.6-rc1 based kernel, the fio test with multithreaded randrw and randwrite tests on the same file on a XFS partition on top of a NVDIMM were run, the aggregated bandwidths before and after the patch were as follows: Test BW before patch BW after patch % change ---- --------------- -------------- -------- randrw 988 MB/s 1192 MB/s +21% randwrite 1513 MB/s 1623 MB/s +7.3% The perf profile of the rwsem_down_write_failed() function in randrw before and after the patch were: 19.95% 5.88% fio [kernel.vmlinux] [k] rwsem_down_write_failed 14.20% 1.52% fio [kernel.vmlinux] [k] rwsem_down_write_failed The actual CPU cycles spend in rwsem_down_write_failed() dropped from 5.88% to 1.52% after the patch. The xfstests was also run and no regression was observed. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jason Low Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Dave Chinner Cc: Douglas Hatch Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 41 ++++++++++++++++++++++------------------- kernel/locking/rwsem.c | 8 ++++++-- kernel/locking/rwsem.h | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 21 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 63b40a5c62ec..6b0d0605910e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -163,6 +163,12 @@ __rwsem_mark_wake(struct rw_semaphore *sem, /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } + /* + * It is not really necessary to set it to reader-owned here, + * but it gives the spinners an early indication that the + * readers now have the lock. + */ + rwsem_set_reader_owned(sem); } /* Grant an infinite number of read locks to the readers at the front @@ -325,16 +331,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!owner) { - long count = atomic_long_read(&sem->count); + if (!rwsem_owner_is_writer(owner)) { /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath with the lock being active, then there is a possibility - * reader(s) may have the lock. To be safe, bail spinning in these - * situations. + * Don't spin if the rwsem is readers owned. */ - if (count & RWSEM_ACTIVE_MASK) - ret = false; + ret = !rwsem_owner_is_reader(owner); goto done; } @@ -347,8 +348,6 @@ done: static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) { - long count; - rcu_read_lock(); while (sem->owner == owner) { /* @@ -369,16 +368,11 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) } rcu_read_unlock(); - if (READ_ONCE(sem->owner)) - return true; /* new owner, continue spinning */ - /* - * When the owner is not set, the lock could be free or - * held by readers. Check the counter to verify the - * state. + * If there is a new owner or the owner is not set, we continue + * spinning. */ - count = atomic_long_read(&sem->count); - return (count == 0 || count == RWSEM_WAITING_BIAS); + return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) @@ -397,7 +391,16 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) while (true) { owner = READ_ONCE(sem->owner); - if (owner && !rwsem_spin_on_owner(sem, owner)) + /* + * Don't spin if + * 1) the owner is a reader as we we can't determine if the + * reader is actively running or not. + * 2) The rwsem_spin_on_owner() returns false which means + * the owner isn't running. + */ + if (rwsem_owner_is_reader(owner) || + (rwsem_owner_is_writer(owner) && + !rwsem_spin_on_owner(sem, owner))) break; /* wait_lock will be acquired if write_lock is obtained */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_reader_owned(sem); + } return ret; } @@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ - rwsem_clear_owner(sem); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..8f43ba234787 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,3 +1,20 @@ +/* + * The owner field of the rw_semaphore structure will be set to + * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * the owner field when it unlocks. A reader, on the other hand, will + * not touch the owner field when it unlocks. + * + * In essence, the owner field now has the following 3 states: + * 1) 0 + * - lock is free or the owner hasn't set the field yet + * 2) RWSEM_READER_OWNED + * - lock is currently or previously owned by readers (lock is free + * or not set by owner yet) + * 3) Other non-zero value + * - a writer owns the lock + */ +#define RWSEM_READER_OWNED ((struct task_struct *)1UL) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -9,6 +26,26 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) sem->owner = NULL; } +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + /* + * We check the owner value first to make sure that we will only + * do a write to the rwsem cacheline when it is really necessary + * to minimize cacheline contention. + */ + if (sem->owner != RWSEM_READER_OWNED) + sem->owner = RWSEM_READER_OWNED; +} + +static inline bool rwsem_owner_is_writer(struct task_struct *owner) +{ + return owner && owner != RWSEM_READER_OWNED; +} + +static inline bool rwsem_owner_is_reader(struct task_struct *owner) +{ + return owner == RWSEM_READER_OWNED; +} #else static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -17,4 +54,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) static inline void rwsem_clear_owner(struct rw_semaphore *sem) { } + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +} #endif -- cgit v1.2.3 From fb6a44f33be542fd81575ff93a4e8118d6a58592 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:20 -0400 Subject: locking/rwsem: Protect all writes to owner by WRITE_ONCE() Without using WRITE_ONCE(), the compiler can potentially break a write into multiple smaller ones (store tearing). So a read from the same data by another task concurrently may return a partial result. This can result in a kernel crash if the data is a memory address that is being dereferenced. This patch changes all write to rwsem->owner to use WRITE_ONCE() to make sure that store tearing will not happen. READ_ONCE() may not be needed for rwsem->owner as long as the value is only used for comparison and not dereferencing. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Hurley Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-3-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 8f43ba234787..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -16,14 +16,21 @@ #define RWSEM_READER_OWNED ((struct task_struct *)1UL) #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - sem->owner = current; + WRITE_ONCE(sem->owner, current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - sem->owner = NULL; + WRITE_ONCE(sem->owner, NULL); } static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) @@ -34,7 +41,7 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) * to minimize cacheline contention. */ if (sem->owner != RWSEM_READER_OWNED) - sem->owner = RWSEM_READER_OWNED; + WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } static inline bool rwsem_owner_is_writer(struct task_struct *owner) -- cgit v1.2.3 From bf7b4c472db44413251bcef79ca1f6bf1ec81475 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:22 -0400 Subject: locking/rwsem: Improve reader wakeup code In __rwsem_do_wake(), the reader wakeup code will assume a writer has stolen the lock if the active reader/writer count is not 0. However, this is not as reliable an indicator as the original "< RWSEM_WAITING_BIAS" check. If another reader is present, the code will still break out and exit even if the writer is gone. This patch changes it to check the same "< RWSEM_WAITING_BIAS" condition to reduce the chance of false positive. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Peter Hurley Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-5-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 6b0d0605910e..4f1daf5a472d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -156,9 +156,14 @@ __rwsem_mark_wake(struct rw_semaphore *sem, oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (atomic_long_sub_return(adjustment, &sem->count) & - RWSEM_ACTIVE_MASK) + /* + * If the count is still less than RWSEM_WAITING_BIAS + * after removing the adjustment, it is assumed that + * a writer has stolen the lock. We have to undo our + * reader grant. + */ + if (atomic_long_add_return(-adjustment, &sem->count) < + RWSEM_WAITING_BIAS) goto out; /* Last active locker left. Retry waking readers. */ goto try_reader_grant; -- cgit v1.2.3 From ddd0fa73c2b71c35de4fe7ae60a5f1a6cddc2cf0 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 17 May 2016 21:26:23 -0400 Subject: locking/rwsem: Streamline the rwsem_optimistic_spin() code This patch moves the owner loading and checking code entirely inside of rwsem_spin_on_owner() to simplify the logic of rwsem_optimistic_spin() loop. Suggested-by: Peter Hurley Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Peter Hurley Cc: Andrew Morton Cc: Dave Chinner Cc: Davidlohr Bueso Cc: Douglas Hatch Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1463534783-38814-6-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 4f1daf5a472d..2031281bb940 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -350,9 +350,16 @@ done: return ret; } -static noinline -bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!rwsem_owner_is_writer(owner)) + goto out; + rcu_read_lock(); while (sem->owner == owner) { /* @@ -372,7 +379,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) cpu_relax_lowlatency(); } rcu_read_unlock(); - +out: /* * If there is a new owner or the owner is not set, we continue * spinning. @@ -382,7 +389,6 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { - struct task_struct *owner; bool taken = false; preempt_disable(); @@ -394,21 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) if (!osq_lock(&sem->osq)) goto done; - while (true) { - owner = READ_ONCE(sem->owner); + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { /* - * Don't spin if - * 1) the owner is a reader as we we can't determine if the - * reader is actively running or not. - * 2) The rwsem_spin_on_owner() returns false which means - * the owner isn't running. + * Try to acquire the lock */ - if (rwsem_owner_is_reader(owner) || - (rwsem_owner_is_writer(owner) && - !rwsem_spin_on_owner(sem, owner))) - break; - - /* wait_lock will be acquired if write_lock is obtained */ if (rwsem_try_write_lock_unqueued(sem)) { taken = true; break; @@ -420,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(current))) + if (!sem->owner && (need_resched() || rt_task(current))) break; /* -- cgit v1.2.3 From 1f03e8d2919270bd6ef64f39a45ce8df8a9f012a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Apr 2016 10:57:12 +0200 Subject: locking/barriers: Replace smp_cond_acquire() with smp_cond_load_acquire() This new form allows using hardware assisted waiting. Some hardware (ARM64 and x86) allow monitoring an address for changes, so by providing a pointer we can use this to replace the cpu_relax() with hardware optimized methods in the future. Requested-by: Will Deacon Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 2f9153b183c9..1b8dda90ebfa 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -475,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -562,7 +562,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_acquire() call. As the next PV queue head hasn't been + * smp_cond_load_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -573,7 +573,7 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* @@ -593,9 +593,9 @@ locked: break; } /* - * The smp_cond_acquire() call above has provided the necessary - * acquire semantics required for locking. At most two - * iterations of this loop may be ran. + * The smp_cond_load_acquire() call above has provided the + * necessary acquire semantics required for locking. At most + * two iterations of this loop may be ran. */ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) -- cgit v1.2.3 From 33ac279677dcc2441cb93d8cb9cf7a74df62814d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 May 2016 13:17:12 +0200 Subject: locking/barriers: Introduce smp_acquire__after_ctrl_dep() Introduce smp_acquire__after_ctrl_dep(), this construct is not uncommon, but the lack of this barrier is. Use it to better express smp_rmb() uses in WRITE_ONCE(), the IPC semaphore code and the qspinlock code. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 1b8dda90ebfa..730655533440 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock) cpu_relax(); done: - smp_rmb(); /* CTRL + RMB -> ACQUIRE */ + smp_acquire__after_ctrl_dep(); } EXPORT_SYMBOL(queued_spin_unlock_wait); #endif -- cgit v1.2.3 From e37837fb62f95a81bdcefa86ceea043df84937d7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2016 01:01:27 +0200 Subject: locking/atomic: Remove the deprecated atomic_{set,clear}_mask() functions These functions have been deprecated for a while and there is only the one user left, convert and kill. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) { - atomic_set_mask(_Q_PENDING_VAL, &lock->val); + atomic_or(_Q_PENDING_VAL, &lock->val); } static __always_inline void clear_pending(struct qspinlock *lock) { - atomic_clear_mask(_Q_PENDING_VAL, &lock->val); + atomic_andnot(_Q_PENDING_VAL, &lock->val); } static __always_inline int trylock_clear_pending(struct qspinlock *lock) -- cgit v1.2.3 From f9852b74bec0117b888da39d070c323ea1cb7f4c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Apr 2016 01:27:03 +0200 Subject: locking/atomic, arch/qrwlock: Employ atomic_fetch_add_acquire() The only reason for the current code is to make GCC emit only the "LOCK XADD" instruction on x86 (and not do a pointless extra ADD on the result), do so nicer. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); rspin_until_writer_unlock(lock, cnts); /* -- cgit v1.2.3 From 86a3b5f34fc1fb307abef4fde76bebd3edce0324 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 May 2016 12:42:21 +0200 Subject: locking/atomic, arch/rwsem: Employ atomic_long_fetch_add() Now that we have fetch_add() we can stop using add_return() - val. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Jason Low Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-arch@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2031281bb940..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -153,7 +153,7 @@ __rwsem_mark_wake(struct rw_semaphore *sem, if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* -- cgit v1.2.3 From 0dceeaf599e6d9b8bd908ba4bd3dfee84aa26be2 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Tue, 14 Jun 2016 14:37:27 +0800 Subject: locking/qspinlock: Use __this_cpu_dec() instead of full-blown this_cpu_dec() queued_spin_lock_slowpath() should not worry about another queued_spin_lock_slowpath() running in interrupt context and changing node->count by accident, because node->count keeps the same value every time we enter/leave queued_spin_lock_slowpath(). On some architectures this_cpu_dec() will save/restore irq flags, which has high overhead. Use the much cheaper __this_cpu_dec() instead. Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hpe.com Link: http://lkml.kernel.org/r/1465886247-3773-1-git-send-email-xinhui.pan@linux.vnet.ibm.com [ Rewrote changelog. ] Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 730655533440..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -619,7 +619,7 @@ release: /* * release the node */ - this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); -- cgit v1.2.3