From 64a5e3cb308028dba0676daae0a7821d770036fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2016 14:26:11 +0200 Subject: locking/qspinlock: Improve readability Restructure pv_queued_spin_steal_lock() as I found it hard to read. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long --- kernel/locking/qspinlock_paravirt.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8a99abf58080..429c3dc2a5f3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -70,11 +70,14 @@ struct pv_node { static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; - int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && - (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); - qstat_inc(qstat_pv_lock_stealing, ret); - return ret; + if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) { + qstat_inc(qstat_pv_lock_stealing, true); + return true; + } + + return false; } /* @@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) static inline bool pv_wait_early(struct pv_node *prev, int loop) { - if ((loop & PV_PREV_CHECK_MASK) != 0) return false; -- cgit v1.2.3 From 08be8f63c40c030b5cf95b4368e314e563a86301 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 31 May 2016 12:53:47 -0400 Subject: locking/pvstat: Separate wait_again and spurious wakeup stats Currently there are overlap in the pvqspinlock wait_again and spurious_wakeup stat counters. Because of lock stealing, it is no longer possible to accurately determine if spurious wakeup has happened in the queue head. As they track both the queue node and queue head status, it is also hard to tell how many of those comes from the queue head and how many from the queue node. This patch changes the accounting rules so that spurious wakeup is only tracked in the queue node. The wait_again count, however, is only tracked in the queue head when the vCPU failed to acquire the lock after a vCPU kick. This should give a much better indication of the wait-kick dynamics in the queue node and the queue head. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Boqun Feng Cc: Douglas Hatch Cc: Linus Torvalds Cc: Pan Xinhui Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464713631-1066-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 12 +++--------- kernel/locking/qspinlock_stat.h | 4 ++-- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 429c3dc2a5f3..3acf16d79cf4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -288,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; struct pv_node *pp = (struct pv_node *)prev; - int waitcnt = 0; int loop; bool wait_early; - /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ - for (;; waitcnt++) { + for (;;) { for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; @@ -317,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) if (!READ_ONCE(node->locked)) { qstat_inc(qstat_pv_wait_node, true); - qstat_inc(qstat_pv_wait_again, waitcnt); qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); } @@ -458,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) pv_wait(&l->locked, _Q_SLOW_VAL); /* - * The unlocker should have freed the lock before kicking the - * CPU. So if the lock is still not free, it is a spurious - * wakeup or another vCPU has stolen the lock. The current - * vCPU should spin again. + * Because of lock stealing, the queue head vCPU may not be + * able to acquire the lock before it has to wait again. */ - qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index b9d031516254..eb0a599fcf58 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -24,8 +24,8 @@ * pv_latency_wake - average latency (ns) from vCPU kick to wakeup * pv_lock_slowpath - # of locking operations via the slowpath * pv_lock_stealing - # of lock stealing operations - * pv_spurious_wakeup - # of spurious wakeups - * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs + * pv_wait_again - # of wait's after a queue head vCPU kick * pv_wait_early - # of early vCPU wait's * pv_wait_head - # of vCPU wait's at the queue head * pv_wait_node - # of vCPU wait's at a non-head queue node -- cgit v1.2.3 From 80127a39681bd68c959f0953f84a830cbd7c3b1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Jul 2016 20:08:46 +0200 Subject: locking/percpu-rwsem: Optimize readers and reduce global impact Currently the percpu-rwsem switches to (global) atomic ops while a writer is waiting; which could be quite a while and slows down releasing the readers. This patch cures this problem by ordering the reader-state vs reader-count (see the comments in __percpu_down_read() and percpu_down_write()). This changes a global atomic op into a full memory barrier, which doesn't have the global cacheline contention. This also enables using the percpu-rwsem with rcu_sync disabled in order to bias the implementation differently, reducing the writer latency by adding some cost to readers. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Oleg Nesterov Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org [ Fixed modular build. ] Signed-off-by: Ingo Molnar --- kernel/locking/percpu-rwsem.c | 228 ++++++++++++++++++++++++------------------ 1 file changed, 131 insertions(+), 97 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index bec0b647f9cc..ce182599cf2e 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -8,152 +8,186 @@ #include #include -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, const char *name, struct lock_class_key *rwsem_key) { - brw->fast_read_ctr = alloc_percpu(int); - if (unlikely(!brw->fast_read_ctr)) + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) return -ENOMEM; /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ - __init_rwsem(&brw->rw_sem, name, rwsem_key); - rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); - atomic_set(&brw->slow_read_ctr, 0); - init_waitqueue_head(&brw->write_waitq); + rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); + __init_rwsem(&sem->rw_sem, name, rwsem_key); + init_waitqueue_head(&sem->writer); + sem->readers_block = 0; return 0; } EXPORT_SYMBOL_GPL(__percpu_init_rwsem); -void percpu_free_rwsem(struct percpu_rw_semaphore *brw) +void percpu_free_rwsem(struct percpu_rw_semaphore *sem) { /* * XXX: temporary kludge. The error path in alloc_super() * assumes that percpu_free_rwsem() is safe after kzalloc(). */ - if (!brw->fast_read_ctr) + if (!sem->read_count) return; - rcu_sync_dtor(&brw->rss); - free_percpu(brw->fast_read_ctr); - brw->fast_read_ctr = NULL; /* catch use after free bugs */ + rcu_sync_dtor(&sem->rss); + free_percpu(sem->read_count); + sem->read_count = NULL; /* catch use after free bugs */ } EXPORT_SYMBOL_GPL(percpu_free_rwsem); -/* - * This is the fast-path for down_read/up_read. If it succeeds we rely - * on the barriers provided by rcu_sync_enter/exit; see the comments in - * percpu_down_write() and percpu_up_write(). - * - * If this helper fails the callers rely on the normal rw_semaphore and - * atomic_dec_and_test(), so in this case we have the necessary barriers. - */ -static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) +int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) { - bool success; + /* + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. + * + * If the reader misses the writer's assignment of readers_block, then + * the writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after + * the writer looks are guaranteed to see the readers_block value, + * which in turn means that they are guaranteed to immediately + * decrement their sem->read_count, so that it doesn't matter that the + * writer missed them. + */ - preempt_disable(); - success = rcu_sync_is_idle(&brw->rss); - if (likely(success)) - __this_cpu_add(*brw->fast_read_ctr, val); - preempt_enable(); + smp_mb(); /* A matches D */ - return success; -} + /* + * If !readers_block the critical section starts here, matched by the + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) + return 1; -/* - * Like the normal down_read() this is not recursive, the writer can - * come after the first percpu_down_read() and create the deadlock. - * - * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep, - * percpu_up_read() does rwsem_release(). This pairs with the usage - * of ->rw_sem in percpu_down/up_write(). - */ -void percpu_down_read(struct percpu_rw_semaphore *brw) -{ - might_sleep(); - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + /* + * Per the above comment; we still have preemption disabled and + * will thus decrement on the same CPU as we incremented. + */ + __percpu_up_read(sem); - if (likely(update_fast_ctr(brw, +1))) - return; + if (try) + return 0; - /* Avoid rwsem_acquire_read() and rwsem_release() */ - __down_read(&brw->rw_sem); - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); -} -EXPORT_SYMBOL_GPL(percpu_down_read); + /* + * We either call schedule() in the wait, or we'll fall through + * and reschedule on the preempt_enable() in percpu_down_read(). + */ + preempt_enable_no_resched(); -int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) -{ - if (unlikely(!update_fast_ctr(brw, +1))) { - if (!__down_read_trylock(&brw->rw_sem)) - return 0; - atomic_inc(&brw->slow_read_ctr); - __up_read(&brw->rw_sem); - } - - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + /* + * Avoid lockdep for the down/up_read() we already have them. + */ + __down_read(&sem->rw_sem); + this_cpu_inc(*sem->read_count); + __up_read(&sem->rw_sem); + + preempt_disable(); return 1; } +EXPORT_SYMBOL_GPL(__percpu_down_read); -void percpu_up_read(struct percpu_rw_semaphore *brw) +void __percpu_up_read(struct percpu_rw_semaphore *sem) { - rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); - - if (likely(update_fast_ctr(brw, -1))) - return; + smp_mb(); /* B matches C */ + /* + * In other words, if they see our decrement (presumably to aggregate + * zero, as that is the only time it matters) they will also see our + * critical section. + */ + __this_cpu_dec(*sem->read_count); - /* false-positive is possible but harmless */ - if (atomic_dec_and_test(&brw->slow_read_ctr)) - wake_up_all(&brw->write_waitq); + /* Prod writer to recheck readers_active */ + wake_up(&sem->writer); } -EXPORT_SYMBOL_GPL(percpu_up_read); +EXPORT_SYMBOL_GPL(__percpu_up_read); + +#define per_cpu_sum(var) \ +({ \ + typeof(var) __sum = 0; \ + int cpu; \ + compiletime_assert_atomic_type(__sum); \ + for_each_possible_cpu(cpu) \ + __sum += per_cpu(var, cpu); \ + __sum; \ +}) -static int clear_fast_ctr(struct percpu_rw_semaphore *brw) +/* + * Return true if the modular sum of the sem->read_count per-CPU variable is + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. + */ +static bool readers_active_check(struct percpu_rw_semaphore *sem) { - unsigned int sum = 0; - int cpu; + if (per_cpu_sum(*sem->read_count) != 0) + return false; + + /* + * If we observed the decrement; ensure we see the entire critical + * section. + */ - for_each_possible_cpu(cpu) { - sum += per_cpu(*brw->fast_read_ctr, cpu); - per_cpu(*brw->fast_read_ctr, cpu) = 0; - } + smp_mb(); /* C matches B */ - return sum; + return true; } -void percpu_down_write(struct percpu_rw_semaphore *brw) +void percpu_down_write(struct percpu_rw_semaphore *sem) { + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + + down_write(&sem->rw_sem); + /* - * Make rcu_sync_is_idle() == F and thus disable the fast-path in - * percpu_down_read() and percpu_up_read(), and wait for gp pass. - * - * The latter synchronises us with the preceding readers which used - * the fast-past, so we can not miss the result of __this_cpu_add() - * or anything else inside their criticial sections. + * Notify new readers to block; up until now, and thus throughout the + * longish rcu_sync_enter() above, new readers could still come in. */ - rcu_sync_enter(&brw->rss); + WRITE_ONCE(sem->readers_block, 1); - /* exclude other writers, and block the new readers completely */ - down_write(&brw->rw_sem); + smp_mb(); /* D matches A */ - /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */ - atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr); + /* + * If they don't see our writer of readers_block, then we are + * guaranteed to see their sem->read_count increment, and therefore + * will wait for them. + */ - /* wait for all readers to complete their percpu_up_read() */ - wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); + /* Wait for all now active readers to complete. */ + wait_event(sem->writer, readers_active_check(sem)); } EXPORT_SYMBOL_GPL(percpu_down_write); -void percpu_up_write(struct percpu_rw_semaphore *brw) +void percpu_up_write(struct percpu_rw_semaphore *sem) { - /* release the lock, but the readers can't use the fast-path */ - up_write(&brw->rw_sem); /* - * Enable the fast-path in percpu_down_read() and percpu_up_read() - * but only after another gp pass; this adds the necessary barrier - * to ensure the reader can't miss the changes done by us. + * Signal the writer is done, no fast path yet. + * + * One reason that we cannot just immediately flip to readers_fast is + * that new readers might fail to see the results of this writer's + * critical section. + * + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. + */ + smp_store_release(&sem->readers_block, 0); + + /* + * Release the write lock, this will allow readers back in the game. + */ + up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the + * reader fast path will be available again. Safe to use outside the + * exclusive write lock because its counting. */ - rcu_sync_exit(&brw->rss); + rcu_sync_exit(&sem->rss); } EXPORT_SYMBOL_GPL(percpu_up_write); -- cgit v1.2.3 From 84b23f9b58687a11ced66cc4be9b0219e8ecab84 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:43 -0700 Subject: locking/rwsem: Return void in __rwsem_mark_wake() We currently return a rw_semaphore structure, which is the same lock we passed to the function's argument in the first place. While there are several functions that choose this return value, the callers use it, for example, for things like ERR_PTR. This is not the case for __rwsem_mark_wake(), and in addition this function is really about the lock waiters (which we know there are at this point), so its somewhat odd to be returning the sem structure. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-2-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 447e08de1fab..b03623172277 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -121,16 +121,17 @@ enum rwsem_wake_type { * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false */ -static struct rw_semaphore * -__rwsem_mark_wake(struct rw_semaphore *sem, - enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) +static void __rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, + struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - long oldcount, woken, loop, adjustment; + long loop, oldcount, woken = 0, adjustment = 0; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { /* @@ -142,19 +143,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ wake_q_add(wake_q, waiter->task); } - goto out; + + return; } - /* Writers might steal the lock before we grant it to the next reader. + /* + * Writers might steal the lock before we grant it to the next reader. * We prefer to do the first reader grant before counting readers * so we can bail out early if a writer stole the lock. */ - adjustment = 0; if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: oldcount = atomic_long_fetch_add(adjustment, &sem->count); - if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { /* * If the count is still less than RWSEM_WAITING_BIAS @@ -164,7 +165,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem, */ if (atomic_long_add_return(-adjustment, &sem->count) < RWSEM_WAITING_BIAS) - goto out; + return; + /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } @@ -176,11 +178,11 @@ __rwsem_mark_wake(struct rw_semaphore *sem, rwsem_set_reader_owned(sem); } - /* Grant an infinite number of read locks to the readers at the front + /* + * Grant an infinite number of read locks to the readers at the front * of the queue. Note we increment the 'active part' of the count by * the number of readers before waking any processes up. */ - woken = 0; do { woken++; @@ -219,9 +221,6 @@ __rwsem_mark_wake(struct rw_semaphore *sem, sem->wait_list.next = next; next->prev = &sem->wait_list; - - out: - return sem; } /* @@ -255,7 +254,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); @@ -505,7 +504,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) if (count > RWSEM_WAITING_BIAS) { WAKE_Q(wake_q); - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* * The wakeup is normally called _after_ the wait_lock * is released, but given that we are proactively waking @@ -616,7 +615,7 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); @@ -640,7 +639,7 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); + __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); wake_up_q(&wake_q); -- cgit v1.2.3 From c2867bbaf5d8f1534cae15175a389c5cbf58fec1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:44 -0700 Subject: locking/rwsem: Remove a few useless comments Our rwsem code (xadd, at least) is rather well documented, but there are a few really annoying comments in there that serve no purpose and we shouldn't bother with them. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b03623172277..e02fe3289b5a 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -234,7 +234,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) struct task_struct *tsk = current; WAKE_Q(wake_q); - /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -613,7 +612,6 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); locked: - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); @@ -637,7 +635,6 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); -- cgit v1.2.3 From 70800c3c0cc525baa38fd0fe4660f2c27f1bfeeb Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 5 Aug 2016 01:04:45 -0700 Subject: locking/rwsem: Scan the wait_list for readers only once When wanting to wakeup readers, __rwsem_mark_wakeup() currently iterates the wait_list twice while looking to wakeup the first N queued reader-tasks. While this can be quite inefficient, it was there such that a awoken reader would be first and foremost acknowledged by the lock counter. Keeping the same logic, we can further benefit from the use of wake_qs and avoid entirely the first wait_list iteration that sets the counter as wake_up_process() isn't going to occur right away, and therefore we maintain the counter->list order of going about things. Other than saving cycles with O(n) "scanning", this change also nicely cleans up a good chunk of __rwsem_mark_wakeup(); both visually and less tedious to read. For example, the following improvements where seen on some will it scale microbenchmarks, on a 48-core Haswell: v4.7 v4.7-rwsem-v1 Hmean signal1-processes-8 5792691.42 ( 0.00%) 5771971.04 ( -0.36%) Hmean signal1-processes-12 6081199.96 ( 0.00%) 6072174.38 ( -0.15%) Hmean signal1-processes-21 3071137.71 ( 0.00%) 3041336.72 ( -0.97%) Hmean signal1-processes-48 3712039.98 ( 0.00%) 3708113.59 ( -0.11%) Hmean signal1-processes-79 4464573.45 ( 0.00%) 4682798.66 ( 4.89%) Hmean signal1-processes-110 4486842.01 ( 0.00%) 4633781.71 ( 3.27%) Hmean signal1-processes-141 4611816.83 ( 0.00%) 4692725.38 ( 1.75%) Hmean signal1-processes-172 4638157.05 ( 0.00%) 4714387.86 ( 1.64%) Hmean signal1-processes-203 4465077.80 ( 0.00%) 4690348.07 ( 5.05%) Hmean signal1-processes-224 4410433.74 ( 0.00%) 4687534.43 ( 6.28%) Stddev signal1-processes-8 6360.47 ( 0.00%) 8455.31 ( 32.94%) Stddev signal1-processes-12 4004.98 ( 0.00%) 9156.13 (128.62%) Stddev signal1-processes-21 3273.14 ( 0.00%) 5016.80 ( 53.27%) Stddev signal1-processes-48 28420.25 ( 0.00%) 26576.22 ( -6.49%) Stddev signal1-processes-79 22038.34 ( 0.00%) 18992.70 (-13.82%) Stddev signal1-processes-110 23226.93 ( 0.00%) 17245.79 (-25.75%) Stddev signal1-processes-141 6358.98 ( 0.00%) 7636.14 ( 20.08%) Stddev signal1-processes-172 9523.70 ( 0.00%) 4824.75 (-49.34%) Stddev signal1-processes-203 13915.33 ( 0.00%) 9326.33 (-32.98%) Stddev signal1-processes-224 15573.94 ( 0.00%) 10613.82 (-31.85%) Other runs that saw improvements include context_switch and pipe; and as expected, this is particularly highlighted on larger thread counts as it becomes more expensive to walk the list twice. No change in wakeup ordering or semantics. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: dave@stgolabs.net Cc: jason.low2@hpe.com Cc: wanpeng.li@hotmail.com Link: http://lkml.kernel.org/r/1470384285-32163-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 58 ++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e02fe3289b5a..2337b4bb2366 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -125,12 +125,14 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { - struct rwsem_waiter *waiter; - struct task_struct *tsk; - struct list_head *next; - long loop, oldcount, woken = 0, adjustment = 0; + struct rwsem_waiter *waiter, *tmp; + long oldcount, woken = 0, adjustment = 0; - waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); + /* + * Take a peek at the queue head waiter such that we can determine + * the wakeup(s) to perform. + */ + waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { if (wake_type == RWSEM_WAKE_ANY) { @@ -180,36 +182,21 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, /* * Grant an infinite number of read locks to the readers at the front - * of the queue. Note we increment the 'active part' of the count by - * the number of readers before waking any processes up. + * of the queue. We know that woken will be at least 1 as we accounted + * for above. Note we increment the 'active part' of the count by the + * number of readers before waking any processes up. */ - do { - woken++; + list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { + struct task_struct *tsk; - if (waiter->list.next == &sem->wait_list) + if (waiter->type == RWSEM_WAITING_FOR_WRITE) break; - waiter = list_entry(waiter->list.next, - struct rwsem_waiter, list); - - } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - - adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - /* hit end of list above */ - adjustment -= RWSEM_WAITING_BIAS; - - if (adjustment) - atomic_long_add(adjustment, &sem->count); - - next = sem->wait_list.next; - loop = woken; - do { - waiter = list_entry(next, struct rwsem_waiter, list); - next = waiter->list.next; + woken++; tsk = waiter->task; wake_q_add(wake_q, tsk); + list_del(&waiter->list); /* * Ensure that the last operation is setting the reader * waiter to nil such that rwsem_down_read_failed() cannot @@ -217,10 +204,16 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); - } while (--loop); + } - sem->wait_list.next = next; - next->prev = &sem->wait_list; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; + if (list_empty(&sem->wait_list)) { + /* hit end of list above */ + adjustment -= RWSEM_WAITING_BIAS; + } + + if (adjustment) + atomic_long_add(adjustment, &sem->count); } /* @@ -245,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); - /* If there are no active locks, wake the front queued process(es). + /* + * If there are no active locks, wake the front queued process(es). * * If there are no writers and we are first in the queue, * wake our own waiter to join the existing active readers ! -- cgit v1.2.3 From b193049375b04df3ada8c3347b7083db95918bc3 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Mon, 19 Sep 2016 05:23:52 -0400 Subject: locking/pv-qspinlock: Use cmpxchg_release() in __pv_queued_spin_unlock() cmpxchg_release() is more lighweight than cmpxchg() on some archs(e.g. PPC), moreover, in __pv_queued_spin_unlock() we only needs a RELEASE in the fast path(pairing with *_try_lock() or *_lock()). And the slow path has smp_store_release too. So it's safe to use cmpxchg_release here. Suggested-by: Boqun Feng Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: virtualization@lists.linux-foundation.org Cc: waiman.long@hpe.com Link: http://lkml.kernel.org/r/1474277037-15200-2-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 3acf16d79cf4..e3b5520005db 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -540,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0); if (likely(locked == _Q_LOCKED_VAL)) return; -- cgit v1.2.3 From e6253970413d99f416f7de8bd516e5f1834d8216 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 21 Nov 2015 19:11:48 +0100 Subject: stop_machine: Remove stop_cpus_lock and lg_double_lock/unlock() stop_two_cpus() and stop_cpus() use stop_cpus_lock to avoid the deadlock, we need to ensure that the stopper functions can't be queued "backwards" from one another. This doesn't look nice; if we use lglock then we do not really need stopper->lock, cpu_stop_queue_work() could use lg_local_lock() under local_irq_save(). OTOH it would be even better to avoid lglock in stop_machine.c and remove lg_double_lock(). This patch adds "bool stop_cpus_in_progress" set/cleared by queue_stop_cpus_work(), and changes cpu_stop_queue_two_works() to busy wait until it is cleared. queue_stop_cpus_work() sets stop_cpus_in_progress = T lockless, but after it queues a work on CPU1 it must be visible to stop_two_cpus(CPU1, CPU2) which checks it under the same lock. And since stop_two_cpus() holds the 2nd lock too, queue_stop_cpus_work() can not clear stop_cpus_in_progress if it is also going to queue a work on CPU2, it needs to take that 2nd lock to do this. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Tejun Heo Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151121181148.GA433@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/lglock.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c index 951cfcd10b4a..86ae2aebf004 100644 --- a/kernel/locking/lglock.c +++ b/kernel/locking/lglock.c @@ -60,28 +60,6 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) } EXPORT_SYMBOL(lg_local_unlock_cpu); -void lg_double_lock(struct lglock *lg, int cpu1, int cpu2) -{ - BUG_ON(cpu1 == cpu2); - - /* lock in cpu order, just like lg_global_lock */ - if (cpu2 < cpu1) - swap(cpu1, cpu2); - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_lock(per_cpu_ptr(lg->lock, cpu2)); -} - -void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2) -{ - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1)); - arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2)); - preempt_enable(); -} - void lg_global_lock(struct lglock *lg) { int i; -- cgit v1.2.3 From d32cdbfb0ba319e44f75437afde868f7cafdc467 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Nov 2015 18:36:16 +0100 Subject: locking/lglock: Remove lglock implementation It is now unused, remove it before someone else thinks its a good idea to use this. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/Makefile | 1 - kernel/locking/lglock.c | 89 ------------------------------------------------- 2 files changed, 90 deletions(-) delete mode 100644 kernel/locking/lglock.c (limited to 'kernel/locking') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 31322a4275cd..6f88e352cd4f 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o -obj-$(CONFIG_SMP) += lglock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c deleted file mode 100644 index 86ae2aebf004..000000000000 --- a/kernel/locking/lglock.c +++ /dev/null @@ -1,89 +0,0 @@ -/* See include/linux/lglock.h for description */ -#include -#include -#include -#include - -/* - * Note there is no uninit, so lglocks cannot be defined in - * modules (but it's fine to use them from there) - * Could be added though, just undo lg_lock_init - */ - -void lg_lock_init(struct lglock *lg, char *name) -{ - LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0); -} -EXPORT_SYMBOL(lg_lock_init); - -void lg_local_lock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock); - -void lg_local_unlock(struct lglock *lg) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = this_cpu_ptr(lg->lock); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock); - -void lg_local_lock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - preempt_disable(); - lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_lock(lock); -} -EXPORT_SYMBOL(lg_local_lock_cpu); - -void lg_local_unlock_cpu(struct lglock *lg, int cpu) -{ - arch_spinlock_t *lock; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - lock = per_cpu_ptr(lg->lock, cpu); - arch_spin_unlock(lock); - preempt_enable(); -} -EXPORT_SYMBOL(lg_local_unlock_cpu); - -void lg_global_lock(struct lglock *lg) -{ - int i; - - preempt_disable(); - lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_lock(lock); - } -} -EXPORT_SYMBOL(lg_global_lock); - -void lg_global_unlock(struct lglock *lg) -{ - int i; - - lock_release(&lg->lock_dep_map, 1, _RET_IP_); - for_each_possible_cpu(i) { - arch_spinlock_t *lock; - lock = per_cpu_ptr(lg->lock, i); - arch_spin_unlock(lock); - } - preempt_enable(); -} -EXPORT_SYMBOL(lg_global_unlock); -- cgit v1.2.3