From f7d71f2052555ae57b47322f2c2f6c29ff2438ae Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 19 Jun 2015 11:50:00 -0400 Subject: locking/qrwlock: Rename functions to queued_*() To sync up with the naming convention used in qspinlock, all the qrwlock functions were renamed to started with "queued" instead of "queue". Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Douglas Hatch Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1434729002-57724-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 6c5da483966b..49057d413b6e 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -60,10 +60,10 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) } /** - * queue_read_lock_slowpath - acquire read lock of a queue rwlock + * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure */ -void queue_read_lock_slowpath(struct qrwlock *lock) +void queued_read_lock_slowpath(struct qrwlock *lock) { u32 cnts; @@ -104,13 +104,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock) */ arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_read_lock_slowpath); +EXPORT_SYMBOL(queued_read_lock_slowpath); /** - * queue_write_lock_slowpath - acquire write lock of a queue rwlock + * queued_write_lock_slowpath - acquire write lock of a queue rwlock * @lock : Pointer to queue rwlock structure */ -void queue_write_lock_slowpath(struct qrwlock *lock) +void queued_write_lock_slowpath(struct qrwlock *lock) { u32 cnts; @@ -149,4 +149,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock) unlock: arch_spin_unlock(&lock->lock); } -EXPORT_SYMBOL(queue_write_lock_slowpath); +EXPORT_SYMBOL(queued_write_lock_slowpath); -- cgit v1.2.3 From 0e06e5be70d392aa842c1455ec2d0baf62aeed48 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 19 Jun 2015 11:50:01 -0400 Subject: locking/qrwlock: Better optimization for interrupt context readers The qrwlock is fair in the process context, but becoming unfair when in the interrupt context to support use cases like the tasklist_lock. The current code isn't that well-documented on what happens when in the interrupt context. The rspin_until_writer_unlock() will only spin if the writer has gotten the lock. If the writer is still in the waiting state, the increment in the reader count will cause the writer to remain in the waiting state and the new interrupt context reader will get the lock and return immediately. The current code, however, does an additional read of the lock value which is not necessary as the information has already been there in the fast path. This may sometime cause an additional cacheline transfer when the lock is highly contended. This patch passes the lock value information gotten in the fast path to the slow path to eliminate the additional read. It also documents the action for the interrupt context readers more clearly. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Cc: Arnd Bergmann Cc: Douglas Hatch Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434729002-57724-3-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 49057d413b6e..d9c36c5f5711 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -62,20 +62,21 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) /** * queued_read_lock_slowpath - acquire read lock of a queue rwlock * @lock: Pointer to queue rwlock structure + * @cnts: Current qrwlock lock value */ -void queued_read_lock_slowpath(struct qrwlock *lock) +void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) { - u32 cnts; - /* * Readers come here when they cannot get the lock without waiting */ if (unlikely(in_interrupt())) { /* - * Readers in interrupt context will spin until the lock is - * available without waiting in the queue. + * Readers in interrupt context will get the lock immediately + * if the writer is just waiting (not holding the lock yet). + * The rspin_until_writer_unlock() function returns immediately + * in this case. Otherwise, they will spin until the lock + * is available without waiting in the queue. */ - cnts = smp_load_acquire((u32 *)&lock->cnts); rspin_until_writer_unlock(lock, cnts); return; } -- cgit v1.2.3 From 1b0b7c1762679a2f8bc359da95649249dfcf4195 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 1 Jul 2015 13:29:48 -0700 Subject: rtmutex: Delete scriptable tester No one uses this anymore, and this is not the first time the idea of replacing it with a (now possible) userspace side. Lock stealing logic was removed long ago in when the lock was granted to the highest prio. Signed-off-by: Davidlohr Bueso Cc: Darren Hart Cc: Steven Rostedt Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Sebastian Andrzej Siewior Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1435782588-4177-2-git-send-email-dave@stgolabs.net Signed-off-by: Thomas Gleixner --- kernel/locking/Makefile | 1 - kernel/locking/rtmutex-tester.c | 420 ---------------------------------------- kernel/locking/rtmutex.c | 2 +- kernel/locking/rtmutex_common.h | 22 --- 4 files changed, 1 insertion(+), 444 deletions(-) delete mode 100644 kernel/locking/rtmutex-tester.c (limited to 'kernel/locking') diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..36942047ffc0 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o -obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c deleted file mode 100644 index 1d96dd0d93c1..000000000000 --- a/kernel/locking/rtmutex-tester.c +++ /dev/null @@ -1,420 +0,0 @@ -/* - * RT-Mutex-tester: scriptable tester for rt mutexes - * - * started by Thomas Gleixner: - * - * Copyright (C) 2006, Timesys Corp., Thomas Gleixner - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rtmutex.h" - -#define MAX_RT_TEST_THREADS 8 -#define MAX_RT_TEST_MUTEXES 8 - -static spinlock_t rttest_lock; -static atomic_t rttest_event; - -struct test_thread_data { - int opcode; - int opdata; - int mutexes[MAX_RT_TEST_MUTEXES]; - int event; - struct device dev; -}; - -static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; -static struct task_struct *threads[MAX_RT_TEST_THREADS]; -static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; - -enum test_opcodes { - RTTEST_NOP = 0, - RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ - RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ - RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ - RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ - RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ - RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ - RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - /* 9, 10 - reserved for BKL commemoration */ - RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ - RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ - RTTEST_RESET = 99, /* 99 Reset all pending operations */ -}; - -static int handle_op(struct test_thread_data *td, int lockwakeup) -{ - int i, id, ret = -EINVAL; - - switch(td->opcode) { - - case RTTEST_NOP: - return 0; - - case RTTEST_LOCKCONT: - td->mutexes[td->opdata] = 1; - td->event = atomic_add_return(1, &rttest_event); - return 0; - - case RTTEST_RESET: - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { - if (td->mutexes[i] == 4) { - rt_mutex_unlock(&mutexes[i]); - td->mutexes[i] = 0; - } - } - return 0; - - case RTTEST_RESETEVENT: - atomic_set(&rttest_event, 0); - return 0; - - default: - if (lockwakeup) - return ret; - } - - switch(td->opcode) { - - case RTTEST_LOCK: - case RTTEST_LOCKNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_lock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 4; - return 0; - - case RTTEST_LOCKINT: - case RTTEST_LOCKINTNOWAIT: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES) - return ret; - - td->mutexes[id] = 1; - td->event = atomic_add_return(1, &rttest_event); - ret = rt_mutex_lock_interruptible(&mutexes[id], 0); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = ret ? 0 : 4; - return ret ? -EINTR : 0; - - case RTTEST_UNLOCK: - id = td->opdata; - if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) - return ret; - - td->event = atomic_add_return(1, &rttest_event); - rt_mutex_unlock(&mutexes[id]); - td->event = atomic_add_return(1, &rttest_event); - td->mutexes[id] = 0; - return 0; - - default: - break; - } - return ret; -} - -/* - * Schedule replacement for rtsem_down(). Only called for threads with - * PF_MUTEX_TESTER set. - * - * This allows us to have finegrained control over the event flow. - * - */ -void schedule_rt_mutex_test(struct rt_mutex *mutex) -{ - int tid, op, dat; - struct test_thread_data *td; - - /* We have to lookup the task */ - for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { - if (threads[tid] == current) - break; - } - - BUG_ON(tid == MAX_RT_TEST_THREADS); - - td = &thread_data[tid]; - - op = td->opcode; - dat = td->opdata; - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - break; - - if (td->mutexes[dat] != 1) - break; - - td->mutexes[dat] = 2; - td->event = atomic_add_return(1, &rttest_event); - break; - - default: - break; - } - - schedule(); - - - switch (op) { - case RTTEST_LOCK: - case RTTEST_LOCKINT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 3; - td->event = atomic_add_return(1, &rttest_event); - break; - - case RTTEST_LOCKNOWAIT: - case RTTEST_LOCKINTNOWAIT: - if (mutex != &mutexes[dat]) - return; - - if (td->mutexes[dat] != 2) - return; - - td->mutexes[dat] = 1; - td->event = atomic_add_return(1, &rttest_event); - return; - - default: - return; - } - - td->opcode = 0; - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - int ret; - - set_current_state(TASK_RUNNING); - ret = handle_op(td, 1); - set_current_state(TASK_INTERRUPTIBLE); - if (td->opcode == RTTEST_LOCKCONT) - break; - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - } - - /* Restore previous command and data */ - td->opcode = op; - td->opdata = dat; -} - -static int test_func(void *data) -{ - struct test_thread_data *td = data; - int ret; - - current->flags |= PF_MUTEX_TESTER; - set_freezable(); - allow_signal(SIGHUP); - - for(;;) { - - set_current_state(TASK_INTERRUPTIBLE); - - if (td->opcode > 0) { - set_current_state(TASK_RUNNING); - ret = handle_op(td, 0); - set_current_state(TASK_INTERRUPTIBLE); - td->opcode = ret; - } - - /* Wait for the next command to be executed */ - schedule(); - try_to_freeze(); - - if (signal_pending(current)) - flush_signals(current); - - if(kthread_should_stop()) - break; - } - return 0; -} - -/** - * sysfs_test_command - interface for test commands - * @dev: thread reference - * @buf: command for actual step - * @count: length of buffer - * - * command syntax: - * - * opcode:data - */ -static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct sched_param schedpar; - struct test_thread_data *td; - char cmdbuf[32]; - int op, dat, tid, ret; - - td = container_of(dev, struct test_thread_data, dev); - tid = td->dev.id; - - /* strings from sysfs write are not 0 terminated! */ - if (count >= sizeof(cmdbuf)) - return -EINVAL; - - /* strip of \n: */ - if (buf[count-1] == '\n') - count--; - if (count < 1) - return -EINVAL; - - memcpy(cmdbuf, buf, count); - cmdbuf[count] = 0; - - if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) - return -EINVAL; - - switch (op) { - case RTTEST_SCHEDOT: - schedpar.sched_priority = 0; - ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); - if (ret) - return ret; - set_user_nice(current, 0); - break; - - case RTTEST_SCHEDRT: - schedpar.sched_priority = dat; - ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); - if (ret) - return ret; - break; - - case RTTEST_SIGNAL: - send_sig(SIGHUP, threads[tid], 0); - break; - - default: - if (td->opcode > 0) - return -EBUSY; - td->opdata = dat; - td->opcode = op; - wake_up_process(threads[tid]); - } - - return count; -} - -/** - * sysfs_test_status - sysfs interface for rt tester - * @dev: thread to query - * @buf: char buffer to be filled with thread status info - */ -static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct test_thread_data *td; - struct task_struct *tsk; - char *curr = buf; - int i; - - td = container_of(dev, struct test_thread_data, dev); - tsk = threads[td->dev.id]; - - spin_lock(&rttest_lock); - - curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", - td->opcode, td->event, tsk->state, - (MAX_RT_PRIO - 1) - tsk->prio, - (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on); - - for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) - curr += sprintf(curr, "%d", td->mutexes[i]); - - spin_unlock(&rttest_lock); - - curr += sprintf(curr, ", T: %p, R: %p\n", tsk, - mutexes[td->dev.id].owner); - - return curr - buf; -} - -static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); -static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); - -static struct bus_type rttest_subsys = { - .name = "rttest", - .dev_name = "rttest", -}; - -static int init_test_thread(int id) -{ - thread_data[id].dev.bus = &rttest_subsys; - thread_data[id].dev.id = id; - - threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); - if (IS_ERR(threads[id])) - return PTR_ERR(threads[id]); - - return device_register(&thread_data[id].dev); -} - -static int init_rttest(void) -{ - int ret, i; - - spin_lock_init(&rttest_lock); - - for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) - rt_mutex_init(&mutexes[i]); - - ret = subsys_system_register(&rttest_subsys, NULL); - if (ret) - return ret; - - for (i = 0; i < MAX_RT_TEST_THREADS; i++) { - ret = init_test_thread(i); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_status); - if (ret) - break; - ret = device_create_file(&thread_data[i].dev, &dev_attr_command); - if (ret) - break; - } - - printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); - - return ret; -} - -device_initcall(init_rttest); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5674b073473c..7781d801212f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, debug_rt_mutex_print_deadlock(waiter); - schedule_rt_mutex(lock); + schedule(); raw_spin_lock(&lock->wait_lock); set_current_state(state); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7844f8f0e639..4f5f83c7d2d3 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -14,28 +14,6 @@ #include -/* - * The rtmutex in kernel tester is independent of rtmutex debugging. We - * call schedule_rt_mutex_test() instead of schedule() for the tasks which - * belong to the tester. That way we can delay the wakeup path of those - * threads to provoke lock stealing and testing of complex boosting scenarios. - */ -#ifdef CONFIG_RT_MUTEX_TESTER - -extern void schedule_rt_mutex_test(struct rt_mutex *lock); - -#define schedule_rt_mutex(_lock) \ - do { \ - if (!(current->flags & PF_MUTEX_TESTER)) \ - schedule(); \ - else \ - schedule_rt_mutex_test(_lock); \ - } while (0) - -#else -# define schedule_rt_mutex(_lock) schedule() -#endif - /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. -- cgit v1.2.3 From 0b792bf519e68108d577fcec815ab50913787012 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2015 12:13:43 +0200 Subject: locking: Clean up pvqspinlock warning - Rename the on-stack variable to match the datastructure variable, - place the cmpxchg back under the comment that explains it, - clean up the WARN() statement to avoid superfluous conditionals and line-breaks. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index df19ae4debd0..489a87884337 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -287,20 +287,21 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + u8 locked; /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - if (likely(lockval == _Q_LOCKED_VAL)) + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) return; - if (unlikely(lockval != _Q_SLOW_VAL)) { - if (debug_locks_silent) - return; - WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val)); + if (unlikely(locked != _Q_SLOW_VAL)) { + WARN(!debug_locks_silent, + "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n", + (unsigned long)lock, atomic_read(&lock->val)); return; } -- cgit v1.2.3 From 3b3fdf10a8add87ef0050138d51bfee9ab4983df Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 13 Jul 2015 16:58:30 +0100 Subject: locking/pvqspinlock: Order pv_unhash() after cmpxchg() on unlock slowpath When we unlock in __pv_queued_spin_unlock(), a failed cmpxchg() on the lock value indicates that we need to take the slow-path and unhash the corresponding node blocked on the lock. Since a failed cmpxchg() does not provide any memory-ordering guarantees, it is possible that the node data could be read before the cmpxchg() on weakly-ordered architectures and therefore return a stale value, leading to hash corruption and/or a BUG(). This patch adds an smb_rmb() following the failed cmpxchg operation, so that the unhashing is ordered after the lock has been checked. Reported-by: Peter Zijlstra Signed-off-by: Will Deacon [ Added more comments] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul McKenney Cc: Steve Capper Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150713155830.GL2632@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock_paravirt.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 489a87884337..ab8b1bb8caa4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -244,13 +244,17 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); /* - * lp must be set before setting _Q_SLOW_VAL + * We must hash before setting _Q_SLOW_VAL, such that + * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() + * we'll be sure to be able to observe our hash entry. * - * [S] lp = lock [RmW] l = l->locked = 0 - * MB MB - * [S] l->locked = _Q_SLOW_VAL [L] lp + * [S] pn->state + * [S] [Rmw] l->locked == _Q_SLOW_VAL + * MB RMB + * [RmW] l->locked = _Q_SLOW_VAL [L] + * [L] pn->state * - * Matches the cmpxchg() in __pv_queued_spin_unlock(). + * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { /* @@ -305,6 +309,15 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) return; } + /* + * A failed cmpxchg doesn't provide any memory-ordering guarantees, + * so we need a barrier to order the read of the node data in + * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. + * + * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + */ + smp_rmb(); + /* * Since the above failed to release, this must be the SLOW path. * Therefore start by looking up the blocked node and unhashing it. -- cgit v1.2.3 From ffffeaf318bd8da036eb8eb784b025a9f829201b Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 9 Jul 2015 12:32:22 -0400 Subject: locking/qrwlock: Reduce reader/writer to reader lock transfer latency Currently, a reader will check first to make sure that the writer mode byte is cleared before incrementing the reader count. That waiting is not really necessary. It increases the latency in the reader/writer to reader transition and reduces readers performance. This patch eliminates that waiting. It also has the side effect of reducing the chance of writer lock stealing and improving the fairness of the lock. Using a locking microbenchmark, a 10-threads 5M locking loop of mostly readers (RW ratio = 10,000:1) has the following performance numbers in a Haswell-EX box: Kernel Locking Rate (Kops/s) ------ --------------------- 4.1.1 15,063,081 4.1.1+patch 17,241,552 (+14.4%) Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Arnd Bergmann Cc: Douglas Hatch Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1436459543-29126-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index d9c36c5f5711..6a7a3b8d5ac9 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -88,15 +88,11 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) arch_spin_lock(&lock->lock); /* - * At the head of the wait queue now, wait until the writer state - * goes to 0 and then try to increment the reader count and get - * the lock. It is possible that an incoming writer may steal the - * lock in the interim, so it is necessary to check the writer byte - * to make sure that the write lock isn't taken. + * At the head of the wait queue now, increment the reader count + * and wait until the writer, if it has the lock, has gone away. + * At ths stage, it is not possible for a writer to remain in the + * waiting state (_QW_WAITING). So there won't be any deadlock. */ - while (atomic_read(&lock->cnts) & _QW_WMASK) - cpu_relax_lowlatency(); - cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); -- cgit v1.2.3 From 75d2270280686bff21b9ba66c7f3dd379c887981 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 11 Jul 2015 16:36:52 -0400 Subject: locking/pvqspinlock: Only kick CPU at unlock time For an over-committed guest with more vCPUs than physical CPUs available, it is possible that a vCPU may be kicked twice before getting the lock - once before it becomes queue head and once again before it gets the lock. All these CPU kicking and halting (VMEXIT) can be expensive and slow down system performance. This patch adds a new vCPU state (vcpu_hashed) which enables the code to delay CPU kicking until at unlock time. Once this state is set, the new lock holder will set _Q_SLOW_VAL and fill in the hash table on behalf of the halted queue head vCPU. The original vcpu_halted state will be used by pv_wait_node() only to differentiate other queue nodes from the qeue head. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Douglas Hatch Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Scott J Norton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1436647018-49734-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 6 ++-- kernel/locking/qspinlock_paravirt.h | 66 +++++++++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 21 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38c49202d532..337c8818541d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock) static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } - +static __always_inline void __pv_kick_node(struct qspinlock *lock, + struct mcs_spinlock *node) { } static __always_inline void __pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { } @@ -440,7 +440,7 @@ queue: cpu_relax(); arch_mcs_spin_unlock_contended(&next->locked); - pv_kick_node(next); + pv_kick_node(lock, next); release: /* diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index ab8b1bb8caa4..c8e6e9a596f5 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -22,9 +22,14 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) +/* + * Queue node uses: vcpu_running & vcpu_halted. + * Queue head uses: vcpu_running & vcpu_hashed. + */ enum vcpu_state { vcpu_running = 0, - vcpu_halted, + vcpu_halted, /* Used only in pv_wait_node */ + vcpu_hashed, /* = pv_hash'ed + vcpu_halted */ }; struct pv_node { @@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node) /* * Wait for node->locked to become true, halt the vcpu after a short spin. - * pv_kick_node() is used to wake the vcpu again. + * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its + * behalf. */ static void pv_wait_node(struct mcs_spinlock *node) { @@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node) * * [S] pn->state = vcpu_halted [S] next->locked = 1 * MB MB - * [L] pn->locked [RmW] pn->state = vcpu_running + * [L] pn->locked [RmW] pn->state = vcpu_hashed * - * Matches the xchg() from pv_kick_node(). + * Matches the cmpxchg() from pv_kick_node(). */ smp_store_mb(pn->state, vcpu_halted); @@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node) pv_wait(&pn->state, vcpu_halted); /* - * Reset the vCPU state to avoid unncessary CPU kicking + * If pv_kick_node() changed us to vcpu_hashed, retain that value + * so that pv_wait_head() knows to not also try to hash this lock. */ - WRITE_ONCE(pn->state, vcpu_running); + cmpxchg(&pn->state, vcpu_halted, vcpu_running); /* * If the locked flag is still not set after wakeup, it is a @@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * MCS lock will be released soon. */ } + /* * By now our node->locked should be 1 and our caller will not actually * spin-wait for it. We do however rely on our caller to do a @@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node) } /* - * Called after setting next->locked = 1, used to wake those stuck in - * pv_wait_node(). + * Called after setting next->locked = 1 when we're the lock owner. + * + * Instead of waking the waiters stuck in pv_wait_node() advance their state such + * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. */ -static void pv_kick_node(struct mcs_spinlock *node) +static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; + struct __qspinlock *l = (void *)lock; /* - * Note that because node->locked is already set, this actual - * mcs_spinlock entry could be re-used already. + * If the vCPU is indeed halted, advance its state to match that of + * pv_wait_node(). If OTOH this fails, the vCPU was running and will + * observe its next->locked value and advance itself. * - * This should be fine however, kicking people for no reason is - * harmless. + * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() + */ + if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted) + return; + + /* + * Put the lock into the hash table and set the _Q_SLOW_VAL. * - * See the comment in pv_wait_node(). + * As this is the same vCPU that will check the _Q_SLOW_VAL value and + * the hash table later on at unlock time, no atomic instruction is + * needed. */ - if (xchg(&pn->state, vcpu_running) == vcpu_halted) - pv_kick(pn->cpu); + WRITE_ONCE(l->locked, _Q_SLOW_VAL); + (void)pv_hash(lock, pn); } /* @@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) struct qspinlock **lp = NULL; int loop; + /* + * If pv_kick_node() already advanced our state, we don't need to + * insert ourselves into the hash table anymore. + */ + if (READ_ONCE(pn->state) == vcpu_hashed) + lp = (struct qspinlock **)1; + for (;;) { for (loop = SPIN_THRESHOLD; loop; loop--) { if (!READ_ONCE(l->locked)) @@ -240,9 +266,10 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) cpu_relax(); } - WRITE_ONCE(pn->state, vcpu_halted); if (!lp) { /* ONCE */ + WRITE_ONCE(pn->state, vcpu_hashed); lp = pv_hash(lock, pn); + /* * We must hash before setting _Q_SLOW_VAL, such that * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock() @@ -333,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) /* * At this point the memory pointed at by lock can be freed/reused, * however we can still use the pv_node to kick the CPU. + * The other vCPU may not really be halted, but kicking an active + * vCPU is harmless other than the additional latency in completing + * the unlock. */ - if (READ_ONCE(node->state) == vcpu_halted) + if (READ_ONCE(node->state) == vcpu_hashed) pv_kick(node->cpu); } /* -- cgit v1.2.3 From 77e430e3e45662b696dc49aa53ea0f7ac63f2574 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 6 Aug 2015 17:54:42 +0100 Subject: locking/qrwlock: Make use of _{acquire|release|relaxed}() atomics The qrwlock implementation is slightly heavy in its use of memory barriers, mainly through the use of _cmpxchg() and _return() atomics, which imply full barrier semantics. This patch modifies the qrwlock code to use the more relaxed atomic routines so that we can reduce the unnecessary barrier overhead on weakly-ordered architectures. Signed-off-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Waiman.Long@hp.com Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1438880084-18856-7-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qrwlock.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 6a7a3b8d5ac9..f17a3e3b3550 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -55,7 +55,7 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { cpu_relax_lowlatency(); - cnts = smp_load_acquire((u32 *)&lock->cnts); + cnts = atomic_read_acquire(&lock->cnts); } } @@ -74,8 +74,9 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * Readers in interrupt context will get the lock immediately * if the writer is just waiting (not holding the lock yet). * The rspin_until_writer_unlock() function returns immediately - * in this case. Otherwise, they will spin until the lock - * is available without waiting in the queue. + * in this case. Otherwise, they will spin (with ACQUIRE + * semantics) until the lock is available without waiting in + * the queue. */ rspin_until_writer_unlock(lock, cnts); return; @@ -88,12 +89,11 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) arch_spin_lock(&lock->lock); /* - * At the head of the wait queue now, increment the reader count - * and wait until the writer, if it has the lock, has gone away. - * At ths stage, it is not possible for a writer to remain in the - * waiting state (_QW_WAITING). So there won't be any deadlock. + * The ACQUIRE semantics of the following spinning code ensure + * that accesses can't leak upwards out of our subsequent critical + * section in the case that the lock is currently held for write. */ - cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; rspin_until_writer_unlock(lock, cnts); /* @@ -116,7 +116,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) /* Try to acquire the lock directly if no reader is present */ if (!atomic_read(&lock->cnts) && - (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) + (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0)) goto unlock; /* @@ -127,7 +127,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) struct __qrwlock *l = (struct __qrwlock *)lock; if (!READ_ONCE(l->wmode) && - (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) + (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; cpu_relax_lowlatency(); @@ -137,8 +137,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) for (;;) { cnts = atomic_read(&lock->cnts); if ((cnts == _QW_WAITING) && - (atomic_cmpxchg(&lock->cnts, _QW_WAITING, - _QW_LOCKED) == _QW_WAITING)) + (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING, + _QW_LOCKED) == _QW_WAITING)) break; cpu_relax_lowlatency(); -- cgit v1.2.3