diff options
-rw-r--r-- | Documentation/RCU/stallwarn.txt | 7 | ||||
-rw-r--r-- | Documentation/RCU/torture.txt | 39 | ||||
-rw-r--r-- | Documentation/RCU/whatisRCU.txt | 6 | ||||
-rw-r--r-- | Documentation/locking/locktorture.txt | 3 | ||||
-rw-r--r-- | Documentation/memory-barriers.txt | 12 | ||||
-rw-r--r-- | include/linux/percpu-rwsem.h | 3 | ||||
-rw-r--r-- | include/linux/rcu_sync.h | 86 | ||||
-rw-r--r-- | kernel/locking/locktorture.c | 164 | ||||
-rw-r--r-- | kernel/locking/percpu-rwsem.c | 90 | ||||
-rw-r--r-- | kernel/rcu/Makefile | 2 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 10 | ||||
-rw-r--r-- | kernel/rcu/sync.c | 223 | ||||
-rw-r--r-- | kernel/torture.c | 1 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm.sh | 6 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/lock/CFLIST | 4 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/lock/LOCK05 | 6 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot | 1 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/lock/LOCK06 | 6 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot | 1 |
19 files changed, 568 insertions, 102 deletions
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index efb9454875ab..0f7fb4298e7e 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -205,6 +205,13 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the behavior, you might need to replace some of the cond_resched() calls with calls to cond_resched_rcu_qs(). +o Booting Linux using a console connection that is too slow to + keep up with the boot-time console-message rate. For example, + a 115Kbaud serial console can be -way- too slow to keep up + with boot-time message rates, and will frequently result in + RCU CPU stall warning messages. Especially if you have added + debug printk()s. + o Anything that prevents RCU's grace-period kthreads from running. This can result in the "All QSes seen" console-log message. This message will include information on when the kthread last diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index dac02a6219b1..118e7c176ce7 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -166,40 +166,27 @@ test_no_idle_hz Whether or not to test the ability of RCU to operate in torture_type The type of RCU to test, with string values as follows: - "rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(). - - "rcu_sync": rcu_read_lock(), rcu_read_unlock(), and - synchronize_rcu(). - - "rcu_expedited": rcu_read_lock(), rcu_read_unlock(), and - synchronize_rcu_expedited(). + "rcu": rcu_read_lock(), rcu_read_unlock() and call_rcu(), + along with expedited, synchronous, and polling + variants. "rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and - call_rcu_bh(). - - "rcu_bh_sync": rcu_read_lock_bh(), rcu_read_unlock_bh(), - and synchronize_rcu_bh(). + call_rcu_bh(), along with expedited and synchronous + variants. - "rcu_bh_expedited": rcu_read_lock_bh(), rcu_read_unlock_bh(), - and synchronize_rcu_bh_expedited(). + "rcu_busted": This tests an intentionally incorrect version + of RCU in order to help test rcutorture itself. "srcu": srcu_read_lock(), srcu_read_unlock() and - call_srcu(). - - "srcu_sync": srcu_read_lock(), srcu_read_unlock() and - synchronize_srcu(). - - "srcu_expedited": srcu_read_lock(), srcu_read_unlock() and - synchronize_srcu_expedited(). + call_srcu(), along with expedited and + synchronous variants. "sched": preempt_disable(), preempt_enable(), and - call_rcu_sched(). - - "sched_sync": preempt_disable(), preempt_enable(), and - synchronize_sched(). + call_rcu_sched(), along with expedited, + synchronous, and polling variants. - "sched_expedited": preempt_disable(), preempt_enable(), and - synchronize_sched_expedited(). + "tasks": voluntary context switch and call_rcu_tasks(), + along with expedited and synchronous variants. Defaults to "rcu". diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index adc2184009c5..dc49c6712b17 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -364,7 +364,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt. }; DEFINE_SPINLOCK(foo_mutex); - struct foo *gbl_foo; + struct foo __rcu *gbl_foo; /* * Create a new struct foo that is the same as the one currently @@ -386,7 +386,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt. new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); spin_lock(&foo_mutex); - old_fp = gbl_foo; + old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); *new_fp = *old_fp; new_fp->a = new_a; rcu_assign_pointer(gbl_foo, new_fp); @@ -487,7 +487,7 @@ The foo_update_a() function might then be written as follows: new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); spin_lock(&foo_mutex); - old_fp = gbl_foo; + old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); *new_fp = *old_fp; new_fp->a = new_a; rcu_assign_pointer(gbl_foo, new_fp); diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt index 619f2bb136a5..a2ef3a929bf1 100644 --- a/Documentation/locking/locktorture.txt +++ b/Documentation/locking/locktorture.txt @@ -52,6 +52,9 @@ torture_type Type of lock to torture. By default, only spinlocks will o "mutex_lock": mutex_lock() and mutex_unlock() pairs. + o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock() + pairs. Kernel must have CONFIG_RT_MUTEX=y. + o "rwsem_lock": read/write down() and up() semaphore pairs. torture_runnable Start locktorture at boot time in the case where the diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index 2ba8461b0631..8e7cf9ad3db1 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1710,6 +1710,17 @@ There are some more advanced barrier functions: operations" subsection for information on where to use these. + (*) lockless_dereference(); + This can be thought of as a pointer-fetch wrapper around the + smp_read_barrier_depends() data-dependency barrier. + + This is also similar to rcu_dereference(), but in cases where + object lifetime is handled by some mechanism other than RCU, for + example, when the objects removed only when the system goes down. + In addition, lockless_dereference() is used in some data structures + that can be used both with and without RCU. + + (*) dma_wmb(); (*) dma_rmb(); @@ -1789,7 +1800,6 @@ The Linux kernel has a number of locking constructs: (*) mutexes (*) semaphores (*) R/W semaphores - (*) RCU In all cases there are variants on "ACQUIRE" operations and "RELEASE" operations for each construct. These operations all imply certain barriers: diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 834c4e52cb2d..c2fa3ecb0dce 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -5,11 +5,12 @@ #include <linux/rwsem.h> #include <linux/percpu.h> #include <linux/wait.h> +#include <linux/rcu_sync.h> #include <linux/lockdep.h> struct percpu_rw_semaphore { + struct rcu_sync rss; unsigned int __percpu *fast_read_ctr; - atomic_t write_ctr; struct rw_semaphore rw_sem; atomic_t slow_read_ctr; wait_queue_head_t write_waitq; diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h new file mode 100644 index 000000000000..a63a33e6196e --- /dev/null +++ b/include/linux/rcu_sync.h @@ -0,0 +1,86 @@ +/* + * RCU-based infrastructure for lightweight reader-writer locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (c) 2015, Red Hat, Inc. + * + * Author: Oleg Nesterov <oleg@redhat.com> + */ + +#ifndef _LINUX_RCU_SYNC_H_ +#define _LINUX_RCU_SYNC_H_ + +#include <linux/wait.h> +#include <linux/rcupdate.h> + +enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC }; + +/* Structure to mediate between updaters and fastpath-using readers. */ +struct rcu_sync { + int gp_state; + int gp_count; + wait_queue_head_t gp_wait; + + int cb_state; + struct rcu_head cb_head; + + enum rcu_sync_type gp_type; +}; + +extern void rcu_sync_lockdep_assert(struct rcu_sync *); + +/** + * rcu_sync_is_idle() - Are readers permitted to use their fastpaths? + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * Returns true if readers are permitted to use their fastpaths. + * Must be invoked within an RCU read-side critical section whose + * flavor matches that of the rcu_sync struture. + */ +static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) +{ +#ifdef CONFIG_PROVE_RCU + rcu_sync_lockdep_assert(rsp); +#endif + return !rsp->gp_state; /* GP_IDLE */ +} + +extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type); +extern void rcu_sync_enter(struct rcu_sync *); +extern void rcu_sync_exit(struct rcu_sync *); +extern void rcu_sync_dtor(struct rcu_sync *); + +#define __RCU_SYNC_INITIALIZER(name, type) { \ + .gp_state = 0, \ + .gp_count = 0, \ + .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ + .cb_state = 0, \ + .gp_type = type, \ + } + +#define __DEFINE_RCU_SYNC(name, type) \ + struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type) + +#define DEFINE_RCU_SYNC(name) \ + __DEFINE_RCU_SYNC(name, RCU_SYNC) + +#define DEFINE_RCU_SCHED_SYNC(name) \ + __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC) + +#define DEFINE_RCU_BH_SYNC(name) \ + __DEFINE_RCU_SYNC(name, RCU_BH_SYNC) + +#endif /* _LINUX_RCU_SYNC_H_ */ diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 32244186f1f2..8ef1919d63b2 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -17,12 +17,14 @@ * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kthread.h> +#include <linux/sched/rt.h> #include <linux/spinlock.h> #include <linux/rwlock.h> #include <linux/mutex.h> @@ -34,6 +36,7 @@ #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/percpu-rwsem.h> #include <linux/torture.h> MODULE_LICENSE("GPL"); @@ -91,11 +94,13 @@ struct lock_torture_ops { void (*init)(void); int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); + void (*task_boost)(struct torture_random_state *trsp); void (*writeunlock)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *trsp); void (*readunlock)(void); - unsigned long flags; + + unsigned long flags; /* for irq spinlocks */ const char *name; }; @@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void) /* BUGGY, do not use in real life!!! */ } +static void torture_boost_dummy(struct torture_random_state *trsp) +{ + /* Only rtmutexes care about priority */ +} + static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -211,6 +223,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -315,6 +329,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex) static struct lock_torture_ops mutex_lock_ops = { .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#ifdef CONFIG_RT_MUTEXES +static DEFINE_RT_MUTEX(torture_rtmutex); + +static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +{ + rt_mutex_lock(&torture_rtmutex); + return 0; +} + +static void torture_rtmutex_boost(struct torture_random_state *trsp) +{ + int policy; + struct sched_param param; + const unsigned int factor = 50000; /* yes, quite arbitrary */ + + if (!rt_task(current)) { + /* + * (1) Boost priority once every ~50k operations. When the + * task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + policy = SCHED_FIFO; + param.sched_priority = MAX_RT_PRIO - 1; + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another ~500k operations, + * then restored back to its original prio, and so forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + policy = SCHED_NORMAL; + param.sched_priority = 0; + } else /* common case, do nothing */ + return; + } + + sched_setscheduler_nocheck(current, policy, ¶m); +} + +static void torture_rtmutex_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* + * We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +{ + rt_mutex_unlock(&torture_rtmutex); +} + +static struct lock_torture_ops rtmutex_lock_ops = { + .writelock = torture_rtmutex_lock, + .write_delay = torture_rtmutex_delay, + .task_boost = torture_rtmutex_boost, + .writeunlock = torture_rtmutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "rtmutex_lock" +}; +#endif + static DECLARE_RWSEM(torture_rwsem); static int torture_rwsem_down_write(void) __acquires(torture_rwsem) { @@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = { .name = "rwsem_lock" }; +#include <linux/percpu-rwsem.h> +static struct percpu_rw_semaphore pcpu_rwsem; + +void torture_percpu_rwsem_init(void) +{ + BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); +} + +static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +{ + percpu_down_write(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +{ + percpu_up_write(&pcpu_rwsem); +} + +static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +{ + percpu_down_read(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +{ + percpu_up_read(&pcpu_rwsem); +} + +static struct lock_torture_ops percpu_rwsem_lock_ops = { + .init = torture_percpu_rwsem_init, + .writelock = torture_percpu_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_percpu_rwsem_up_write, + .readlock = torture_percpu_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_percpu_rwsem_up_read, + .name = "percpu_rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. @@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + cxt.cur_ops->task_boost(&rand); cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; @@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg) stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); + + cxt.cur_ops->task_boost(NULL); /* reset prio */ torture_kthread_stopping("lock_torture_writer"); return 0; } @@ -642,7 +788,11 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, +#ifdef CONFIG_RT_MUTEXES + &rtmutex_lock_ops, +#endif &rwsem_lock_ops, + &percpu_rwsem_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) @@ -661,11 +811,11 @@ static int __init lock_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cxt.cur_ops->init) - cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cxt.cur_ops->init(); if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; @@ -676,6 +826,10 @@ static int __init lock_torture_init(void) if (strncmp(torture_type, "mutex", 5) == 0) cxt.debug_lock = true; #endif +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (strncmp(torture_type, "rtmutex", 7) == 0) + cxt.debug_lock = true; +#endif #ifdef CONFIG_DEBUG_SPINLOCK if ((strncmp(torture_type, "spin", 4) == 0) || (strncmp(torture_type, "rw_lock", 7) == 0)) diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f32567254867..f231e0bb311c 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); + rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); atomic_set(&brw->slow_read_ctr, 0); init_waitqueue_head(&brw->write_waitq); return 0; } +EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) { + /* + * XXX: temporary kludge. The error path in alloc_super() + * assumes that percpu_free_rwsem() is safe after kzalloc(). + */ + if (!brw->fast_read_ctr) + return; + + rcu_sync_dtor(&brw->rss); free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } /* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer + * This is the fast-path for down_read/up_read. If it succeeds we rely + * on the barriers provided by rcu_sync_enter/exit; see the comments in + * percpu_down_write() and percpu_up_write(). * * If this helper fails the callers rely on the normal rw_semaphore and * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { - bool success = false; + bool success; preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { + success = rcu_sync_is_idle(&brw->rss); + if (likely(success)) __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } preempt_enable(); return success; @@ -77,16 +70,17 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) void percpu_down_read(struct percpu_rw_semaphore *brw) { might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + + if (likely(update_fast_ctr(brw, +1))) return; - } - down_read(&brw->rw_sem); + /* Avoid rwsem_acquire_read() and rwsem_release() */ + __down_read(&brw->rw_sem); atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ __up_read(&brw->rw_sem); } +EXPORT_SYMBOL_GPL(percpu_down_read); int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) { @@ -112,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw) if (atomic_dec_and_test(&brw->slow_read_ctr)) wake_up_all(&brw->write_waitq); } +EXPORT_SYMBOL_GPL(percpu_up_read); static int clear_fast_ctr(struct percpu_rw_semaphore *brw) { @@ -126,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw) return sum; } -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ void percpu_down_write(struct percpu_rw_semaphore *brw) { - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). + * Make rcu_sync_is_idle() == F and thus disable the fast-path in + * percpu_down_read() and percpu_up_read(), and wait for gp pass. * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). + * The latter synchronises us with the preceding readers which used + * the fast-past, so we can not miss the result of __this_cpu_add() + * or anything else inside their criticial sections. */ - synchronize_sched_expedited(); + rcu_sync_enter(&brw->rss); /* exclude other writers, and block the new readers completely */ down_write(&brw->rw_sem); @@ -163,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw) /* wait for all readers to complete their percpu_up_read() */ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); } +EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *brw) { /* release the lock, but the readers can't use the fast-path */ up_write(&brw->rw_sem); /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). + * Enable the fast-path in percpu_down_read() and percpu_up_read() + * but only after another gp pass; this adds the necessary barrier + * to ensure the reader can't miss the changes done by us. */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); + rcu_sync_exit(&brw->rss); } +EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 50a808424b06..61a16569ffbf 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,4 @@ -obj-y += update.o +obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f9ec6cbe77d3..d89328e260df 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void) #define RCUTORTURE_TASKS_OPS -static bool torturing_tasks(void) +static bool __maybe_unused torturing_tasks(void) { return false; } @@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1742,15 +1740,15 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cur_ops->init(); if (nreaders >= 0) { nrealreaders = nreaders; diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c new file mode 100644 index 000000000000..be922c9f3d37 --- /dev/null +++ b/kernel/rcu/sync.c @@ -0,0 +1,223 @@ +/* + * RCU-based infrastructure for lightweight reader-writer locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (c) 2015, Red Hat, Inc. + * + * Author: Oleg Nesterov <oleg@redhat.com> + */ + +#include <linux/rcu_sync.h> +#include <linux/sched.h> + +#ifdef CONFIG_PROVE_RCU +#define __INIT_HELD(func) .held = func, +#else +#define __INIT_HELD(func) +#endif + +static const struct { + void (*sync)(void); + void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); + void (*wait)(void); +#ifdef CONFIG_PROVE_RCU + int (*held)(void); +#endif +} gp_ops[] = { + [RCU_SYNC] = { + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, + __INIT_HELD(rcu_read_lock_held) + }, + [RCU_SCHED_SYNC] = { + .sync = synchronize_sched, + .call = call_rcu_sched, + .wait = rcu_barrier_sched, + __INIT_HELD(rcu_read_lock_sched_held) + }, + [RCU_BH_SYNC] = { + .sync = synchronize_rcu_bh, + .call = call_rcu_bh, + .wait = rcu_barrier_bh, + __INIT_HELD(rcu_read_lock_bh_held) + }, +}; + +enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; +enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; + +#define rss_lock gp_wait.lock + +#ifdef CONFIG_PROVE_RCU +void rcu_sync_lockdep_assert(struct rcu_sync *rsp) +{ + RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), + "suspicious rcu_sync_is_idle() usage"); +} +#endif + +/** + * rcu_sync_init() - Initialize an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be initialized + * @type: Flavor of RCU with which to synchronize rcu_sync structure + */ +void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +{ + memset(rsp, 0, sizeof(*rsp)); + init_waitqueue_head(&rsp->gp_wait); + rsp->gp_type = type; +} + +/** + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + bool need_wait, need_sync; + + spin_lock_irq(&rsp->rss_lock); + need_wait = rsp->gp_count++; + need_sync = rsp->gp_state == GP_IDLE; + if (need_sync) + rsp->gp_state = GP_PENDING; + spin_unlock_irq(&rsp->rss_lock); + + BUG_ON(need_wait && need_sync); + + if (need_sync) { + gp_ops[rsp->gp_type].sync(); + rsp->gp_state = GP_PASSED; + wake_up_all(&rsp->gp_wait); + } else if (need_wait) { + wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); + } else { + /* + * Possible when there's a pending CB from a rcu_sync_exit(). + * Nobody has yet been allowed the 'fast' path and thus we can + * avoid doing any sync(). The callback will get 'dropped'. + */ + BUG_ON(rsp->gp_state != GP_PASSED); + } +} + +/** + * rcu_sync_func() - Callback function managing reader access to fastpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is passed to one of the call_rcu() functions by + * rcu_sync_exit(), so that it is invoked after a grace period following the + * that invocation of rcu_sync_exit(). It takes action based on events that + * have taken place in the meantime, so that closely spaced rcu_sync_enter() + * and rcu_sync_exit() pairs need not wait for a grace period. + * + * If another rcu_sync_enter() is invoked before the grace period + * ended, reset state to allow the next rcu_sync_exit() to let the + * readers back onto their fastpaths (after a grace period). If both + * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked + * before the grace period ended, re-invoke call_rcu() on behalf of that + * rcu_sync_exit(). Otherwise, set all state back to idle so that readers + * can again use their fastpaths. + */ +static void rcu_sync_func(struct rcu_head *rcu) +{ + struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + unsigned long flags; + + BUG_ON(rsp->gp_state != GP_PASSED); + BUG_ON(rsp->cb_state == CB_IDLE); + + spin_lock_irqsave(&rsp->rss_lock, flags); + if (rsp->gp_count) { + /* + * A new rcu_sync_begin() has happened; drop the callback. + */ + rsp->cb_state = CB_IDLE; + } else if (rsp->cb_state == CB_REPLAY) { + /* + * A new rcu_sync_exit() has happened; requeue the callback + * to catch a later GP. + */ + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else { + /* + * We're at least a GP after rcu_sync_exit(); eveybody will now + * have observed the write side critical section. Let 'em rip!. + */ + rsp->cb_state = CB_IDLE; + rsp->gp_state = GP_IDLE; + } + spin_unlock_irqrestore(&rsp->rss_lock, flags); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who have completed, and can therefore + * now allow readers to make use of their fastpaths after a grace period + * has elapsed. After this grace period has completed, all subsequent + * calls to rcu_sync_is_idle() will return true, which tells readers that + * they can once again use their fastpaths. + */ +void rcu_sync_exit(struct rcu_sync *rsp) +{ + spin_lock_irq(&rsp->rss_lock); + if (!--rsp->gp_count) { + if (rsp->cb_state == CB_IDLE) { + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else if (rsp->cb_state == CB_PENDING) { + rsp->cb_state = CB_REPLAY; + } + } + spin_unlock_irq(&rsp->rss_lock); +} + +/** + * rcu_sync_dtor() - Clean up an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be cleaned up + */ +void rcu_sync_dtor(struct rcu_sync *rsp) +{ + int cb_state; + + BUG_ON(rsp->gp_count); + + spin_lock_irq(&rsp->rss_lock); + if (rsp->cb_state == CB_REPLAY) + rsp->cb_state = CB_PENDING; + cb_state = rsp->cb_state; + spin_unlock_irq(&rsp->rss_lock); + + if (cb_state != CB_IDLE) { + gp_ops[rsp->gp_type].wait(); + BUG_ON(rsp->cb_state != CB_IDLE); + } +} diff --git a/kernel/torture.c b/kernel/torture.c index 3e4840633d3e..44aa462d033f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -523,6 +523,7 @@ static int stutter; */ void stutter_wait(const char *title) { + cond_resched_rcu_qs(); while (READ_ONCE(stutter_pause_test) || (torture_runnable && !READ_ONCE(*torture_runnable))) { if (stutter_pause_test) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index fbe2dbff1e21..f6483609ebc2 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -75,7 +75,7 @@ usage () { while test $# -gt 0 do case "$1" in - --bootargs) + --bootargs|--bootarg) checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--' TORTURE_BOOTARGS="$2" shift @@ -88,7 +88,7 @@ do --buildonly) TORTURE_BUILDONLY=1 ;; - --configs) + --configs|--config) checkarg --configs "(list of config files)" "$#" "$2" '^[^/]*$' '^--' configs="$2" shift @@ -134,7 +134,7 @@ do --no-initrd) TORTURE_INITRD=""; export TORTURE_INITRD ;; - --qemu-args) + --qemu-args|--qemu-arg) checkarg --qemu-args "-qemu args" $# "$2" '^-' '^error' TORTURE_QEMU_ARG="$2" shift diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST index 6910b7370761..b9611c523723 100644 --- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST +++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST @@ -1,4 +1,6 @@ LOCK01 LOCK02 LOCK03 -LOCK04
\ No newline at end of file +LOCK04 +LOCK05 +LOCK06 diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05 b/tools/testing/selftests/rcutorture/configs/lock/LOCK05 new file mode 100644 index 000000000000..1d1da1477fc3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05 @@ -0,0 +1,6 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot new file mode 100644 index 000000000000..8ac37307c987 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot @@ -0,0 +1 @@ +locktorture.torture_type=rtmutex_lock diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06 b/tools/testing/selftests/rcutorture/configs/lock/LOCK06 new file mode 100644 index 000000000000..1d1da1477fc3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06 @@ -0,0 +1,6 @@ +CONFIG_SMP=y +CONFIG_NR_CPUS=4 +CONFIG_HOTPLUG_CPU=y +CONFIG_PREEMPT_NONE=n +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=y diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot new file mode 100644 index 000000000000..f92219cd4ad9 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot @@ -0,0 +1 @@ +locktorture.torture_type=percpu_rwsem_lock |