19 files changed, 568 insertions, 102 deletions
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index efb9454875ab..0f7fb4298e7e 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -205,6 +205,13 @@ o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the
 	behavior, you might need to replace some of the cond_resched()
 	calls with calls to cond_resched_rcu_qs().
 
+o	Booting Linux using a console connection that is too slow to
+	keep up with the boot-time console-message rate.  For example,
+	a 115Kbaud serial console can be -way- too slow to keep up
+	with boot-time message rates, and will frequently result in
+	RCU CPU stall warning messages.  Especially if you have added
+	debug printk()s.
+
 o	Anything that prevents RCU's grace-period kthreads from running.
 	This can result in the "All QSes seen" console-log message.
 	This message will include information on when the kthread last
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index dac02a6219b1..118e7c176ce7 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -166,40 +166,27 @@ test_no_idle_hz	Whether or not to test the ability of RCU to operate in
 
 torture_type	The type of RCU to test, with string values as follows:
 
-		"rcu":  rcu_read_lock(), rcu_read_unlock() and call_rcu().
-
-		"rcu_sync":  rcu_read_lock(), rcu_read_unlock(), and
-			synchronize_rcu().
-
-		"rcu_expedited": rcu_read_lock(), rcu_read_unlock(), and
-			synchronize_rcu_expedited().
+		"rcu":  rcu_read_lock(), rcu_read_unlock() and call_rcu(),
+			along with expedited, synchronous, and polling
+			variants.
 
 		"rcu_bh": rcu_read_lock_bh(), rcu_read_unlock_bh(), and
-			call_rcu_bh().
-
-		"rcu_bh_sync": rcu_read_lock_bh(), rcu_read_unlock_bh(),
-			and synchronize_rcu_bh().
+			call_rcu_bh(), along with expedited and synchronous
+			variants.
 
-		"rcu_bh_expedited": rcu_read_lock_bh(), rcu_read_unlock_bh(),
-			and synchronize_rcu_bh_expedited().
+		"rcu_busted": This tests an intentionally incorrect version
+			of RCU in order to help test rcutorture itself.
 
 		"srcu": srcu_read_lock(), srcu_read_unlock() and
-			call_srcu().
-
-		"srcu_sync": srcu_read_lock(), srcu_read_unlock() and
-			synchronize_srcu().
-
-		"srcu_expedited": srcu_read_lock(), srcu_read_unlock() and
-			synchronize_srcu_expedited().
+			call_srcu(), along with expedited and
+			synchronous variants.
 
 		"sched": preempt_disable(), preempt_enable(), and
-			call_rcu_sched().
-
-		"sched_sync": preempt_disable(), preempt_enable(), and
-			synchronize_sched().
+			call_rcu_sched(), along with expedited,
+			synchronous, and polling variants.
 
-		"sched_expedited": preempt_disable(), preempt_enable(), and
-			synchronize_sched_expedited().
+		"tasks": voluntary context switch and call_rcu_tasks(),
+			along with expedited and synchronous variants.
 
 		Defaults to "rcu".
 
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index adc2184009c5..dc49c6712b17 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -364,7 +364,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
 	};
 	DEFINE_SPINLOCK(foo_mutex);
 
-	struct foo *gbl_foo;
+	struct foo __rcu *gbl_foo;
 
 	/*
 	 * Create a new struct foo that is the same as the one currently
@@ -386,7 +386,7 @@ uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt.
 
 		new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
 		spin_lock(&foo_mutex);
-		old_fp = gbl_foo;
+		old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
 		*new_fp = *old_fp;
 		new_fp->a = new_a;
 		rcu_assign_pointer(gbl_foo, new_fp);
@@ -487,7 +487,7 @@ The foo_update_a() function might then be written as follows:
 
 		new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL);
 		spin_lock(&foo_mutex);
-		old_fp = gbl_foo;
+		old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex));
 		*new_fp = *old_fp;
 		new_fp->a = new_a;
 		rcu_assign_pointer(gbl_foo, new_fp);
diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt
index 619f2bb136a5..a2ef3a929bf1 100644
--- a/Documentation/locking/locktorture.txt
+++ b/Documentation/locking/locktorture.txt
@@ -52,6 +52,9 @@ torture_type	  Type of lock to torture. By default, only spinlocks will
 
 		     o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
 
+		     o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
+				       pairs. Kernel must have CONFIG_RT_MUTEX=y.
+
 		     o "rwsem_lock": read/write down() and up() semaphore pairs.
 
 torture_runnable  Start locktorture at boot time in the case where the
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 2ba8461b0631..8e7cf9ad3db1 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1710,6 +1710,17 @@ There are some more advanced barrier functions:
      operations" subsection for information on where to use these.
 
 
+ (*) lockless_dereference();
+     This can be thought of as a pointer-fetch wrapper around the
+     smp_read_barrier_depends() data-dependency barrier.
+
+     This is also similar to rcu_dereference(), but in cases where
+     object lifetime is handled by some mechanism other than RCU, for
+     example, when the objects removed only when the system goes down.
+     In addition, lockless_dereference() is used in some data structures
+     that can be used both with and without RCU.
+
+
  (*) dma_wmb();
  (*) dma_rmb();
 
@@ -1789,7 +1800,6 @@ The Linux kernel has a number of locking constructs:
  (*) mutexes
  (*) semaphores
  (*) R/W semaphores
- (*) RCU
 
 In all cases there are variants on "ACQUIRE" operations and "RELEASE" operations
 for each construct.  These operations all imply certain barriers:
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 834c4e52cb2d..c2fa3ecb0dce 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,11 +5,12 @@
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/wait.h>
+#include <linux/rcu_sync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
+	struct rcu_sync		rss;
 	unsigned int __percpu	*fast_read_ctr;
-	atomic_t		write_ctr;
 	struct rw_semaphore	rw_sem;
 	atomic_t		slow_read_ctr;
 	wait_queue_head_t	write_waitq;
diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
new file mode 100644
index 000000000000..a63a33e6196e
--- /dev/null
+++ b/include/linux/rcu_sync.h
@@ -0,0 +1,86 @@
+/*
+ * RCU-based infrastructure for lightweight reader-writer locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2015, Red Hat, Inc.
+ *
+ * Author: Oleg Nesterov <oleg@redhat.com>
+ */
+
+#ifndef _LINUX_RCU_SYNC_H_
+#define _LINUX_RCU_SYNC_H_
+
+#include <linux/wait.h>
+#include <linux/rcupdate.h>
+
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
+/* Structure to mediate between updaters and fastpath-using readers.  */
+struct rcu_sync {
+	int			gp_state;
+	int			gp_count;
+	wait_queue_head_t	gp_wait;
+
+	int			cb_state;
+	struct rcu_head		cb_head;
+
+	enum rcu_sync_type	gp_type;
+};
+
+extern void rcu_sync_lockdep_assert(struct rcu_sync *);
+
+/**
+ * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * Returns true if readers are permitted to use their fastpaths.
+ * Must be invoked within an RCU read-side critical section whose
+ * flavor matches that of the rcu_sync struture.
+ */
+static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
+{
+#ifdef CONFIG_PROVE_RCU
+	rcu_sync_lockdep_assert(rsp);
+#endif
+	return !rsp->gp_state; /* GP_IDLE */
+}
+
+extern void rcu_sync_init(struct rcu_sync *, enum rcu_sync_type);
+extern void rcu_sync_enter(struct rcu_sync *);
+extern void rcu_sync_exit(struct rcu_sync *);
+extern void rcu_sync_dtor(struct rcu_sync *);
+
+#define __RCU_SYNC_INITIALIZER(name, type) {				\
+		.gp_state = 0,						\
+		.gp_count = 0,						\
+		.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),	\
+		.cb_state = 0,						\
+		.gp_type = type,					\
+	}
+
+#define	__DEFINE_RCU_SYNC(name, type)	\
+	struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+
+#define DEFINE_RCU_SYNC(name)		\
+	__DEFINE_RCU_SYNC(name, RCU_SYNC)
+
+#define DEFINE_RCU_SCHED_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+
+#define DEFINE_RCU_BH_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+
+#endif /* _LINUX_RCU_SYNC_H_ */
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 32244186f1f2..8ef1919d63b2 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -17,12 +17,14 @@
  *
  * Copyright (C) IBM Corporation, 2014
  *
- * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *          Davidlohr Bueso <dave@stgolabs.net>
  *	Based on kernel/rcu/torture.c.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/kthread.h>
+#include <linux/sched/rt.h>
 #include <linux/spinlock.h>
 #include <linux/rwlock.h>
 #include <linux/mutex.h>
@@ -34,6 +36,7 @@
 #include <linux/moduleparam.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/torture.h>
 
 MODULE_LICENSE("GPL");
@@ -91,11 +94,13 @@ struct lock_torture_ops {
 	void (*init)(void);
 	int (*writelock)(void);
 	void (*write_delay)(struct torture_random_state *trsp);
+	void (*task_boost)(struct torture_random_state *trsp);
 	void (*writeunlock)(void);
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *trsp);
 	void (*readunlock)(void);
-	unsigned long flags;
+
+	unsigned long flags; /* for irq spinlocks */
 	const char *name;
 };
 
@@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void)
 	  /* BUGGY, do not use in real life!!! */
 }
 
+static void torture_boost_dummy(struct torture_random_state *trsp)
+{
+	/* Only rtmutexes care about priority */
+}
+
 static struct lock_torture_ops lock_busted_ops = {
 	.writelock	= torture_lock_busted_write_lock,
 	.write_delay	= torture_lock_busted_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_lock_busted_write_unlock,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
 static struct lock_torture_ops spin_lock_ops = {
 	.writelock	= torture_spin_lock_write_lock,
 	.write_delay	= torture_spin_lock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_spin_lock_write_unlock,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -211,6 +223,7 @@ __releases(torture_spinlock)
 static struct lock_torture_ops spin_lock_irq_ops = {
 	.writelock	= torture_spin_lock_write_lock_irq,
 	.write_delay	= torture_spin_lock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_lock_spin_write_unlock_irq,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
 static struct lock_torture_ops rw_lock_ops = {
 	.writelock	= torture_rwlock_write_lock,
 	.write_delay	= torture_rwlock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_rwlock_write_unlock,
 	.readlock       = torture_rwlock_read_lock,
 	.read_delay     = torture_rwlock_read_delay,
@@ -315,6 +329,7 @@ __releases(torture_rwlock)
 static struct lock_torture_ops rw_lock_irq_ops = {
 	.writelock	= torture_rwlock_write_lock_irq,
 	.write_delay	= torture_rwlock_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_rwlock_write_unlock_irq,
 	.readlock       = torture_rwlock_read_lock_irq,
 	.read_delay     = torture_rwlock_read_delay,
@@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex)
 static struct lock_torture_ops mutex_lock_ops = {
 	.writelock	= torture_mutex_lock,
 	.write_delay	= torture_mutex_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_mutex_unlock,
 	.readlock       = NULL,
 	.read_delay     = NULL,
@@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = {
 	.name		= "mutex_lock"
 };
 
+#ifdef CONFIG_RT_MUTEXES
+static DEFINE_RT_MUTEX(torture_rtmutex);
+
+static int torture_rtmutex_lock(void) __acquires(torture_rtmutex)
+{
+	rt_mutex_lock(&torture_rtmutex);
+	return 0;
+}
+
+static void torture_rtmutex_boost(struct torture_random_state *trsp)
+{
+	int policy;
+	struct sched_param param;
+	const unsigned int factor = 50000; /* yes, quite arbitrary */
+
+	if (!rt_task(current)) {
+		/*
+		 * (1) Boost priority once every ~50k operations. When the
+		 * task tries to take the lock, the rtmutex it will account
+		 * for the new priority, and do any corresponding pi-dance.
+		 */
+		if (!(torture_random(trsp) %
+		      (cxt.nrealwriters_stress * factor))) {
+			policy = SCHED_FIFO;
+			param.sched_priority = MAX_RT_PRIO - 1;
+		} else /* common case, do nothing */
+			return;
+	} else {
+		/*
+		 * The task will remain boosted for another ~500k operations,
+		 * then restored back to its original prio, and so forth.
+		 *
+		 * When @trsp is nil, we want to force-reset the task for
+		 * stopping the kthread.
+		 */
+		if (!trsp || !(torture_random(trsp) %
+			       (cxt.nrealwriters_stress * factor * 2))) {
+			policy = SCHED_NORMAL;
+			param.sched_priority = 0;
+		} else /* common case, do nothing */
+			return;
+	}
+
+	sched_setscheduler_nocheck(current, policy, &param);
+}
+
+static void torture_rtmutex_delay(struct torture_random_state *trsp)
+{
+	const unsigned long shortdelay_us = 2;
+	const unsigned long longdelay_ms = 100;
+
+	/*
+	 * We want a short delay mostly to emulate likely code, and
+	 * we want a long delay occasionally to force massive contention.
+	 */
+	if (!(torture_random(trsp) %
+	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms);
+	if (!(torture_random(trsp) %
+	      (cxt.nrealwriters_stress * 2 * shortdelay_us)))
+		udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+		preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_rtmutex_unlock(void) __releases(torture_rtmutex)
+{
+	rt_mutex_unlock(&torture_rtmutex);
+}
+
+static struct lock_torture_ops rtmutex_lock_ops = {
+	.writelock	= torture_rtmutex_lock,
+	.write_delay	= torture_rtmutex_delay,
+	.task_boost     = torture_rtmutex_boost,
+	.writeunlock	= torture_rtmutex_unlock,
+	.readlock       = NULL,
+	.read_delay     = NULL,
+	.readunlock     = NULL,
+	.name		= "rtmutex_lock"
+};
+#endif
+
 static DECLARE_RWSEM(torture_rwsem);
 static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
 {
@@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem)
 static struct lock_torture_ops rwsem_lock_ops = {
 	.writelock	= torture_rwsem_down_write,
 	.write_delay	= torture_rwsem_write_delay,
+	.task_boost     = torture_boost_dummy,
 	.writeunlock	= torture_rwsem_up_write,
 	.readlock       = torture_rwsem_down_read,
 	.read_delay     = torture_rwsem_read_delay,
@@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = {
 	.name		= "rwsem_lock"
 };
 
+#include <linux/percpu-rwsem.h>
+static struct percpu_rw_semaphore pcpu_rwsem;
+
+void torture_percpu_rwsem_init(void)
+{
+	BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
+}
+
+static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
+{
+	percpu_down_write(&pcpu_rwsem);
+	return 0;
+}
+
+static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem)
+{
+	percpu_up_write(&pcpu_rwsem);
+}
+
+static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem)
+{
+	percpu_down_read(&pcpu_rwsem);
+	return 0;
+}
+
+static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
+{
+	percpu_up_read(&pcpu_rwsem);
+}
+
+static struct lock_torture_ops percpu_rwsem_lock_ops = {
+	.init		= torture_percpu_rwsem_init,
+	.writelock	= torture_percpu_rwsem_down_write,
+	.write_delay	= torture_rwsem_write_delay,
+	.task_boost     = torture_boost_dummy,
+	.writeunlock	= torture_percpu_rwsem_up_write,
+	.readlock       = torture_percpu_rwsem_down_read,
+	.read_delay     = torture_rwsem_read_delay,
+	.readunlock     = torture_percpu_rwsem_up_read,
+	.name		= "percpu_rwsem_lock"
+};
+
 /*
  * Lock torture writer kthread.  Repeatedly acquires and releases
  * the lock, checking for duplicate acquisitions.
@@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg)
 		if ((torture_random(&rand) & 0xfffff) == 0)
 			schedule_timeout_uninterruptible(1);
 
+		cxt.cur_ops->task_boost(&rand);
 		cxt.cur_ops->writelock();
 		if (WARN_ON_ONCE(lock_is_write_held))
 			lwsp->n_lock_fail++;
@@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg)
 
 		stutter_wait("lock_torture_writer");
 	} while (!torture_must_stop());
+
+	cxt.cur_ops->task_boost(NULL); /* reset prio */
 	torture_kthread_stopping("lock_torture_writer");
 	return 0;
 }
@@ -642,7 +788,11 @@ static int __init lock_torture_init(void)
 		&spin_lock_ops, &spin_lock_irq_ops,
 		&rw_lock_ops, &rw_lock_irq_ops,
 		&mutex_lock_ops,
+#ifdef CONFIG_RT_MUTEXES
+		&rtmutex_lock_ops,
+#endif
 		&rwsem_lock_ops,
+		&percpu_rwsem_lock_ops,
 	};
 
 	if (!torture_init_begin(torture_type, verbose, &torture_runnable))
@@ -661,11 +811,11 @@ static int __init lock_torture_init(void)
 		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
 			pr_alert(" %s", torture_ops[i]->name);
 		pr_alert("\n");
-		torture_init_end();
-		return -EINVAL;
+		firsterr = -EINVAL;
+		goto unwind;
 	}
 	if (cxt.cur_ops->init)
-		cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+		cxt.cur_ops->init();
 
 	if (nwriters_stress >= 0)
 		cxt.nrealwriters_stress = nwriters_stress;
@@ -676,6 +826,10 @@ static int __init lock_torture_init(void)
 	if (strncmp(torture_type, "mutex", 5) == 0)
 		cxt.debug_lock = true;
 #endif
+#ifdef CONFIG_DEBUG_RT_MUTEXES
+	if (strncmp(torture_type, "rtmutex", 7) == 0)
+		cxt.debug_lock = true;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
 	if ((strncmp(torture_type, "spin", 4) == 0) ||
 	    (strncmp(torture_type, "rw_lock", 7) == 0))
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f32567254867..f231e0bb311c 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
 
 	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
 	__init_rwsem(&brw->rw_sem, name, rwsem_key);
-	atomic_set(&brw->write_ctr, 0);
+	rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
 	atomic_set(&brw->slow_read_ctr, 0);
 	init_waitqueue_head(&brw->write_waitq);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
 
 void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 {
+	/*
+	 * XXX: temporary kludge. The error path in alloc_super()
+	 * assumes that percpu_free_rwsem() is safe after kzalloc().
+	 */
+	if (!brw->fast_read_ctr)
+		return;
+
+	rcu_sync_dtor(&brw->rss);
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
 
 /*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- *	R_W: down_write() comes after up_read(), the writer should see all
- *	     changes done by the reader
- * or
- *	W_R: down_read() comes after up_write(), the reader should see all
- *	     changes done by the writer
+ * This is the fast-path for down_read/up_read. If it succeeds we rely
+ * on the barriers provided by rcu_sync_enter/exit; see the comments in
+ * percpu_down_write() and percpu_up_write().
  *
  * If this helper fails the callers rely on the normal rw_semaphore and
  * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
  */
 static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 {
-	bool success = false;
+	bool success;
 
 	preempt_disable();
-	if (likely(!atomic_read(&brw->write_ctr))) {
+	success = rcu_sync_is_idle(&brw->rss);
+	if (likely(success))
 		__this_cpu_add(*brw->fast_read_ctr, val);
-		success = true;
-	}
 	preempt_enable();
 
 	return success;
@@ -77,16 +70,17 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
 void percpu_down_read(struct percpu_rw_semaphore *brw)
 {
 	might_sleep();
-	if (likely(update_fast_ctr(brw, +1))) {
-		rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+	rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+	if (likely(update_fast_ctr(brw, +1)))
 		return;
-	}
 
-	down_read(&brw->rw_sem);
+	/* Avoid rwsem_acquire_read() and rwsem_release() */
+	__down_read(&brw->rw_sem);
 	atomic_inc(&brw->slow_read_ctr);
-	/* avoid up_read()->rwsem_release() */
 	__up_read(&brw->rw_sem);
 }
+EXPORT_SYMBOL_GPL(percpu_down_read);
 
 int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
 {
@@ -112,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw)
 	if (atomic_dec_and_test(&brw->slow_read_ctr))
 		wake_up_all(&brw->write_waitq);
 }
+EXPORT_SYMBOL_GPL(percpu_up_read);
 
 static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
 {
@@ -126,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
 	return sum;
 }
 
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
 void percpu_down_write(struct percpu_rw_semaphore *brw)
 {
-	/* tell update_fast_ctr() there is a pending writer */
-	atomic_inc(&brw->write_ctr);
 	/*
-	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
-	 *    so that update_fast_ctr() can't succeed.
-	 *
-	 * 2. Ensures we see the result of every previous this_cpu_add() in
-	 *    update_fast_ctr().
+	 * Make rcu_sync_is_idle() == F and thus disable the fast-path in
+	 * percpu_down_read() and percpu_up_read(), and wait for gp pass.
 	 *
-	 * 3. Ensures that if any reader has exited its critical section via
-	 *    fast-path, it executes a full memory barrier before we return.
-	 *    See R_W case in the comment above update_fast_ctr().
+	 * The latter synchronises us with the preceding readers which used
+	 * the fast-past, so we can not miss the result of __this_cpu_add()
+	 * or anything else inside their criticial sections.
 	 */
-	synchronize_sched_expedited();
+	rcu_sync_enter(&brw->rss);
 
 	/* exclude other writers, and block the new readers completely */
 	down_write(&brw->rw_sem);
@@ -163,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw)
 	/* wait for all readers to complete their percpu_up_read() */
 	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
 }
+EXPORT_SYMBOL_GPL(percpu_down_write);
 
 void percpu_up_write(struct percpu_rw_semaphore *brw)
 {
 	/* release the lock, but the readers can't use the fast-path */
 	up_write(&brw->rw_sem);
 	/*
-	 * Insert the barrier before the next fast-path in down_read,
-	 * see W_R case in the comment above update_fast_ctr().
+	 * Enable the fast-path in percpu_down_read() and percpu_up_read()
+	 * but only after another gp pass; this adds the necessary barrier
+	 * to ensure the reader can't miss the changes done by us.
 	 */
-	synchronize_sched_expedited();
-	/* the last writer unblocks update_fast_ctr() */
-	atomic_dec(&brw->write_ctr);
+	rcu_sync_exit(&brw->rss);
 }
+EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 50a808424b06..61a16569ffbf 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f9ec6cbe77d3..d89328e260df 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void)
 
 #define RCUTORTURE_TASKS_OPS
 
-static bool torturing_tasks(void)
+static bool __maybe_unused torturing_tasks(void)
 {
 	return false;
 }
@@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg)
 				}
 				call_rcu_time = jiffies;
 			}
-			cond_resched_rcu_qs();
 			stutter_wait("rcu_torture_boost");
 			if (torture_must_stop())
 				goto checkwait;
@@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg)
 		__this_cpu_inc(rcu_torture_batch[completed]);
 		preempt_enable();
 		cur_ops->readunlock(idx);
-		cond_resched_rcu_qs();
 		stutter_wait("rcu_torture_reader");
 	} while (!torture_must_stop());
 	if (irqreader && cur_ops->irq_capable) {
@@ -1742,15 +1740,15 @@ rcu_torture_init(void)
 		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
 			pr_alert(" %s", torture_ops[i]->name);
 		pr_alert("\n");
-		torture_init_end();
-		return -EINVAL;
+		firsterr = -EINVAL;
+		goto unwind;
 	}
 	if (cur_ops->fqs == NULL && fqs_duration != 0) {
 		pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
 		fqs_duration = 0;
 	}
 	if (cur_ops->init)
-		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+		cur_ops->init();
 
 	if (nreaders >= 0) {
 		nrealreaders = nreaders;
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
new file mode 100644
index 000000000000..be922c9f3d37
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,223 @@
+/*
+ * RCU-based infrastructure for lightweight reader-writer locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (c) 2015, Red Hat, Inc.
+ *
+ * Author: Oleg Nesterov <oleg@redhat.com>
+ */
+
+#include <linux/rcu_sync.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func)	.held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
+static const struct {
+	void (*sync)(void);
+	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+	void (*wait)(void);
+#ifdef CONFIG_PROVE_RCU
+	int  (*held)(void);
+#endif
+} gp_ops[] = {
+	[RCU_SYNC] = {
+		.sync = synchronize_rcu,
+		.call = call_rcu,
+		.wait = rcu_barrier,
+		__INIT_HELD(rcu_read_lock_held)
+	},
+	[RCU_SCHED_SYNC] = {
+		.sync = synchronize_sched,
+		.call = call_rcu_sched,
+		.wait = rcu_barrier_sched,
+		__INIT_HELD(rcu_read_lock_sched_held)
+	},
+	[RCU_BH_SYNC] = {
+		.sync = synchronize_rcu_bh,
+		.call = call_rcu_bh,
+		.wait = rcu_barrier_bh,
+		__INIT_HELD(rcu_read_lock_bh_held)
+	},
+};
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define	rss_lock	gp_wait.lock
+
+#ifdef CONFIG_PROVE_RCU
+void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
+{
+	RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
+			 "suspicious rcu_sync_is_idle() usage");
+}
+#endif
+
+/**
+ * rcu_sync_init() - Initialize an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be initialized
+ * @type: Flavor of RCU with which to synchronize rcu_sync structure
+ */
+void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+{
+	memset(rsp, 0, sizeof(*rsp));
+	init_waitqueue_head(&rsp->gp_wait);
+	rsp->gp_type = type;
+}
+
+/**
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update.  After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths.  A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+	bool need_wait, need_sync;
+
+	spin_lock_irq(&rsp->rss_lock);
+	need_wait = rsp->gp_count++;
+	need_sync = rsp->gp_state == GP_IDLE;
+	if (need_sync)
+		rsp->gp_state = GP_PENDING;
+	spin_unlock_irq(&rsp->rss_lock);
+
+	BUG_ON(need_wait && need_sync);
+
+	if (need_sync) {
+		gp_ops[rsp->gp_type].sync();
+		rsp->gp_state = GP_PASSED;
+		wake_up_all(&rsp->gp_wait);
+	} else if (need_wait) {
+		wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
+	} else {
+		/*
+		 * Possible when there's a pending CB from a rcu_sync_exit().
+		 * Nobody has yet been allowed the 'fast' path and thus we can
+		 * avoid doing any sync(). The callback will get 'dropped'.
+		 */
+		BUG_ON(rsp->gp_state != GP_PASSED);
+	}
+}
+
+/**
+ * rcu_sync_func() - Callback function managing reader access to fastpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is passed to one of the call_rcu() functions by
+ * rcu_sync_exit(), so that it is invoked after a grace period following the
+ * that invocation of rcu_sync_exit().  It takes action based on events that
+ * have taken place in the meantime, so that closely spaced rcu_sync_enter()
+ * and rcu_sync_exit() pairs need not wait for a grace period.
+ *
+ * If another rcu_sync_enter() is invoked before the grace period
+ * ended, reset state to allow the next rcu_sync_exit() to let the
+ * readers back onto their fastpaths (after a grace period).  If both
+ * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked
+ * before the grace period ended, re-invoke call_rcu() on behalf of that
+ * rcu_sync_exit().  Otherwise, set all state back to idle so that readers
+ * can again use their fastpaths.
+ */
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+	struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head);
+	unsigned long flags;
+
+	BUG_ON(rsp->gp_state != GP_PASSED);
+	BUG_ON(rsp->cb_state == CB_IDLE);
+
+	spin_lock_irqsave(&rsp->rss_lock, flags);
+	if (rsp->gp_count) {
+		/*
+		 * A new rcu_sync_begin() has happened; drop the callback.
+		 */
+		rsp->cb_state = CB_IDLE;
+	} else if (rsp->cb_state == CB_REPLAY) {
+		/*
+		 * A new rcu_sync_exit() has happened; requeue the callback
+		 * to catch a later GP.
+		 */
+		rsp->cb_state = CB_PENDING;
+		gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+	} else {
+		/*
+		 * We're at least a GP after rcu_sync_exit(); eveybody will now
+		 * have observed the write side critical section. Let 'em rip!.
+		 */
+		rsp->cb_state = CB_IDLE;
+		rsp->gp_state = GP_IDLE;
+	}
+	spin_unlock_irqrestore(&rsp->rss_lock, flags);
+}
+
+/**
+ * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who have completed, and can therefore
+ * now allow readers to make use of their fastpaths after a grace period
+ * has elapsed.  After this grace period has completed, all subsequent
+ * calls to rcu_sync_is_idle() will return true, which tells readers that
+ * they can once again use their fastpaths.
+ */
+void rcu_sync_exit(struct rcu_sync *rsp)
+{
+	spin_lock_irq(&rsp->rss_lock);
+	if (!--rsp->gp_count) {
+		if (rsp->cb_state == CB_IDLE) {
+			rsp->cb_state = CB_PENDING;
+			gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+		} else if (rsp->cb_state == CB_PENDING) {
+			rsp->cb_state = CB_REPLAY;
+		}
+	}
+	spin_unlock_irq(&rsp->rss_lock);
+}
+
+/**
+ * rcu_sync_dtor() - Clean up an rcu_sync structure
+ * @rsp: Pointer to rcu_sync structure to be cleaned up
+ */
+void rcu_sync_dtor(struct rcu_sync *rsp)
+{
+	int cb_state;
+
+	BUG_ON(rsp->gp_count);
+
+	spin_lock_irq(&rsp->rss_lock);
+	if (rsp->cb_state == CB_REPLAY)
+		rsp->cb_state = CB_PENDING;
+	cb_state = rsp->cb_state;
+	spin_unlock_irq(&rsp->rss_lock);
+
+	if (cb_state != CB_IDLE) {
+		gp_ops[rsp->gp_type].wait();
+		BUG_ON(rsp->cb_state != CB_IDLE);
+	}
+}
diff --git a/kernel/torture.c b/kernel/torture.c
index 3e4840633d3e..44aa462d033f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -523,6 +523,7 @@ static int stutter;
  */
 void stutter_wait(const char *title)
 {
+	cond_resched_rcu_qs();
 	while (READ_ONCE(stutter_pause_test) ||
 	       (torture_runnable && !READ_ONCE(*torture_runnable))) {
 		if (stutter_pause_test)
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index fbe2dbff1e21..f6483609ebc2 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -75,7 +75,7 @@ usage () {
 while test $# -gt 0
 do
 	case "$1" in
-	--bootargs)
+	--bootargs|--bootarg)
 		checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--'
 		TORTURE_BOOTARGS="$2"
 		shift
@@ -88,7 +88,7 @@ do
 	--buildonly)
 		TORTURE_BUILDONLY=1
 		;;
-	--configs)
+	--configs|--config)
 		checkarg --configs "(list of config files)" "$#" "$2" '^[^/]*$' '^--'
 		configs="$2"
 		shift
@@ -134,7 +134,7 @@ do
 	--no-initrd)
 		TORTURE_INITRD=""; export TORTURE_INITRD
 		;;
-	--qemu-args)
+	--qemu-args|--qemu-arg)
 		checkarg --qemu-args "-qemu args" $# "$2" '^-' '^error'
 		TORTURE_QEMU_ARG="$2"
 		shift
diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
index 6910b7370761..b9611c523723 100644
--- a/tools/testing/selftests/rcutorture/configs/lock/CFLIST
+++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
@@ -1,4 +1,6 @@
 LOCK01
 LOCK02
 LOCK03
-LOCK04
-\ No newline at end of file
+LOCK04
+LOCK05
+LOCK06
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05 b/tools/testing/selftests/rcutorture/configs/lock/LOCK05
new file mode 100644
index 000000000000..1d1da1477fc3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot
new file mode 100644
index 000000000000..8ac37307c987
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot
@@ -0,0 +1 @@
+locktorture.torture_type=rtmutex_lock
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06 b/tools/testing/selftests/rcutorture/configs/lock/LOCK06
new file mode 100644
index 000000000000..1d1da1477fc3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot
new file mode 100644
index 000000000000..f92219cd4ad9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot
@@ -0,0 +1 @@
+locktorture.torture_type=percpu_rwsem_lock