diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 15:40:38 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-03 15:40:38 -0800 |
commit | 281422869942c19f05a08d4017c633d08d390938 (patch) | |
tree | 78e1fb9155e3e82697f96ce04e7b40e8f7c5147c /kernel | |
parent | f5a8160c1e055c0fd8d16a5b3ac97c638365b0db (diff) | |
parent | b33e18f61bd18227a456016a77b1a968f5bc1d65 (diff) |
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU changes from Ingo Molnar:
"The main changes in this cycle were:
- Improvements to expedited grace periods (Paul E McKenney)
- Performance improvements to and locktorture tests for percpu-rwsem
(Oleg Nesterov, Paul E McKenney)
- Torture-test changes (Paul E McKenney, Davidlohr Bueso)
- Documentation updates (Paul E McKenney)
- Miscellaneous fixes (Paul E McKenney, Boqun Feng, Oleg Nesterov,
Patrick Marlier)"
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
fs/writeback, rcu: Don't use list_entry_rcu() for pointer offsetting in bdi_split_work_to_wbs()
rcu: Better hotplug handling for synchronize_sched_expedited()
rcu: Enable stall warnings for synchronize_rcu_expedited()
rcu: Add tasks to expedited stall-warning messages
rcu: Add online/offline info to expedited stall warning message
rcu: Consolidate expedited CPU selection
rcu: Prepare for consolidating expedited CPU selection
cpu: Remove try_get_online_cpus()
rcu: Stop excluding CPU hotplug in synchronize_sched_expedited()
rcu: Stop silencing lockdep false positive for expedited grace periods
rcu: Switch synchronize_sched_expedited() to IPI
locktorture: Fix module unwind when bad torture_type specified
torture: Forgive non-plural arguments
rcutorture: Fix unused-function warning for torturing_tasks()
rcutorture: Fix module unwind when bad torture_type specified
rcu_sync: Cleanup the CONFIG_PROVE_RCU checks
locking/percpu-rwsem: Clean up the lockdep annotations in percpu_down_read()
locking/percpu-rwsem: Fix the comments outdated by rcu_sync
locking/percpu-rwsem: Make use of the rcu_sync infrastructure
locking/percpu-rwsem: Make percpu_free_rwsem() after kzalloc() safe
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpu.c | 13 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/locking/locktorture.c | 164 | ||||
-rw-r--r-- | kernel/locking/percpu-rwsem.c | 90 | ||||
-rw-r--r-- | kernel/rcu/Makefile | 2 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 16 | ||||
-rw-r--r-- | kernel/rcu/srcu.c | 4 | ||||
-rw-r--r-- | kernel/rcu/sync.c | 223 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 8 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 512 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 69 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 437 | ||||
-rw-r--r-- | kernel/rcu/tree_trace.c | 10 | ||||
-rw-r--r-- | kernel/rcu/update.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/torture.c | 1 |
16 files changed, 1111 insertions, 443 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c index 82cf9dff4295..14a9cdf8abe9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -102,19 +102,6 @@ void get_online_cpus(void) } EXPORT_SYMBOL_GPL(get_online_cpus); -bool try_get_online_cpus(void) -{ - if (cpu_hotplug.active_writer == current) - return true; - if (!mutex_trylock(&cpu_hotplug.lock)) - return false; - cpuhp_lock_acquire_tryread(); - atomic_inc(&cpu_hotplug.refcount); - mutex_unlock(&cpu_hotplug.lock); - return true; -} -EXPORT_SYMBOL_GPL(try_get_online_cpus); - void put_online_cpus(void) { int refcount; diff --git a/kernel/exit.c b/kernel/exit.c index ea95ee1b5ef7..0e93b63bbc59 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -761,7 +761,9 @@ void do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(preempt_disable()); TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); + TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); #ifdef CONFIG_NUMA diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 32244186f1f2..8ef1919d63b2 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -17,12 +17,14 @@ * * Copyright (C) IBM Corporation, 2014 * - * Author: Paul E. McKenney <paulmck@us.ibm.com> + * Authors: Paul E. McKenney <paulmck@us.ibm.com> + * Davidlohr Bueso <dave@stgolabs.net> * Based on kernel/rcu/torture.c. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/kthread.h> +#include <linux/sched/rt.h> #include <linux/spinlock.h> #include <linux/rwlock.h> #include <linux/mutex.h> @@ -34,6 +36,7 @@ #include <linux/moduleparam.h> #include <linux/delay.h> #include <linux/slab.h> +#include <linux/percpu-rwsem.h> #include <linux/torture.h> MODULE_LICENSE("GPL"); @@ -91,11 +94,13 @@ struct lock_torture_ops { void (*init)(void); int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); + void (*task_boost)(struct torture_random_state *trsp); void (*writeunlock)(void); int (*readlock)(void); void (*read_delay)(struct torture_random_state *trsp); void (*readunlock)(void); - unsigned long flags; + + unsigned long flags; /* for irq spinlocks */ const char *name; }; @@ -139,9 +144,15 @@ static void torture_lock_busted_write_unlock(void) /* BUGGY, do not use in real life!!! */ } +static void torture_boost_dummy(struct torture_random_state *trsp) +{ + /* Only rtmutexes care about priority */ +} + static struct lock_torture_ops lock_busted_ops = { .writelock = torture_lock_busted_write_lock, .write_delay = torture_lock_busted_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_busted_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -185,6 +196,7 @@ static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) static struct lock_torture_ops spin_lock_ops = { .writelock = torture_spin_lock_write_lock, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_spin_lock_write_unlock, .readlock = NULL, .read_delay = NULL, @@ -211,6 +223,7 @@ __releases(torture_spinlock) static struct lock_torture_ops spin_lock_irq_ops = { .writelock = torture_spin_lock_write_lock_irq, .write_delay = torture_spin_lock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_lock_spin_write_unlock_irq, .readlock = NULL, .read_delay = NULL, @@ -275,6 +288,7 @@ static void torture_rwlock_read_unlock(void) __releases(torture_rwlock) static struct lock_torture_ops rw_lock_ops = { .writelock = torture_rwlock_write_lock, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock, .readlock = torture_rwlock_read_lock, .read_delay = torture_rwlock_read_delay, @@ -315,6 +329,7 @@ __releases(torture_rwlock) static struct lock_torture_ops rw_lock_irq_ops = { .writelock = torture_rwlock_write_lock_irq, .write_delay = torture_rwlock_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwlock_write_unlock_irq, .readlock = torture_rwlock_read_lock_irq, .read_delay = torture_rwlock_read_delay, @@ -354,6 +369,7 @@ static void torture_mutex_unlock(void) __releases(torture_mutex) static struct lock_torture_ops mutex_lock_ops = { .writelock = torture_mutex_lock, .write_delay = torture_mutex_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_mutex_unlock, .readlock = NULL, .read_delay = NULL, @@ -361,6 +377,90 @@ static struct lock_torture_ops mutex_lock_ops = { .name = "mutex_lock" }; +#ifdef CONFIG_RT_MUTEXES +static DEFINE_RT_MUTEX(torture_rtmutex); + +static int torture_rtmutex_lock(void) __acquires(torture_rtmutex) +{ + rt_mutex_lock(&torture_rtmutex); + return 0; +} + +static void torture_rtmutex_boost(struct torture_random_state *trsp) +{ + int policy; + struct sched_param param; + const unsigned int factor = 50000; /* yes, quite arbitrary */ + + if (!rt_task(current)) { + /* + * (1) Boost priority once every ~50k operations. When the + * task tries to take the lock, the rtmutex it will account + * for the new priority, and do any corresponding pi-dance. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * factor))) { + policy = SCHED_FIFO; + param.sched_priority = MAX_RT_PRIO - 1; + } else /* common case, do nothing */ + return; + } else { + /* + * The task will remain boosted for another ~500k operations, + * then restored back to its original prio, and so forth. + * + * When @trsp is nil, we want to force-reset the task for + * stopping the kthread. + */ + if (!trsp || !(torture_random(trsp) % + (cxt.nrealwriters_stress * factor * 2))) { + policy = SCHED_NORMAL; + param.sched_priority = 0; + } else /* common case, do nothing */ + return; + } + + sched_setscheduler_nocheck(current, policy, ¶m); +} + +static void torture_rtmutex_delay(struct torture_random_state *trsp) +{ + const unsigned long shortdelay_us = 2; + const unsigned long longdelay_ms = 100; + + /* + * We want a short delay mostly to emulate likely code, and + * we want a long delay occasionally to force massive contention. + */ + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(torture_random(trsp) % + (cxt.nrealwriters_stress * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) + preempt_schedule(); /* Allow test to be preempted. */ +#endif +} + +static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) +{ + rt_mutex_unlock(&torture_rtmutex); +} + +static struct lock_torture_ops rtmutex_lock_ops = { + .writelock = torture_rtmutex_lock, + .write_delay = torture_rtmutex_delay, + .task_boost = torture_rtmutex_boost, + .writeunlock = torture_rtmutex_unlock, + .readlock = NULL, + .read_delay = NULL, + .readunlock = NULL, + .name = "rtmutex_lock" +}; +#endif + static DECLARE_RWSEM(torture_rwsem); static int torture_rwsem_down_write(void) __acquires(torture_rwsem) { @@ -419,6 +519,7 @@ static void torture_rwsem_up_read(void) __releases(torture_rwsem) static struct lock_torture_ops rwsem_lock_ops = { .writelock = torture_rwsem_down_write, .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, .writeunlock = torture_rwsem_up_write, .readlock = torture_rwsem_down_read, .read_delay = torture_rwsem_read_delay, @@ -426,6 +527,48 @@ static struct lock_torture_ops rwsem_lock_ops = { .name = "rwsem_lock" }; +#include <linux/percpu-rwsem.h> +static struct percpu_rw_semaphore pcpu_rwsem; + +void torture_percpu_rwsem_init(void) +{ + BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); +} + +static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) +{ + percpu_down_write(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_write(void) __releases(pcpu_rwsem) +{ + percpu_up_write(&pcpu_rwsem); +} + +static int torture_percpu_rwsem_down_read(void) __acquires(pcpu_rwsem) +{ + percpu_down_read(&pcpu_rwsem); + return 0; +} + +static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) +{ + percpu_up_read(&pcpu_rwsem); +} + +static struct lock_torture_ops percpu_rwsem_lock_ops = { + .init = torture_percpu_rwsem_init, + .writelock = torture_percpu_rwsem_down_write, + .write_delay = torture_rwsem_write_delay, + .task_boost = torture_boost_dummy, + .writeunlock = torture_percpu_rwsem_up_write, + .readlock = torture_percpu_rwsem_down_read, + .read_delay = torture_rwsem_read_delay, + .readunlock = torture_percpu_rwsem_up_read, + .name = "percpu_rwsem_lock" +}; + /* * Lock torture writer kthread. Repeatedly acquires and releases * the lock, checking for duplicate acquisitions. @@ -442,6 +585,7 @@ static int lock_torture_writer(void *arg) if ((torture_random(&rand) & 0xfffff) == 0) schedule_timeout_uninterruptible(1); + cxt.cur_ops->task_boost(&rand); cxt.cur_ops->writelock(); if (WARN_ON_ONCE(lock_is_write_held)) lwsp->n_lock_fail++; @@ -456,6 +600,8 @@ static int lock_torture_writer(void *arg) stutter_wait("lock_torture_writer"); } while (!torture_must_stop()); + + cxt.cur_ops->task_boost(NULL); /* reset prio */ torture_kthread_stopping("lock_torture_writer"); return 0; } @@ -642,7 +788,11 @@ static int __init lock_torture_init(void) &spin_lock_ops, &spin_lock_irq_ops, &rw_lock_ops, &rw_lock_irq_ops, &mutex_lock_ops, +#ifdef CONFIG_RT_MUTEXES + &rtmutex_lock_ops, +#endif &rwsem_lock_ops, + &percpu_rwsem_lock_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) @@ -661,11 +811,11 @@ static int __init lock_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cxt.cur_ops->init) - cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cxt.cur_ops->init(); if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; @@ -676,6 +826,10 @@ static int __init lock_torture_init(void) if (strncmp(torture_type, "mutex", 5) == 0) cxt.debug_lock = true; #endif +#ifdef CONFIG_DEBUG_RT_MUTEXES + if (strncmp(torture_type, "rtmutex", 7) == 0) + cxt.debug_lock = true; +#endif #ifdef CONFIG_DEBUG_SPINLOCK if ((strncmp(torture_type, "spin", 4) == 0) || (strncmp(torture_type, "rw_lock", 7) == 0)) diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f32567254867..f231e0bb311c 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -17,50 +17,43 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *brw, /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ __init_rwsem(&brw->rw_sem, name, rwsem_key); - atomic_set(&brw->write_ctr, 0); + rcu_sync_init(&brw->rss, RCU_SCHED_SYNC); atomic_set(&brw->slow_read_ctr, 0); init_waitqueue_head(&brw->write_waitq); return 0; } +EXPORT_SYMBOL_GPL(__percpu_init_rwsem); void percpu_free_rwsem(struct percpu_rw_semaphore *brw) { + /* + * XXX: temporary kludge. The error path in alloc_super() + * assumes that percpu_free_rwsem() is safe after kzalloc(). + */ + if (!brw->fast_read_ctr) + return; + + rcu_sync_dtor(&brw->rss); free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } /* - * This is the fast-path for down_read/up_read, it only needs to ensure - * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the - * fast per-cpu counter. The writer uses synchronize_sched_expedited() to - * serialize with the preempt-disabled section below. - * - * The nontrivial part is that we should guarantee acquire/release semantics - * in case when - * - * R_W: down_write() comes after up_read(), the writer should see all - * changes done by the reader - * or - * W_R: down_read() comes after up_write(), the reader should see all - * changes done by the writer + * This is the fast-path for down_read/up_read. If it succeeds we rely + * on the barriers provided by rcu_sync_enter/exit; see the comments in + * percpu_down_write() and percpu_up_write(). * * If this helper fails the callers rely on the normal rw_semaphore and * atomic_dec_and_test(), so in this case we have the necessary barriers. - * - * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or - * __this_cpu_add() below can be reordered with any LOAD/STORE done by the - * reader inside the critical section. See the comments in down_write and - * up_write below. */ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) { - bool success = false; + bool success; preempt_disable(); - if (likely(!atomic_read(&brw->write_ctr))) { + success = rcu_sync_is_idle(&brw->rss); + if (likely(success)) __this_cpu_add(*brw->fast_read_ctr, val); - success = true; - } preempt_enable(); return success; @@ -77,16 +70,17 @@ static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val) void percpu_down_read(struct percpu_rw_semaphore *brw) { might_sleep(); - if (likely(update_fast_ctr(brw, +1))) { - rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_); + + if (likely(update_fast_ctr(brw, +1))) return; - } - down_read(&brw->rw_sem); + /* Avoid rwsem_acquire_read() and rwsem_release() */ + __down_read(&brw->rw_sem); atomic_inc(&brw->slow_read_ctr); - /* avoid up_read()->rwsem_release() */ __up_read(&brw->rw_sem); } +EXPORT_SYMBOL_GPL(percpu_down_read); int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) { @@ -112,6 +106,7 @@ void percpu_up_read(struct percpu_rw_semaphore *brw) if (atomic_dec_and_test(&brw->slow_read_ctr)) wake_up_all(&brw->write_waitq); } +EXPORT_SYMBOL_GPL(percpu_up_read); static int clear_fast_ctr(struct percpu_rw_semaphore *brw) { @@ -126,33 +121,17 @@ static int clear_fast_ctr(struct percpu_rw_semaphore *brw) return sum; } -/* - * A writer increments ->write_ctr to force the readers to switch to the - * slow mode, note the atomic_read() check in update_fast_ctr(). - * - * After that the readers can only inc/dec the slow ->slow_read_ctr counter, - * ->fast_read_ctr is stable. Once the writer moves its sum into the slow - * counter it represents the number of active readers. - * - * Finally the writer takes ->rw_sem for writing and blocks the new readers, - * then waits until the slow counter becomes zero. - */ void percpu_down_write(struct percpu_rw_semaphore *brw) { - /* tell update_fast_ctr() there is a pending writer */ - atomic_inc(&brw->write_ctr); /* - * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read - * so that update_fast_ctr() can't succeed. - * - * 2. Ensures we see the result of every previous this_cpu_add() in - * update_fast_ctr(). + * Make rcu_sync_is_idle() == F and thus disable the fast-path in + * percpu_down_read() and percpu_up_read(), and wait for gp pass. * - * 3. Ensures that if any reader has exited its critical section via - * fast-path, it executes a full memory barrier before we return. - * See R_W case in the comment above update_fast_ctr(). + * The latter synchronises us with the preceding readers which used + * the fast-past, so we can not miss the result of __this_cpu_add() + * or anything else inside their criticial sections. */ - synchronize_sched_expedited(); + rcu_sync_enter(&brw->rss); /* exclude other writers, and block the new readers completely */ down_write(&brw->rw_sem); @@ -163,16 +142,17 @@ void percpu_down_write(struct percpu_rw_semaphore *brw) /* wait for all readers to complete their percpu_up_read() */ wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr)); } +EXPORT_SYMBOL_GPL(percpu_down_write); void percpu_up_write(struct percpu_rw_semaphore *brw) { /* release the lock, but the readers can't use the fast-path */ up_write(&brw->rw_sem); /* - * Insert the barrier before the next fast-path in down_read, - * see W_R case in the comment above update_fast_ctr(). + * Enable the fast-path in percpu_down_read() and percpu_up_read() + * but only after another gp pass; this adds the necessary barrier + * to ensure the reader can't miss the changes done by us. */ - synchronize_sched_expedited(); - /* the last writer unblocks update_fast_ctr() */ - atomic_dec(&brw->write_ctr); + rcu_sync_exit(&brw->rss); } +EXPORT_SYMBOL_GPL(percpu_up_write); diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 50a808424b06..61a16569ffbf 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,4 +1,4 @@ -obj-y += update.o +obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += tree.o diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 77192953dee5..d89328e260df 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -252,7 +252,7 @@ struct rcu_torture_ops { void (*exp_sync)(void); unsigned long (*get_state)(void); void (*cond_sync)(unsigned long oldstate); - void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + call_rcu_func_t call; void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); @@ -448,7 +448,7 @@ static void synchronize_rcu_busted(void) } static void -call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +call_rcu_busted(struct rcu_head *head, rcu_callback_t func) { /* This is a deliberate bug for testing purposes only! */ func(head); @@ -523,7 +523,7 @@ static void srcu_torture_synchronize(void) } static void srcu_torture_call(struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { call_srcu(srcu_ctlp, head, func); } @@ -695,7 +695,7 @@ static bool __maybe_unused torturing_tasks(void) #define RCUTORTURE_TASKS_OPS -static bool torturing_tasks(void) +static bool __maybe_unused torturing_tasks(void) { return false; } @@ -768,7 +768,6 @@ static int rcu_torture_boost(void *arg) } call_rcu_time = jiffies; } - cond_resched_rcu_qs(); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) goto checkwait; @@ -1208,7 +1207,6 @@ rcu_torture_reader(void *arg) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); - cond_resched_rcu_qs(); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); if (irqreader && cur_ops->irq_capable) { @@ -1742,15 +1740,15 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_alert(" %s", torture_ops[i]->name); pr_alert("\n"); - torture_init_end(); - return -EINVAL; + firsterr = -EINVAL; + goto unwind; } if (cur_ops->fqs == NULL && fqs_duration != 0) { pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); fqs_duration = 0; } if (cur_ops->init) - cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + cur_ops->init(); if (nreaders >= 0) { nrealreaders = nreaders; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index d3fcb2ec8536..a63a1ea5a41b 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -298,11 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - preempt_disable(); __this_cpu_inc(sp->per_cpu_ref->c[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ __this_cpu_inc(sp->per_cpu_ref->seq[idx]); - preempt_enable(); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -387,7 +385,7 @@ static void srcu_flip(struct srcu_struct *sp) * srcu_struct structure. */ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, - void (*func)(struct rcu_head *head)) + rcu_callback_t func) { unsigned long flags; diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c new file mode 100644 index 000000000000..be922c9f3d37 --- /dev/null +++ b/kernel/rcu/sync.c @@ -0,0 +1,223 @@ +/* + * RCU-based infrastructure for lightweight reader-writer locking + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (c) 2015, Red Hat, Inc. + * + * Author: Oleg Nesterov <oleg@redhat.com> + */ + +#include <linux/rcu_sync.h> +#include <linux/sched.h> + +#ifdef CONFIG_PROVE_RCU +#define __INIT_HELD(func) .held = func, +#else +#define __INIT_HELD(func) +#endif + +static const struct { + void (*sync)(void); + void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); + void (*wait)(void); +#ifdef CONFIG_PROVE_RCU + int (*held)(void); +#endif +} gp_ops[] = { + [RCU_SYNC] = { + .sync = synchronize_rcu, + .call = call_rcu, + .wait = rcu_barrier, + __INIT_HELD(rcu_read_lock_held) + }, + [RCU_SCHED_SYNC] = { + .sync = synchronize_sched, + .call = call_rcu_sched, + .wait = rcu_barrier_sched, + __INIT_HELD(rcu_read_lock_sched_held) + }, + [RCU_BH_SYNC] = { + .sync = synchronize_rcu_bh, + .call = call_rcu_bh, + .wait = rcu_barrier_bh, + __INIT_HELD(rcu_read_lock_bh_held) + }, +}; + +enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; +enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; + +#define rss_lock gp_wait.lock + +#ifdef CONFIG_PROVE_RCU +void rcu_sync_lockdep_assert(struct rcu_sync *rsp) +{ + RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), + "suspicious rcu_sync_is_idle() usage"); +} +#endif + +/** + * rcu_sync_init() - Initialize an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be initialized + * @type: Flavor of RCU with which to synchronize rcu_sync structure + */ +void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) +{ + memset(rsp, 0, sizeof(*rsp)); + init_waitqueue_head(&rsp->gp_wait); + rsp->gp_type = type; +} + +/** + * rcu_sync_enter() - Force readers onto slowpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who need readers to make use of + * a slowpath during the update. After this function returns, all + * subsequent calls to rcu_sync_is_idle() will return false, which + * tells readers to stay off their fastpaths. A later call to + * rcu_sync_exit() re-enables reader slowpaths. + * + * When called in isolation, rcu_sync_enter() must wait for a grace + * period, however, closely spaced calls to rcu_sync_enter() can + * optimize away the grace-period wait via a state machine implemented + * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). + */ +void rcu_sync_enter(struct rcu_sync *rsp) +{ + bool need_wait, need_sync; + + spin_lock_irq(&rsp->rss_lock); + need_wait = rsp->gp_count++; + need_sync = rsp->gp_state == GP_IDLE; + if (need_sync) + rsp->gp_state = GP_PENDING; + spin_unlock_irq(&rsp->rss_lock); + + BUG_ON(need_wait && need_sync); + + if (need_sync) { + gp_ops[rsp->gp_type].sync(); + rsp->gp_state = GP_PASSED; + wake_up_all(&rsp->gp_wait); + } else if (need_wait) { + wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); + } else { + /* + * Possible when there's a pending CB from a rcu_sync_exit(). + * Nobody has yet been allowed the 'fast' path and thus we can + * avoid doing any sync(). The callback will get 'dropped'. + */ + BUG_ON(rsp->gp_state != GP_PASSED); + } +} + +/** + * rcu_sync_func() - Callback function managing reader access to fastpath + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is passed to one of the call_rcu() functions by + * rcu_sync_exit(), so that it is invoked after a grace period following the + * that invocation of rcu_sync_exit(). It takes action based on events that + * have taken place in the meantime, so that closely spaced rcu_sync_enter() + * and rcu_sync_exit() pairs need not wait for a grace period. + * + * If another rcu_sync_enter() is invoked before the grace period + * ended, reset state to allow the next rcu_sync_exit() to let the + * readers back onto their fastpaths (after a grace period). If both + * another rcu_sync_enter() and its matching rcu_sync_exit() are invoked + * before the grace period ended, re-invoke call_rcu() on behalf of that + * rcu_sync_exit(). Otherwise, set all state back to idle so that readers + * can again use their fastpaths. + */ +static void rcu_sync_func(struct rcu_head *rcu) +{ + struct rcu_sync *rsp = container_of(rcu, struct rcu_sync, cb_head); + unsigned long flags; + + BUG_ON(rsp->gp_state != GP_PASSED); + BUG_ON(rsp->cb_state == CB_IDLE); + + spin_lock_irqsave(&rsp->rss_lock, flags); + if (rsp->gp_count) { + /* + * A new rcu_sync_begin() has happened; drop the callback. + */ + rsp->cb_state = CB_IDLE; + } else if (rsp->cb_state == CB_REPLAY) { + /* + * A new rcu_sync_exit() has happened; requeue the callback + * to catch a later GP. + */ + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else { + /* + * We're at least a GP after rcu_sync_exit(); eveybody will now + * have observed the write side critical section. Let 'em rip!. + */ + rsp->cb_state = CB_IDLE; + rsp->gp_state = GP_IDLE; + } + spin_unlock_irqrestore(&rsp->rss_lock, flags); +} + +/** + * rcu_sync_exit() - Allow readers back onto fast patch after grace period + * @rsp: Pointer to rcu_sync structure to use for synchronization + * + * This function is used by updaters who have completed, and can therefore + * now allow readers to make use of their fastpaths after a grace period + * has elapsed. After this grace period has completed, all subsequent + * calls to rcu_sync_is_idle() will return true, which tells readers that + * they can once again use their fastpaths. + */ +void rcu_sync_exit(struct rcu_sync *rsp) +{ + spin_lock_irq(&rsp->rss_lock); + if (!--rsp->gp_count) { + if (rsp->cb_state == CB_IDLE) { + rsp->cb_state = CB_PENDING; + gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); + } else if (rsp->cb_state == CB_PENDING) { + rsp->cb_state = CB_REPLAY; + } + } + spin_unlock_irq(&rsp->rss_lock); +} + +/** + * rcu_sync_dtor() - Clean up an rcu_sync structure + * @rsp: Pointer to rcu_sync structure to be cleaned up + */ +void rcu_sync_dtor(struct rcu_sync *rsp) +{ + int cb_state; + + BUG_ON(rsp->gp_count); + + spin_lock_irq(&rsp->rss_lock); + if (rsp->cb_state == CB_REPLAY) + rsp->cb_state = CB_PENDING; + cb_state = rsp->cb_state; + spin_unlock_irq(&rsp->rss_lock); + + if (cb_state != CB_IDLE) { + gp_ops[rsp->gp_type].wait(); + BUG_ON(rsp->cb_state != CB_IDLE); + } +} diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d0471056d0af..944b1b491ed8 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -44,7 +44,7 @@ struct rcu_ctrlblk; static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp); #include "tiny_plugin.h" @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(synchronize_sched); * Helper function for call_rcu() and call_rcu_bh(). */ static void __call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu), + rcu_callback_t func, struct rcu_ctrlblk *rcp) { unsigned long flags; @@ -229,7 +229,7 @@ static void __call_rcu(struct rcu_head *head, * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_ctrlblk); } @@ -239,7 +239,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); * Post an RCU bottom-half callback to be invoked after any subsequent * quiescent state. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_ctrlblk); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 775d36cc0050..f07343b54fe5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -98,7 +97,7 @@ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .rda = &sname##_data, \ .call = cr, \ - .fqs_state = RCU_GP_IDLE, \ + .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ @@ -161,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); +static void rcu_report_exp_rdp(struct rcu_state *rsp, + struct rcu_data *rdp, bool wake); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -245,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { + unsigned long flags; + + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_sched_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.passed_quiesce, 1); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + local_irq_save(flags); + if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), + true); + } + local_irq_restore(flags); } } void rcu_bh_qs(void) { - if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { + if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_bh"), __this_cpu_read(rcu_bh_data.gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_bh_data.passed_quiesce, 1); + __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); } } @@ -337,12 +350,14 @@ static void rcu_momentary_dyntick_idle(void) */ void rcu_note_context_switch(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -353,12 +368,19 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); * RCU flavors in desperate need of a quiescent state, which will normally * be none of them). Either way, do a lightweight quiescent state for * all RCU flavors. + * + * The barrier() calls are redundant in the common case when this is + * called externally, but just in case this is called from within this + * file. + * */ void rcu_all_qs(void) { + barrier(); /* Avoid RCU read-side critical sections leaking down. */ if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) rcu_momentary_dyntick_idle(); this_cpu_inc(rcu_qs_ctr); + barrier(); /* Avoid RCU read-side critical sections leaking up. */ } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -1744,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, */ rdp->gpnum = rnp->gpnum; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); - rdp->passed_quiesce = 0; + rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); + rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); } @@ -1927,16 +1949,15 @@ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) /* * Do one round of quiescent-state forcing. */ -static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) +static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) { - int fqs_state = fqs_state_in; bool isidle = false; unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); rsp->n_force_qs++; - if (fqs_state == RCU_SAVE_DYNTICK) { + if (first_time) { /* Collect dyntick-idle snapshots. */ if (is_sysidle_rcu_state(rsp)) { isidle = true; @@ -1945,7 +1966,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) force_qs_rnp(rsp, dyntick_save_progress_counter, &isidle, &maxj); rcu_sysidle_report_gp(rsp, isidle, maxj); - fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ isidle = true; @@ -1959,7 +1979,6 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); } - return fqs_state; } /* @@ -2023,7 +2042,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) /* Declare grace period done. */ WRITE_ONCE(rsp->completed, rsp->gpnum); trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); - rsp->fqs_state = RCU_GP_IDLE; + rsp->gp_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); /* Advance CBs to reduce false positives below. */ needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; @@ -2041,7 +2060,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ static int __noreturn rcu_gp_kthread(void *arg) { - int fqs_state; + bool first_gp_fqs; int gf; unsigned long j; int ret; @@ -2073,7 +2092,7 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle quiescent-state forcing. */ - fqs_state = RCU_SAVE_DYNTICK; + first_gp_fqs = true; j = jiffies_till_first_fqs; if (j > HZ) { j = HZ; @@ -2101,7 +2120,8 @@ static int __noreturn rcu_gp_kthread(void *arg) trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsstart")); - fqs_state = rcu_gp_fqs(rsp, fqs_state); + rcu_gp_fqs(rsp, first_gp_fqs); + first_gp_fqs = false; trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), TPS("fqsend")); @@ -2337,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - if ((rdp->passed_quiesce == 0 && + if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || rdp->gpwrap) { @@ -2348,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * We will instead need a new quiescent state that lies * within the current grace period. */ - rdp->passed_quiesce = 0; /* need qs for new gp. */ + rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -2357,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) if ((rnp->qsmask & mask) == 0) { raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { - rdp->qs_pending = 0; + rdp->core_needs_qs = 0; /* * This GP can't end until cpu checks in, so all of our @@ -2388,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Does this CPU still need to do its part for current grace period? * If no, return and let the other CPUs do their part as well. */ - if (!rdp->qs_pending) + if (!rdp->core_needs_qs) return; /* * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesce && + if (rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) return; @@ -3017,7 +3037,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), +__call_rcu(struct rcu_head *head, rcu_callback_t func, struct rcu_state *rsp, int cpu, bool lazy) { unsigned long flags; @@ -3088,7 +3108,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), /* * Queue an RCU-sched callback for invocation after a grace period. */ -void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_sched_state, -1, 0); } @@ -3097,7 +3117,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Queue an RCU callback for invocation after a quicker grace period. */ -void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, &rcu_bh_state, -1, 0); } @@ -3111,7 +3131,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); * function may only be called from __kfree_rcu(). */ void kfree_call_rcu(struct rcu_head *head, - void (*func)(struct rcu_head *rcu)) + rcu_callback_t func) { __call_rcu(head, func, rcu_state_p, -1, 1); } @@ -3379,6 +3399,191 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) return rcu_seq_done(&rsp->expedited_sequence, s); } +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave(&rnp_up->lock, flags); + smp_mb__after_unlock_lock(); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the root rcu_node's exp_funnel_mutex and the + * specified rcu_node structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + wake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + smp_mb__after_unlock_lock(); + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the root + * rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + * Caller must hold the root rcu_node's exp_funnel_mutex. + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp, @@ -3455,16 +3660,111 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) } /* Invoked on each online non-idle CPU for expedited quiescent state. */ -static int synchronize_sched_expedited_cpu_stop(void *data) +static void sync_sched_exp_handler(void *data) { - struct rcu_data *rdp = data; - struct rcu_state *rsp = rdp->rsp; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; - /* We are here: If we are last, do the wakeup. */ - rdp->exp_done = true; - if (atomic_dec_and_test(&rsp->expedited_need_qs)) - wake_up(&rsp->expedited_wq); - return 0; + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + } else { + /* Failed, raced with offline. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore(&rnp->lock, + flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave(&rnp->lock, + flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } } static void synchronize_sched_expedited_wait(struct rcu_state *rsp) @@ -3472,7 +3772,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) int cpu; unsigned long jiffies_stall; unsigned long jiffies_start; - struct rcu_data *rdp; + unsigned long mask; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; jiffies_stall = rcu_jiffies_till_stall_check(); @@ -3481,33 +3783,43 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) for (;;) { ret = wait_event_interruptible_timeout( rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs), + sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); if (ret > 0) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ wait_event(rsp->expedited_wq, - !atomic_read(&rsp->expedited_need_qs)); + sync_rcu_preempt_exp_done(rnp_root)); return; } - pr_err("INFO: %s detected expedited stalls on CPUs: {", + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); - for_each_online_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - - if (rdp->exp_done) - continue; - pr_cont(" %d", cpu); + rcu_for_each_leaf_node(rsp, rnp) { + (void)rcu_print_task_exp_stall(rnp); + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + struct rcu_data *rdp; + + if (!(rnp->expmask & mask)) + continue; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + mask <<= 1; } pr_cont(" } %lu jiffies s: %lu\n", jiffies - jiffies_start, rsp->expedited_sequence); - for_each_online_cpu(cpu) { - rdp = per_cpu_ptr(rsp->rda, cpu); - - if (rdp->exp_done) - continue; - dump_cpu_task(cpu); + rcu_for_each_leaf_node(rsp, rnp) { + mask = 1; + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } } jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; } @@ -3531,7 +3843,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) */ void synchronize_sched_expedited(void) { - int cpu; unsigned long s; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; @@ -3539,48 +3850,16 @@ void synchronize_sched_expedited(void) /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - return; - } - WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - rnp = exp_funnel_lock(rsp, s); - if (rnp == NULL) { - put_online_cpus(); + if (rnp == NULL) return; /* Someone else did our work for us. */ - } rcu_exp_gp_seq_start(rsp); - - /* Stop each CPU that is online, non-idle, and not us. */ - init_waitqueue_head(&rsp->expedited_wq); - atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ - for_each_online_cpu(cpu) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdp->exp_done = false; - - /* Skip our CPU and any idle CPUs. */ - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - continue; - atomic_inc(&rsp->expedited_need_qs); - stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, - rdp, &rdp->exp_stop_work); - } - - /* Remove extra count and, if necessary, wait for CPUs to stop. */ - if (!atomic_dec_and_test(&rsp->expedited_need_qs)) - synchronize_sched_expedited_wait(rsp); + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + synchronize_sched_expedited_wait(rsp); rcu_exp_gp_seq_end(rsp); mutex_unlock(&rnp->exp_funnel_mutex); - - put_online_cpus(); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); @@ -3606,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && - rdp->qs_pending && !rdp->passed_quiesce && + rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { - rdp->n_rp_qs_pending++; - } else if (rdp->qs_pending && - (rdp->passed_quiesce || + rdp->n_rp_core_needs_qs++; + } else if (rdp->core_needs_qs && + (!rdp->cpu_no_qs.b.norm || rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { rdp->n_rp_report_qs++; return 1; @@ -3868,7 +4147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { - static struct lock_class_key rcu_exp_sched_rdp_class; unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3884,10 +4162,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (rsp == &rcu_sched_state) - lockdep_set_class_and_name(&rdp->exp_funnel_mutex, - &rcu_exp_sched_rdp_class, - "rcu_data_exp_sched"); } /* @@ -3906,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->beenonline = 1; /* We have now been online. */ rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -3928,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); rnp->qsmaskinitnext |= mask; + rnp->expmaskinitnext |= mask; + if (!rdp->beenonline) + WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); + rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; - rdp->passed_quiesce = false; + rdp->cpu_no_qs.b.norm = true; rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); - rdp->qs_pending = false; + rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -3965,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self, break; case CPU_ONLINE: case CPU_DOWN_FAILED: + sync_sched_exp_online_cleanup(cpu); rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: @@ -3976,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self, rcu_cleanup_dying_cpu(rsp); break; case CPU_DYING_IDLE: + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) { rcu_cleanup_dying_idle_cpu(cpu, rsp); } @@ -4107,7 +4391,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; - static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4167,18 +4450,13 @@ static void __init rcu_init_one(struct rcu_state *rsp, INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); mutex_init(&rnp->exp_funnel_mutex); - if (rsp == &rcu_sched_state) - lockdep_set_class_and_name( - &rnp->exp_funnel_mutex, - &rcu_exp_sched_class[i], exp_sched[i]); - else - lockdep_set_class_and_name( - &rnp->exp_funnel_mutex, - &rcu_exp_class[i], exp[i]); + lockdep_set_class_and_name(&rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } init_waitqueue_head(&rsp->gp_wq); + init_waitqueue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -4221,13 +4499,12 @@ static void __init rcu_init_geometry(void) rcu_fanout_leaf, nr_cpu_ids); /* - * The boot-time rcu_fanout_leaf parameter is only permitted - * to increase the leaf-level fanout, not decrease it. Of course, - * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Complain and fall back to the compile- - * time values if these limits are exceeded. + * The boot-time rcu_fanout_leaf parameter must be at least two + * and cannot exceed the number of bits in the rcu_node masks. + * Complain and fall back to the compile-time values if this + * limit is exceeded. */ - if (rcu_fanout_leaf < RCU_FANOUT_LEAF || + if (rcu_fanout_leaf < 2 || rcu_fanout_leaf > sizeof(unsigned long) * 8) { rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); @@ -4244,10 +4521,13 @@ static void __init rcu_init_geometry(void) /* * The tree must be able to accommodate the configured number of CPUs. - * If this limit is exceeded than we have a serious problem elsewhere. + * If this limit is exceeded, fall back to the compile-time values. */ - if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) - panic("rcu_init_geometry: rcu_capacity[] is too small"); + if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; + WARN_ON(1); + return; + } /* Calculate the number of levels in the tree. */ for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2e991f8361e4..9fb4e238d4dc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -70,8 +70,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 @@ -81,8 +79,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 @@ -93,8 +89,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 @@ -106,8 +100,6 @@ # define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } # define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } # define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } -# define RCU_EXP_SCHED_NAME_INIT \ - { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ @@ -171,16 +163,21 @@ struct rcu_node { /* an rcu_data structure, otherwise, each */ /* bit corresponds to a child rcu_node */ /* structure. */ - unsigned long expmask; /* Groups that have ->blkd_tasks */ - /* elements that need to drain to allow the */ - /* current expedited grace period to */ - /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; - /* Per-GP initial value for qsmask & expmask. */ + /* Per-GP initial value for qsmask. */ /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; /* Online CPUs for next grace period. */ + unsigned long expmask; /* CPUs or groups that need to check in */ + /* to allow the current expedited GP */ + /* to complete. */ + unsigned long expmaskinit; + /* Per-GP initial values for expmask. */ + /* Initialized from ->expmaskinitnext at the */ + /* beginning of each expedited GP. */ + unsigned long expmaskinitnext; + /* Online CPUs for next expedited GP. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -281,6 +278,18 @@ struct rcu_node { for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) +/* + * Union to allow "aggregate OR" operation on the need for a quiescent + * state by the normal and expedited grace periods. + */ +union rcu_noqs { + struct { + u8 norm; + u8 exp; + } b; /* Bits. */ + u16 s; /* Set of bits, aggregate OR here. */ +}; + /* Index values for nxttail array in struct rcu_data. */ #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ @@ -297,8 +306,8 @@ struct rcu_data { /* is aware of having started. */ unsigned long rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */ /* for rcu_all_qs() invocations. */ - bool passed_quiesce; /* User-mode/idle loop etc. */ - bool qs_pending; /* Core waits for quiesc state. */ + union rcu_noqs cpu_no_qs; /* No QSes yet for this CPU. */ + bool core_needs_qs; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ @@ -307,9 +316,6 @@ struct rcu_data { /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ - struct cpu_stop_work exp_stop_work; - /* Expedited grace-period control */ - /* for CPU stopping. */ /* 2) batch handling */ /* @@ -363,7 +369,7 @@ struct rcu_data { /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ - unsigned long n_rp_qs_pending; + unsigned long n_rp_core_needs_qs; unsigned long n_rp_report_qs; unsigned long n_rp_cb_ready; unsigned long n_rp_cpu_needs_gp; @@ -378,7 +384,6 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; - bool exp_done; /* Expedited QS for this CPU? */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -412,13 +417,6 @@ struct rcu_data { struct rcu_state *rsp; }; -/* Values for fqs_state field in struct rcu_state. */ -#define RCU_GP_IDLE 0 /* No grace period in progress. */ -#define RCU_GP_INIT 1 /* Grace period being initialized. */ -#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ -#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ -#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK - /* Values for nocb_defer_wakeup field in struct rcu_data. */ #define RCU_NOGP_WAKE_NOT 0 #define RCU_NOGP_WAKE 1 @@ -464,14 +462,13 @@ struct rcu_state { /* shut bogus gcc warning) */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ - void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ - void (*func)(struct rcu_head *head)); + call_rcu_func_t call; /* call_rcu() flavor. */ + int ncpus; /* # CPUs seen so far. */ /* The following fields are guarded by the root rcu_node's lock. */ - u8 fqs_state ____cacheline_internodealigned_in_smp; - /* Force QS state. */ - u8 boost; /* Subject to priority boost. */ + u8 boost ____cacheline_internodealigned_in_smp; + /* Subject to priority boost. */ unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ @@ -508,6 +505,7 @@ struct rcu_state { atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ @@ -538,8 +536,8 @@ struct rcu_state { #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ -/* Values for rcu_state structure's gp_flags field. */ -#define RCU_GP_WAIT_INIT 0 /* Initial state. */ +/* Values for rcu_state structure's gp_state field. */ +#define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ #define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ #define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ @@ -582,9 +580,10 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); static int rcu_print_task_stall(struct rcu_node *rnp); +static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static void rcu_preempt_check_callbacks(void); -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +void call_rcu(struct rcu_head *head, rcu_callback_t func); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b2bf3963a0ae..630c19772630 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -101,7 +101,6 @@ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *const rcu_state_p = &rcu_preempt_state; static struct rcu_data __percpu *const rcu_data_p = &rcu_preempt_data; -static int rcu_preempted_readers_exp(struct rcu_node *rnp); static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); @@ -114,6 +113,147 @@ static void __init rcu_bootup_announce(void) rcu_bootup_announce_oddness(); } +/* Flags for rcu_preempt_ctxt_queue() decision table. */ +#define RCU_GP_TASKS 0x8 +#define RCU_EXP_TASKS 0x4 +#define RCU_GP_BLKD 0x2 +#define RCU_EXP_BLKD 0x1 + +/* + * Queues a task preempted within an RCU-preempt read-side critical + * section into the appropriate location within the ->blkd_tasks list, + * depending on the states of any ongoing normal and expedited grace + * periods. The ->gp_tasks pointer indicates which element the normal + * grace period is waiting on (NULL if none), and the ->exp_tasks pointer + * indicates which element the expedited grace period is waiting on (again, + * NULL if none). If a grace period is waiting on a given element in the + * ->blkd_tasks list, it also waits on all subsequent elements. Thus, + * adding a task to the tail of the list blocks any grace period that is + * already waiting on one of the elements. In contrast, adding a task + * to the head of the list won't block any grace period that is already + * waiting on one of the elements. + * + * This queuing is imprecise, and can sometimes make an ongoing grace + * period wait for a task that is not strictly speaking blocking it. + * Given the choice, we needlessly block a normal grace period rather than + * blocking an expedited grace period. + * + * Note that an endless sequence of expedited grace periods still cannot + * indefinitely postpone a normal grace period. Eventually, all of the + * fixed number of preempted tasks blocking the normal grace period that are + * not also blocking the expedited grace period will resume and complete + * their RCU read-side critical sections. At that point, the ->gp_tasks + * pointer will equal the ->exp_tasks pointer, at which point the end of + * the corresponding expedited grace period will also be the end of the + * normal grace period. + */ +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long flags) __releases(rnp->lock) +{ + int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + + (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) + + (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0); + struct task_struct *t = current; + + /* + * Decide where to queue the newly blocked task. In theory, + * this could be an if-statement. In practice, when I tried + * that, it was quite messy. + */ + switch (blkd_state) { + case 0: + case RCU_EXP_TASKS: + case RCU_EXP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS: + case RCU_GP_TASKS + RCU_EXP_TASKS: + + /* + * Blocking neither GP, or first task blocking the normal + * GP but not blocking the already-waiting expedited GP. + * Queue at the head of the list to avoid unnecessarily + * blocking the already-waiting GPs. + */ + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_BLKD: + case RCU_GP_BLKD: + case RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + + /* + * First task arriving that blocks either GP, or first task + * arriving that blocks the expedited GP (with the normal + * GP already waiting), or a task arriving that blocks + * both GPs with both GPs already waiting. Queue at the + * tail of the list to avoid any GP waiting on any of the + * already queued tasks that are not blocking it. + */ + list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks); + break; + + case RCU_EXP_TASKS + RCU_EXP_BLKD: + case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD: + + /* + * Second or subsequent task blocking the expedited GP. + * The task either does not block the normal GP, or is the + * first task blocking the normal GP. Queue just after + * the first task blocking the expedited GP. + */ + list_add(&t->rcu_node_entry, rnp->exp_tasks); + break; + + case RCU_GP_TASKS + RCU_GP_BLKD: + case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD: + + /* + * Second or subsequent task blocking the normal GP. + * The task does not block the expedited GP. Queue just + * after the first task blocking the normal GP. + */ + list_add(&t->rcu_node_entry, rnp->gp_tasks); + break; + + default: + + /* Yet another exercise in excessive paranoia. */ + WARN_ON_ONCE(1); + break; + } + + /* + * We have now queued the task. If it was the first one to + * block either grace period, update the ->gp_tasks and/or + * ->exp_tasks pointers, respectively, to reference the newly + * blocked tasks. + */ + if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) + rnp->gp_tasks = &t->rcu_node_entry; + if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) + rnp->exp_tasks = &t->rcu_node_entry; + raw_spin_unlock(&rnp->lock); + + /* + * Report the quiescent state for the expedited GP. This expedited + * GP should not be able to end until we report, so there should be + * no need to check for a subsequent expedited GP. (Though we are + * still in a quiescent state in any case.) + */ + if (blkd_state & RCU_EXP_BLKD && + t->rcu_read_unlock_special.b.exp_need_qs) { + t->rcu_read_unlock_special.b.exp_need_qs = false; + rcu_report_exp_rdp(rdp->rsp, rdp, true); + } else { + WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); + } + local_irq_restore(flags); +} + /* * Record a preemptible-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is @@ -125,11 +265,11 @@ static void __init rcu_bootup_announce(void) */ static void rcu_preempt_qs(void) { - if (!__this_cpu_read(rcu_data_p->passed_quiesce)) { + if (__this_cpu_read(rcu_data_p->cpu_no_qs.s)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data_p->gpnum), TPS("cpuqs")); - __this_cpu_write(rcu_data_p->passed_quiesce, 1); + __this_cpu_write(rcu_data_p->cpu_no_qs.b.norm, false); barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */ current->rcu_read_unlock_special.b.need_qs = false; } @@ -167,42 +307,18 @@ static void rcu_preempt_note_context_switch(void) t->rcu_blocked_node = rnp; /* - * If this CPU has already checked in, then this task - * will hold up the next grace period rather than the - * current grace period. Queue the task accordingly. - * If the task is queued for the current grace period - * (i.e., this CPU has not yet passed through a quiescent - * state for the current grace period), then as long - * as that task remains queued, the current grace period - * cannot end. Note that there is some uncertainty as - * to exactly when the current grace period started. - * We take a conservative approach, which can result - * in unnecessarily waiting on tasks that started very - * slightly after the current grace period began. C'est - * la vie!!! - * - * But first, note that the current CPU must still be - * on line! + * Verify the CPU's sanity, trace the preemption, and + * then queue the task as required based on the states + * of any ongoing and expedited grace periods. */ WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); - if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { - list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); - rnp->gp_tasks = &t->rcu_node_entry; - if (IS_ENABLED(CONFIG_RCU_BOOST) && - rnp->boost_tasks != NULL) - rnp->boost_tasks = rnp->gp_tasks; - } else { - list_add(&t->rcu_node_entry, &rnp->blkd_tasks); - if (rnp->qsmask & rdp->grpmask) - rnp->gp_tasks = &t->rcu_node_entry; - } trace_rcu_preempt_task(rdp->rsp->name, t->pid, (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_preempt_ctxt_queue(rnp, rdp, flags); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -272,6 +388,7 @@ void rcu_read_unlock_special(struct task_struct *t) unsigned long flags; struct list_head *np; bool drop_boost_mutex = false; + struct rcu_data *rdp; struct rcu_node *rnp; union rcu_special special; @@ -282,8 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t) local_irq_save(flags); /* - * If RCU core is waiting for this CPU to exit critical section, - * let it know that we have done so. Because irqs are disabled, + * If RCU core is waiting for this CPU to exit its critical section, + * report the fact that it has exited. Because irqs are disabled, * t->rcu_read_unlock_special cannot change. */ special = t->rcu_read_unlock_special; @@ -296,13 +413,32 @@ void rcu_read_unlock_special(struct task_struct *t) } } + /* + * Respond to a request for an expedited grace period, but only if + * we were not preempted, meaning that we were running on the same + * CPU throughout. If we were preempted, the exp_need_qs flag + * would have been cleared at the time of the first preemption, + * and the quiescent state would be reported when we were dequeued. + */ + if (special.b.exp_need_qs) { + WARN_ON_ONCE(special.b.blocked); + t->rcu_read_unlock_special.b.exp_need_qs = false; + rdp = this_cpu_ptr(rcu_state_p->rda); + rcu_report_exp_rdp(rcu_state_p, rdp, true); + if (!t->rcu_read_unlock_special.s) { + local_irq_restore(flags); + return; + } + } + /* Hardware IRQ handlers cannot block, complain if they get here. */ if (in_irq() || in_serving_softirq()) { lockdep_rcu_suspicious(__FILE__, __LINE__, "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); - pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", + pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", t->rcu_read_unlock_special.s, t->rcu_read_unlock_special.b.blocked, + t->rcu_read_unlock_special.b.exp_need_qs, t->rcu_read_unlock_special.b.need_qs); local_irq_restore(flags); return; @@ -329,7 +465,7 @@ void rcu_read_unlock_special(struct task_struct *t) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); - empty_exp = !rcu_preempted_readers_exp(rnp); + empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); @@ -353,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, * so we must take a snapshot of the expedited state. */ - empty_exp_now = !rcu_preempted_readers_exp(rnp); + empty_exp_now = sync_rcu_preempt_exp_done(rnp); if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) { trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, @@ -450,6 +586,27 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + int ndetected = 0; + + if (!rnp->exp_tasks) + return 0; + t = list_entry(rnp->exp_tasks->prev, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + pr_cont(" P%d", t->pid); + ndetected++; + } + return ndetected; +} + +/* * Check that the list of blocked tasks for the newly completed grace * period is in fact empty. It is a serious bug to complete a grace * period that still has RCU readers blocked! This function must be @@ -483,8 +640,8 @@ static void rcu_preempt_check_callbacks(void) return; } if (t->rcu_read_lock_nesting > 0 && - __this_cpu_read(rcu_data_p->qs_pending) && - !__this_cpu_read(rcu_data_p->passed_quiesce)) + __this_cpu_read(rcu_data_p->core_needs_qs) && + __this_cpu_read(rcu_data_p->cpu_no_qs.b.norm)) t->rcu_read_unlock_special.b.need_qs = true; } @@ -500,7 +657,7 @@ static void rcu_preempt_do_callbacks(void) /* * Queue a preemptible-RCU callback for invocation after a grace period. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, rcu_state_p, -1, 0); } @@ -535,155 +692,41 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); - -/* - * Return non-zero if there are any tasks in RCU read-side critical - * sections blocking the current preemptible-RCU expedited grace period. - * If there is no preemptible-RCU expedited grace period currently in - * progress, returns zero unconditionally. - */ -static int rcu_preempted_readers_exp(struct rcu_node *rnp) -{ - return rnp->exp_tasks != NULL; -} - -/* - * return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return !rcu_preempted_readers_exp(rnp) && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake) -{ - unsigned long flags; - unsigned long mask; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - wake_up(&sync_rcu_preempt_exp_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); - rnp->expmask &= ~mask; - } -} - /* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 1. If there - * are such tasks, set the ->expmask bits up the rcu_node tree and also - * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 - * that work is needed here. - * - * Caller must hold the root rcu_node's exp_funnel_mutex. + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. */ -static void -sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) +static void sync_rcu_exp_handler(void *info) { - unsigned long flags; - unsigned long mask; - struct rcu_node *rnp_up; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - WARN_ON_ONCE(rnp->expmask); - WARN_ON_ONCE(rnp->exp_tasks); - if (!rcu_preempt_has_tasks(rnp)) { - /* No blocked tasks, nothing to do. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - /* Call for Phase 2 and propagate ->expmask bits up the tree. */ - rnp->expmask = 1; - rnp_up = rnp; - while (rnp_up->parent) { - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - if (rnp_up->expmask & mask) - break; - raw_spin_lock(&rnp_up->lock); /* irqs already off */ - smp_mb__after_unlock_lock(); - rnp_up->expmask |= mask; - raw_spin_unlock(&rnp_up->lock); /* irqs still off */ - } - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* - * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure, phase 2. If the - * leaf rcu_node structure has its ->expmask field set, check for tasks. - * If there are some, clear ->expmask and set ->exp_tasks accordingly, - * then initiate RCU priority boosting. Otherwise, clear ->expmask and - * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, - * enabling rcu_read_unlock_special() to do the bit-clearing. - * - * Caller must hold the root rcu_node's exp_funnel_mutex. - */ -static void -sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - if (!rnp->expmask) { - /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - - /* Phase 1 is over. */ - rnp->expmask = 0; + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; /* - * If there are still blocked tasks, set up ->exp_tasks so that - * rcu_read_unlock_special() will wake us and then boost them. + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. */ - if (rcu_preempt_has_tasks(rnp)) { - rnp->exp_tasks = rnp->blkd_tasks.next; - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; return; } - /* No longer any blocked tasks, so undo bit setting. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_report_exp_rnp(rsp, rnp, false); + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); } /** @@ -713,24 +756,12 @@ void synchronize_rcu_expedited(void) rcu_exp_gp_seq_start(rsp); - /* force all RCU readers onto ->blkd_tasks lists. */ - synchronize_sched_expedited(); - - /* - * Snapshot current state of ->blkd_tasks lists into ->expmask. - * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() - * to start clearing them. Doing this in one phase leads to - * strange races between setting and clearing bits, so just say "no"! - */ - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init1(rsp, rnp); - rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init2(rsp, rnp); + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); - wait_event(sync_rcu_preempt_exp_wq, - sync_rcu_preempt_exp_done(rnp)); + synchronize_sched_expedited_wait(rsp); /* Clean up and exit. */ rcu_exp_gp_seq_end(rsp); @@ -835,6 +866,16 @@ static int rcu_print_task_stall(struct rcu_node *rnp) } /* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ + return 0; +} + +/* * Because there is no preemptible RCU, there can be no readers blocked, * so there is no need to check for blocked tasks. So check only for * bogus qsmask values. @@ -1702,8 +1743,12 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", - cpu, ticks_value, ticks_title, + pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], + "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], + ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 6fc4c5ff3bb5..ef7093cc9b5c 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -117,13 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d", + seq_printf(m, "%3d%cc=%ld g=%ld cnq=%d/%d:%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), - rdp->passed_quiesce, + rdp->cpu_no_qs.b.norm, rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), - rdp->qs_pending); + rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", atomic_read(&rdp->dynticks->dynticks), rdp->dynticks->dynticks_nesting, @@ -268,7 +268,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) gpnum = rsp->gpnum; seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", ulong2long(rsp->completed), ulong2long(gpnum), - rsp->fqs_state, + rsp->gp_state, (long)(rsp->jiffies_force_qs - jiffies), (int)(jiffies & 0xffff)); seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", @@ -361,7 +361,7 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->n_rcu_pending); seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", - rdp->n_rp_qs_pending, + rdp->n_rp_core_needs_qs, rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 7a0b3bc7c5ed..5f748c5a40f0 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -534,7 +534,7 @@ static void rcu_spawn_tasks_kthread(void); * Post an RCU-tasks callback. First call must be from process context * after the scheduler if fully operational. */ -void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp)) +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bcd214e4b4d6..f7402f7eb448 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4029,6 +4029,7 @@ int sched_setscheduler_nocheck(struct task_struct *p, int policy, { return _sched_setscheduler(p, policy, param, false); } +EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) diff --git a/kernel/torture.c b/kernel/torture.c index 3e4840633d3e..44aa462d033f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -523,6 +523,7 @@ static int stutter; */ void stutter_wait(const char *title) { + cond_resched_rcu_qs(); while (READ_ONCE(stutter_pause_test) || (torture_runnable && !READ_ONCE(*torture_runnable))) { if (stutter_pause_test) |