From 546a9d8519ed137b2804a3f5a3659003039dd49c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Jun 2014 14:57:10 -0700 Subject: rcu: Export debug_init_rcu_head() and and debug_init_rcu_head() Currently, call_rcu() relies on implicit allocation and initialization for the debug-objects handling of RCU callbacks. If you hammer the kernel hard enough with Sasha's modified version of trinity, you can end up with the sl*b allocators recursing into themselves via this implicit call_rcu() allocation. This commit therefore exports the debug_init_rcu_head() and debug_rcu_head_free() functions, which permits the allocators to allocated and pre-initialize the debug-objects information, so that there no longer any need for call_rcu() to do that initialization, which in turn prevents the recursion into the memory allocators. Reported-by: Sasha Levin Suggested-by: Thomas Gleixner Signed-off-by: Paul E. McKenney Acked-by: Thomas Gleixner Looks-good-to: Christoph Lameter --- include/linux/rcupdate.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5a75d19aa661..13bbfbde41b9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -358,9 +358,19 @@ void wait_rcu_gp(call_rcu_func_t crf); * initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +void init_rcu_head(struct rcu_head *head); +void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void init_rcu_head(struct rcu_head *head) +{ +} + +static inline void destroy_rcu_head(struct rcu_head *head) +{ +} + static inline void init_rcu_head_on_stack(struct rcu_head *head) { } -- cgit v1.2.3 From 4a81e8328d3791a4f99bf5b436d050f6dc5ffea3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Jun 2014 16:49:01 -0700 Subject: rcu: Reduce overhead of cond_resched() checks for RCU Commit ac1bea85781e (Make cond_resched() report RCU quiescent states) fixed a problem where a CPU looping in the kernel with but one runnable task would give RCU CPU stall warnings, even if the in-kernel loop contained cond_resched() calls. Unfortunately, in so doing, it introduced performance regressions in Anton Blanchard's will-it-scale "open1" test. The problem appears to be not so much the increased cond_resched() path length as an increase in the rate at which grace periods complete, which increased per-update grace-period overhead. This commit takes a different approach to fixing this bug, mainly by moving the RCU-visible quiescent state from cond_resched() to rcu_note_context_switch(), and by further reducing the check to a simple non-zero test of a single per-CPU variable. However, this approach requires that the force-quiescent-state processing send resched IPIs to the offending CPUs. These will be sent only once the grace period has reached an age specified by the boot/sysfs parameter rcutree.jiffies_till_sched_qs, or once the grace period reaches an age halfway to the point at which RCU CPU stall warnings will be emitted, whichever comes first. Reported-by: Dave Hansen Signed-off-by: Paul E. McKenney Cc: Andi Kleen Cc: Christoph Lameter Cc: Mike Galbraith Cc: Eric Dumazet Reviewed-by: Josh Triplett [ paulmck: Made rcu_momentary_dyntick_idle() as suggested by the ktest build robot. Also fixed smp_mb() comment as noted by Oleg Nesterov. ] Merge with e552592e (Reduce overhead of cond_resched() checks for RCU) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13bbfbde41b9..6a94cc8b1ca0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,7 +44,6 @@ #include #include #include -#include #include extern int rcu_expedited; /* for sysctl */ @@ -299,41 +298,6 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ -/* - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. - */ - -#define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */ -DECLARE_PER_CPU(int, rcu_cond_resched_count); -void rcu_resched(void); - -/* - * Is it time to report RCU quiescent states? - * - * Note unsynchronized access to rcu_cond_resched_count. Yes, we might - * increment some random CPU's count, and possibly also load the result from - * yet another CPU's count. We might even clobber some other CPU's attempt - * to zero its counter. This is all OK because the goal is not precision, - * but rather reasonable amortization of rcu_note_context_switch() overhead - * and extremely high probability of avoiding RCU CPU stall warnings. - * Note that this function has to be preempted in just the wrong place, - * many thousands of times in a row, for anything bad to happen. - */ -static inline bool rcu_should_resched(void) -{ - return raw_cpu_inc_return(rcu_cond_resched_count) >= - RCU_COND_RESCHED_LIM; -} - -/* - * Report quiscent states to RCU if it is time to do so. - */ -static inline void rcu_cond_resched(void) -{ - if (unlikely(rcu_should_resched())) - rcu_resched(); -} - /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. -- cgit v1.2.3 From f27bc4873fa8b75cc1eba7b641eda7375dc72ccf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 May 2014 15:38:38 -0700 Subject: rcu: Document deadlock-avoidance information for rcu_read_unlock() Reported-by: Oleg Nesterov Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- include/linux/rcupdate.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6a94cc8b1ca0..c56ad15204ec 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -858,6 +858,34 @@ static inline void rcu_read_lock(void) /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * + * In most situations, rcu_read_unlock() is immune from deadlock. + * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock() + * is responsible for deboosting, which it does via rt_mutex_unlock(). + * Unfortunately, this function acquires the scheduler's runqueue and + * priority-inheritance spinlocks. This means that deadlock could result + * if the caller of rcu_read_unlock() already holds one of these locks or + * any lock that is ever acquired while holding them. + * + * That said, RCU readers are never priority boosted unless they were + * preempted. Therefore, one way to avoid deadlock is to make sure + * that preemption never happens within any RCU read-side critical + * section whose outermost rcu_read_unlock() is called with one of + * rt_mutex_unlock()'s locks held. Such preemption can be avoided in + * a number of ways, for example, by invoking preempt_disable() before + * critical section's outermost rcu_read_lock(). + * + * Given that the set of locks acquired by rt_mutex_unlock() might change + * at any time, a somewhat more future-proofed approach is to make sure + * that that preemption never happens within any RCU read-side critical + * section whose outermost rcu_read_unlock() is called with irqs disabled. + * This approach relies on the fact that rt_mutex_unlock() currently only + * acquires irq-disabled locks. + * + * The second of these two approaches is best in most situations, + * however, the first approach can also be useful, at least to those + * developers willing to keep abreast of the set of locks acquired by + * rt_mutex_unlock(). + * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) -- cgit v1.2.3 From ab74fdfd4e11ec040f21cf87edc14fc9f62cc934 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 4 May 2014 15:41:21 -0700 Subject: rcu: Handle obsolete references to TINY_PREEMPT_RCU Signed-off-by: Paul E. McKenney Reviewed-by: Lai Jiangshan --- include/linux/rcupdate.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c56ad15204ec..d231aa17b1d7 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -826,15 +826,14 @@ static inline void rcu_preempt_sleep_check(void) * read-side critical section that would block in a !PREEMPT kernel. * But if you want the full story, read on! * - * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it - * is illegal to block while in an RCU read-side critical section. In - * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU) - * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may - * be preempted, but explicit blocking is illegal. Finally, in preemptible - * RCU implementations in real-time (with -rt patchset) kernel builds, - * RCU read-side critical sections may be preempted and they may also - * block, but only when acquiring spinlocks that are subject to priority - * inheritance. + * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), + * it is illegal to block while in an RCU read-side critical section. + * In preemptible RCU implementations (TREE_PREEMPT_RCU) in CONFIG_PREEMPT + * kernel builds, RCU read-side critical sections may be preempted, + * but explicit blocking is illegal. Finally, in preemptible RCU + * implementations in real-time (with -rt patchset) kernel builds, RCU + * read-side critical sections may be preempted and they may also block, but + * only when acquiring spinlocks that are subject to priority inheritance. */ static inline void rcu_read_lock(void) { -- cgit v1.2.3 From abaa93d9e1de2c29297e69ddba8ddd38f15064cf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jun 2014 13:30:25 -0700 Subject: rcu: Simplify priority boosting by putting rt_mutex in rcu_node RCU priority boosting currently checks for boosting via a pointer in task_struct. However, this is not needed: As Oleg noted, if the rt_mutex is placed in the rcu_node instead of on the booster's stack, the boostee can simply check it see if it owns the lock. This commit makes this change, shrinking task_struct by one pointer and the kernel by thirteen lines. Suggested-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/init_task.h | 9 +-------- include/linux/sched.h | 6 ------ 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 6df7f9fe0d01..2bb4c4f3531a 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -102,12 +102,6 @@ extern struct group_info init_groups; #define INIT_IDS #endif -#ifdef CONFIG_RCU_BOOST -#define INIT_TASK_RCU_BOOST() \ - .rcu_boost_mutex = NULL, -#else -#define INIT_TASK_RCU_BOOST() -#endif #ifdef CONFIG_TREE_PREEMPT_RCU #define INIT_TASK_RCU_TREE_PREEMPT() \ .rcu_blocked_node = NULL, @@ -119,8 +113,7 @@ extern struct group_info init_groups; .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ - INIT_TASK_RCU_TREE_PREEMPT() \ - INIT_TASK_RCU_BOOST() + INIT_TASK_RCU_TREE_PREEMPT() #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 306f4f0c987a..3cfbc05e66e6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1270,9 +1270,6 @@ struct task_struct { #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */ #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; @@ -2009,9 +2006,6 @@ static inline void rcu_copy_process(struct task_struct *p) #ifdef CONFIG_TREE_PREEMPT_RCU p->rcu_blocked_node = NULL; #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); } -- cgit v1.2.3 From c0f489d2c6fec8994c642c2ec925eb858727dc7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jun 2014 13:46:03 -0700 Subject: rcu: Bind grace-period kthreads to non-NO_HZ_FULL CPUs Binding the grace-period kthreads to the timekeeping CPU resulted in significant performance decreases for some workloads. For more detail, see: https://lkml.org/lkml/2014/6/3/395 for benchmark numbers https://lkml.org/lkml/2014/6/4/218 for CPU statistics It turns out that it is necessary to bind the grace-period kthreads to the timekeeping CPU only when all but CPU 0 is a nohz_full CPU on the one hand or if CONFIG_NO_HZ_FULL_SYSIDLE=y on the other. In other cases, it suffices to bind the grace-period kthreads to the set of non-nohz_full CPUs. This commit therefore creates a tick_nohz_not_full_mask that is the complement of tick_nohz_full_mask, and then binds the grace-period kthread to the set of CPUs indicated by this new mask, which covers the CONFIG_NO_HZ_FULL_SYSIDLE=n case. The CONFIG_NO_HZ_FULL_SYSIDLE=y case still binds the grace-period kthreads to the timekeeping CPU. This commit also includes the tick_nohz_full_enabled() check suggested by Frederic Weisbecker. Reported-by: Jet Chen Signed-off-by: Paul E. McKenney [ paulmck: Created housekeeping_affine() and housekeeping_mask per fweisbec feedback. ] --- include/linux/tick.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tick.h b/include/linux/tick.h index b84773cb9f4c..06cc093ab7ad 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -162,6 +163,7 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; extern cpumask_var_t tick_nohz_full_mask; +extern cpumask_var_t housekeeping_mask; static inline bool tick_nohz_full_enabled(void) { @@ -194,6 +196,24 @@ static inline void tick_nohz_full_kick_all(void) { } static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } #endif +static inline bool is_housekeeping_cpu(int cpu) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + return cpumask_test_cpu(cpu, housekeeping_mask); +#endif + return true; +} + +static inline void housekeeping_affine(struct task_struct *t) +{ +#ifdef CONFIG_NO_HZ_FULL + if (tick_nohz_full_enabled()) + set_cpus_allowed_ptr(t, housekeeping_mask); + +#endif +} + static inline void tick_nohz_full_check(void) { if (tick_nohz_full_enabled()) -- cgit v1.2.3