From 9bd721c55c8a886b938a45198aab0ccb52f1f7fa Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 13 Sep 2013 11:26:52 -0700 Subject: sched/balancing: Consider max cost of idle balance per sched domain In this patch, we keep track of the max cost we spend doing idle load balancing for each sched domain. If the avg time the CPU remains idle is less then the time we have already spent on idle balancing + the max cost of idle balancing in the sched domain, then we don't continue to attempt the balance. We also keep a per rq variable, max_idle_balance_cost, which keeps track of the max time spent on newidle load balances throughout all its domains so that we can determine the avg_idle's max value. By using the max, we avoid overrunning the average. This further reduces the chance we attempt balancing when the CPU is not idle for longer than the cost to balance. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379096813-3032-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/topology.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..be078ff9157f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -810,6 +810,7 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ u64 last_update; + u64 max_newidle_lb_cost; #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ diff --git a/include/linux/topology.h b/include/linux/topology.h index d3cf0d6e7712..e2a2c3da2929 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -106,6 +106,7 @@ int arch_update_cpu_topology(void); .last_balance = jiffies, \ .balance_interval = 1, \ .smt_gain = 1178, /* 15% */ \ + .max_newidle_lb_cost = 0, \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -135,6 +136,7 @@ int arch_update_cpu_topology(void); , \ .last_balance = jiffies, \ .balance_interval = 1, \ + .max_newidle_lb_cost = 0, \ } #endif #endif /* CONFIG_SCHED_MC */ @@ -166,6 +168,7 @@ int arch_update_cpu_topology(void); , \ .last_balance = jiffies, \ .balance_interval = 1, \ + .max_newidle_lb_cost = 0, \ } #endif -- cgit v1.2.3 From f48627e686a69f5215cb0761e731edb3d9859dd9 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 13 Sep 2013 11:26:53 -0700 Subject: sched/balancing: Periodically decay max cost of idle balance This patch builds on patch 2 and periodically decays that max value to do idle balancing per sched domain by approximately 1% per second. Also decay the rq's max_idle_balance_cost value. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379096813-3032-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ include/linux/topology.h | 3 +++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index be078ff9157f..b5344de1658b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -810,7 +810,10 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ u64 last_update; + + /* idle_balance() stats */ u64 max_newidle_lb_cost; + unsigned long next_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ diff --git a/include/linux/topology.h b/include/linux/topology.h index e2a2c3da2929..12ae6ce997d6 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -107,6 +107,7 @@ int arch_update_cpu_topology(void); .balance_interval = 1, \ .smt_gain = 1178, /* 15% */ \ .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -137,6 +138,7 @@ int arch_update_cpu_topology(void); .last_balance = jiffies, \ .balance_interval = 1, \ .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif #endif /* CONFIG_SCHED_MC */ @@ -169,6 +171,7 @@ int arch_update_cpu_topology(void); .last_balance = jiffies, \ .balance_interval = 1, \ .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } #endif -- cgit v1.2.3 From 3150398626466c6cc626732f60bc901d58f40677 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 12 Sep 2013 15:10:31 +0200 Subject: sched: Remove {set,clear}_need_resched Preemption semantics are going to change which mandate a change. All DRM usage sites are already broken and will not be affected (much) by this change. DRM people are aware and will remove the last few stragglers. For now, leave an empty stub that generates a warning, once all users are gone we can remove this. Signed-off-by: Peter Zijlstra Cc: airlied@linux.ie Cc: daniel.vetter@ffwll.ch Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/n/tip-qfc1el2zvhxiyut4ai99ij4n@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/thread_info.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e7e04736802f..a629e4b23217 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -104,8 +104,19 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) -#define set_need_resched() set_thread_flag(TIF_NEED_RESCHED) -#define clear_need_resched() clear_thread_flag(TIF_NEED_RESCHED) +static inline __deprecated void set_need_resched(void) +{ + /* + * Use of this function in deprecated. + * + * As of this writing there are only a few users in the DRM tree left + * all of which are wrong and can be removed without causing too much + * grief. + * + * The DRM people are aware and are working on removing the last few + * instances. + */ +} #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK /* -- cgit v1.2.3 From ea8117478918a4734586d35ff530721b682425be Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Sep 2013 12:43:13 +0200 Subject: sched, idle: Fix the idle polling state logic Mike reported that commit 7d1a9417 ("x86: Use generic idle loop") regressed several workloads and caused excessive reschedule interrupts. The patch in question failed to notice that the x86 code had an inverted sense of the polling state versus the new generic code (x86: default polling, generic: default !polling). Fix the two prominent x86 mwait based idle drivers and introduce a few new generic polling helpers (fixing the wrong smp_mb__after_clear_bit usage). Also switch the idle routines to using tif_need_resched() which is an immediate TIF_NEED_RESCHED test as opposed to need_resched which will end up being slightly different. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: lenb@kernel.org Cc: tglx@linutronix.de Link: http://lkml.kernel.org/n/tip-nc03imb0etuefmzybzj7sprf@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 78 +++++++++++++++++++++++++++++++++++++++++---- include/linux/thread_info.h | 2 ++ 2 files changed, 73 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b5344de1658b..e783ec52295a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2479,34 +2479,98 @@ static inline int tsk_is_polling(struct task_struct *p) { return task_thread_info(p)->status & TS_POLLING; } -static inline void current_set_polling(void) +static inline void __current_set_polling(void) { current_thread_info()->status |= TS_POLLING; } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); +} + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); } #elif defined(TIF_POLLING_NRFLAG) static inline int tsk_is_polling(struct task_struct *p) { return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); } -static inline void current_set_polling(void) + +static inline void __current_set_polling(void) { set_thread_flag(TIF_POLLING_NRFLAG); } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + * + * XXX: assumes set/clear bit are identical barrier wise. + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { clear_thread_flag(TIF_POLLING_NRFLAG); } + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + #else static inline int tsk_is_polling(struct task_struct *p) { return 0; } -static inline void current_set_polling(void) { } -static inline void current_clr_polling(void) { } +static inline void __current_set_polling(void) { } +static inline void __current_clr_polling(void) { } + +static inline bool __must_check current_set_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} +static inline bool __must_check current_clr_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} #endif /* diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index a629e4b23217..fddbe2023a5d 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -118,6 +118,8 @@ static inline __deprecated void set_need_resched(void) */ } +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) + #if defined TIF_RESTORE_SIGMASK && !defined HAVE_SET_RESTORE_SIGMASK /* * An arch can define its own version of set_restore_sigmask() to get the -- cgit v1.2.3 From 4a2b4b222743bb07fedf985b884550f2ca067ea9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:24 +0200 Subject: sched: Introduce preempt_count accessor functions Replace the single preempt_count() 'function' that's an lvalue with two proper functions: preempt_count() - returns the preempt_count value as rvalue preempt_count_set() - Allows setting the preempt-count value Also provide preempt_count_ptr() as a convenience wrapper to implement all modifying operations. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-orxrbycjozopqfhb4dxdkdvb@git.kernel.org [ Fixed build failure. ] Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index f5d4723cdb3d..eaac52a8fe6a 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,19 +10,32 @@ #include #include +static __always_inline int preempt_count(void) +{ + return current_thread_info()->preempt_count; +} + +static __always_inline int *preempt_count_ptr(void) +{ + return ¤t_thread_info()->preempt_count; +} + +static __always_inline void preempt_count_set(int pc) +{ + *preempt_count_ptr() = pc; +} + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); #else -# define add_preempt_count(val) do { preempt_count() += (val); } while (0) -# define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) +# define add_preempt_count(val) do { *preempt_count_ptr() += (val); } while (0) +# define sub_preempt_count(val) do { *preempt_count_ptr() -= (val); } while (0) #endif #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) - #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void); @@ -81,9 +94,9 @@ do { \ /* For debugging and tracer internals only! */ #define add_preempt_count_notrace(val) \ - do { preempt_count() += (val); } while (0) + do { *preempt_count_ptr() += (val); } while (0) #define sub_preempt_count_notrace(val) \ - do { preempt_count() -= (val); } while (0) + do { *preempt_count_ptr() -= (val); } while (0) #define inc_preempt_count_notrace() add_preempt_count_notrace(1) #define dec_preempt_count_notrace() sub_preempt_count_notrace(1) -- cgit v1.2.3 From f27dde8deef33c9e58027df11ceab2198601d6a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:31 +0200 Subject: sched: Add NEED_RESCHED to the preempt_count In order to combine the preemption and need_resched test we need to fold the need_resched information into the preempt_count value. Since the NEED_RESCHED flag is set across CPUs this needs to be an atomic operation, however we very much want to avoid making preempt_count atomic, therefore we keep the existing TIF_NEED_RESCHED infrastructure in place but at 3 sites test it and fold its value into preempt_count; namely: - resched_task() when setting TIF_NEED_RESCHED on the current task - scheduler_ipi() when resched_task() sets TIF_NEED_RESCHED on a remote task it follows it up with a reschedule IPI and we can modify the cpu local preempt_count from there. - cpu_idle_loop() for when resched_task() found tsk_is_polling(). We use an inverted bitmask to indicate need_resched so that a 0 means both need_resched and !atomic. Also remove the barrier() in preempt_enable() between preempt_enable_no_resched() and preempt_check_resched() to avoid having to reload the preemption value and allow the compiler to use the flags of the previuos decrement. I couldn't come up with any sane reason for this barrier() to be there as preempt_enable_no_resched() already has a barrier() before doing the decrement. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-7a7m5qqbn5pmwnd4wko9u6da@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 47 ++++++++++++++++++++++++++++++++++++++++++----- include/linux/sched.h | 7 +++++-- 2 files changed, 47 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index eaac52a8fe6a..92e341853e4b 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -10,9 +10,19 @@ #include #include +/* + * We use the MSB mostly because its available; see for + * the other bits -- can't include that header due to inclusion hell. + */ +#define PREEMPT_NEED_RESCHED 0x80000000 + +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ static __always_inline int preempt_count(void) { - return current_thread_info()->preempt_count; + return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; } static __always_inline int *preempt_count_ptr(void) @@ -20,11 +30,40 @@ static __always_inline int *preempt_count_ptr(void) return ¤t_thread_info()->preempt_count; } +/* + * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the + * alternative is loosing a reschedule. Better schedule too often -- also this + * should be a very rare operation. + */ static __always_inline void preempt_count_set(int pc) { *preempt_count_ptr() = pc; } +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * single instruction. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) +{ + *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; +} + +static __always_inline void clear_preempt_need_resched(void) +{ + *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; +} + +static __always_inline bool test_preempt_need_resched(void) +{ + return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); +} + #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); extern void sub_preempt_count(int val); @@ -42,7 +81,7 @@ asmlinkage void preempt_schedule(void); #define preempt_check_resched() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(!*preempt_count_ptr())) \ preempt_schedule(); \ } while (0) @@ -52,7 +91,7 @@ void preempt_schedule_context(void); #define preempt_check_resched_context() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(!*preempt_count_ptr())) \ preempt_schedule_context(); \ } while (0) #else @@ -88,7 +127,6 @@ do { \ #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ - barrier(); \ preempt_check_resched(); \ } while (0) @@ -116,7 +154,6 @@ do { \ #define preempt_enable_notrace() \ do { \ preempt_enable_no_resched_notrace(); \ - barrier(); \ preempt_check_resched_context(); \ } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index e783ec52295a..9fa151fb968e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -22,6 +22,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -434,7 +435,9 @@ struct task_cputime { * We include PREEMPT_ACTIVE to avoid cond_resched() from working * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) +#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE + PREEMPT_NEED_RESCHED) +#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) +#define PREEMPT_DISABLED (1 + PREEMPT_NEED_RESCHED) /** * struct thread_group_cputimer - thread group interval timer counts @@ -2408,7 +2411,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return unlikely(test_preempt_need_resched()); } /* -- cgit v1.2.3 From a787870924dbd6f321661e06d4ec1c7a408c9ccf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:40 +0200 Subject: sched, arch: Create asm/preempt.h In order to prepare to per-arch implementations of preempt_count move the required bits into an asm-generic header and use this for all archs. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-h5j0c1r3e3fk015m30h8f1zx@git.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 54 +++++++++++++++++++++++++++++++++++++++++++ include/linux/preempt.h | 49 +-------------------------------------- 2 files changed, 55 insertions(+), 48 deletions(-) create mode 100644 include/asm-generic/preempt.h (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h new file mode 100644 index 000000000000..a1fc6590a743 --- /dev/null +++ b/include/asm-generic/preempt.h @@ -0,0 +1,54 @@ +#ifndef __ASM_PREEMPT_H +#define __ASM_PREEMPT_H + +#include + +/* + * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users + * that think a non-zero value indicates we cannot preempt. + */ +static __always_inline int preempt_count(void) +{ + return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; +} + +static __always_inline int *preempt_count_ptr(void) +{ + return ¤t_thread_info()->preempt_count; +} + +/* + * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the + * alternative is loosing a reschedule. Better schedule too often -- also this + * should be a very rare operation. + */ +static __always_inline void preempt_count_set(int pc) +{ + *preempt_count_ptr() = pc; +} + +/* + * We fold the NEED_RESCHED bit into the preempt count such that + * preempt_enable() can decrement and test for needing to reschedule with a + * single instruction. + * + * We invert the actual bit, so that when the decrement hits 0 we know we both + * need to resched (the bit is cleared) and can resched (no preempt count). + */ + +static __always_inline void set_preempt_need_resched(void) +{ + *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; +} + +static __always_inline void clear_preempt_need_resched(void) +{ + *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; +} + +static __always_inline bool test_preempt_need_resched(void) +{ + return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); +} + +#endif /* __ASM_PREEMPT_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 92e341853e4b..df8e245e8729 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -6,7 +6,6 @@ * preempt_count (used for kernel preemption, interrupt count, etc.) */ -#include #include #include @@ -16,53 +15,7 @@ */ #define PREEMPT_NEED_RESCHED 0x80000000 -/* - * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users - * that think a non-zero value indicates we cannot preempt. - */ -static __always_inline int preempt_count(void) -{ - return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED; -} - -static __always_inline int *preempt_count_ptr(void) -{ - return ¤t_thread_info()->preempt_count; -} - -/* - * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the - * alternative is loosing a reschedule. Better schedule too often -- also this - * should be a very rare operation. - */ -static __always_inline void preempt_count_set(int pc) -{ - *preempt_count_ptr() = pc; -} - -/* - * We fold the NEED_RESCHED bit into the preempt count such that - * preempt_enable() can decrement and test for needing to reschedule with a - * single instruction. - * - * We invert the actual bit, so that when the decrement hits 0 we know we both - * need to resched (the bit is cleared) and can resched (no preempt count). - */ - -static __always_inline void set_preempt_need_resched(void) -{ - *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED; -} - -static __always_inline void clear_preempt_need_resched(void) -{ - *preempt_count_ptr() |= PREEMPT_NEED_RESCHED; -} - -static __always_inline bool test_preempt_need_resched(void) -{ - return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); -} +#include #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) extern void add_preempt_count(int val); -- cgit v1.2.3 From 01028747559ac6c6f642a7bbd2875cc4f66b2feb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:55:46 +0200 Subject: sched: Create more preempt_count accessors We need a few special preempt_count accessors: - task_preempt_count() for when we're interested in the preemption count of another (non-running) task. - init_task_preempt_count() for properly initializing the preemption count. - init_idle_preempt_count() a special case of the above for the idle threads. With these no generic code ever touches thread_info::preempt_count anymore and architectures could choose to remove it. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-jf5swrio8l78j37d06fzmo4r@git.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 14 ++++++++++++++ include/trace/events/sched.h | 2 +- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index a1fc6590a743..8100b1ec1715 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -27,6 +27,20 @@ static __always_inline void preempt_count_set(int pc) *preempt_count_ptr() = pc; } +/* + * must be macros to avoid header recursion hell + */ +#define task_preempt_count(p) \ + (task_thread_info(p)->preempt_count & ~PREEMPT_NEED_RESCHED) + +#define init_task_preempt_count(p) do { \ + task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \ +} while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \ +} while (0) + /* * We fold the NEED_RESCHED bit into the preempt count such that * preempt_enable() can decrement and test for needing to reschedule with a diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 2e7d9947a10d..613381bcde40 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) /* * For all intents and purposes a preempted task is a running task. */ - if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) + if (task_preempt_count(p) & PREEMPT_ACTIVE) state = TASK_RUNNING | TASK_STATE_MAX; #endif -- cgit v1.2.3 From bdb43806589096ac4272fe1307e789846ac08d7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Sep 2013 12:15:23 +0200 Subject: sched: Extract the basic add/sub preempt_count modifiers Rewrite the preempt_count macros in order to extract the 3 basic preempt_count value modifiers: __preempt_count_add() __preempt_count_sub() and the new: __preempt_count_dec_and_test() And since we're at it anyway, replace the unconventional $op_preempt_count names with the more conventional preempt_count_$op. Since these basic operators are equivalent to the previous _notrace() variants, do away with the _notrace() versions. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ewbpdbupy9xpsjhg960zwbv8@git.kernel.org Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 35 ++++++++++++++ include/linux/hardirq.h | 8 ++-- include/linux/preempt.h | 106 +++++++++++++++++++----------------------- include/linux/sched.h | 5 -- include/linux/uaccess.h | 8 +--- 5 files changed, 89 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 8100b1ec1715..82d958fc3823 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -65,4 +65,39 @@ static __always_inline bool test_preempt_need_resched(void) return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED); } +/* + * The various preempt_count add/sub methods + */ + +static __always_inline void __preempt_count_add(int val) +{ + *preempt_count_ptr() += val; +} + +static __always_inline void __preempt_count_sub(int val) +{ + *preempt_count_ptr() -= val; +} + +static __always_inline bool __preempt_count_dec_and_test(void) +{ + return !--*preempt_count_ptr(); +} + +/* + * Returns true when we need to resched -- even if we can not. + */ +static __always_inline bool need_resched(void) +{ + return unlikely(test_preempt_need_resched()); +} + +/* + * Returns true when we need to resched and can (barring IRQ state). + */ +static __always_inline bool should_resched(void) +{ + return unlikely(!*preempt_count_ptr()); +} + #endif /* __ASM_PREEMPT_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 1e041063b226..d9cf963ac832 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -33,7 +33,7 @@ extern void rcu_nmi_exit(void); #define __irq_enter() \ do { \ account_irq_enter_time(current); \ - add_preempt_count(HARDIRQ_OFFSET); \ + preempt_count_add(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -49,7 +49,7 @@ extern void irq_enter(void); do { \ trace_hardirq_exit(); \ account_irq_exit_time(current); \ - sub_preempt_count(HARDIRQ_OFFSET); \ + preempt_count_sub(HARDIRQ_OFFSET); \ } while (0) /* @@ -62,7 +62,7 @@ extern void irq_exit(void); lockdep_off(); \ ftrace_nmi_enter(); \ BUG_ON(in_nmi()); \ - add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ rcu_nmi_enter(); \ trace_hardirq_enter(); \ } while (0) @@ -72,7 +72,7 @@ extern void irq_exit(void); trace_hardirq_exit(); \ rcu_nmi_exit(); \ BUG_ON(!in_nmi()); \ - sub_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET); \ + preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ ftrace_nmi_exit(); \ lockdep_on(); \ } while (0) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index df8e245e8729..2343d8715299 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -18,97 +18,86 @@ #include #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) - extern void add_preempt_count(int val); - extern void sub_preempt_count(int val); +extern void preempt_count_add(int val); +extern void preempt_count_sub(int val); +#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); }) #else -# define add_preempt_count(val) do { *preempt_count_ptr() += (val); } while (0) -# define sub_preempt_count(val) do { *preempt_count_ptr() -= (val); } while (0) +#define preempt_count_add(val) __preempt_count_add(val) +#define preempt_count_sub(val) __preempt_count_sub(val) +#define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif -#define inc_preempt_count() add_preempt_count(1) -#define dec_preempt_count() sub_preempt_count(1) - -#ifdef CONFIG_PREEMPT - -asmlinkage void preempt_schedule(void); - -#define preempt_check_resched() \ -do { \ - if (unlikely(!*preempt_count_ptr())) \ - preempt_schedule(); \ -} while (0) - -#ifdef CONFIG_CONTEXT_TRACKING - -void preempt_schedule_context(void); - -#define preempt_check_resched_context() \ -do { \ - if (unlikely(!*preempt_count_ptr())) \ - preempt_schedule_context(); \ -} while (0) -#else - -#define preempt_check_resched_context() preempt_check_resched() - -#endif /* CONFIG_CONTEXT_TRACKING */ - -#else /* !CONFIG_PREEMPT */ - -#define preempt_check_resched() do { } while (0) -#define preempt_check_resched_context() do { } while (0) - -#endif /* CONFIG_PREEMPT */ +#define __preempt_count_inc() __preempt_count_add(1) +#define __preempt_count_dec() __preempt_count_sub(1) +#define preempt_count_inc() preempt_count_add(1) +#define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ - inc_preempt_count(); \ + preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ - dec_preempt_count(); \ + preempt_count_dec(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#ifdef CONFIG_PREEMPT +asmlinkage void preempt_schedule(void); #define preempt_enable() \ do { \ - preempt_enable_no_resched(); \ - preempt_check_resched(); \ + barrier(); \ + if (unlikely(preempt_count_dec_and_test())) \ + preempt_schedule(); \ } while (0) -/* For debugging and tracer internals only! */ -#define add_preempt_count_notrace(val) \ - do { *preempt_count_ptr() += (val); } while (0) -#define sub_preempt_count_notrace(val) \ - do { *preempt_count_ptr() -= (val); } while (0) -#define inc_preempt_count_notrace() add_preempt_count_notrace(1) -#define dec_preempt_count_notrace() sub_preempt_count_notrace(1) +#define preempt_check_resched() \ +do { \ + if (should_resched()) \ + preempt_schedule(); \ +} while (0) + +#else +#define preempt_enable() preempt_enable_no_resched() +#define preempt_check_resched() do { } while (0) +#endif #define preempt_disable_notrace() \ do { \ - inc_preempt_count_notrace(); \ + __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ - dec_preempt_count_notrace(); \ + __preempt_count_dec(); \ } while (0) -/* preempt_check_resched is OK to trace */ +#ifdef CONFIG_PREEMPT + +#ifdef CONFIG_CONTEXT_TRACKING +asmlinkage void preempt_schedule_context(void); +#else +#define preempt_schedule_context() preempt_schedule() +#endif + #define preempt_enable_notrace() \ do { \ - preempt_enable_no_resched_notrace(); \ - preempt_check_resched_context(); \ + barrier(); \ + if (unlikely(__preempt_count_dec_and_test())) \ + preempt_schedule_context(); \ } while (0) +#else +#define preempt_enable_notrace() preempt_enable_no_resched_notrace() +#endif #else /* !CONFIG_PREEMPT_COUNT */ @@ -118,10 +107,11 @@ do { \ * that can cause faults and scheduling migrate into our preempt-protected * region. */ -#define preempt_disable() barrier() +#define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() -#define preempt_enable_no_resched() barrier() -#define preempt_enable() barrier() +#define preempt_enable_no_resched() barrier() +#define preempt_enable() barrier() +#define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() diff --git a/include/linux/sched.h b/include/linux/sched.h index 9fa151fb968e..06ac17c7e639 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2409,11 +2409,6 @@ static inline int signal_pending_state(long state, struct task_struct *p) return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } -static inline int need_resched(void) -{ - return unlikely(test_preempt_need_resched()); -} - /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5ca0951e1855..9d8cf056e661 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -15,7 +15,7 @@ */ static inline void pagefault_disable(void) { - inc_preempt_count(); + preempt_count_inc(); /* * make sure to have issued the store before a pagefault * can hit. @@ -30,11 +30,7 @@ static inline void pagefault_enable(void) * the pagefault handler again. */ barrier(); - dec_preempt_count(); - /* - * make sure we do.. - */ - barrier(); + preempt_count_dec(); preempt_check_resched(); } -- cgit v1.2.3 From a233f1120c37724938f7201fe2353b2577adaaf9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Sep 2013 19:04:26 +0200 Subject: sched: Prepare for per-cpu preempt_count When using per-cpu preempt_count variables we need to save/restore the preempt_count on context switch (into per task storage; for instance the old thread_info::preempt_count variable) because of PREEMPT_ACTIVE. However, this means that on fork() the preempt_count value of the last context switch gets copied and if we had a PREEMPT_ACTIVE switch right before cloning a child task the child task will now too have PREEMPT_ACTIVE set and start its life with an extra PREEMPT_ACTIVE count. Therefore we need to make init_task_preempt_count() unconditional; this resets whatever preempt_count we inherited from our parent process. Doing so for !per-cpu implementations is harmless. For !PREEMPT_COUNT kernels we need to be careful not to start life with an increased preempt_count. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-4k0b7oy1rcdyzochwiixuwi9@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 06ac17c7e639..b09798b672f3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -428,6 +428,14 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) + +#ifdef CONFIG_PREEMPT_COUNT +#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) +#else +#define PREEMPT_DISABLED PREEMPT_ENABLED +#endif + /* * Disable preemption until the scheduler is running. * Reset by start_kernel()->sched_init()->init_idle(). @@ -435,9 +443,7 @@ struct task_cputime { * We include PREEMPT_ACTIVE to avoid cond_resched() from working * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE + PREEMPT_NEED_RESCHED) -#define PREEMPT_ENABLED (PREEMPT_NEED_RESCHED) -#define PREEMPT_DISABLED (1 + PREEMPT_NEED_RESCHED) +#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) /** * struct thread_group_cputimer - thread group interval timer counts -- cgit v1.2.3 From 1a338ac32ca630f67df25b4a16436cccc314e997 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Aug 2013 14:51:00 +0200 Subject: sched, x86: Optimize the preempt_schedule() call Remove the bloat of the C calling convention out of the preempt_enable() sites by creating an ASM wrapper which allows us to do an asm("call ___preempt_schedule") instead. calling.h bits by Andi Kleen Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-tk7xdi1cvvxewixzke8t8le1@git.kernel.org [ Fixed build error. ] Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 10 ++++++++++ include/linux/preempt.h | 13 +++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 82d958fc3823..5dc14ed3791c 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -100,4 +100,14 @@ static __always_inline bool should_resched(void) return unlikely(!*preempt_count_ptr()); } +#ifdef CONFIG_PREEMPT +extern asmlinkage void preempt_schedule(void); +#define __preempt_schedule() preempt_schedule() + +#ifdef CONFIG_CONTEXT_TRACKING +extern asmlinkage void preempt_schedule_context(void); +#define __preempt_schedule_context() preempt_schedule_context() +#endif +#endif /* CONFIG_PREEMPT */ + #endif /* __ASM_PREEMPT_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 2343d8715299..a3d9dc8c2c00 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -50,18 +50,17 @@ do { \ #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #ifdef CONFIG_PREEMPT -asmlinkage void preempt_schedule(void); #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ - preempt_schedule(); \ + __preempt_schedule(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched()) \ - preempt_schedule(); \ + __preempt_schedule(); \ } while (0) #else @@ -83,17 +82,15 @@ do { \ #ifdef CONFIG_PREEMPT -#ifdef CONFIG_CONTEXT_TRACKING -asmlinkage void preempt_schedule_context(void); -#else -#define preempt_schedule_context() preempt_schedule() +#ifndef CONFIG_CONTEXT_TRACKING +#define __preempt_schedule_context() __preempt_schedule() #endif #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ - preempt_schedule_context(); \ + __preempt_schedule_context(); \ } while (0) #else #define preempt_enable_notrace() preempt_enable_no_resched_notrace() -- cgit v1.2.3 From 75f93fed50c2abadbab6ef546b265f51ca975b27 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 27 Sep 2013 17:30:03 +0200 Subject: sched: Revert need_resched() to look at TIF_NEED_RESCHED Yuanhan reported a serious throughput regression in his pigz benchmark. Using the ftrace patch I found that several idle paths need more TLC before we can switch the generic need_resched() over to preempt_need_resched. The preemption paths benefit most from preempt_need_resched and do indeed use it; all other need_resched() users don't really care that much so reverting need_resched() back to tif_need_resched() is the simple and safe solution. Reported-by: Yuanhan Liu Signed-off-by: Peter Zijlstra Cc: Fengguang Wu Cc: Huang Ying Cc: lkp@linux.intel.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20130927153003.GF15690@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/asm-generic/preempt.h | 8 -------- include/linux/sched.h | 5 +++++ 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index 5dc14ed3791c..ddf2b420ac8f 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -84,14 +84,6 @@ static __always_inline bool __preempt_count_dec_and_test(void) return !--*preempt_count_ptr(); } -/* - * Returns true when we need to resched -- even if we can not. - */ -static __always_inline bool need_resched(void) -{ - return unlikely(test_preempt_need_resched()); -} - /* * Returns true when we need to resched and can (barring IRQ state). */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b09798b672f3..2ac5285db434 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2577,6 +2577,11 @@ static inline bool __must_check current_clr_polling_and_test(void) } #endif +static __always_inline bool need_resched(void) +{ + return unlikely(tif_need_resched()); +} + /* * Thread group CPU time accounting. */ -- cgit v1.2.3 From 2f2a2b60adf368bacd6acd2116c01e32caf936c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:18 +0200 Subject: sched/wait: Make the signal_pending() checks consistent There's two patterns to check signals in the __wait_event*() macros: if (!signal_pending(current)) { schedule(); continue; } ret = -ERESTARTSYS; break; And the more natural: if (signal_pending(current)) { ret = -ERESTARTSYS; break; } schedule(); Change them all into the latter form. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092527.956416254@infradead.org Signed-off-by: Ingo Molnar --- include/linux/tty.h | 13 ++++++------- include/linux/wait.h | 35 ++++++++++++++++------------------- 2 files changed, 22 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 64f864651d86..050372979076 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -686,14 +686,13 @@ do { \ prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ if (condition) \ break; \ - if (!signal_pending(current)) { \ - tty_unlock(tty); \ - schedule(); \ - tty_lock(tty); \ - continue; \ + if (signal_pending(current)) { \ + ret = -ERESTARTSYS; \ + break; \ } \ - ret = -ERESTARTSYS; \ - break; \ + tty_unlock(tty); \ + schedule(); \ + tty_lock(tty); \ } \ finish_wait(&wq, &__wait); \ } while (0) diff --git a/include/linux/wait.h b/include/linux/wait.h index a67fc1635592..ccf0c529fd37 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -261,12 +261,11 @@ do { \ prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ if (condition) \ break; \ - if (!signal_pending(current)) { \ - schedule(); \ - continue; \ + if (signal_pending(current)) { \ + ret = -ERESTARTSYS; \ + break; \ } \ - ret = -ERESTARTSYS; \ - break; \ + schedule(); \ } \ finish_wait(&wq, &__wait); \ } while (0) @@ -302,14 +301,13 @@ do { \ prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ if (condition) \ break; \ - if (!signal_pending(current)) { \ - ret = schedule_timeout(ret); \ - if (!ret) \ - break; \ - continue; \ + if (signal_pending(current)) { \ + ret = -ERESTARTSYS; \ + break; \ } \ - ret = -ERESTARTSYS; \ - break; \ + ret = schedule_timeout(ret); \ + if (!ret) \ + break; \ } \ if (!ret && (condition)) \ ret = 1; \ @@ -439,14 +437,13 @@ do { \ finish_wait(&wq, &__wait); \ break; \ } \ - if (!signal_pending(current)) { \ - schedule(); \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - abort_exclusive_wait(&wq, &__wait, \ + if (signal_pending(current)) { \ + ret = -ERESTARTSYS; \ + abort_exclusive_wait(&wq, &__wait, \ TASK_INTERRUPTIBLE, NULL); \ - break; \ + break; \ + } \ + schedule(); \ } \ } while (0) -- cgit v1.2.3 From 2953ef246b058989657e1e77b36b67566ac06f7b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:19 +0200 Subject: sched/wait: Change timeout logic Commit 4c663cf ("wait: fix false timeouts when using wait_event_timeout()") introduced an additional condition check after a timeout but there's a few issues; - it forgot one site - it put the check after the main loop; not at the actual timeout check. Cure both; by wrapping the condition (as suggested by Oleg), this avoids double evaluation of 'condition' which could be quite big. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.028892896@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index ccf0c529fd37..b2afd665e4ea 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -179,6 +179,14 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) +#define ___wait_cond_timeout(condition, ret) \ +({ \ + bool __cond = (condition); \ + if (__cond && !ret) \ + ret = 1; \ + __cond || !ret; \ +}) + #define __wait_event(wq, condition) \ do { \ DEFINE_WAIT(__wait); \ @@ -217,14 +225,10 @@ do { \ \ for (;;) { \ prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ - if (condition) \ + if (___wait_cond_timeout(condition, ret)) \ break; \ ret = schedule_timeout(ret); \ - if (!ret) \ - break; \ } \ - if (!ret && (condition)) \ - ret = 1; \ finish_wait(&wq, &__wait); \ } while (0) @@ -299,18 +303,14 @@ do { \ \ for (;;) { \ prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ + if (___wait_cond_timeout(condition, ret)) \ break; \ if (signal_pending(current)) { \ ret = -ERESTARTSYS; \ break; \ } \ ret = schedule_timeout(ret); \ - if (!ret) \ - break; \ } \ - if (!ret && (condition)) \ - ret = 1; \ finish_wait(&wq, &__wait); \ } while (0) @@ -815,7 +815,7 @@ do { \ \ for (;;) { \ prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ + if (___wait_cond_timeout(condition, ret)) \ break; \ if (signal_pending(current)) { \ ret = -ERESTARTSYS; \ @@ -824,8 +824,6 @@ do { \ spin_unlock_irq(&lock); \ ret = schedule_timeout(ret); \ spin_lock_irq(&lock); \ - if (!ret) \ - break; \ } \ finish_wait(&wq, &__wait); \ } while (0) -- cgit v1.2.3 From bb632bc44970f75b66df102e831a4fc0692e9159 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:20 +0200 Subject: sched/wait: Change the wait_exclusive control flow Purely a preparatory patch; it changes the control flow to match what will soon be generated by generic code so that that patch can be a unity transform. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.107994763@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index b2afd665e4ea..7d7819dafcc5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -428,23 +428,24 @@ do { \ #define __wait_event_interruptible_exclusive(wq, condition, ret) \ do { \ + __label__ __out; \ DEFINE_WAIT(__wait); \ \ for (;;) { \ prepare_to_wait_exclusive(&wq, &__wait, \ TASK_INTERRUPTIBLE); \ - if (condition) { \ - finish_wait(&wq, &__wait); \ + if (condition) \ break; \ - } \ if (signal_pending(current)) { \ ret = -ERESTARTSYS; \ abort_exclusive_wait(&wq, &__wait, \ TASK_INTERRUPTIBLE, NULL); \ - break; \ + goto __out; \ } \ schedule(); \ } \ + finish_wait(&wq, &__wait); \ +__out: ; \ } while (0) #define wait_event_interruptible_exclusive(wq, condition) \ -- cgit v1.2.3 From 41a1431b178c3b731d6dfc40b987528b333dd93e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:21 +0200 Subject: sched/wait: Introduce ___wait_event() There's far too much duplication in the __wait_event macros; in order to fix this introduce ___wait_event() a macro with the capability to replace most other macros. With the previous patches changing the various __wait_event*() implementations to be more uniform; we can now collapse the lot without also changing generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.181897111@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 7d7819dafcc5..29d0249e03ab 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -187,6 +187,42 @@ wait_queue_head_t *bit_waitqueue(void *, int); __cond || !ret; \ }) +#define ___wait_signal_pending(state) \ + ((state == TASK_INTERRUPTIBLE && signal_pending(current)) || \ + (state == TASK_KILLABLE && fatal_signal_pending(current))) + +#define ___wait_nop_ret int ret __always_unused + +#define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ +do { \ + __label__ __out; \ + DEFINE_WAIT(__wait); \ + \ + for (;;) { \ + if (exclusive) \ + prepare_to_wait_exclusive(&wq, &__wait, state); \ + else \ + prepare_to_wait(&wq, &__wait, state); \ + \ + if (condition) \ + break; \ + \ + if (___wait_signal_pending(state)) { \ + ret = -ERESTARTSYS; \ + if (exclusive) { \ + abort_exclusive_wait(&wq, &__wait, \ + state, NULL); \ + goto __out; \ + } \ + break; \ + } \ + \ + cmd; \ + } \ + finish_wait(&wq, &__wait); \ +__out: ; \ +} while (0) + #define __wait_event(wq, condition) \ do { \ DEFINE_WAIT(__wait); \ -- cgit v1.2.3 From 854267f4384243b19c03a2942e84f06f2beb0952 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:22 +0200 Subject: sched/wait: Collapse __wait_event() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.254863348@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 29d0249e03ab..68e3a628e157 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -224,17 +224,8 @@ __out: ; \ } while (0) #define __wait_event(wq, condition) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - schedule(); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + ___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ + ___wait_nop_ret, schedule()) /** * wait_event - sleep until a condition gets true -- cgit v1.2.3 From ddc1994b8217527e1818f690f17597fc9cedf81b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:23 +0200 Subject: sched/wait: Collapse __wait_event_timeout() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.325264677@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 68e3a628e157..546b94efc82e 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -247,17 +247,9 @@ do { \ } while (0) #define __wait_event_timeout(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ - if (___wait_cond_timeout(condition, ret)) \ - break; \ - ret = schedule_timeout(ret); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + ___wait_event(wq, ___wait_cond_timeout(condition, ret), \ + TASK_UNINTERRUPTIBLE, 0, ret, \ + ret = schedule_timeout(ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses -- cgit v1.2.3 From f13f4c41c9cf9cd61c896e46e4e7ba2687e2af9c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:24 +0200 Subject: sched/wait: Collapse __wait_event_interruptible() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.396949919@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 546b94efc82e..39e4bbd2c735 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -277,21 +277,8 @@ do { \ }) #define __wait_event_interruptible(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - schedule(); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, ret, \ + schedule()) /** * wait_event_interruptible - sleep until a condition gets true -- cgit v1.2.3 From c2ebb1fb4eddf3d1d66fe31d1e89e83ee211b81c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:25 +0200 Subject: sched/wait: Collapse __wait_event_interruptible_timeout() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.469616907@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 39e4bbd2c735..a79fb15c1dd4 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -304,21 +304,9 @@ do { \ }) #define __wait_event_interruptible_timeout(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (___wait_cond_timeout(condition, ret)) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - ret = schedule_timeout(ret); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + ___wait_event(wq, ___wait_cond_timeout(condition, ret), \ + TASK_INTERRUPTIBLE, 0, ret, \ + ret = schedule_timeout(ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses -- cgit v1.2.3 From 48c2521717b39cb6904941ec2847d9775669207a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:26 +0200 Subject: sched/wait: Collapse __wait_event_interruptible_exclusive() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.541716442@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index a79fb15c1dd4..c4ab172daac0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -421,26 +421,8 @@ do { \ }) #define __wait_event_interruptible_exclusive(wq, condition, ret) \ -do { \ - __label__ __out; \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait_exclusive(&wq, &__wait, \ - TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - abort_exclusive_wait(&wq, &__wait, \ - TASK_INTERRUPTIBLE, NULL); \ - goto __out; \ - } \ - schedule(); \ - } \ - finish_wait(&wq, &__wait); \ -__out: ; \ -} while (0) + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, ret, \ + schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ -- cgit v1.2.3 From 13cb5042a4b80396f77cf5d599d2c002c57b89dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:27 +0200 Subject: sched/wait: Collapse __wait_event_lock_irq() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.612813379@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index c4ab172daac0..d64918e09e3c 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -624,20 +624,12 @@ do { \ #define __wait_event_lock_irq(wq, condition, lock, cmd) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + ___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ + ___wait_nop_ret, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The -- cgit v1.2.3 From 8fbd88fa1717601ef91ced49a32f24786b167065 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:28 +0200 Subject: sched/wait: Collapse __wait_event_interruptible_lock_irq() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.686006009@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index d64918e09e3c..a577a85004ae 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -689,26 +689,12 @@ do { \ } while (0) -#define __wait_event_interruptible_lock_irq(wq, condition, \ - lock, ret, cmd) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) +#define __wait_event_interruptible_lock_irq(wq, condition, lock, ret, cmd) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, ret, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock)) /** * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true. -- cgit v1.2.3 From a1dc6852ac5eecdcd3122ae01703183a3e88e979 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:29 +0200 Subject: sched/wait: Collapse __wait_event_interruptible_lock_irq_timeout() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.759956109@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index a577a85004ae..5d5408b08ba5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -763,25 +763,12 @@ do { \ __ret; \ }) -#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ - lock, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (___wait_cond_timeout(condition, ret)) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - spin_unlock_irq(&lock); \ - ret = schedule_timeout(ret); \ - spin_lock_irq(&lock); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) +#define __wait_event_interruptible_lock_irq_timeout(wq, condition, lock, ret) \ + ___wait_event(wq, ___wait_cond_timeout(condition, ret), \ + TASK_INTERRUPTIBLE, 0, ret, \ + spin_unlock_irq(&lock); \ + ret = schedule_timeout(ret); \ + spin_lock_irq(&lock)); /** * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses. -- cgit v1.2.3 From 0d1e1c8a430450a3ce61a842cec64f9e2a9f3b05 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:30 +0200 Subject: sched/wait: Collapse __wait_event_interruptible_tty() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.831085521@infradead.org Signed-off-by: Ingo Molnar --- include/linux/tty.h | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 050372979076..6e803291028f 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -679,23 +679,10 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty, }) #define __wait_event_interruptible_tty(tty, wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (signal_pending(current)) { \ - ret = -ERESTARTSYS; \ - break; \ - } \ - tty_unlock(tty); \ - schedule(); \ - tty_lock(tty); \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, ret, \ + tty_unlock(tty); \ + schedule(); \ + tty_lock(tty)) #ifdef CONFIG_PROC_FS extern void proc_tty_register_driver(struct tty_driver *); -- cgit v1.2.3 From cf7361fd961b6f0510572af6cf8ca3ffba07018b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:31 +0200 Subject: sched/wait: Collapse __wait_event_killable() Reduce macro complexity by using the new ___wait_event() helper. No change in behaviour, identical generated code. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.898691966@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 5d5408b08ba5..ec3683ee0fc2 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -582,22 +582,7 @@ do { \ #define __wait_event_killable(wq, condition, ret) \ -do { \ - DEFINE_WAIT(__wait); \ - \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, TASK_KILLABLE); \ - if (condition) \ - break; \ - if (!fatal_signal_pending(current)) { \ - schedule(); \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - break; \ - } \ - finish_wait(&wq, &__wait); \ -} while (0) + ___wait_event(wq, condition, TASK_KILLABLE, 0, ret, schedule()) /** * wait_event_killable - sleep until a condition gets true -- cgit v1.2.3 From ebdc195f2ec68576876216081035293e37318e86 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:32 +0200 Subject: sched/wait: Collapse __wait_event_hrtimeout() While not a whole-sale replacement like the others we can still reduce the size of __wait_event_hrtimeout() considerably by noting that the actual core of __wait_event_hrtimeout() is identical to what ___wait_event() generates. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092528.972793648@infradead.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index ec3683ee0fc2..c065e8af9749 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -337,7 +337,6 @@ do { \ #define __wait_event_hrtimeout(wq, condition, timeout, state) \ ({ \ int __ret = 0; \ - DEFINE_WAIT(__wait); \ struct hrtimer_sleeper __t; \ \ hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, \ @@ -348,25 +347,15 @@ do { \ current->timer_slack_ns, \ HRTIMER_MODE_REL); \ \ - for (;;) { \ - prepare_to_wait(&wq, &__wait, state); \ - if (condition) \ - break; \ - if (state == TASK_INTERRUPTIBLE && \ - signal_pending(current)) { \ - __ret = -ERESTARTSYS; \ - break; \ - } \ + ___wait_event(wq, condition, state, 0, __ret, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ } \ - schedule(); \ - } \ + schedule()); \ \ hrtimer_cancel(&__t.timer); \ destroy_hrtimer_on_stack(&__t.timer); \ - finish_wait(&wq, &__wait); \ __ret; \ }) -- cgit v1.2.3 From 35a2af94c7ce7130ca292c68b1d27fcfdb648f6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Oct 2013 11:22:33 +0200 Subject: sched/wait: Make the __wait_event*() interface more friendly Change all __wait_event*() implementations to match the corresponding wait_event*() signature for convenience. In particular this does away with the weird 'ret' logic. Since there are __wait_event*() users this requires we update them too. Reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131002092529.042563462@infradead.org Signed-off-by: Ingo Molnar --- include/linux/tty.h | 10 ++--- include/linux/wait.h | 113 +++++++++++++++++++++++++-------------------------- 2 files changed, 60 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/linux/tty.h b/include/linux/tty.h index 6e803291028f..633cac77f9f9 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -672,14 +672,14 @@ static inline void tty_wait_until_sent_from_close(struct tty_struct *tty, #define wait_event_interruptible_tty(tty, wq, condition) \ ({ \ int __ret = 0; \ - if (!(condition)) { \ - __wait_event_interruptible_tty(tty, wq, condition, __ret); \ - } \ + if (!(condition)) \ + __ret = __wait_event_interruptible_tty(tty, wq, \ + condition); \ __ret; \ }) -#define __wait_event_interruptible_tty(tty, wq, condition, ret) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, ret, \ +#define __wait_event_interruptible_tty(tty, wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ tty_unlock(tty); \ schedule(); \ tty_lock(tty)) diff --git a/include/linux/wait.h b/include/linux/wait.h index c065e8af9749..bd4bd7b479b6 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -179,24 +179,23 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) -#define ___wait_cond_timeout(condition, ret) \ +#define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ - if (__cond && !ret) \ - ret = 1; \ - __cond || !ret; \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ }) #define ___wait_signal_pending(state) \ ((state == TASK_INTERRUPTIBLE && signal_pending(current)) || \ (state == TASK_KILLABLE && fatal_signal_pending(current))) -#define ___wait_nop_ret int ret __always_unused - #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ -do { \ +({ \ __label__ __out; \ DEFINE_WAIT(__wait); \ + long __ret = ret; \ \ for (;;) { \ if (exclusive) \ @@ -208,7 +207,7 @@ do { \ break; \ \ if (___wait_signal_pending(state)) { \ - ret = -ERESTARTSYS; \ + __ret = -ERESTARTSYS; \ if (exclusive) { \ abort_exclusive_wait(&wq, &__wait, \ state, NULL); \ @@ -220,12 +219,12 @@ do { \ cmd; \ } \ finish_wait(&wq, &__wait); \ -__out: ; \ -} while (0) +__out: __ret; \ +}) #define __wait_event(wq, condition) \ - ___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ - ___wait_nop_ret, schedule()) + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + schedule()) /** * wait_event - sleep until a condition gets true @@ -246,10 +245,10 @@ do { \ __wait_event(wq, condition); \ } while (0) -#define __wait_event_timeout(wq, condition, ret) \ - ___wait_event(wq, ___wait_cond_timeout(condition, ret), \ - TASK_UNINTERRUPTIBLE, 0, ret, \ - ret = schedule_timeout(ret)) +#define __wait_event_timeout(wq, condition, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_UNINTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) /** * wait_event_timeout - sleep until a condition gets true or a timeout elapses @@ -272,12 +271,12 @@ do { \ ({ \ long __ret = timeout; \ if (!(condition)) \ - __wait_event_timeout(wq, condition, __ret); \ + __ret = __wait_event_timeout(wq, condition, timeout); \ __ret; \ }) -#define __wait_event_interruptible(wq, condition, ret) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, ret, \ +#define __wait_event_interruptible(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ schedule()) /** @@ -299,14 +298,14 @@ do { \ ({ \ int __ret = 0; \ if (!(condition)) \ - __wait_event_interruptible(wq, condition, __ret); \ + __ret = __wait_event_interruptible(wq, condition); \ __ret; \ }) -#define __wait_event_interruptible_timeout(wq, condition, ret) \ - ___wait_event(wq, ___wait_cond_timeout(condition, ret), \ - TASK_INTERRUPTIBLE, 0, ret, \ - ret = schedule_timeout(ret)) +#define __wait_event_interruptible_timeout(wq, condition, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, timeout, \ + __ret = schedule_timeout(__ret)) /** * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses @@ -330,7 +329,8 @@ do { \ ({ \ long __ret = timeout; \ if (!(condition)) \ - __wait_event_interruptible_timeout(wq, condition, __ret); \ + __ret = __wait_event_interruptible_timeout(wq, \ + condition, timeout); \ __ret; \ }) @@ -347,7 +347,7 @@ do { \ current->timer_slack_ns, \ HRTIMER_MODE_REL); \ \ - ___wait_event(wq, condition, state, 0, __ret, \ + __ret = ___wait_event(wq, condition, state, 0, 0, \ if (!__t.task) { \ __ret = -ETIME; \ break; \ @@ -409,15 +409,15 @@ do { \ __ret; \ }) -#define __wait_event_interruptible_exclusive(wq, condition, ret) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, ret, \ +#define __wait_event_interruptible_exclusive(wq, condition) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ schedule()) #define wait_event_interruptible_exclusive(wq, condition) \ ({ \ int __ret = 0; \ if (!(condition)) \ - __wait_event_interruptible_exclusive(wq, condition, __ret);\ + __ret = __wait_event_interruptible_exclusive(wq, condition);\ __ret; \ }) @@ -570,8 +570,8 @@ do { \ -#define __wait_event_killable(wq, condition, ret) \ - ___wait_event(wq, condition, TASK_KILLABLE, 0, ret, schedule()) +#define __wait_event_killable(wq, condition) \ + ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) /** * wait_event_killable - sleep until a condition gets true @@ -592,18 +592,17 @@ do { \ ({ \ int __ret = 0; \ if (!(condition)) \ - __wait_event_killable(wq, condition, __ret); \ + __ret = __wait_event_killable(wq, condition); \ __ret; \ }) #define __wait_event_lock_irq(wq, condition, lock, cmd) \ - ___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \ - ___wait_nop_ret, \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ - spin_lock_irq(&lock)) + (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ + spin_lock_irq(&lock)) /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The @@ -663,11 +662,11 @@ do { \ } while (0) -#define __wait_event_interruptible_lock_irq(wq, condition, lock, ret, cmd) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, ret, \ - spin_unlock_irq(&lock); \ - cmd; \ - schedule(); \ +#define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + spin_unlock_irq(&lock); \ + cmd; \ + schedule(); \ spin_lock_irq(&lock)) /** @@ -698,10 +697,9 @@ do { \ #define wait_event_interruptible_lock_irq_cmd(wq, condition, lock, cmd) \ ({ \ int __ret = 0; \ - \ if (!(condition)) \ - __wait_event_interruptible_lock_irq(wq, condition, \ - lock, __ret, cmd); \ + __ret = __wait_event_interruptible_lock_irq(wq, \ + condition, lock, cmd); \ __ret; \ }) @@ -730,18 +728,18 @@ do { \ #define wait_event_interruptible_lock_irq(wq, condition, lock) \ ({ \ int __ret = 0; \ - \ if (!(condition)) \ - __wait_event_interruptible_lock_irq(wq, condition, \ - lock, __ret, ); \ + __ret = __wait_event_interruptible_lock_irq(wq, \ + condition, lock,) \ __ret; \ }) -#define __wait_event_interruptible_lock_irq_timeout(wq, condition, lock, ret) \ - ___wait_event(wq, ___wait_cond_timeout(condition, ret), \ - TASK_INTERRUPTIBLE, 0, ret, \ - spin_unlock_irq(&lock); \ - ret = schedule_timeout(ret); \ +#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ + lock, timeout) \ + ___wait_event(wq, ___wait_cond_timeout(condition), \ + TASK_INTERRUPTIBLE, 0, ret, \ + spin_unlock_irq(&lock); \ + __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** @@ -771,11 +769,10 @@ do { \ #define wait_event_interruptible_lock_irq_timeout(wq, condition, lock, \ timeout) \ ({ \ - int __ret = timeout; \ - \ + long __ret = timeout; \ if (!(condition)) \ - __wait_event_interruptible_lock_irq_timeout( \ - wq, condition, lock, __ret); \ + __ret = __wait_event_interruptible_lock_irq_timeout( \ + wq, condition, lock, timeout); \ __ret; \ }) -- cgit v1.2.3 From fb869b6e91a3ac235f237f73305ecf34cdc4969b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 4 Oct 2013 10:24:49 +0200 Subject: sched/wait: Clean up wait.h details a bit Since we are changing wait.h profoundly, use the opportunity to: - add a sentence to explain what this file is about - remove whitespace noise - prettify weird looking line break fixup attempts - standardize type definition and initialization sequences - use consistent style details No code is changed. Acked-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-O8dIie5swnctqpupakatvqyq@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 124 +++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index bd4bd7b479b6..a2726c7dd244 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1,7 +1,8 @@ #ifndef _LINUX_WAIT_H #define _LINUX_WAIT_H - - +/* + * Linux wait queue related types and methods + */ #include #include #include @@ -13,27 +14,27 @@ typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, v int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); struct __wait_queue { - unsigned int flags; + unsigned int flags; #define WQ_FLAG_EXCLUSIVE 0x01 - void *private; - wait_queue_func_t func; - struct list_head task_list; + void *private; + wait_queue_func_t func; + struct list_head task_list; }; struct wait_bit_key { - void *flags; - int bit_nr; -#define WAIT_ATOMIC_T_BIT_NR -1 + void *flags; + int bit_nr; +#define WAIT_ATOMIC_T_BIT_NR -1 }; struct wait_bit_queue { - struct wait_bit_key key; - wait_queue_t wait; + struct wait_bit_key key; + wait_queue_t wait; }; struct __wait_queue_head { - spinlock_t lock; - struct list_head task_list; + spinlock_t lock; + struct list_head task_list; }; typedef struct __wait_queue_head wait_queue_head_t; @@ -84,17 +85,17 @@ extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { - q->flags = 0; - q->private = p; - q->func = default_wake_function; + q->flags = 0; + q->private = p; + q->func = default_wake_function; } -static inline void init_waitqueue_func_entry(wait_queue_t *q, - wait_queue_func_t func) +static inline void +init_waitqueue_func_entry(wait_queue_t *q, wait_queue_func_t func) { - q->flags = 0; - q->private = NULL; - q->func = func; + q->flags = 0; + q->private = NULL; + q->func = func; } static inline int waitqueue_active(wait_queue_head_t *q) @@ -114,8 +115,8 @@ static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) /* * Used for wake-one threads: */ -static inline void __add_wait_queue_exclusive(wait_queue_head_t *q, - wait_queue_t *wait) +static inline void +__add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) { wait->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(q, wait); @@ -127,23 +128,22 @@ static inline void __add_wait_queue_tail(wait_queue_head_t *head, list_add_tail(&new->task_list, &head->task_list); } -static inline void __add_wait_queue_tail_exclusive(wait_queue_head_t *q, - wait_queue_t *wait) +static inline void +__add_wait_queue_tail_exclusive(wait_queue_head_t *q, wait_queue_t *wait) { wait->flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(q, wait); } -static inline void __remove_wait_queue(wait_queue_head_t *head, - wait_queue_t *old) +static inline void +__remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) { list_del(&old->task_list); } void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, - void *key); +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); @@ -170,21 +170,21 @@ wait_queue_head_t *bit_waitqueue(void *, int); /* * Wakeup macros to be used to report events to the targets. */ -#define wake_up_poll(x, m) \ +#define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, (void *) (m)) -#define wake_up_locked_poll(x, m) \ +#define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, (void *) (m)) -#define wake_up_interruptible_poll(x, m) \ +#define wake_up_interruptible_poll(x, m) \ __wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m)) #define wake_up_interruptible_sync_poll(x, m) \ __wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m)) #define ___wait_cond_timeout(condition) \ ({ \ - bool __cond = (condition); \ - if (__cond && !__ret) \ - __ret = 1; \ - __cond || !__ret; \ + bool __cond = (condition); \ + if (__cond && !__ret) \ + __ret = 1; \ + __cond || !__ret; \ }) #define ___wait_signal_pending(state) \ @@ -209,8 +209,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); if (___wait_signal_pending(state)) { \ __ret = -ERESTARTSYS; \ if (exclusive) { \ - abort_exclusive_wait(&wq, &__wait, \ - state, NULL); \ + abort_exclusive_wait(&wq, &__wait, \ + state, NULL); \ goto __out; \ } \ break; \ @@ -222,7 +222,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); __out: __ret; \ }) -#define __wait_event(wq, condition) \ +#define __wait_event(wq, condition) \ (void)___wait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ schedule()) @@ -238,9 +238,9 @@ __out: __ret; \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. */ -#define wait_event(wq, condition) \ +#define wait_event(wq, condition) \ do { \ - if (condition) \ + if (condition) \ break; \ __wait_event(wq, condition); \ } while (0) @@ -270,7 +270,7 @@ do { \ #define wait_event_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ - if (!(condition)) \ + if (!(condition)) \ __ret = __wait_event_timeout(wq, condition, timeout); \ __ret; \ }) @@ -329,7 +329,7 @@ do { \ ({ \ long __ret = timeout; \ if (!(condition)) \ - __ret = __wait_event_interruptible_timeout(wq, \ + __ret = __wait_event_interruptible_timeout(wq, \ condition, timeout); \ __ret; \ }) @@ -569,7 +569,6 @@ do { \ ? 0 : __wait_event_interruptible_locked(wq, condition, 1, 1)) - #define __wait_event_killable(wq, condition) \ ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule()) @@ -663,7 +662,7 @@ do { \ #define __wait_event_interruptible_lock_irq(wq, condition, lock, cmd) \ - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ + ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 0, 0, \ spin_unlock_irq(&lock); \ cmd; \ schedule(); \ @@ -698,7 +697,7 @@ do { \ ({ \ int __ret = 0; \ if (!(condition)) \ - __ret = __wait_event_interruptible_lock_irq(wq, \ + __ret = __wait_event_interruptible_lock_irq(wq, \ condition, lock, cmd); \ __ret; \ }) @@ -734,18 +733,18 @@ do { \ __ret; \ }) -#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ - lock, timeout) \ +#define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ + lock, timeout) \ ___wait_event(wq, ___wait_cond_timeout(condition), \ - TASK_INTERRUPTIBLE, 0, ret, \ + TASK_INTERRUPTIBLE, 0, ret, \ spin_unlock_irq(&lock); \ __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); /** - * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets true or a timeout elapses. - * The condition is checked under the lock. This is expected - * to be called with the lock taken. + * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets + * true or a timeout elapses. The condition is checked under + * the lock. This is expected to be called with the lock taken. * @wq: the waitqueue to wait on * @condition: a C expression for the event to wait for * @lock: a locked spinlock_t, which will be released before schedule() @@ -783,11 +782,9 @@ do { \ * We plan to remove these interfaces. */ extern void sleep_on(wait_queue_head_t *q); -extern long sleep_on_timeout(wait_queue_head_t *q, - signed long timeout); +extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout); extern void interruptible_sleep_on(wait_queue_head_t *q); -extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, - signed long timeout); +extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout); /* * Waitqueues which are removed from the waitqueue_head at wakeup time @@ -795,8 +792,7 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, - unsigned int mode, void *key); +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); @@ -842,8 +838,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); * One uses wait_on_bit() where one is waiting for the bit to clear, * but has no intention of setting it. */ -static inline int wait_on_bit(void *word, int bit, - int (*action)(void *), unsigned mode) +static inline int +wait_on_bit(void *word, int bit, int (*action)(void *), unsigned mode) { if (!test_bit(bit, word)) return 0; @@ -866,8 +862,8 @@ static inline int wait_on_bit(void *word, int bit, * One uses wait_on_bit_lock() where one is waiting for the bit to * clear with the intention of setting it, and when done, clearing it. */ -static inline int wait_on_bit_lock(void *word, int bit, - int (*action)(void *), unsigned mode) +static inline int +wait_on_bit_lock(void *word, int bit, int (*action)(void *), unsigned mode) { if (!test_and_set_bit(bit, word)) return 0; @@ -891,5 +887,5 @@ int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode) return 0; return out_of_line_wait_on_atomic_t(val, action, mode); } - -#endif + +#endif /* _LINUX_WAIT_H */ -- cgit v1.2.3 From b726b7dfb400c937546fa91cf8523dcb1aa2fc6e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:53 +0100 Subject: Revert "mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node" PTE scanning and NUMA hinting fault handling is expensive so commit 5bca2303 ("mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node") deferred the PTE scan until a task had been scheduled on another node. The problem is that in the purely shared memory case that this may never happen and no NUMA hinting fault information will be captured. We are not ruling out the possibility that something better can be done here but for now, this patch needs to be reverted and depend entirely on the scan_delay to avoid punishing short-lived processes. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-16-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm_types.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d9851eeb6e1d..b7adf1d4310c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -428,20 +428,10 @@ struct mm_struct { /* numa_scan_seq prevents two threads setting pte_numa */ int numa_scan_seq; - - /* - * The first node a task was scheduled on. If a task runs on - * a different node than Make PTE Scan Go Now. - */ - int first_nid; #endif struct uprobes_state uprobes_state; }; -/* first nid will either be a valid NID or one of these values */ -#define NUMA_PTE_SCAN_INIT -1 -#define NUMA_PTE_SCAN_ACTIVE -2 - static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK -- cgit v1.2.3 From 598f0ec0bc996e90a806ee9564af919ea5aad401 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:55 +0100 Subject: sched/numa: Set the scan rate proportional to the memory usage of the task being scanned The NUMA PTE scan rate is controlled with a combination of the numa_balancing_scan_period_min, numa_balancing_scan_period_max and numa_balancing_scan_size. This scan rate is independent of the size of the task and as an aside it is further complicated by the fact that numa_balancing_scan_size controls how many pages are marked pte_numa and not how much virtual memory is scanned. In combination, it is almost impossible to meaningfully tune the min and max scan periods and reasoning about performance is complex when the time to complete a full scan is is partially a function of the tasks memory size. This patch alters the semantic of the min and max tunables to be about tuning the length time it takes to complete a scan of a tasks occupied virtual address space. Conceptually this is a lot easier to understand. There is a "sanity" check to ensure the scan rate is never extremely fast based on the amount of virtual memory that should be scanned in a second. The default of 2.5G seems arbitrary but it is to have the maximum scan rate after the patch roughly match the maximum scan rate before the patch was applied. On a similar note, numa_scan_period is in milliseconds and not jiffies. Properly placed pages slow the scanning rate but adding 10 jiffies to numa_scan_period means that the rate scanning slows depends on HZ which is confusing. Get rid of the jiffies_to_msec conversion and treat it as ms. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-18-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2ac5285db434..fdcb4c855072 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1339,6 +1339,7 @@ struct task_struct { int numa_scan_seq; int numa_migrate_seq; unsigned int numa_scan_period; + unsigned int numa_scan_period_max; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3 From f809ca9a554dda49fb264c79e31c722e0b063ff8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:57 +0100 Subject: sched/numa: Track NUMA hinting faults on per-node basis This patch tracks what nodes numa hinting faults were incurred on. This information is later used to schedule a task on the node storing the pages most frequently faulted by the task. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-20-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index fdcb4c855072..a810e95bca2b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1342,6 +1342,8 @@ struct task_struct { unsigned int numa_scan_period_max; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + + unsigned long *numa_faults; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; -- cgit v1.2.3 From 688b7585d16ab57a17aa4422a3b290b3a55fa679 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:58 +0100 Subject: sched/numa: Select a preferred node with the most numa hinting faults This patch selects a preferred node for a task to run on based on the NUMA hinting faults. This information is later used to migrate tasks towards the node during balancing. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-21-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index a810e95bca2b..b1fc75e7187b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1344,6 +1344,7 @@ struct task_struct { struct callback_head numa_work; unsigned long *numa_faults; + int numa_preferred_nid; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; -- cgit v1.2.3 From 745d61476ddb737aad3495fa6d9a8f8c2ee59f86 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:59 +0100 Subject: sched/numa: Update NUMA hinting faults once per scan NUMA hinting fault counts and placement decisions are both recorded in the same array which distorts the samples in an unpredictable fashion. The values linearly accumulate during the scan and then decay creating a sawtooth-like pattern in the per-node counts. It also means that placement decisions are time sensitive. At best it means that it is very difficult to state that the buffer holds a decaying average of past faulting behaviour. At worst, it can confuse the load balancer if it sees one node with an artifically high count due to very recent faulting activity and may create a bouncing effect. This patch adds a second array. numa_faults stores the historical data which is used for placement decisions. numa_faults_buffer holds the fault activity during the current scan window. When the scan completes, numa_faults decays and the values from numa_faults_buffer are copied across. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-22-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b1fc75e7187b..a463bc3ad437 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1343,7 +1343,20 @@ struct task_struct { u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + /* + * Exponential decaying average of faults on a per-node basis. + * Scheduling placement decisions are made based on the these counts. + * The values remain static for the duration of a PTE scan + */ unsigned long *numa_faults; + + /* + * numa_faults_buffer records faults per node during the current + * scan window. When the scan completes, the counts in numa_faults + * decay and these values are copied. + */ + unsigned long *numa_faults_buffer; + int numa_preferred_nid; #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3 From 3a7053b3224f4a8b0e8184166190076593621617 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:00 +0100 Subject: sched/numa: Favour moving tasks towards the preferred node This patch favours moving tasks towards NUMA node that recorded a higher number of NUMA faults during active load balancing. Ideally this is self-reinforcing as the longer the task runs on that node, the more faults it should incur causing task_numa_placement to keep the task running on that node. In reality a big weakness is that the nodes CPUs can be overloaded and it would be more efficient to queue tasks on an idle node and migrate to the new node. This would require additional smarts in the balancer so for now the balancer will simply prefer to place the task on the preferred node for a PTE scans which is controlled by the numa_balancing_settle_count sysctl. Once the settle_count number of scans has complete the schedule is free to place the task on an alternative node if the load is imbalanced. [srikar@linux.vnet.ibm.com: Fixed statistics] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju [ Tunable and use higher faults instead of preferred. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-23-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index a463bc3ad437..aecdc5a18773 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -777,6 +777,7 @@ enum cpu_idle_type { #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x4000 /* cross-node balancing */ extern int __weak arch_sd_sibiling_asym_packing(void); -- cgit v1.2.3 From ac8e895bd260cb8bb19ade6a3abd44e7abe9a01d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:03 +0100 Subject: sched/numa: Add infrastructure for split shared/private accounting of NUMA hinting faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. This patch prepares infrastructure for separately accounting shared and private faults by allocating the necessary buffers and passing in relevant information. For now, all faults are treated as private and detection will be introduced later. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-26-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index aecdc5a18773..d946195eec10 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1445,10 +1445,11 @@ struct task_struct { #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, bool migrated); extern void set_numabalancing_state(bool enabled); #else -static inline void task_numa_fault(int node, int pages, bool migrated) +static inline void task_numa_fault(int last_node, int node, int pages, + bool migrated) { } static inline void set_numabalancing_state(bool enabled) -- cgit v1.2.3 From 1bc115d87dffd1c43bdc3c9c9d1e3a51c195d18e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:05 +0100 Subject: mm: numa: Scan pages with elevated page_mapcount Currently automatic NUMA balancing is unable to distinguish between false shared versus private pages except by ignoring pages with an elevated page_mapcount entirely. This avoids shared pages bouncing between the nodes whose task is using them but that is ignored quite a lot of data. This patch kicks away the training wheels in preparation for adding support for identifying shared/private pages is now in place. The ordering is so that the impact of the shared/private detection can be easily measured. Note that the patch does not migrate shared, file-backed within vmas marked VM_EXEC as these are generally shared library pages. Migrating such pages is not beneficial as there is an expectation they are read-shared between caches and iTLB and iCache pressure is generally low. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-28-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/migrate.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8d3c57fdf221..f5096b58b20d 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -90,11 +90,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING -extern int migrate_misplaced_page(struct page *page, int node); -extern int migrate_misplaced_page(struct page *page, int node); +extern int migrate_misplaced_page(struct page *page, + struct vm_area_struct *vma, int node); extern bool migrate_ratelimited(int node); #else -static inline int migrate_misplaced_page(struct page *page, int node) +static inline int migrate_misplaced_page(struct page *page, + struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } -- cgit v1.2.3 From b795854b1fa70f6aee923ae5df74ff7afeaddcaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:07 +0100 Subject: sched/numa: Set preferred NUMA node based on number of private faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 89 +++++++++++++++++++++++++++++---------- include/linux/mm_types.h | 4 +- include/linux/page-flags-layout.h | 28 +++++++----- 3 files changed, 86 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b6e55ee8855..bb412ce2a8b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) +#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) +#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) +#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,48 +661,93 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int nid_pid_to_nidpid(int nid, int pid) { - return xchg(&page->_last_nid, nid); + return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int page_nid_last(struct page *page) +static inline int nidpid_to_pid(int nidpid) { - return page->_last_nid; + return nidpid & LAST__PID_MASK; } -static inline void page_nid_reset_last(struct page *page) + +static inline int nidpid_to_nid(int nidpid) +{ + return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); +} + +static inline bool nidpid_nid_unset(int nidpid) { - page->_last_nid = -1; + return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); +} + +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +static inline int page_nidpid_xchg_last(struct page *page, int nid) +{ + return xchg(&page->_last_nidpid, nid); +} + +static inline int page_nidpid_last(struct page *page) +{ + return page->_last_nidpid; +} +static inline void page_nidpid_reset_last(struct page *page) +{ + page->_last_nidpid = -1; } #else -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { - return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; + return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; } -extern int page_nid_xchg_last(struct page *page, int nid); +extern int page_nidpid_xchg_last(struct page *page, int nidpid); -static inline void page_nid_reset_last(struct page *page) +static inline void page_nidpid_reset_last(struct page *page) { - int nid = (1 << LAST_NID_SHIFT) - 1; + int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; - page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); + page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; } -#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ +#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ #else -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int page_nidpid_xchg_last(struct page *page, int nidpid) { return page_to_nid(page); } -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { return page_to_nid(page); } -static inline void page_nid_reset_last(struct page *page) +static inline int nidpid_to_nid(int nidpid) +{ + return -1; +} + +static inline int nidpid_to_pid(int nidpid) +{ + return -1; +} + +static inline int nid_pid_to_nidpid(int nid, int pid) +{ + return -1; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return 1; +} + +static inline void page_nidpid_reset_last(struct page *page) { } #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b7adf1d4310c..38a902a6d1e3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,8 +174,8 @@ struct page { void *shadow; #endif -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS - int _last_nid; +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS + int _last_nidpid; #endif } /* diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 93506a114034..02bc9184f16b 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -38,10 +38,10 @@ * The last is when there is insufficient space in page->flags and a separate * lookup is necessary. * - * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | NODE | ZONE | LAST_NID ... | FLAGS | - * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * " plus space for last_nid: | SECTION | NODE | ZONE | LAST_NID ... | FLAGS | + * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | + * " plus space for last_nidpid: | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | + * " plus space for last_nidpid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -62,15 +62,21 @@ #endif #ifdef CONFIG_NUMA_BALANCING -#define LAST_NID_SHIFT NODES_SHIFT +#define LAST__PID_SHIFT 8 +#define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) + +#define LAST__NID_SHIFT NODES_SHIFT +#define LAST__NID_MASK ((1 << LAST__NID_SHIFT)-1) + +#define LAST_NIDPID_SHIFT (LAST__PID_SHIFT+LAST__NID_SHIFT) #else -#define LAST_NID_SHIFT 0 +#define LAST_NIDPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define LAST_NID_WIDTH LAST_NID_SHIFT +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT #else -#define LAST_NID_WIDTH 0 +#define LAST_NIDPID_WIDTH 0 #endif /* @@ -81,8 +87,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_NID_WIDTH == 0 -#define LAST_NID_NOT_IN_PAGE_FLAGS +#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0 +#define LAST_NIDPID_NOT_IN_PAGE_FLAGS #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ -- cgit v1.2.3 From fc3147245d193bd0f57307859c698fa28a20b0fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:09 +0100 Subject: mm: numa: Limit NUMA scanning to migrate-on-fault VMAs There is a 90% regression observed with a large Oracle performance test on a 4 node system. Profiles indicated that the overhead was due to contention on sp_lock when looking up shared memory policies. These policies do not have the appropriate flags to allow them to be automatically balanced so trapping faults on them is pointless. This patch skips VMAs that do not have MPOL_F_MOF set. [riel@redhat.com: Initial patch] Signed-off-by: Mel Gorman Reported-and-tested-by: Joe Mario Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-32-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mempolicy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index da6716b9e3fe..ea4d2495c646 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -136,6 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_vma_policy(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long addr); +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); -- cgit v1.2.3 From 6b9a7460b6baf6c77fc3d23d927ddfc3f3f05bf3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:11 +0100 Subject: sched/numa: Retry migration of tasks to CPU on a preferred node When a preferred node is selected for a tasks there is an attempt to migrate the task to a CPU there. This may fail in which case the task will only migrate if the active load balancer takes action. This may never happen if the conditions are not right. This patch will check at NUMA hinting fault time if another attempt should be made to migrate the task. It will only make an attempt once every five seconds. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-34-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d946195eec10..14251a8ff2ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1341,6 +1341,7 @@ struct task_struct { int numa_migrate_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; + unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; -- cgit v1.2.3 From 1be0bd77c5dd7c903f46abf52f9a3650face3c1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:15 +0100 Subject: stop_machine: Introduce stop_two_cpus() Introduce stop_two_cpus() in order to allow controlled swapping of two tasks. It repurposes the stop_machine() state machine but only stops the two cpus which we can do with on-stack structures and avoid machine wide synchronization issues. The ordering of CPUs is important to avoid deadlocks. If unordered then two cpus calling stop_two_cpus on each other simultaneously would attempt to queue in the opposite order on each CPU causing an AB-BA style deadlock. By always having the lowest number CPU doing the queueing of works, we can guarantee that works are always queued in the same order, and deadlocks are avoided. Signed-off-by: Peter Zijlstra [ Implemented deadlock avoidance. ] Signed-off-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Mel Gorman Link: http://lkml.kernel.org/r/1381141781-10992-38-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/stop_machine.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 3b5e910d14ca..d2abbdb8c6aa 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -28,6 +28,7 @@ struct cpu_stop_work { }; int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg); +int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg); void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf); int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg); -- cgit v1.2.3 From ac66f5477239ebd3c4e2cbf2f591ef387aa09884 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:16 +0100 Subject: sched/numa: Introduce migrate_swap() Use the new stop_two_cpus() to implement migrate_swap(), a function that flips two tasks between their respective cpus. I'm fairly sure there's a less crude way than employing the stop_two_cpus() method, but everything I tried either got horribly fragile and/or complex. So keep it simple for now. The notable detail is how we 'migrate' tasks that aren't runnable anymore. We'll make it appear like we migrated them before they went to sleep. The sole difference is the previous cpu in the wakeup path, so we override this. Signed-off-by: Peter Zijlstra Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Mel Gorman Link: http://lkml.kernel.org/r/1381141781-10992-39-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 14251a8ff2ea..b6619792bb13 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1043,6 +1043,8 @@ struct task_struct { struct task_struct *last_wakee; unsigned long wakee_flips; unsigned long wakee_flip_decay_ts; + + int wake_cpu; #endif int on_rq; -- cgit v1.2.3 From 90572890d202527c366aa9489b32404e88a7c020 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:20 +0100 Subject: mm: numa: Change page last {nid,pid} into {cpu,pid} Change the per page last fault tracking to use cpu,pid instead of nid,pid. This will allow us to try and lookup the alternate task more easily. Note that even though it is the cpu that is store in the page flags that the mpol_misplaced decision is still based on the node. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-43-git-send-email-mgorman@suse.de [ Fixed build failure on 32-bit systems. ] Signed-off-by: Ingo Molnar --- include/linux/mm.h | 90 ++++++++++++++++++++++----------------- include/linux/mm_types.h | 4 +- include/linux/page-flags-layout.h | 22 +++++----- 3 files changed, 63 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb412ce2a8b5..ce464cd4777e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) +#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) +#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,96 +661,106 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpu_pid_to_cpupid(int cpu, int pid) { - return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); + return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { - return nidpid & LAST__PID_MASK; + return cpupid & LAST__PID_MASK; } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_cpu(int cpupid) { - return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; + return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpupid_to_nid(int cpupid) { - return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); + return cpu_to_node(cpupid_to_cpu(cpupid)); } -static inline bool nidpid_nid_unset(int nidpid) +static inline bool cpupid_pid_unset(int cpupid) { - return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); + return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS -static inline int page_nidpid_xchg_last(struct page *page, int nid) +static inline bool cpupid_cpu_unset(int cpupid) { - return xchg(&page->_last_nidpid, nid); + return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } -static inline int page_nidpid_last(struct page *page) +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page->_last_nidpid; + return xchg(&page->_last_cpupid, cpupid); } -static inline void page_nidpid_reset_last(struct page *page) + +static inline int page_cpupid_last(struct page *page) +{ + return page->_last_cpupid; +} +static inline void page_cpupid_reset_last(struct page *page) { - page->_last_nidpid = -1; + page->_last_cpupid = -1; } #else -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; + return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } -extern int page_nidpid_xchg_last(struct page *page, int nidpid); +extern int page_cpupid_xchg_last(struct page *page, int cpupid); -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { - int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; + int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; - page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); - page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; + page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; } -#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ -#else -static inline int page_nidpid_xchg_last(struct page *page, int nidpid) +#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ +#else /* !CONFIG_NUMA_BALANCING */ +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_nid(int cpupid) { return -1; } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { return -1; } -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpupid_to_cpu(int cpupid) { return -1; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpu_pid_to_cpupid(int nid, int pid) +{ + return -1; +} + +static inline bool cpupid_pid_unset(int cpupid) { return 1; } -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { } -#endif +#endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 38a902a6d1e3..a30f9ca66557 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -174,8 +174,8 @@ struct page { void *shadow; #endif -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS - int _last_nidpid; +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + int _last_cpupid; #endif } /* diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 02bc9184f16b..da523661500a 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -39,9 +39,9 @@ * lookup is necessary. * * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | - * " plus space for last_nidpid: | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * " plus space for last_cpupid: | NODE | ZONE | LAST_CPUPID ... | FLAGS | * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | - * " plus space for last_nidpid: | SECTION | NODE | ZONE | LAST_NIDPID ... | FLAGS | + * " plus space for last_cpupid: | SECTION | NODE | ZONE | LAST_CPUPID ... | FLAGS | * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | */ #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -65,18 +65,18 @@ #define LAST__PID_SHIFT 8 #define LAST__PID_MASK ((1 << LAST__PID_SHIFT)-1) -#define LAST__NID_SHIFT NODES_SHIFT -#define LAST__NID_MASK ((1 << LAST__NID_SHIFT)-1) +#define LAST__CPU_SHIFT NR_CPUS_BITS +#define LAST__CPU_MASK ((1 << LAST__CPU_SHIFT)-1) -#define LAST_NIDPID_SHIFT (LAST__PID_SHIFT+LAST__NID_SHIFT) +#define LAST_CPUPID_SHIFT (LAST__PID_SHIFT+LAST__CPU_SHIFT) #else -#define LAST_NIDPID_SHIFT 0 +#define LAST_CPUPID_SHIFT 0 #endif -#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_NIDPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -#define LAST_NIDPID_WIDTH LAST_NIDPID_SHIFT +#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT+LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS +#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT #else -#define LAST_NIDPID_WIDTH 0 +#define LAST_CPUPID_WIDTH 0 #endif /* @@ -87,8 +87,8 @@ #define NODE_NOT_IN_PAGE_FLAGS #endif -#if defined(CONFIG_NUMA_BALANCING) && LAST_NIDPID_WIDTH == 0 -#define LAST_NIDPID_NOT_IN_PAGE_FLAGS +#if defined(CONFIG_NUMA_BALANCING) && LAST_CPUPID_WIDTH == 0 +#define LAST_CPUPID_NOT_IN_PAGE_FLAGS #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ -- cgit v1.2.3 From 8c8a743c5087bac9caac8155b8f3b367e75cdd0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:21 +0100 Subject: sched/numa: Use {cpu, pid} to create task groups for shared faults While parallel applications tend to align their data on the cache boundary, they tend not to align on the page or THP boundary. Consequently tasks that partition their data can still "false-share" pages presenting a problem for optimal NUMA placement. This patch uses NUMA hinting faults to chain tasks together into numa_groups. As well as storing the NID a task was running on when accessing a page a truncated representation of the faulting PID is stored. If subsequent faults are from different PIDs it is reasonable to assume that those two tasks share a page and are candidates for being grouped together. Note that this patch makes no scheduling decisions based on the grouping information. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-44-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 11 +++++++++++ include/linux/sched.h | 3 +++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ce464cd4777e..81443d557a2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,12 @@ static inline bool cpupid_cpu_unset(int cpupid) return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } +static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) +{ + return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); +} + +#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { @@ -760,6 +766,11 @@ static inline bool cpupid_pid_unset(int cpupid) static inline void page_cpupid_reset_last(struct page *page) { } + +static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) diff --git a/include/linux/sched.h b/include/linux/sched.h index b6619792bb13..f587ded5c148 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1347,6 +1347,9 @@ struct task_struct { u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + struct list_head numa_entry; + struct numa_group *numa_group; + /* * Exponential decaying average of faults on a per-node basis. * Scheduling placement decisions are made based on the these counts. -- cgit v1.2.3 From e29cf08b05dc0b8151d65704d96d525a9e179a6b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:22 +0100 Subject: sched/numa: Report a NUMA task group ID It is desirable to model from userspace how the scheduler groups tasks over time. This patch adds an ID to the numa_group and reports it via /proc/PID/status. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-45-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f587ded5c148..b0b343b1ba64 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1452,12 +1452,17 @@ struct task_struct { #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, bool migrated); +extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int last_node, int node, int pages, bool migrated) { } +static inline pid_t task_numa_group_id(struct task_struct *p) +{ + return 0; +} static inline void set_numabalancing_state(bool enabled) { } -- cgit v1.2.3 From 6688cc05473b36a0a3d3971e1adf1712919b32eb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:24 +0100 Subject: mm: numa: Do not group on RO pages And here's a little something to make sure not the whole world ends up in a single group. As while we don't migrate shared executable pages, we do scan/fault on them. And since everybody links to libc, everybody ends up in the same group. Suggested-by: Rik van Riel Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-47-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b0b343b1ba64..ff543851a18a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1450,13 +1450,16 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#define TNF_MIGRATED 0x01 +#define TNF_NO_GROUP 0x02 + #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int last_node, int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); #else static inline void task_numa_fault(int last_node, int node, int pages, - bool migrated) + int flags) { } static inline pid_t task_numa_group_id(struct task_struct *p) -- cgit v1.2.3 From 5e1576ed0e54d419286a8096133029062b6ad456 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:26 +0100 Subject: sched/numa: Stay on the same node if CLONE_VM A newly spawned thread inside a process should stay on the same NUMA node as its parent. This prevents processes from being "torn" across multiple NUMA nodes every time they spawn a new thread. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-49-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff543851a18a..8563e3dd5c0f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2021,7 +2021,7 @@ extern void wake_up_new_task(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p); +extern void sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); -- cgit v1.2.3 From 83e1d2cd9eabec5164afea295ff06b941ae8e4a9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:27 +0100 Subject: sched/numa: Use group fault statistics in numa placement This patch uses the fraction of faults on a particular node for both task and group, to figure out the best node to place a task. If the task and group statistics disagree on what the preferred node should be then a full rescan will select the node with the best combined weight. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-50-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8563e3dd5c0f..724482200b83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1356,6 +1356,7 @@ struct task_struct { * The values remain static for the duration of a PTE scan */ unsigned long *numa_faults; + unsigned long total_numa_faults; /* * numa_faults_buffer records faults per node during the current -- cgit v1.2.3 From 82727018b0d33d188e9916bcf76f18387484cb04 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:28 +0100 Subject: sched/numa: Call task_numa_free() from do_execve() It is possible for a task in a numa group to call exec, and have the new (unrelated) executable inherit the numa group association from its former self. This has the potential to break numa grouping, and is trivial to fix. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-51-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 724482200b83..f6385107c352 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1458,6 +1458,7 @@ struct task_struct { extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); +extern void task_numa_free(struct task_struct *p); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -1470,6 +1471,9 @@ static inline pid_t task_numa_group_id(struct task_struct *p) static inline void set_numabalancing_state(bool enabled) { } +static inline void task_numa_free(struct task_struct *p) +{ +} #endif static inline struct pid *task_pid(struct task_struct *task) -- cgit v1.2.3 From b32e86b4301e345611f0446265f782a229faadf6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 7 Oct 2013 11:29:30 +0100 Subject: sched/numa: Add debugging Signed-off-by: Ingo Molnar Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Srikar Dronamraju Cc: Andrea Arcangeli Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/1381141781-10992-53-git-send-email-mgorman@suse.de --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f6385107c352..1127a46ac3d2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1366,6 +1366,7 @@ struct task_struct { unsigned long *numa_faults_buffer; int numa_preferred_nid; + unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; @@ -2661,6 +2662,11 @@ static inline unsigned int task_cpu(const struct task_struct *p) return task_thread_info(p)->cpu; } +static inline int task_node(const struct task_struct *p) +{ + return cpu_to_node(task_cpu(p)); +} + extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else -- cgit v1.2.3 From dabe1d992414a6456e60e41f1d1ad8affc6d444d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:34 +0100 Subject: sched/numa: Be more careful about joining numa groups Due to the way the pid is truncated, and tasks are moved between CPUs by the scheduler, it is possible for the current task_numa_fault to group together tasks that do not actually share memory together. This patch adds a few easy sanity checks to task_numa_fault, joining tasks together if they share the same tsk->mm, or if the fault was on a page with an elevated mapcount, in a shared VMA. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-57-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1127a46ac3d2..59f953b2e413 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1454,6 +1454,7 @@ struct task_struct { #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 +#define TNF_SHARED 0x04 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); -- cgit v1.2.3 From 04bb2f9475054298f0c67a89ca92cade42d3fe5e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:36 +0100 Subject: sched/numa: Adjust scan rate in task_numa_placement Adjust numa_scan_period in task_numa_placement, depending on how much useful work the numa code can do. The more local faults there are in a given scan window the longer the period (and hence the slower the scan rate) during the next window. If there are excessive shared faults then the scan period will decrease with the amount of scaling depending on whether the ratio of shared/private faults. If the preferred node changes then the scan rate is reset to recheck if the task is properly placed. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-59-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 59f953b2e413..2292f6c1596f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1365,6 +1365,14 @@ struct task_struct { */ unsigned long *numa_faults_buffer; + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local. The task scan period is adapted + * based on the locality of the faults with different weights + * depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[2]; + int numa_preferred_nid; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1455,6 +1463,7 @@ struct task_struct { #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 +#define TNF_FAULT_LOCAL 0x08 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); -- cgit v1.2.3 From 930aa174fcc8b0efaad102fd80f677b92f35eaa2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:37 +0100 Subject: sched/numa: Remove the numa_balancing_scan_period_reset sysctl With scan rate adaptions based on whether the workload has properly converged or not there should be no need for the scan period reset hammer. Get rid of it. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-60-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm_types.h | 3 --- include/linux/sched/sysctl.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a30f9ca66557..a3198e5aaf4e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -420,9 +420,6 @@ struct mm_struct { */ unsigned long numa_next_scan; - /* numa_next_reset is when the PTE scanner period will be reset */ - unsigned long numa_next_reset; - /* Restart point for scanning and setting pte_numa */ unsigned long numa_scan_offset; diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index bf8086b2506e..10d16c4fbe89 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -47,7 +47,6 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; -extern unsigned int sysctl_numa_balancing_scan_period_reset; extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_settle_count; -- cgit v1.2.3 From 1e3646ffc64b232cb14a5ef01d7b98997c1b73f9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:38 +0100 Subject: mm: numa: Revert temporarily disabling of NUMA migration With the scan rate code working (at least for multi-instance specjbb), the large hammer that is "sched: Do not migrate memory immediately after switching node" can be replaced with something smarter. Revert temporarily migration disabling and all traces of numa_migrate_seq. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-61-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2292f6c1596f..d24f70ffddee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1340,7 +1340,6 @@ struct task_struct { #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; - int numa_migrate_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; unsigned long numa_migrate_retry; -- cgit v1.2.3 From de1c9ce6f07fec0381a39a9d0b379ea35aa1167f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:39 +0100 Subject: sched/numa: Skip some page migrations after a shared fault Shared faults can lead to lots of unnecessary page migrations, slowing down the system, and causing private faults to hit the per-pgdat migration ratelimit. This patch adds sysctl numa_balancing_migrate_deferred, which specifies how many shared page migrations to skip unconditionally, after each page migration that is skipped because it is a shared fault. This reduces the number of page migrations back and forth in shared fault situations. It also gives a strong preference to the tasks that are already running where most of the memory is, and to moving the other tasks to near the memory. Testing this with a much higher scan rate than the default still seems to result in fewer page migrations than before. Memory seems to be somewhat better consolidated than previously, with multi-instance specjbb runs on a 4 node system. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-62-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d24f70ffddee..833eed55cf43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1342,6 +1342,8 @@ struct task_struct { int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; + int numa_preferred_nid; + int numa_migrate_deferred; unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; @@ -1372,7 +1374,6 @@ struct task_struct { */ unsigned long numa_faults_locality[2]; - int numa_preferred_nid; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p); + +extern unsigned int sysctl_numa_balancing_migrate_deferred; #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) -- cgit v1.2.3 From 8922915b38cd8b72f8e5af614b95be71d1d299d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Oct 2013 20:31:06 +0200 Subject: sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too Commit 4c663cfc ("wait: fix false timeouts when using wait_event_timeout()") introduced the additional condition checks after a timeout but only in the "slow" __wait*() paths. wait_event_timeout(wq, CONDITION, 0) still returns 0 if CONDITION is already true and we do not call __wait*(). Now that we have ___wait_cond_timeout() we can use it instead to ensure that __ret will be properly updated. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131007183106.GA10973@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index a2726c7dd244..04c0260bda8f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -270,7 +270,7 @@ do { \ #define wait_event_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ - if (!(condition)) \ + if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_timeout(wq, condition, timeout); \ __ret; \ }) @@ -328,7 +328,7 @@ do { \ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ long __ret = timeout; \ - if (!(condition)) \ + if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_timeout(wq, \ condition, timeout); \ __ret; \ @@ -769,7 +769,7 @@ do { \ timeout) \ ({ \ long __ret = timeout; \ - if (!(condition)) \ + if (!___wait_cond_timeout(condition)) \ __ret = __wait_event_interruptible_lock_irq_timeout( \ wq, condition, lock, timeout); \ __ret; \ -- cgit v1.2.3 From c2d816443ef305aba8eaf0bf368f4d3d87494f06 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Oct 2013 18:18:24 +0200 Subject: sched/wait: Introduce prepare_to_wait_event() Add the new helper, prepare_to_wait_event() which should only be used by ___wait_event(). prepare_to_wait_event() returns -ERESTARTSYS if signal_pending_state() is true, otherwise it does prepare_to_wait/exclusive. This allows to uninline the signal-pending checks in wait_event*() macros. Also, it can initialize wait->private/func. We do not care if they were already initialized, the values are the same. This also shaves a couple of insns from the inlined code. This obviously makes prepare_*() path a little bit slower, but we are likely going to sleep anyway, so I think it makes sense to shrink .text: text data bss dec hex filename =================================================== before: 5126092 2959248 10117120 18202460 115bf5c vmlinux after: 5124618 2955152 10117120 18196890 115a99a vmlinux on my build. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131007161824.GA29757@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 04c0260bda8f..ec099b03e11b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -187,27 +187,30 @@ wait_queue_head_t *bit_waitqueue(void *, int); __cond || !__ret; \ }) -#define ___wait_signal_pending(state) \ - ((state == TASK_INTERRUPTIBLE && signal_pending(current)) || \ - (state == TASK_KILLABLE && fatal_signal_pending(current))) +#define ___wait_is_interruptible(state) \ + (!__builtin_constant_p(state) || \ + state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ #define ___wait_event(wq, condition, state, exclusive, ret, cmd) \ ({ \ __label__ __out; \ - DEFINE_WAIT(__wait); \ + wait_queue_t __wait; \ long __ret = ret; \ \ + INIT_LIST_HEAD(&__wait.task_list); \ + if (exclusive) \ + __wait.flags = WQ_FLAG_EXCLUSIVE; \ + else \ + __wait.flags = 0; \ + \ for (;;) { \ - if (exclusive) \ - prepare_to_wait_exclusive(&wq, &__wait, state); \ - else \ - prepare_to_wait(&wq, &__wait, state); \ + long __int = prepare_to_wait_event(&wq, &__wait, state);\ \ if (condition) \ break; \ \ - if (___wait_signal_pending(state)) { \ - __ret = -ERESTARTSYS; \ + if (___wait_is_interruptible(state) && __int) { \ + __ret = __int; \ if (exclusive) { \ abort_exclusive_wait(&wq, &__wait, \ state, NULL); \ @@ -791,6 +794,7 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long tim */ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); +long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -- cgit v1.2.3 From 92ec11809565cf6429c75204e99e0f583b5c9d7c Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Oct 2013 13:40:55 +0200 Subject: sched/wait: Fix build breakage The wait_event_interruptible_lock_irq() macro is missing a semi-colon which causes a build failure in the i915 DRM driver. Signed-off-by: Thierry Reding Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1382528455-29911-1-git-send-email-treding@nvidia.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index ec099b03e11b..3b23afa04d6b 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -732,7 +732,7 @@ do { \ int __ret = 0; \ if (!(condition)) \ __ret = __wait_event_interruptible_lock_irq(wq, \ - condition, lock,) \ + condition, lock,); \ __ret; \ }) -- cgit v1.2.3 From 7d716456a0ee4e9bd63be9234f886d20382ac950 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 31 Oct 2013 12:48:14 +0100 Subject: sched/wait: Fix __wait_event_interruptible_lock_irq_timeout() __wait_event_interruptible_lock_irq_timeout() needs the timeout parameter passed instead of "ret". This magically compiled since the only user has a local ret variable. Luckily we got a build warning: CC drivers/s390/scsi/zfcp_qdio.o drivers/s390/scsi/zfcp_qdio.c: In function 'zfcp_qdio_sbal_get': include/linux/wait.h:780:15: warning: 'ret' may be used uninitialized Signed-off-by: Heiko Carstens Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20131031114814.GB5551@osiris Signed-off-by: Ingo Molnar --- include/linux/wait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 3b23afa04d6b..61939ba30aa0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -739,7 +739,7 @@ do { \ #define __wait_event_interruptible_lock_irq_timeout(wq, condition, \ lock, timeout) \ ___wait_event(wq, ___wait_cond_timeout(condition), \ - TASK_INTERRUPTIBLE, 0, ret, \ + TASK_INTERRUPTIBLE, 0, timeout, \ spin_unlock_irq(&lock); \ __ret = schedule_timeout(__ret); \ spin_lock_irq(&lock)); -- cgit v1.2.3 From b8a216269ec0ce2e961d32e6d640d7010b8a818e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 4 Oct 2013 22:06:53 +0200 Subject: sched: Move completion code from core.c to completion.c Completions already have their own header file: linux/completion.h Move the implementation out of kernel/sched/core.c and into its own file: kernel/sched/completion.c. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-x2y49rmxu5dljt66ai2lcfuw@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/completion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/completion.h b/include/linux/completion.h index 3cd574d5b19e..22c33e35bcb2 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -5,7 +5,7 @@ * (C) Copyright 2001 Linus Torvalds * * Atomic wait-for-completion handler data structures. - * See kernel/sched/core.c for details. + * See kernel/sched/completion.c for details. */ #include -- cgit v1.2.3