From bd425d4bfc7a1a6064dbbadfbac9c7eec0e426ec Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 22 Jun 2016 18:03:12 +0100 Subject: sched/core: Fix power to capacity renaming in comment It is seems that this one escaped Nico's renaming of cpu_power to cpu_capacity a while back. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: linux-kernel@vger.kernel.org Cc: mgalbraith@suse.de Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1466615004-3503-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e513e39..f3db596efd2c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1022,7 +1022,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -- cgit v1.2.3 From d1c6d149cf04d6c7c3c3ebf4b66c82500cbcf6e1 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sat, 23 Jul 2016 09:46:39 +0200 Subject: sched/debug: Make the "Preemption disabled at ..." message more useful This message is currently really useless since it always prints a value that comes from the printk() we just did, e.g.: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] down_trylock+0x13/0x80 BUG: sleeping function called from invalid context at include/linux/freezer.h:56 in_atomic(): 0, irqs_disabled(): 0, pid: 31996, name: trinity-c1 Preemption disabled at:[] console_unlock+0x2f7/0x930 Here, both down_trylock() and console_unlock() is somewhere in the printk() path. We should save the value before calling printk() and use the saved value instead. That immediately reveals the offending callsite: BUG: sleeping function called from invalid context at mm/slab.h:388 in_atomic(): 0, irqs_disabled(): 0, pid: 14971, name: trinity-c2 Preemption disabled at:[] rhashtable_walk_start+0x46/0x150 Bug report: http://marc.info/?l=linux-netdev&m=146925979821849&w=2 Signed-off-by: Vegard Nossum Cc: Andrew Morton Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russel Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f3db596efd2c..7f64e89a5873 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3236,6 +3236,15 @@ static inline void cond_resched_rcu(void) #endif } +static inline unsigned long get_preempt_disable_ip(struct task_struct *p) +{ +#ifdef CONFIG_DEBUG_PREEMPT + return p->preempt_disable_ip; +#else + return 0; +#endif +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, -- cgit v1.2.3 From 1f6e6c7cb9bcd58abb5ee11243e0eefe6b36fc8e Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 25 Jul 2016 14:34:22 +0100 Subject: sched/core: Introduce SD_ASYM_CPUCAPACITY sched_domain topology flag Add a topology flag to the sched_domain hierarchy indicating the lowest domain level where the full range of CPU capacities is represented by the domain members for asymmetric capacity topologies (e.g. ARM big.LITTLE). The flag is intended to indicate that extra care should be taken when placing tasks on CPUs and this level spans all the different types of CPUs found in the system (no need to look further up the domain hierarchy). This information is currently only available through iterating through the capacities of all the CPUs at parent levels in the sched_domain hierarchy. SD 2 [ 0 1 2 3] SD_ASYM_CPUCAPACITY SD 1 [ 0 1] [ 2 3] !SD_ASYM_CPUCAPACITY CPU: 0 1 2 3 capacity: 756 756 1024 1024 If the topology in the example above is duplicated to create an eight CPU example with third sched_domain level on top (SD 3), this level should not have the flag set (!SD_ASYM_CPUCAPACITY) as its two group would both have all CPU capacities represented within them. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: freedom.tan@mediatek.com Cc: keita.kobayashi.ym@renesas.com Cc: mgalbraith@suse.de Cc: sgurrappadi@nvidia.com Cc: vincent.guittot@linaro.org Cc: yuyang.du@intel.com Link: http://lkml.kernel.org/r/1469453670-2660-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7f64e89a5873..d75024053e9b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1022,6 +1022,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ #define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ -- cgit v1.2.3 From 9af6528ee9b682df7f29dbee86fbba0b67eab944 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 13 Sep 2016 18:37:29 +0200 Subject: sched/core: Optimize __schedule() Oleg noted that by making do_exit() use __schedule() for the TASK_DEAD context switch, we can avoid the TASK_DEAD special case currently in __schedule() because that avoids the extra preempt_disable() from schedule(). In order to facilitate this, create a do_task_dead() helper which we place in the scheduler code, such that it can access __schedule(). Also add some __noreturn annotations to the functions, there's no coming back from do_exit(). Suggested-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Cheng Chao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: chris@chris-wilson.co.uk Cc: tj@kernel.org Link: http://lkml.kernel.org/r/20160913163729.GB5012@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 9 +++------ include/linux/sched.h | 2 ++ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a6118d26a..74fd6f05bc5b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -259,17 +259,14 @@ static inline void might_fault(void) { } extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) -void panic(const char *fmt, ...) - __noreturn __cold; +void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -void do_exit(long error_code) - __noreturn; -void complete_and_exit(struct completion *, long) - __noreturn; +void do_exit(long error_code) __noreturn; +void complete_and_exit(struct completion *, long) __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); diff --git a/include/linux/sched.h b/include/linux/sched.h index d75024053e9b..f00ee8e90a29 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -448,6 +448,8 @@ static inline void io_schedule(void) io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); } +void __noreturn do_task_dead(void); + struct nsproxy; struct user_namespace; -- cgit v1.2.3 From 35a773a07926a22bf19d77ee00024522279c4e68 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Sep 2016 12:57:53 +0200 Subject: sched/core: Avoid _cond_resched() for PREEMPT=y On fully preemptible kernels _cond_resched() is pointless, so avoid emitting any code for it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mikulas Patocka Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f00ee8e90a29..b99fcd1b341e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3209,7 +3209,11 @@ static inline int signal_pending_state(long state, struct task_struct *p) * cond_resched_lock() will drop the spinlock before scheduling, * cond_resched_softirq() will enable bhs before scheduling. */ +#ifndef CONFIG_PREEMPT extern int _cond_resched(void); +#else +static inline int _cond_resched(void) { return 0; } +#endif #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ -- cgit v1.2.3 From 38a3e1fc1dac480f3672ab22fc97e1f995c80ed7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:47 +0200 Subject: sched/wait: Fix abort_exclusive_wait(), it should pass TASK_NORMAL to wake_up() Otherwise this logic only works if mode is "compatible" with another exclusive waiter. If some wq has both TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE waiters, abort_exclusive_wait() won't wait an uninterruptible waiter. The main user is __wait_on_bit_lock() and currently it is fine but only because TASK_KILLABLE includes TASK_UNINTERRUPTIBLE and we do not have lock_page_interruptible() yet. Just use TASK_NORMAL and remove the "mode" arg from abort_exclusive_wait(). Yes, this means that (say) wake_up_interruptible() can wake up the non- interruptible waiter(s), but I think this is fine. And in fact I think that abort_exclusive_wait() must die, see the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140047.GA6157@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index c3ff74d764fa..e4cfd1ed726e 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -281,8 +281,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ if (exclusive) { \ - abort_exclusive_wait(&wq, &__wait, \ - state, NULL); \ + abort_exclusive_wait(&wq, &__wait, \ + NULL); \ goto __out; \ } \ break; \ @@ -989,7 +989,7 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key); long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -- cgit v1.2.3 From b1ea06a90f528e516929a4da1d9b8838752bceb9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 8 Sep 2016 18:48:15 +0200 Subject: sched/wait: Avoid abort_exclusive_wait() in ___wait_event() ___wait_event() doesn't really need abort_exclusive_wait(), we can simply change prepare_to_wait_event() to remove the waiter from q->task_list if it was interrupted. This simplifies the code/logic, and this way prepare_to_wait_event() can have more users, see the next change. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160908164815.GA18801@redhat.com Signed-off-by: Ingo Molnar -- include/linux/wait.h | 7 +------ kernel/sched/wait.c | 35 +++++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 16 deletions(-) --- include/linux/wait.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index e4cfd1ed726e..7261dcbe5afe 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -280,12 +280,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ - if (exclusive) { \ - abort_exclusive_wait(&wq, &__wait, \ - NULL); \ - goto __out; \ - } \ - break; \ + goto __out; \ } \ \ cmd; \ -- cgit v1.2.3 From eaf9ef52241b545fe63621266bfc6fd8b06559ff Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:53 +0200 Subject: sched/wait: Avoid abort_exclusive_wait() in __wait_on_bit_lock() __wait_on_bit_lock() doesn't need abort_exclusive_wait() too. Right now it can't use prepare_to_wait_event() (see the next change), but it can do the additional finish_wait() if action() fails. abort_exclusive_wait() no longer has callers, remove it. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140053.GA6164@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 7261dcbe5afe..19c75f9545ce 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -984,7 +984,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, void *key); long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); -- cgit v1.2.3 From 0176beaffbe9ed627b6a4dfa61d640f1a848086f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 6 Sep 2016 16:00:55 +0200 Subject: sched/wait: Introduce init_wait_entry() The partial initialization of wait_queue_t in prepare_to_wait_event() looks ugly. This was done to shrink .text, but we can simply add the new helper which does the full initialization and shrink the compiled code a bit more. And. This way prepare_to_wait_event() can have more users. In particular we are ready to remove the signal_pending_state() checks from wait_bit_action_f helpers and change __wait_on_bit_lock() to use prepare_to_wait_event(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Al Viro Cc: Bart Van Assche Cc: Johannes Weiner Cc: Linus Torvalds Cc: Mike Galbraith Cc: Neil Brown Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160906140055.GA6167@redhat.com Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 19c75f9545ce..2408e8d5c05c 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); (!__builtin_constant_p(state) || \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ +extern void init_wait_entry(wait_queue_t *__wait, int flags); + /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. @@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); wait_queue_t __wait; \ long __ret = ret; /* explicit shadow */ \ \ - INIT_LIST_HEAD(&__wait.task_list); \ - if (exclusive) \ - __wait.flags = WQ_FLAG_EXCLUSIVE; \ - else \ - __wait.flags = 0; \ - \ + init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq, &__wait, state);\ \ -- cgit v1.2.3 From 24fc7edb92eea05946119cc0258c891c26b3b469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:37:59 +0200 Subject: sched/core: Introduce 'struct sched_domain_shared' Since struct sched_domain is strictly per cpu; introduce a structure that is shared between all 'identical' sched_domains. Limit to SD_SHARE_PKG_RESOURCES domains for now, as we'll only use it for shared cache state; if another use comes up later we can easily relax this. While the sched_group's are normally shared between CPUs, these are not natural to use when we need some shared state on a domain level -- since that would require the domain to have a parent, which is not a given. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index b99fcd1b341e..8a878b9649a1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,6 +1067,10 @@ extern int sched_domain_level_max; struct sched_group; +struct sched_domain_shared { + atomic_t ref; +}; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -1135,6 +1139,7 @@ struct sched_domain { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ }; + struct sched_domain_shared *shared; unsigned int span_weight; /* @@ -1168,6 +1173,7 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain **__percpu sd; + struct sched_domain_shared **__percpu sds; struct sched_group **__percpu sg; struct sched_group_capacity **__percpu sgc; }; -- cgit v1.2.3 From 0e369d757578b23ac50b893f920aa50fdbc45fb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:38:01 +0200 Subject: sched/core: Replace sd_busy/nr_busy_cpus with sched_domain_shared Move the nr_busy_cpus thing from its hacky sd->parent->groups->sgc location into the much more natural sched_domain_shared location. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8a878b9649a1..98888f1a03bc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1069,6 +1069,7 @@ struct sched_group; struct sched_domain_shared { atomic_t ref; + atomic_t nr_busy_cpus; }; struct sched_domain { -- cgit v1.2.3 From 10e2f1acd0106c05229f94c70a344ce3a2c8008b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 May 2016 10:38:05 +0200 Subject: sched/core: Rewrite and improve select_idle_siblings() select_idle_siblings() is a known pain point for a number of workloads; it either does too much or not enough and sometimes just does plain wrong. This rewrite attempts to address a number of issues (but sadly not all). The current code does an unconditional sched_domain iteration; with the intent of finding an idle core (on SMT hardware). The problems which this patch tries to address are: - its pointless to look for idle cores if the machine is real busy; at which point you're just wasting cycles. - it's behaviour is inconsistent between SMT and !SMT hardware in that !SMT hardware ends up doing a scan for any idle CPU in the LLC domain, while SMT hardware does a scan for idle cores and if that fails, falls back to a scan for idle threads on the 'target' core. The new code replaces the sched_domain scan with 3 explicit scans: 1) search for an idle core in the LLC 2) search for an idle CPU in the LLC 3) search for an idle thread in the 'target' core where 1 and 3 are conditional on SMT support and 1 and 2 have runtime heuristics to skip the step. Step 1) is conditional on sd_llc_shared->has_idle_cores; when a cpu goes idle and sd_llc_shared->has_idle_cores is false, we scan all SMT siblings of the CPU going idle. Similarly, we clear sd_llc_shared->has_idle_cores when we fail to find an idle core. Step 2) tracks the average cost of the scan and compares this to the average idle time guestimate for the CPU doing the wakeup. There is a significant fudge factor involved to deal with the variability of the averages. Esp. hackbench was sensitive to this. Step 3) is unconditional; we assume (also per step 1) that scanning all SMT siblings in a core is 'cheap'. With this; SMT systems gain step 2, which cures a few benchmarks -- notably one from Facebook. One 'feature' of the sched_domain iteration, which we preserve in the new code, is that it would start scanning from the 'target' CPU, instead of scanning the cpumask in cpu id order. This avoids multiple CPUs in the LLC scanning for idle to gang up and find the same CPU quite as much. The down side is that tasks can end up hopping across the LLC for no apparent reason. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 98888f1a03bc..2c30ed860d66 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1070,6 +1070,7 @@ struct sched_group; struct sched_domain_shared { atomic_t ref; atomic_t nr_busy_cpus; + int has_idle_cores; }; struct sched_domain { @@ -1102,6 +1103,8 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long next_decay_max_lb_cost; + u64 avg_scan_cost; /* select_idle_sibling */ + #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; -- cgit v1.2.3 From a458ae2ea616420f74480f0f5ed67ca0f3b5dbf7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Sep 2016 20:29:40 +0200 Subject: sched/core, ia64: Rename set_curr_task() Rename the ia64 only set_curr_task() function to free up the name. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c30ed860d66..ad51978ff15e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2581,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p) return p->pid == 0; } extern struct task_struct *curr_task(int cpu); -extern void set_curr_task(int cpu, struct task_struct *p); +extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); -- cgit v1.2.3 From 68107df5f2cb5dc3785be40162bfe2f19a178bbb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 26 Sep 2016 02:29:19 +0200 Subject: u64_stats: Introduce IRQs disabled helpers Introduce light versions of u64_stats helpers for context where either preempt or IRQs are disabled. This way we can make this library usable by scheduler irqtime accounting which currenty implement its ad-hoc version. Signed-off-by: Frederic Weisbecker Cc: Eric Dumazet Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: http://lkml.kernel.org/r/1474849761-12678-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- include/linux/u64_stats_sync.h | 45 ++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index d3a2bb712af3..650f3dd6b800 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp) #endif } -static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_begin(&syncp->seq); #else -#if BITS_PER_LONG==32 - preempt_disable(); -#endif return 0; #endif } -static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) + preempt_disable(); +#endif + return __u64_stats_fetch_begin(syncp); +} + +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_retry(&syncp->seq, start); #else -#if BITS_PER_LONG==32 - preempt_enable(); -#endif return false; #endif } +static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) + preempt_enable(); +#endif + return __u64_stats_fetch_retry(syncp, start); +} + /* * In case irq handlers can update u64 counters, readers can use following helpers * - SMP 32bit arches use seqcount protection, irq safe. @@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_begin(&syncp->seq); -#else -#if BITS_PER_LONG==32 +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) local_irq_disable(); #endif - return 0; -#endif + return __u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, - unsigned int start) + unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_retry(&syncp->seq, start); -#else -#if BITS_PER_LONG==32 +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) local_irq_enable(); #endif - return false; -#endif + return __u64_stats_fetch_retry(syncp, start); } #endif /* _LINUX_U64_STATS_SYNC_H */ -- cgit v1.2.3