From 50a323b73069b169385a8ac65633dee837a7d13f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:36 +0200 Subject: sched: define and use CPU_PRI_* enums for cpu notifier priorities Instead of hardcoding priority 10 and 20 in sched and perf, collect them into CPU_PRI_* enums. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- include/linux/cpu.h | 9 +++++++++ include/linux/perf_event.h | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e287863ac053..2d9073883ea9 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -48,6 +48,15 @@ extern ssize_t arch_cpu_release(const char *, size_t); #endif struct notifier_block; +/* + * CPU notifier priorities. + */ +enum { + /* migration should happen before other stuff but after perf */ + CPU_PRI_PERF = 20, + CPU_PRI_MIGRATION = 10, +}; + #ifdef CONFIG_SMP /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5d0266d94985..469e03e96fe7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1068,7 +1068,7 @@ static inline void perf_event_disable(struct perf_event *event) { } #define perf_cpu_notifier(fn) \ do { \ static struct notifier_block fn##_nb __cpuinitdata = \ - { .notifier_call = fn, .priority = 20 }; \ + { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ (void *)(unsigned long)smp_processor_id()); \ fn(&fn##_nb, (unsigned long)CPU_STARTING, \ -- cgit v1.2.3 From 3a101d0548e925ab16ca6aaa8cf4f767d322ddb0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:36 +0200 Subject: sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining Currently, when a cpu goes down, cpu_active is cleared before CPU_DOWN_PREPARE starts and cpuset configuration is updated from a default priority cpu notifier. When a cpu is coming up, it's set before CPU_ONLINE but cpuset configuration again is updated from the same cpu notifier. For cpu notifiers, this presents an inconsistent state. Threads which a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be migrated to other cpus because the cpu is no more inactive. Fix it by updating cpu_active in the highest priority cpu notifier and cpuset configuration in the second highest when a cpu is coming up. Down path is updated similarly. This guarantees that all other cpu notifiers see consistent cpu_active and cpuset configuration. cpuset_track_online_cpus() notifier is converted to cpuset_update_active_cpus() which just updates the configuration and now called from cpuset_cpu_[in]active() notifiers registered from sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus() degenerates into partition_sched_domains() making separate notifier for !CONFIG_CPUSETS unnecessary. This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug callback creates a kthread and kthread_bind()s it to the target cpu, and the thread is expected to run on that cpu. * Ingo's test discovered __cpuinit/exit markups were incorrect. Fixed. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Rusty Russell Cc: Ingo Molnar Cc: Paul Menage --- include/linux/cpu.h | 16 ++++++++++++++++ include/linux/cpuset.h | 6 ++++++ 2 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2d9073883ea9..de6b1722cdca 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -52,6 +52,22 @@ struct notifier_block; * CPU notifier priorities. */ enum { + /* + * SCHED_ACTIVE marks a cpu which is coming up active during + * CPU_ONLINE and CPU_DOWN_FAILED and must be the first + * notifier. CPUSET_ACTIVE adjusts cpuset according to + * cpu_active mask right after SCHED_ACTIVE. During + * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are + * ordered in the similar way. + * + * This ordering guarantees consistent cpu_active mask and + * migration behavior to all cpu notifiers. + */ + CPU_PRI_SCHED_ACTIVE = INT_MAX, + CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, + CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, + CPU_PRI_CPUSET_INACTIVE = INT_MIN, + /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, CPU_PRI_MIGRATION = 10, diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 457ed765a116..f20eb8f16025 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_update_active_cpus(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_update_active_cpus(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { -- cgit v1.2.3 From 21aa9af03d06cb1d19a3738e5cf12acff984e69b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Jun 2010 21:40:37 +0200 Subject: sched: add hooks for workqueue Concurrency managed workqueue needs to know when workers are going to sleep and waking up. Using these two hooks, cmwq keeps track of the current concurrency level and throttles execution of new works if it's too high and wakes up another worker from the sleep hook if it becomes too low. This patch introduces PF_WQ_WORKER to identify workqueue workers and adds the following two hooks. * wq_worker_waking_up(): called when a worker is woken up. * wq_worker_sleeping(): called when a worker is going to sleep and may return a pointer to a local task which should be woken up. The returned task is woken up using try_to_wake_up_local() which is simplified ttwu which is called under rq lock and can only wake up local tasks. Both hooks are currently defined as noop in kernel/workqueue_sched.h. Later cmwq implementation will replace them with proper implementation. These hooks are hard coded as they'll always be enabled. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index f118809c953f..edc3dd168d87 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1696,6 +1696,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ -- cgit v1.2.3 From c676329abb2b8359d9a5d734dec0c81779823fd6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 10:48:51 +0200 Subject: sched_clock: Add local_clock() API and improve documentation For people who otherwise get to write: cpu_clock(smp_processor_id()), there is now: local_clock(). Also, as per suggestion from Andrew, provide some documentation on the various clock interfaces, and minimize the unsigned long long vs u64 mess. Signed-off-by: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: Jens Axboe LKML-Reference: <1275052414.1645.52.camel@laptop> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index edc3dd168d87..c2d4316a04bb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1791,20 +1791,23 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) #endif /* - * Architectures can set this to 1 if they have specified - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, - * but then during bootup it turns out that sched_clock() - * is reliable after all: + * Do not use outside of architecture code which knows its limitations. + * + * sched_clock() has no promise of monotonicity or bounded drift between + * CPUs, use (which you should not) requires disabling IRQs. + * + * Please use one of the three interfaces below. */ -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -extern int sched_clock_stable; -#endif - -/* ftrace calls sched_clock() directly */ extern unsigned long long notrace sched_clock(void); +/* + * See the comment in kernel/sched_clock.c + */ +extern u64 cpu_clock(int cpu); +extern u64 local_clock(void); +extern u64 sched_clock_cpu(int cpu); + extern void sched_clock_init(void); -extern u64 sched_clock_cpu(int cpu); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline void sched_clock_tick(void) @@ -1819,17 +1822,19 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } #else +/* + * Architectures can set this to 1 if they have specified + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, + * but then during bootup it turns out that sched_clock() + * is reliable after all: + */ +extern int sched_clock_stable; + extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #endif -/* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): - */ -extern unsigned long long cpu_clock(int cpu); - extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); -- cgit v1.2.3 From 83cd4fe27ad8446619b2e030b171b858501de87d Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 21 May 2010 17:09:41 -0700 Subject: sched: Change nohz idle load balancing logic to push model In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra Cc: Thomas Gleixner LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c2d4316a04bb..a3e5b1cd0438 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -271,14 +271,11 @@ extern int runqueue_is_locked(int cpu); extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) -extern int select_nohz_load_balancer(int cpu); -extern int get_nohz_load_balancer(void); +extern void select_nohz_load_balancer(int stop_tick); +extern int get_nohz_timer_target(void); extern int nohz_ratelimit(int cpu); #else -static inline int select_nohz_load_balancer(int cpu) -{ - return 0; -} +static inline void select_nohz_load_balancer(int stop_tick) { } static inline int nohz_ratelimit(int cpu) { -- cgit v1.2.3 From 9d5efe05eb0c904545a28b19c18b949f23334de0 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 8 Jun 2010 14:57:02 +1000 Subject: sched: Fix capacity calculations for SMT4 Handle cpu capacity being reported as 0 on cores with more number of hardware threads. For example on a Power7 core with 4 hardware threads, core power is 1177 and thus power of each hardware thread is 1177/4 = 294. This low power can lead to capacity for each hardware thread being calculated as 0, which leads to tasks bouncing within the core madly! Fix this by reporting capacity for hardware threads as 1, provided their power is not scaled down significantly because of frequency scaling or real-time tasks usage of cpu. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Michael Neuling Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven LKML-Reference: <20100608045702.21D03CC895@localhost.localdomain> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index a3e5b1cd0438..c731296e5e93 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -857,7 +857,7 @@ struct sched_group { * CPU power of this group, SCHED_LOAD_SCALE being max power for a * single CPU. */ - unsigned int cpu_power; + unsigned int cpu_power, cpu_power_orig; /* * The CPUs this group covers. -- cgit v1.2.3 From 532cb4c401e225b084c14d6bd6a2f8ee561de2f1 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 8 Jun 2010 14:57:02 +1000 Subject: sched: Add asymmetric group packing option for sibling domain Check to see if the group is packed in a sched doman. This is primarily intended to used at the sibling level. Some cores like POWER7 prefer to use lower numbered SMT threads. In the case of POWER7, it can move to lower SMT modes only when higher threads are idle. When in lower SMT modes, the threads will perform better since they share less core resources. Hence when we have idle threads, we want them to be the higher ones. This adds a hook into f_b_g() called check_asym_packing() to check the packing. This packing function is run on idle threads. It checks to see if the busiest CPU in this domain (core in the P7 case) has a higher CPU number than what where the packing function is being run on. If it is, calculate the imbalance and return the higher busier thread as the busiest group to f_b_g(). Here we are assuming a lower CPU number will be equivalent to a lower SMT thread number. It also creates a new SD_ASYM_PACKING flag to enable this feature at any scheduler domain level. It also creates an arch hook to enable this feature at the sibling level. The default function doesn't enable this feature. Based heavily on patch from Peter Zijlstra. Fixes from Srivatsa Vaddagiri. Signed-off-by: Michael Neuling Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Cc: "H. Peter Anvin" Cc: Thomas Gleixner LKML-Reference: <20100608045702.2936CCC897@localhost.localdomain> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +++- include/linux/topology.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index c731296e5e93..ff154e10752b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -801,7 +801,7 @@ enum cpu_idle_type { #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ - +#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ enum powersavings_balance_level { @@ -836,6 +836,8 @@ static inline int sd_balance_for_package_power(void) return SD_PREFER_SIBLING; } +extern int __weak arch_sd_sibiling_asym_packing(void); + /* * Optimise SD flags for power savings: * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. diff --git a/include/linux/topology.h b/include/linux/topology.h index c44df50a05ab..cf57f30d0dcb 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -103,6 +103,7 @@ int arch_update_cpu_topology(void); | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ + | arch_sd_sibiling_asym_packing() \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ -- cgit v1.2.3 From 2ec57d448b2e8fcfba539a46701b43f14f037f17 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Tue, 29 Jun 2010 12:02:01 +1000 Subject: sched: Fix spelling of sibling No logic changes, only spelling. Signed-off-by: Michael Neuling Cc: linuxppc-dev@ozlabs.org Cc: David Howells Cc: Peter Zijlstra LKML-Reference: <15249.1277776921@neuling.org> Signed-off-by: Ingo Molnar --- include/linux/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index cf57f30d0dcb..b572e432d2f3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -103,7 +103,7 @@ int arch_update_cpu_topology(void); | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ - | arch_sd_sibiling_asym_packing() \ + | arch_sd_sibling_asym_packing() \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ -- cgit v1.2.3 From 396e894d289d69bacf5acd983c97cd6e21a14c08 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jul 2010 15:12:27 +0200 Subject: sched: Revert nohz_ratelimit() for now Norbert reported that nohz_ratelimit() causes his laptop to burn about 4W (40%) extra. For now back out the change and see if we can adjust the power management code to make better decisions. Reported-by: Norbert Preining Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Cc: Arjan van de Ven LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 747fcaedddb7..6e0bb86de990 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -273,17 +273,11 @@ extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); extern int get_nohz_load_balancer(void); -extern int nohz_ratelimit(int cpu); #else static inline int select_nohz_load_balancer(int cpu) { return 0; } - -static inline int nohz_ratelimit(int cpu) -{ - return 0; -} #endif /* -- cgit v1.2.3