From 7b3c92b85a65c2db1f542265bc98e1f9e3056eba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 4 Jul 2019 15:13:23 -0700 Subject: sched/core: Convert get_task_struct() to return the task Returning the pointer that was passed in allows us to write slightly more idiomatic code. Convert a few users. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190704221323.24290-1-willy@infradead.org Signed-off-by: Ingo Molnar --- include/linux/sched/task.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 0497091e40c1..3d90ed8f75f0 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -105,7 +105,11 @@ extern void sched_exec(void); #define sched_exec() {} #endif -#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0) +static inline struct task_struct *get_task_struct(struct task_struct *t) +{ + refcount_inc(&t->usage); + return t; +} extern void __put_task_struct(struct task_struct *t); -- cgit v1.2.3 From c22645f4c8f021fb1c5e7189eb1f968132cc0844 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:53 +0200 Subject: sched/topology: Add partition_sched_domains_locked() Introduce the partition_sched_domains_locked() function by taking the mutex locking code out of the original function. That way the work done by partition_sched_domains_locked() can be reused without dropping the mutex lock. No change of functionality is introduced by this patch. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-2-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched/topology.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 7863bb62d2ab..f341163fedc9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -150,6 +150,10 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd) return to_cpumask(sd->span); } +extern void partition_sched_domains_locked(int ndoms_new, + cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new); + extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new); @@ -194,6 +198,12 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); struct sched_domain_attr; +static inline void +partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ +} + static inline void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) -- cgit v1.2.3 From f9a25f776d780bfa3279f0b6e5f5cf3224997976 Mon Sep 17 00:00:00 2001 From: Mathieu Poirier Date: Fri, 19 Jul 2019 15:59:55 +0200 Subject: cpusets: Rebuild root domain deadline accounting information When the topology of root domains is modified by CPUset or CPUhotplug operations information about the current deadline bandwidth held in the root domain is lost. This patch addresses the issue by recalculating the lost deadline bandwidth information by circling through the deadline tasks held in CPUsets and adding their current load to the root domain they are associated with. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Juri Lelli [ Various additional modifications. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-4-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/cgroup.h | 1 + include/linux/sched.h | 5 +++++ include/linux/sched/deadline.h | 8 ++++++++ 3 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f6b048902d6c..3ba3e6da13a6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -150,6 +150,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); +void cgroup_enable_task_cg_lists(void); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f51932bd543..b94ad92dfbe6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -295,6 +295,11 @@ enum uclamp_id { UCLAMP_CNT }; +#ifdef CONFIG_SMP +extern struct root_domain def_root_domain; +extern struct mutex sched_domains_mutex; +#endif + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 0cb034331cbb..1aff00b65f3c 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -24,3 +24,11 @@ static inline bool dl_time_before(u64 a, u64 b) { return (s64)(a - b) < 0; } + +#ifdef CONFIG_SMP + +struct root_domain; +extern void dl_add_task_root_domain(struct task_struct *p); +extern void dl_clear_root_domain(struct root_domain *rd); + +#endif /* CONFIG_SMP */ -- cgit v1.2.3 From d74b27d63a8bebe2fe634944e4ebdc7b10db7a39 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 15:59:58 +0200 Subject: cgroup/cpuset: Change cpuset_rwsem and hotplug lock order cpuset_rwsem is going to be acquired from sched_setscheduler() with a following patch. There are however paths (e.g., spawn_ksoftirqd) in which sched_scheduler() is eventually called while holding hotplug lock; this creates a dependecy between hotplug lock (to be always acquired first) and cpuset_rwsem (to be always acquired after hotplug lock). Fix paths which currently take the two locks in the wrong order (after a following patch is applied). Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-7-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 934633a05d20..7f1478c26a33 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void) static inline void cpuset_inc(void) { - static_branch_inc(&cpusets_pre_enable_key); - static_branch_inc(&cpusets_enabled_key); + static_branch_inc_cpuslocked(&cpusets_pre_enable_key); + static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { - static_branch_dec(&cpusets_enabled_key); - static_branch_dec(&cpusets_pre_enable_key); + static_branch_dec_cpuslocked(&cpusets_enabled_key); + static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } extern int cpuset_init(void); -- cgit v1.2.3 From 710da3c8ea7dfbd327920afd3831d8c82c42789d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 19 Jul 2019 16:00:00 +0200 Subject: sched/core: Prevent race condition between cpuset and __sched_setscheduler() No synchronisation mechanism exists between the cpuset subsystem and calls to function __sched_setscheduler(). As such, it is possible that new root domains are created on the cpuset side while a deadline acceptance test is carried out in __sched_setscheduler(), leading to a potential oversell of CPU bandwidth. Grab cpuset_rwsem read lock from core scheduler, so to prevent situations such as the one described above from happening. The only exception is normalize_rt_tasks() which needs to work under tasklist_lock and can't therefore grab cpuset_rwsem. We are fine with this, as this function is only called by sysrq and, if that gets triggered, DEADLINE guarantees are already gone out of the window anyway. Tested-by: Dietmar Eggemann Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: mathieu.poirier@linaro.org Cc: rostedt@goodmis.org Cc: tj@kernel.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-9-juri.lelli@redhat.com Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 7f1478c26a33..04c20de66afc 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -55,6 +55,8 @@ extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void cpuset_wait_for_hotplug(void); +extern void cpuset_read_lock(void); +extern void cpuset_read_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -176,6 +178,9 @@ static inline void cpuset_update_active_cpus(void) static inline void cpuset_wait_for_hotplug(void) { } +static inline void cpuset_read_lock(void) { } +static inline void cpuset_read_unlock(void) { } + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { -- cgit v1.2.3 From c1a280b68d4e6b6db4a65aa7865c22d8789ddf09 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:37 +0200 Subject: sched/preempt: Use CONFIG_PREEMPTION where appropriate CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the preemption code, scheduler and init task over to use CONFIG_PREEMPTION. That's the first step towards RT in that area. The more complex changes are coming separately. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.117528401@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 6 +++--- include/linux/sched.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index dd92b1a93919..bbb68dba37cc 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -182,7 +182,7 @@ do { \ #define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ @@ -203,7 +203,7 @@ do { \ __preempt_schedule(); \ } while (0) -#else /* !CONFIG_PREEMPT */ +#else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ @@ -217,7 +217,7 @@ do { \ } while (0) #define preempt_check_resched() do { } while (0) -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 9f51932bd543..6947516a2d3e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1767,7 +1767,7 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION extern int _cond_resched(void); #else static inline int _cond_resched(void) { return 0; } @@ -1796,12 +1796,12 @@ static inline void cond_resched_rcu(void) /* * Does a critical section need to be broken due to another - * task waiting?: (technically does not depend on CONFIG_PREEMPT, + * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; -- cgit v1.2.3 From 01b1d88b09824bea1a75b0bac04dcf50d9893875 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:38 +0200 Subject: rcu: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Switch the conditionals in RCU to use CONFIG_PREEMPTION. That's the first step towards RCU on RT. The further tweaks are work in progress. This neither touches the selftest bits which need a closer look by Paul. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.210156346@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/rcupdate.h | 2 +- include/linux/rcutree.h | 2 +- include/linux/torture.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 8f7167478c1d..c4f76a310443 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -578,7 +578,7 @@ do { \ * * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. - * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPT + * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 735601ac27d3..18b1ed9864b0 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -53,7 +53,7 @@ void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; void rcu_end_inkernel_boot(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPT +#ifndef CONFIG_PREEMPTION void rcu_all_qs(void); #endif diff --git a/include/linux/torture.h b/include/linux/torture.h index a620118385bb..6241f59e2d6f 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -86,7 +86,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #define torture_stop_kthread(n, tp) \ _torture_stop_kthread("Stopping " #n " task", &(tp)) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() preempt_schedule() #else #define torture_preempt_schedule() -- cgit v1.2.3 From 27972765bd0410fc2ef5e86a41de17c71440a2dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 26 Jul 2019 23:19:39 +0200 Subject: locking/spinlocks: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Adjust the comments in the locking code. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Paolo Bonzini Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20190726212124.302995288@linutronix.de Signed-off-by: Ingo Molnar --- include/linux/spinlock.h | 2 +- include/linux/spinlock_api_smp.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index ed7c4d6b8235..031ce8617df8 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -214,7 +214,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) /* * Define the various spin_lock methods. Note we define these - * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The + * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The * various methods are defined as nops in the case they are not * required. */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 42dfab89e740..b762eaba4cdf 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -96,7 +96,7 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock) /* * If lockdep is enabled then we use the non-preemption spin-ops - * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * even on CONFIG_PREEMPTION, because lockdep assumes that interrupts are * not re-enabled during lock-acquire (which the preempt-spin-ops do): */ #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) -- cgit v1.2.3 From a55c7454a8c887b226a01d7eed088ccb5374d81e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 8 Aug 2019 20:53:01 +0100 Subject: sched/topology: Improve load balancing on AMD EPYC systems SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init() for any sched domains with a NUMA distance greater than 2 hops (RECLAIM_DISTANCE). The idea being that it's expensive to balance across domains that far apart. However, as is rather unfortunately explained in: commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30") the value for RECLAIM_DISTANCE is based on node distance tables from 2011-era hardware. Current AMD EPYC machines have the following NUMA node distances: node distances: node 0 1 2 3 4 5 6 7 0: 10 16 16 16 32 32 32 32 1: 16 10 16 16 32 32 32 32 2: 16 16 10 16 32 32 32 32 3: 16 16 16 10 32 32 32 32 4: 32 32 32 32 10 16 16 16 5: 32 32 32 32 16 10 16 16 6: 32 32 32 32 16 16 10 16 7: 32 32 32 32 16 16 16 10 where 2 hops is 32. The result is that the scheduler fails to load balance properly across NUMA nodes on different sockets -- 2 hops apart. For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4 (CPUs 32-39) like so, $ numactl -C 0-7,32-39 ./spinner 16 causes all threads to fork and remain on node 0 until the active balancer kicks in after a few seconds and forcibly moves some threads to node 4. Override node_reclaim_distance for AMD Zen. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Suravee.Suthikulpanit@amd.com Cc: Thomas Gleixner Cc: Thomas.Lendacky@amd.com Cc: Tony Luck Link: https://lkml.kernel.org/r/20190808195301.13222-3-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- include/linux/topology.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index 47a3e3c08036..579522ec446c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -59,6 +59,20 @@ int arch_update_cpu_topology(void); */ #define RECLAIM_DISTANCE 30 #endif + +/* + * The following tunable allows platforms to override the default node + * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are + * sufficiently fast that the default value actually hurts + * performance. + * + * AMD EPYC machines use this because even though the 2-hop distance + * is 32 (3.2x slower than a local memory access) performance actually + * *improves* if allowed to reclaim memory and load balance tasks + * between NUMA nodes 2-hops apart. + */ +extern int __read_mostly node_reclaim_distance; + #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif -- cgit v1.2.3