From 66630058e56b26b3a9cf2625e250a8c592dd0207 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 8 Feb 2020 20:48:29 +0100 Subject: sched/rt: Provide migrate_disable/enable() inlines Code which solely needs to prevent migration of a task uses preempt_disable()/enable() pairs. This is the only reliable way to do so as setting the task affinity to a single CPU can be undone by a setaffinity operation from a different task/process. RT provides a seperate migrate_disable/enable() mechanism which does not disable preemption to achieve the semantic requirements of a (almost) fully preemptible kernel. As it is unclear from looking at a given code path whether the intention is to disable preemption or migration, introduce migrate_disable/enable() inline functions which can be used to annotate code which merely needs to disable migration. Map them to preempt_disable/enable() for now. The RT substitution will be provided later. Code which is annotated that way documents that it has no requirement to protect against reentrancy of a preempting task. Either this is not required at all or the call sites are already serialized by other means. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/878slclv1u.fsf@nanos.tec.linutronix.de --- include/linux/preempt.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index bbb68dba37cc..bc3f1aecaa19 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -322,4 +322,34 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif +/** + * migrate_disable - Prevent migration of the current task + * + * Maps to preempt_disable() which also disables preemption. Use + * migrate_disable() to annotate that the intent is to prevent migration, + * but not necessarily preemption. + * + * Can be invoked nested like preempt_disable() and needs the corresponding + * number of migrate_enable() invocations. + */ +static __always_inline void migrate_disable(void) +{ + preempt_disable(); +} + +/** + * migrate_enable - Allow migration of the current task + * + * Counterpart to migrate_disable(). + * + * As migrate_disable() can be invoked nested, only the outermost invocation + * reenables migration. + * + * Currently mapped to preempt_enable(). + */ +static __always_inline void migrate_enable(void) +{ + preempt_enable(); +} + #endif /* __LINUX_PREEMPT_H */ -- cgit v1.2.3 From 4e139c7711633365ebb52fbb63905395522a8413 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 14 Feb 2020 14:39:19 +0100 Subject: sched: Provide cant_migrate() Some code pathes rely on preempt_disable() to prevent migration on a non RT enabled kernel. These preempt_disable/enable() pairs are substituted by migrate_disable/enable() pairs or other forms of RT specific protection. On RT these protections prevent migration but not preemption. Obviously a cant_sleep() check in such a section will trigger on RT because preemption is not disabled. Provide a cant_migrate() macro which maps to cant_sleep() on a non RT kernel and an empty placeholder for RT for now. The placeholder will be changed to a proper debug check along with the RT specific migration protection mechanism. Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200214161503.070487511@linutronix.de --- include/linux/kernel.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 0d9db2a14f44..9b7a8d74a9d6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -257,6 +257,13 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) +#ifndef CONFIG_PREEMPT_RT +# define cant_migrate() cant_sleep() +#else + /* Placeholder for now */ +# define cant_migrate() do { } while (0) +#endif + /** * abs - return absolute value of an argument * @x: the value. If it is unsigned type, it is converted to signed type first. -- cgit v1.2.3 From 0dacee1bfa70e171be3a12a30414c228453048d2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 24 Feb 2020 09:52:17 +0000 Subject: sched/pelt: Remove unused runnable load average Now that runnable_load_avg is no more used, we can remove it to make space for a new signal. Signed-off-by: Vincent Guittot Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Reviewed-by: "Dietmar Eggemann " Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Valentin Schneider Cc: Phil Auld Cc: Hillf Danton Link: https://lore.kernel.org/r/20200224095223.13361-8-mgorman@techsingularity.net --- include/linux/sched.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 04278493bf15..037eaffabc24 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -357,7 +357,7 @@ struct util_est { /* * The load_avg/util_avg accumulates an infinite geometric series - * (see __update_load_avg() in kernel/sched/fair.c). + * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * @@ -401,11 +401,9 @@ struct util_est { struct sched_avg { u64 last_update_time; u64 load_sum; - u64 runnable_load_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; - unsigned long runnable_load_avg; unsigned long util_avg; struct util_est util_est; } ____cacheline_aligned; @@ -449,7 +447,6 @@ struct sched_statistics { struct sched_entity { /* For load-balancing: */ struct load_weight load; - unsigned long runnable_weight; struct rb_node run_node; struct list_head group_node; unsigned int on_rq; -- cgit v1.2.3 From 9f68395333ad7f5bfe2f83473fed363d4229f11c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 24 Feb 2020 09:52:18 +0000 Subject: sched/pelt: Add a new runnable average signal Now that runnable_load_avg has been removed, we can replace it by a new signal that will highlight the runnable pressure on a cfs_rq. This signal track the waiting time of tasks on rq and can help to better define the state of rqs. At now, only util_avg is used to define the state of a rq: A rq with more that around 80% of utilization and more than 1 tasks is considered as overloaded. But the util_avg signal of a rq can become temporaly low after that a task migrated onto another rq which can bias the classification of the rq. When tasks compete for the same rq, their runnable average signal will be higher than util_avg as it will include the waiting time and we can use this signal to better classify cfs_rqs. The new runnable_avg will track the runnable time of a task which simply adds the waiting time to the running time. The runnable _avg of cfs_rq will be the /Sum of se's runnable_avg and the runnable_avg of group entity will follow the one of the rq similarly to util_avg. Signed-off-by: Vincent Guittot Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Reviewed-by: "Dietmar Eggemann " Acked-by: Peter Zijlstra Cc: Juri Lelli Cc: Valentin Schneider Cc: Phil Auld Cc: Hillf Danton Link: https://lore.kernel.org/r/20200224095223.13361-9-mgorman@techsingularity.net --- include/linux/sched.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 037eaffabc24..2e9199bf947b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -356,28 +356,30 @@ struct util_est { } __attribute__((__aligned__(sizeof(u64)))); /* - * The load_avg/util_avg accumulates an infinite geometric series + * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * - * where runnable% is the time ratio that a sched_entity is runnable. - * For cfs_rq, it is the aggregated load_avg of all runnable and - * blocked sched_entities. + * [runnable_avg definition] + * + * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * - * where running% is the time ratio that a sched_entity is running on - * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable - * and blocked sched_entities. + * where runnable% is the time ratio that a sched_entity is runnable and + * running% the time ratio that a sched_entity is running. + * + * For cfs_rq, they are the aggregated values of all runnable and blocked + * sched_entities. * - * load_avg and util_avg don't direcly factor frequency scaling and CPU - * capacity scaling. The scaling is done through the rq_clock_pelt that - * is used for computing those signals (see update_rq_clock_pelt()) + * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU + * capacity scaling. The scaling is done through the rq_clock_pelt that is used + * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them @@ -401,9 +403,11 @@ struct util_est { struct sched_avg { u64 last_update_time; u64 load_sum; + u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; + unsigned long runnable_avg; unsigned long util_avg; struct util_est util_est; } ____cacheline_aligned; @@ -467,6 +471,8 @@ struct sched_entity { struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; + /* cached value of my_q->h_nr_running */ + unsigned long runnable_weight; #endif #ifdef CONFIG_SMP -- cgit v1.2.3 From 36a0df85d2e85e1929e8cd607e19243e5a2754e7 Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Fri, 21 Feb 2020 19:52:06 -0500 Subject: sched/topology: Add callback to read per CPU thermal pressure Introduce the arch_scale_thermal_pressure() callback to retrieve per CPU thermal pressure. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-3-thara.gopinath@linaro.org --- include/linux/sched/topology.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index f341163fedc9..af9319e4cfb9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -225,6 +225,14 @@ unsigned long arch_scale_cpu_capacity(int cpu) } #endif +#ifndef arch_scale_thermal_pressure +static __always_inline +unsigned long arch_scale_thermal_pressure(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); -- cgit v1.2.3 From ad58cc5cc50ca8423cf630778594bd38252a0a58 Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Fri, 21 Feb 2020 19:52:07 -0500 Subject: drivers/base/arch_topology: Add infrastructure to store and update instantaneous thermal pressure Add architecture specific APIs to update and track thermal pressure on a per CPU basis. A per CPU variable thermal_pressure is introduced to keep track of instantaneous per CPU thermal pressure. Thermal pressure is the delta between maximum capacity and capped capacity due to a thermal event. topology_get_thermal_pressure can be hooked into the scheduler specified arch_scale_thermal_pressure to retrieve instantaneous thermal pressure of a CPU. arch_set_thermal_pressure can be used to update the thermal pressure. Considering topology_get_thermal_pressure reads thermal_pressure and arch_set_thermal_pressure writes into thermal_pressure, one can argue for some sort of locking mechanism to avoid a stale value. But considering topology_get_thermal_pressure can be called from a system critical path like scheduler tick function, a locking mechanism is not ideal. This means that it is possible the thermal_pressure value used to calculate average thermal pressure for a CPU can be stale for up to 1 tick period. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-4-thara.gopinath@linaro.org --- include/linux/arch_topology.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 3015ecbb90b1..88a115e81f27 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -33,6 +33,16 @@ unsigned long topology_get_freq_scale(int cpu) return per_cpu(freq_scale, cpu); } +DECLARE_PER_CPU(unsigned long, thermal_pressure); + +static inline unsigned long topology_get_thermal_pressure(int cpu) +{ + return per_cpu(thermal_pressure, cpu); +} + +void arch_set_thermal_pressure(struct cpumask *cpus, + unsigned long th_pressure); + struct cpu_topology { int thread_id; int core_id; -- cgit v1.2.3 From 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Tue, 10 Mar 2020 18:01:13 -0700 Subject: sched/core: Distribute tasks within affinity masks Currently, when updating the affinity of tasks via either cpusets.cpus, or, sched_setaffinity(); tasks not currently running within the newly specified mask will be arbitrarily assigned to the first CPU within the mask. This (particularly in the case that we are restricting masks) can result in many tasks being assigned to the first CPUs of their new masks. This: 1) Can induce scheduling delays while the load-balancer has a chance to spread them between their new CPUs. 2) Can antogonize a poor load-balancer behavior where it has a difficult time recognizing that a cross-socket imbalance has been forced by an affinity mask. This change adds a new cpumask interface to allow iterated calls to distribute within the intersection of the provided masks. The cases that this mainly affects are: - modifying cpuset.cpus - when tasks join a cpuset - when modifying a task's affinity via sched_setaffinity(2) Signed-off-by: Paul Turner Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Tested-by: Qais Yousef Link: https://lkml.kernel.org/r/20200311010113.136465-1-joshdon@google.com --- include/linux/cpumask.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d5cc88514aee..f0d895d6ac39 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -194,6 +194,11 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) return 0; } +static inline int cpumask_any_and_distribute(const struct cpumask *src1p, + const struct cpumask *src2p) { + return cpumask_next_and(-1, src1p, src2p); +} + #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ @@ -245,6 +250,8 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); +int cpumask_any_and_distribute(const struct cpumask *src1p, + const struct cpumask *src2p); /** * for_each_cpu - iterate over every cpu in a mask -- cgit v1.2.3 From b05e75d611380881e73edc58a20fd8c6bb71720b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 16 Mar 2020 15:13:31 -0400 Subject: psi: Fix cpu.pressure for cpu.max and competing cgroups For simplicity, cpu pressure is defined as having more than one runnable task on a given CPU. This works on the system-level, but it has limitations in a cgrouped reality: When cpu.max is in use, it doesn't capture the time in which a task is not executing on the CPU due to throttling. Likewise, it doesn't capture the time in which a competing cgroup is occupying the CPU - meaning it only reflects cgroup-internal competitive pressure, not outside pressure. Enable tracking of currently executing tasks, and then change the definition of cpu pressure in a cgroup from NR_RUNNING > 1 to NR_RUNNING > ON_CPU which will capture the effects of cpu.max as well as competition from outside the cgroup. After this patch, a cgroup running `stress -c 1` with a cpu.max setting of 5000 10000 shows ~50% continuous CPU pressure. Signed-off-by: Johannes Weiner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200316191333.115523-2-hannes@cmpxchg.org --- include/linux/psi_types.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 07aaf9b82241..4b7258495a04 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -14,13 +14,21 @@ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, NR_RUNNING, - NR_PSI_TASK_COUNTS = 3, + /* + * This can't have values other than 0 or 1 and could be + * implemented as a bit flag. But for now we still have room + * in the first cacheline of psi_group_cpu, and this way we + * don't have to special case any state tracking for it. + */ + NR_ONCPU, + NR_PSI_TASK_COUNTS = 4, }; /* Task state bitmasks */ #define TSK_IOWAIT (1 << NR_IOWAIT) #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) +#define TSK_ONCPU (1 << NR_ONCPU) /* Resources that workloads could be stalled on */ enum psi_res { -- cgit v1.2.3 From 36b238d5717279163859fb6ba0f4360abcafab83 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 16 Mar 2020 15:13:32 -0400 Subject: psi: Optimize switching tasks inside shared cgroups When switching tasks running on a CPU, the psi state of a cgroup containing both of these tasks does not change. Right now, we don't exploit that, and can perform many unnecessary state changes in nested hierarchies, especially when most activity comes from one leaf cgroup. This patch implements an optimization where we only update cgroups whose state actually changes during a task switch. These are all cgroups that contain one task but not the other, up to the first shared ancestor. When both tasks are in the same group, we don't need to update anything at all. We can identify the first shared ancestor by walking the groups of the incoming task until we see TSK_ONCPU set on the local CPU; that's the first group that also contains the outgoing task. The new psi_task_switch() is similar to psi_task_change(). To allow code reuse, move the task flag maintenance code into a new function and the poll/avg worker wakeups into the shared psi_group_change(). Suggested-by: Peter Zijlstra Signed-off-by: Johannes Weiner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200316191333.115523-3-hannes@cmpxchg.org --- include/linux/psi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/psi.h b/include/linux/psi.h index 7b3de7321219..7361023f3fdd 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -17,6 +17,8 @@ extern struct psi_group psi_system; void psi_init(void); void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); void psi_memstall_tick(struct task_struct *task, int cpu); void psi_memstall_enter(unsigned long *flags); -- cgit v1.2.3 From 1066d1b6974e095d5a6c472ad9180a957b496cd6 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 16 Mar 2020 21:28:05 -0400 Subject: psi: Move PF_MEMSTALL out of task->flags The task->flags is a 32-bits flag, in which 31 bits have already been consumed. So it is hardly to introduce other new per process flag. Currently there're still enough spaces in the bit-field section of task_struct, so we can define the memstall state as a single bit in task_struct instead. This patch also removes an out-of-date comment pointed by Matthew. Suggested-by: Johannes Weiner Signed-off-by: Yafang Shao Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/1584408485-1921-1-git-send-email-laoar.shao@gmail.com --- include/linux/sched.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2e9199bf947b..09bddd9e69a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -785,9 +785,12 @@ struct task_struct { unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP - /* to be used once the psi infrastructure lands upstream. */ unsigned use_memdelay:1; #endif +#ifdef CONFIG_PSI + /* Stalled due to lack of memory */ + unsigned in_memstall:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ @@ -1480,7 +1483,6 @@ extern struct pid *cad_pid; #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -- cgit v1.2.3 From 9c40365a65d62d7c06a95fb331b3442cb02d2fd9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 2 Mar 2020 12:29:39 +0100 Subject: threads: Update PID limit comment according to futex UAPI change The futex UAPI changed back in commit 76b81e2b0e22 ("[PATCH] lightweight robust futexes updates 2"), which landed in v2.6.17: FUTEX_TID_MASK is now 0x3fffffff instead of 0x1fffffff. Update the corresponding comment in include/linux/threads.h. Documentation mentions that only the lower 29 bits are available for TID storage, but as the UAPI header released the bit already via FUTEX_TID_MASK, this is moot as well. Fix it up. [ tglx: Fixed up documentation ] Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200302112939.8068-1-jannh@google.com --- include/linux/threads.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/threads.h b/include/linux/threads.h index 3086dba525e2..18d5a74bcc3d 100644 --- a/include/linux/threads.h +++ b/include/linux/threads.h @@ -29,7 +29,7 @@ /* * A maximum of 4 million PIDs should be enough for a while. - * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.] + * [NOTE: PID/TIDs are limited to 2^30 ~= 1 billion, see FUTEX_TID_MASK.] */ #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \ (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT)) -- cgit v1.2.3