From cfe7ddcbd72dc67ce5749cc6f451a2b0c6aec5b5 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:47 +0100 Subject: ARM, sched/topology: Remove SD_SHARE_POWERDOMAIN This flag was introduced in 2014 by commit: d77b3ed5c9f8 ("sched: Add a new SD_SHARE_POWERDOMAIN for sched_domain") but AFAIA it was never leveraged by the scheduler. The closest thing I can think of is EAS caring about frequency domains, and it does that by leveraging performance domains. Remove the flag. No change in functionality is expected. Suggested-by: Morten Rasmussen Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-2-valentin.schneider@arm.com --- kernel/sched/topology.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 007b0a6b0152..fe0396978fea 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -148,8 +148,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_PKG_RESOURCES)) { if (sd->groups != sd->groups->next) return 0; } @@ -180,8 +179,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -1292,7 +1290,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; * SD_SHARE_CPUCAPACITY - describes SMT topologies * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1303,8 +1300,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; (SD_SHARE_CPUCAPACITY | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ - SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_ASYM_PACKING) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, -- cgit v1.2.3 From 65c5e253168dbbb52c20026b0c5b7a2f344b9197 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:51 +0100 Subject: sched/topology: Verify SD_* flags setup when sched_debug is on Now that we have some description of what we expect the flags layout to be, we can use that to assert at runtime that the actual layout is sane. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-6-valentin.schneider@arm.com --- kernel/sched/topology.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index fe0396978fea..cbdaf084ff32 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -29,6 +29,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { struct sched_group *group = sd->groups; + unsigned long flags = sd->flags; + unsigned int idx; cpumask_clear(groupmask); @@ -43,6 +45,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); } + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + unsigned int flag = BIT(idx); + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; + + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && + !(sd->child->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", + sd_flag_debug[idx].name); + + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && + !(sd->parent->flags & flag)) + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", + sd_flag_debug[idx].name); + } + printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { -- cgit v1.2.3 From 5b9f8ff7b320a34af3dbcf04edb40d9b04f22f4a Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:52 +0100 Subject: sched/debug: Output SD flag names rather than their values Decoding the output of /proc/sys/kernel/sched_domain/cpu*/domain*/flags has always been somewhat annoying, as one needs to go fetch the bit -> name mapping from the source code itself. This encoding can be saved in a script somewhere, but that isn't safe from flags being added, removed or even shuffled around. What matters for debugging purposes is to get *which* flags are set in a given domain, their associated value is pretty much meaningless. Make the sd flags debug file output flag names. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-7-valentin.schneider@arm.com --- kernel/sched/debug.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 36c54265bb2b..0655524700d2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } +static int sd_ctl_doflags(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + unsigned long flags = *(unsigned long *)table->data; + size_t data_size = 0; + size_t len = 0; + char *tmp; + int idx; + + if (write) + return 0; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + /* Name plus whitespace */ + data_size += strlen(name) + 1; + } + + if (*ppos > data_size) { + *lenp = 0; + return 0; + } + + tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { + char *name = sd_flag_debug[idx].name; + + len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + } + + tmp += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + memcpy(buffer, tmp, len); + if (len < *lenp) { + ((char *)buffer)[len] = '\n'; + len++; + } + + *lenp = len; + *ppos += len; + + kfree(tmp); + + return 0; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { @@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); - set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax); + set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags); set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); /* &table[8] is terminator */ -- cgit v1.2.3 From 6f349818621dff364fc000909135d0065e9756c9 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:54 +0100 Subject: sched/topology: Use prebuilt SD flag degeneration mask Leverage SD_DEGENERATE_GROUPS_MASK in sd_degenerate() and sd_parent_degenerate(). Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-9-valentin.schneider@arm.com --- kernel/sched/topology.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index cbdaf084ff32..38e6671c3bd7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -160,15 +160,9 @@ static int sd_degenerate(struct sched_domain *sd) return 1; /* Following flags need at least 2 groups */ - if (sd->flags & (SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_SHARE_CPUCAPACITY | - SD_ASYM_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES)) { - if (sd->groups != sd->groups->next) - return 0; - } + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && + (sd->groups != sd->groups->next)) + return 0; /* Following flags don't use groups */ if (sd->flags & (SD_WAKE_AFFINE)) @@ -190,13 +184,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { - pflags &= ~(SD_BALANCE_NEWIDLE | - SD_BALANCE_FORK | - SD_BALANCE_EXEC | - SD_ASYM_CPUCAPACITY | - SD_SHARE_CPUCAPACITY | - SD_SHARE_PKG_RESOURCES | - SD_PREFER_SIBLING); + pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } -- cgit v1.2.3 From ab65afb094c7b2e954e8d56ffbc7df843211c2c8 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:55 +0100 Subject: sched/topology: Remove SD_SERIALIZE degeneration special case If there is only a single NUMA node in the system, the only NUMA topology level that will be generated will be NODE (identity distance), which doesn't have SD_SERIALIZE. This means we don't need this special case in sd_parent_degenerate(), as having the NODE level "naturally" covers it. Thus, remove it. Suggested-by: Peter Zijlstra Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-10-valentin.schneider@arm.com --- kernel/sched/topology.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 38e6671c3bd7..c662c53736e2 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -183,11 +183,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 0; /* Flags needing groups don't count if only 1 group in parent */ - if (parent->groups == parent->groups->next) { + if (parent->groups == parent->groups->next) pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); - if (nr_node_ids == 1) - pflags &= ~SD_SERIALIZE; - } + if (~cflags & pflags) return 0; -- cgit v1.2.3 From c200191d4c2c1aa2ffb62a984b756ac1f02dc55c Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:56 +0100 Subject: sched/topology: Propagate SD_ASYM_CPUCAPACITY upwards We currently set this flag *only* on domains whose topology level exactly match the level where we detect asymmetry (as returned by asym_cpu_capacity_level()). This is rather problematic. Say there are two clusters in the system, one with a lone big CPU and the other with a mix of big and LITTLE CPUs (as is allowed by DynamIQ): DIE [ ] MC [ ][ ] 0 1 2 3 4 L L B B B asym_cpu_capacity_level() will figure out that the MC level is the one where all CPUs can see a CPU of max capacity, and we will thus set SD_ASYM_CPUCAPACITY at MC level for all CPUs. That lone big CPU will degenerate its MC domain, since it would be alone in there, and will end up with just a DIE domain. Since the flag was only set at MC, this CPU ends up not seeing any SD with the flag set, which is broken. Rather than clearing dflags at every topology level, clear it before entering the topology level loop. This will properly propagate upwards flags that are set starting from a certain level. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Reviewed-by: Quentin Perret Reviewed-by: Dietmar Eggemann Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-11-valentin.schneider@arm.com --- kernel/sched/topology.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c662c53736e2..f36ed96f3197 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1988,11 +1988,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; + int dflags = 0; sd = NULL; for_each_sd_topology(tl) { - int dflags = 0; - if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; has_asym = true; -- cgit v1.2.3 From 3a6712c7685352a5d71eaf459a4fddfc3589f018 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 17 Aug 2020 12:29:57 +0100 Subject: sched/topology: Mark SD_PREFER_SIBLING as SDF_NEEDS_GROUPS SD_PREFER_SIBLING is currently considered in sd_parent_degenerate() but not in sd_degenerate(). It too hinges on load balancing, and thus won't have any effect when set on a domain with a single group. Add it to SD_DEGENERATE_GROUPS_MASK. Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Link: https://lore.kernel.org/r/20200817113003.20802-12-valentin.schneider@arm.com --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f36ed96f3197..c674aaab312c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -184,7 +184,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) - pflags &= ~(SD_DEGENERATE_GROUPS_MASK | SD_PREFER_SIBLING); + pflags &= ~SD_DEGENERATE_GROUPS_MASK; if (~cflags & pflags) return 0; -- cgit v1.2.3 From ec73240b1627cddfd7cef018c7fa1c32e64a721e Mon Sep 17 00:00:00 2001 From: Josh Don Date: Tue, 4 Aug 2020 12:34:13 -0700 Subject: sched/fair: Ignore cache hotness for SMT migration SMT siblings share caches, so cache hotness should be irrelevant for cross-sibling migration. Signed-off-by: Josh Don Proposed-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200804193413.510651-1-joshdon@google.com --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1a68a0536add..abdb54e2339f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7402,6 +7402,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env) if (unlikely(task_has_idle_policy(p))) return 0; + /* SMT siblings share cache */ + if (env->sd->flags & SD_SHARE_CPUCAPACITY) + return 0; + /* * Buddy candidates are cache hot: */ -- cgit v1.2.3 From da0777d35f47892f359c3f73ea155870bb595700 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 10 Aug 2020 09:30:04 +0100 Subject: sched/fair: Fix wrong negative conversion in find_energy_efficient_cpu() In find_energy_efficient_cpu() 'cpu_cap' could be less that 'util'. It might be because of RT, DL (so higher sched class than CFS), irq or thermal pressure signal, which reduce the capacity value. In such situation the result of 'cpu_cap - util' might be negative but stored in the unsigned long. Then it might be compared with other unsigned long when uclamp_rq_util_with() reduced the 'util' such that is passes the fits_capacity() check. Prevent this situation and make the arithmetic more safe. Fixes: 1d42509e475cd ("sched/fair: Make EAS wakeup placement consider uclamp restrictions") Signed-off-by: Lukasz Luba Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20200810083004.26420-1-lukasz.luba@arm.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index abdb54e2339f..90ebaa4ae16e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6594,7 +6594,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap - util; + spare_cap = cpu_cap; + lsub_positive(&spare_cap, util); /* * Skip CPUs that cannot satisfy the capacity request. -- cgit v1.2.3 From 1724b95b92979a8ea8e55a4817d05b3bb7750958 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 11 Aug 2020 19:32:09 +0800 Subject: sched/fair: Simplify the work when reweighting entity The code in reweight_entity() can be simplified. For a sched entity on the rq, the entity accounting can be replaced by cfs_rq instantaneous load updates currently called from within the entity accounting. Even though an entity on the rq can't represent a task in reweight_entity() (a task is always dequeued before calling this function) and so the numa task accounting and the rq->cfs_tasks list management of the entity accounting are never called, the redundant cfs_rq->nr_running decrement/increment will be avoided. Signed-off-by: Jiang Biao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200811113209.34057-1-benbjiang@tencent.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 90ebaa4ae16e..33699db27ed5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3084,7 +3084,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); - account_entity_dequeue(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); @@ -3100,7 +3100,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) - account_entity_enqueue(cfs_rq, se); + update_load_add(&cfs_rq->load, se->load.weight); } -- cgit v1.2.3 From c1cecf884ad748f63f9139d5a18ee265ee2f70fb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 19 Aug 2020 22:00:25 +0200 Subject: sched: Cache task_struct::flags in sched_submit_work() sched_submit_work() is considered to be a hot path. The preempt_disable() instruction is a compiler barrier and forces the compiler to load task_struct::flags for the second comparison. By using a local variable, the compiler can load the value once and keep it in a register for the second comparison. Verified on x86-64 with gcc-10. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200819200025.lqvmyefqnbok5i4f@linutronix.de --- kernel/sched/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8471a0f7eb32..c36dc1ae58be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4551,9 +4551,12 @@ void __noreturn do_task_dead(void) static inline void sched_submit_work(struct task_struct *tsk) { + unsigned int task_flags; + if (!tsk->state) return; + task_flags = tsk->flags; /* * If a worker went to sleep, notify and ask workqueue whether * it wants to wake up a task to maintain concurrency. @@ -4562,9 +4565,9 @@ static inline void sched_submit_work(struct task_struct *tsk) * in the possible wakeup of a kworker and because wq_worker_sleeping() * requires it. */ - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { preempt_disable(); - if (tsk->flags & PF_WQ_WORKER) + if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); else io_wq_worker_sleeping(tsk); -- cgit v1.2.3 From 8fca9494d4b4d6b57b1398cd473feb308df656db Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Aug 2020 14:32:15 +0100 Subject: sched/topology: Move sd_flag_debug out of linux/sched/topology.h Defining an array in a header imported all over the place clearly is a daft idea, that still didn't stop me from doing it. Leave a declaration of sd_flag_debug in topology.h and move its definition to sched/debug.c. Fixes: b6e862f38672 ("sched/topology: Define and assign sched_domain flag metadata") Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200825133216.9163-1-valentin.schneider@arm.com --- kernel/sched/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..0d7896d2a0b2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,6 +245,12 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include +}; +#undef SD_FLAG + static int sd_ctl_doflags(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From 4fc472f1214ef75e5450f207e23ff13af6eecad4 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 25 Aug 2020 14:32:16 +0100 Subject: sched/topology: Move SD_DEGENERATE_GROUPS_MASK out of linux/sched/topology.h SD_DEGENERATE_GROUPS_MASK is only useful for sched/topology.c, but still gets defined for anyone who imports topology.h, leading to a flurry of unused variable warnings. Move it out of the header and place it next to the SD degeneration functions in sched/topology.c. Fixes: 4ee4ea443a5d ("sched/topology: Introduce SD metaflag for flags needing > 1 groups") Reported-by: Andy Shevchenko Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200825133216.9163-2-valentin.schneider@arm.com --- kernel/sched/topology.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c674aaab312c..aa1676abc544 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -154,6 +154,13 @@ static inline bool sched_debug(void) } #endif /* CONFIG_SCHED_DEBUG */ +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = +#include +0; +#undef SD_FLAG + static int sd_degenerate(struct sched_domain *sd) { if (cpumask_weight(sched_domain_span(sd)) == 1) -- cgit v1.2.3 From 848785df48835eefebe0c4eb5da7690690b0a8b7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 8 Sep 2020 19:49:56 +0100 Subject: sched/topology: Move sd_flag_debug out of #ifdef CONFIG_SYSCTL The last sd_flag_debug shuffle inadvertently moved its definition within an #ifdef CONFIG_SYSCTL region. While CONFIG_SYSCTL is indeed required to produce the sched domain ctl interface (which uses sd_flag_debug to output flag names), it isn't required to run any assertion on the sched_domain hierarchy itself. Move the definition of sd_flag_debug to a CONFIG_SCHED_DEBUG region of topology.c. Now at long last we have: - sd_flag_debug declared in include/linux/sched/topology.h iff CONFIG_SCHED_DEBUG=y - sd_flag_debug defined in kernel/sched/topology.c, conditioned by: - CONFIG_SCHED_DEBUG, with an explicit #ifdef block - CONFIG_SMP, as a requirement to compile topology.c With this change, all symbols pertaining to SD flag metadata (with the exception of __SD_FLAG_CNT) are now defined exclusively within topology.c Fixes: 8fca9494d4b4 ("sched/topology: Move sd_flag_debug out of linux/sched/topology.h") Reported-by: Randy Dunlap Signed-off-by: Valentin Schneider Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200908184956.23369-1-valentin.schneider@arm.com --- kernel/sched/debug.c | 6 ------ kernel/sched/topology.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0d7896d2a0b2..0655524700d2 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -245,12 +245,6 @@ set_table_entry(struct ctl_table *entry, entry->proc_handler = proc_handler; } -#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, -const struct sd_flag_debug sd_flag_debug[] = { -#include -}; -#undef SD_FLAG - static int sd_ctl_doflags(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index aa1676abc544..249bec7b0a4c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -25,6 +25,12 @@ static inline bool sched_debug(void) return sched_debug_enabled; } +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, +const struct sd_flag_debug sd_flag_debug[] = { +#include +}; +#undef SD_FLAG + static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, struct cpumask *groupmask) { -- cgit v1.2.3 From 8e0e0eda6a13e242239799263cc354796c05434a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:29:59 +0200 Subject: sched/numa: Use runnable_avg to classify node Use runnable_avg to classify numa node state similarly to what is done for normal load balancer. This helps to ensure that numa and normal balancers use the same view of the state of the system. Large arm64system: 2 nodes / 224 CPUs: hackbench -l (256000/#grp) -g #grp grp tip/sched/core +patchset improvement 1 14,008(+/- 4,99 %) 13,800(+/- 3.88 %) 1,48 % 4 4,340(+/- 5.35 %) 4.283(+/- 4.85 %) 1,33 % 16 3,357(+/- 0.55 %) 3.359(+/- 0.54 %) -0,06 % 32 3,050(+/- 0.94 %) 3.039(+/- 1,06 %) 0,38 % 64 2.968(+/- 1,85 %) 3.006(+/- 2.92 %) -1.27 % 128 3,290(+/-12.61 %) 3,108(+/- 5.97 %) 5.51 % 256 3.235(+/- 3.95 %) 3,188(+/- 2.83 %) 1.45 % Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mel Gorman Link: https://lkml.kernel.org/r/20200921072959.16317-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33699db27ed5..a15deb210a17 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1504,6 +1504,7 @@ enum numa_type { /* Cached statistics for all CPUs within a node */ struct numa_stats { unsigned long load; + unsigned long runnable; unsigned long util; /* Total compute capacity of CPUs on a node */ unsigned long compute_capacity; @@ -1547,6 +1548,7 @@ struct task_numa_env { }; static unsigned long cpu_load(struct rq *rq); +static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); @@ -1555,11 +1557,13 @@ numa_type numa_classify(unsigned int imbalance_pct, struct numa_stats *ns) { if ((ns->nr_running > ns->weight) && - ((ns->compute_capacity * 100) < (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) return node_overloaded; if ((ns->nr_running < ns->weight) || - ((ns->compute_capacity * 100) > (ns->util * imbalance_pct))) + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) return node_has_spare; return node_fully_busy; @@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env, struct rq *rq = cpu_rq(cpu); ns->load += cpu_load(rq); + ns->runnable += cpu_runnable(rq); ns->util += cpu_util(cpu); ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); -- cgit v1.2.3 From 46fcc4b00c3cca8adb9b7c9afdd499f64e427135 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Mon, 31 Aug 2020 13:07:19 +0200 Subject: sched/deadline: Fix stale throttling on de-/boosted tasks When a boosted task gets throttled, what normally happens is that it's immediately enqueued again with ENQUEUE_REPLENISH, which replenishes the runtime and clears the dl_throttled flag. There is a special case however: if the throttling happened on sched-out and the task has been deboosted in the meantime, the replenish is skipped as the task will return to its normal scheduling class. This leaves the task with the dl_throttled flag set. Now if the task gets boosted up to the deadline scheduling class again while it is sleeping, it's still in the throttled state. The normal wakeup however will enqueue the task with ENQUEUE_REPLENISH not set, so we don't actually place it on the rq. Thus we end up with a task that is runnable, but not actually on the rq and neither a immediate replenishment happens, nor is the replenishment timer set up, so the task is stuck in forever-throttled limbo. Clear the dl_throttled flag before dropping back to the normal scheduling class to fix this issue. Signed-off-by: Lucas Stach Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200831110719.2126930-1-l.stach@pengutronix.de --- kernel/sched/deadline.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3862a28cd05d..c19c1883d695 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1527,12 +1527,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) pi_se = &pi_task->dl; } else if (!dl_prio(p->normal_prio)) { /* - * Special case in which we have a !SCHED_DEADLINE task - * that is going to be deboosted, but exceeds its - * runtime while doing so. No point in replenishing - * it, as it's going to return back to its original - * scheduling class after this. + * Special case in which we have a !SCHED_DEADLINE task that is going + * to be deboosted, but exceeds its runtime while doing so. No point in + * replenishing it, as it's going to return back to its original + * scheduling class after this. If it has been throttled, we need to + * clear the flag, otherwise the task may wake up as throttled after + * being boosted again with no means to replenish the runtime and clear + * the throttle. */ + p->dl.dl_throttled = 0; BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); return; } -- cgit v1.2.3 From 2586af1ac187f6b3a50930a4e33497074e81762d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Mon, 21 Sep 2020 16:39:49 +0200 Subject: sched/rt: Disable RT_RUNTIME_SHARE by default The RT_RUNTIME_SHARE sched feature enables the sharing of rt_runtime between CPUs, allowing a CPU to run a real-time task up to 100% of the time while leaving more space for non-real-time tasks to run on the CPU that lend rt_runtime. The problem is that a CPU can easily borrow enough rt_runtime to allow a spinning rt-task to run forever, starving per-cpu tasks like kworkers, which are non-real-time by design. This patch disables RT_RUNTIME_SHARE by default, avoiding this problem. The feature will still be present for users that want to enable it, though. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Tested-by: Wei Wang Link: https://lkml.kernel.org/r/b776ab46817e3db5d8ef79175fa0d71073c051c7.1600697903.git.bristot@redhat.com --- kernel/sched/features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7481cd96f391..68d369cba9e4 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false) SCHED_FEAT(RT_PUSH_IPI, true) #endif -SCHED_FEAT(RT_RUNTIME_SHARE, true) +SCHED_FEAT(RT_RUNTIME_SHARE, false) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) -- cgit v1.2.3 From 51bd5121c4eb25b911f6bc1ab4de5fe865fe0dcb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 22 Sep 2020 21:24:10 +0800 Subject: sched: Remove unused inline function uclamp_bucket_base_value() There is no caller in tree, so can remove it. Signed-off-by: YueHaibing Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20200922132410.48440-1-yuehaibing@huawei.com --- kernel/sched/core.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c36dc1ae58be..dd32d859ab40 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -940,11 +940,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return clamp_value / UCLAMP_BUCKET_DELTA; } -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); -} - static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) -- cgit v1.2.3 From df3cb4ea1fb63ff326488efd671ba3c39034255e Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Thu, 24 Sep 2020 14:48:47 +0800 Subject: sched/fair: Fix wrong cpu selecting from isolated domain We've met problems that occasionally tasks with full cpumask (e.g. by putting it into a cpuset or setting to full affinity) were migrated to our isolated cpus in production environment. After some analysis, we found that it is due to the current select_idle_smt() not considering the sched_domain mask. Steps to reproduce on my 31-CPU hyperthreads machine: 1. with boot parameter: "isolcpus=domain,2-31" (thread lists: 0,16 and 1,17) 2. cgcreate -g cpu:test; cgexec -g cpu:test "test_threads" 3. some threads will be migrated to the isolated cpu16~17. Fix it by checking the valid domain mask in select_idle_smt(). Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings()) Reported-by: Wetp Zhang Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/1600930127-76857-1-git-send-email-xlpang@linux.alibaba.com --- kernel/sched/fair.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a15deb210a17..9613e5d39d8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6080,7 +6080,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; @@ -6088,7 +6088,8 @@ static int select_idle_smt(struct task_struct *p, int target) return -1; for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + if (!cpumask_test_cpu(cpu, p->cpus_ptr) || + !cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6104,7 +6105,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -6279,7 +6280,7 @@ symmetric: if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, target); + i = select_idle_smt(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; -- cgit v1.2.3 From fe7491580d7c56152ea8d9d3124201191617435d Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Thu, 24 Sep 2020 09:47:55 +0800 Subject: sched/fair: Remove the force parameter of update_tg_load_avg() In the file fair.c, sometims update_tg_load_avg(cfs_rq, 0) is used, sometimes update_tg_load_avg(cfs_rq, false) is used. update_tg_load_avg() has the parameter force, but in current code, it never set 1 or true to it, so remove the force parameter. Signed-off-by: Xianting Tian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200924014755.36253-1-tian.xianting@h3c.com --- kernel/sched/fair.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9613e5d39d8a..b56276ab2525 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct task_struct *p) { } -static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ @@ -3293,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed - * @force: update regardless of how small the difference * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance @@ -3305,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * * Updating tg's load_avg is necessary before update_cfs_share(). */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; @@ -3315,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) if (cfs_rq->tg == &root_task_group) return; - if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { + if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } @@ -3617,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { @@ -3805,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s * IOW we're enqueueing a task on a new CPU. */ attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); if (flags & UPDATE_TG) - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); } } @@ -7898,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) struct sched_entity *se; if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { - update_tg_load_avg(cfs_rq, 0); + update_tg_load_avg(cfs_rq); if (cfs_rq == &rq->cfs) decayed = true; @@ -10797,7 +10796,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se) /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } @@ -10816,7 +10815,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se) /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); - update_tg_load_avg(cfs_rq, false); + update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } -- cgit v1.2.3 From 5a7f555904671c0737819fe4d19bd6143de3f6c0 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:21 +0200 Subject: sched/fair: Relax constraint on task's load during load balance Some UCs like 9 always running tasks on 8 CPUs can't be balanced and the load balancer currently migrates the waiting task between the CPUs in an almost random manner. The success of a rq pulling a task depends of the value of nr_balance_failed of its domains and its ability to be faster than others to detach it. This behavior results in an unfair distribution of the running time between tasks because some CPUs will run most of the time, if not always, the same task whereas others will share their time between several tasks. Instead of using nr_balance_failed as a boolean to relax the condition for detaching task, the LB will use nr_balanced_failed to relax the threshold between the tasks'load and the imbalance. This mecanism prevents the same rq or domain to always win the load balance fight. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b56276ab2525..5e3add351b89 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7679,8 +7679,8 @@ static int detach_tasks(struct lb_env *env) * scheduler fails to find a good waiting task to * migrate. */ - if (load/2 > env->imbalance && - env->sd->nr_balance_failed <= env->sd->cache_nice_tries) + + if ((load >> env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= load; -- cgit v1.2.3 From 2208cdaa56c957e20d8e16f28819aeb47851cb1e Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:22 +0200 Subject: sched/fair: Reduce minimal imbalance threshold The 25% default imbalance threshold for DIE and NUMA domain is large enough to generate significant unfairness between threads. A typical example is the case of 11 threads running on 2x4 CPUs. The imbalance of 20% between the 2 groups of 4 cores is just low enough to not trigger the load balance between the 2 groups. We will have always the same 6 threads on one group of 4 CPUs and the other 5 threads on the other group of CPUS. With a fair time sharing in each group, we ends up with +20% running time for the group of 5 threads. Consider decreasing the imbalance threshold for overloaded case where we use the load to balance task and to ensure fair time sharing. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Acked-by: Hillf Danton Link: https://lkml.kernel.org/r/20200921072424.14813-3-vincent.guittot@linaro.org --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 249bec7b0a4c..41df62884cea 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1349,7 +1349,7 @@ sd_init(struct sched_domain_topology_level *tl, .min_interval = sd_weight, .max_interval = 2*sd_weight, .busy_factor = 32, - .imbalance_pct = 125, + .imbalance_pct = 117, .cache_nice_tries = 0, -- cgit v1.2.3 From e4d32e4d5444977d8dc25fa98b3ce0a65544db8c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:23 +0200 Subject: sched/fair: Minimize concurrent LBs between domain level sched domains tend to trigger simultaneously the load balance loop but the larger domains often need more time to collect statistics. This slowness makes the larger domain trying to detach tasks from a rq whereas tasks already migrated somewhere else at a sub-domain level. This is not a real problem for idle LB because the period of smaller domains will increase with its CPUs being busy and this will let time for higher ones to pulled tasks. But this becomes a problem when all CPUs are already busy because all domains stay synced when they trigger their LB. A simple way to minimize simultaneous LB of all domains is to decrement the the busy interval by 1 jiffies. Because of the busy_factor, the interval of larger domain will not be a multiple of smaller ones anymore. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e3add351b89..24a5ee63718f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9790,6 +9790,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); + + /* + * Reduce likelihood of busy balancing at higher domains racing with + * balancing at lower domains by preventing their balancing periods + * from being multiples of each other. + */ + if (cpu_busy) + interval -= 1; + interval = clamp(interval, 1UL, max_load_balance_interval); return interval; -- cgit v1.2.3 From 6e7499135db724539ca887b3aa64122502875c71 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 21 Sep 2020 09:24:24 +0200 Subject: sched/fair: Reduce busy load balance interval The busy_factor, which increases load balance interval when a cpu is busy, is set to 32 by default. This value generates some huge LB interval on large system like the THX2 made of 2 node x 28 cores x 4 threads. For such system, the interval increases from 112ms to 3584ms at MC level. And from 228ms to 7168ms at NUMA level. Even on smaller system, a lower busy factor has shown improvement on the fair distribution of the running time so let reduce it for all. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-5-vincent.guittot@linaro.org --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 41df62884cea..a3a2417fec54 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1348,7 +1348,7 @@ sd_init(struct sched_domain_topology_level *tl, *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, - .busy_factor = 32, + .busy_factor = 16, .imbalance_pct = 117, .cache_nice_tries = 0, -- cgit v1.2.3 From 233e7aca4c8a2c764f556bba9644c36154017e7f Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 21 Sep 2020 23:18:49 +0100 Subject: sched/fair: Use dst group while checking imbalance for NUMA balancer Barry Song noted the following Something is wrong. In find_busiest_group(), we are checking if src has higher load, however, in task_numa_find_cpu(), we are checking if dst will have higher load after balancing. It seems it is not sensible to check src. It maybe cause wrong imbalance value, for example, if dst_running = env->dst_stats.nr_running + 1 results in 3 or above, and src_running = env->src_stats.nr_running - 1 results in 1; The current code is thinking imbalance as 0 since src_running is smaller than 2. This is inconsistent with load balancer. Basically, in find_busiest_group(), the NUMA imbalance is ignored if moving a task "from an almost idle domain" to a "domain with spare capacity". This patch forbids movement "from a misplaced domain" to "an almost idle domain" as that is closer to what the CPU load balancer expects. This patch is not a universal win. The old behaviour was intended to allow a task from an almost idle NUMA node to migrate to its preferred node if the destination had capacity but there are corner cases. For example, a NAS compute load could be parallelised to use 1/3rd of available CPUs but not all those potential tasks are active at all times allowing this logic to trigger. An obvious example is specjbb 2005 running various numbers of warehouses on a 2 socket box with 80 cpus. specjbb 5.9.0-rc4 5.9.0-rc4 vanilla dstbalance-v1r1 Hmean tput-1 46425.00 ( 0.00%) 43394.00 * -6.53%* Hmean tput-2 98416.00 ( 0.00%) 96031.00 * -2.42%* Hmean tput-3 150184.00 ( 0.00%) 148783.00 * -0.93%* Hmean tput-4 200683.00 ( 0.00%) 197906.00 * -1.38%* Hmean tput-5 236305.00 ( 0.00%) 245549.00 * 3.91%* Hmean tput-6 281559.00 ( 0.00%) 285692.00 * 1.47%* Hmean tput-7 338558.00 ( 0.00%) 334467.00 * -1.21%* Hmean tput-8 340745.00 ( 0.00%) 372501.00 * 9.32%* Hmean tput-9 424343.00 ( 0.00%) 413006.00 * -2.67%* Hmean tput-10 421854.00 ( 0.00%) 434261.00 * 2.94%* Hmean tput-11 493256.00 ( 0.00%) 485330.00 * -1.61%* Hmean tput-12 549573.00 ( 0.00%) 529959.00 * -3.57%* Hmean tput-13 593183.00 ( 0.00%) 555010.00 * -6.44%* Hmean tput-14 588252.00 ( 0.00%) 599166.00 * 1.86%* Hmean tput-15 623065.00 ( 0.00%) 642713.00 * 3.15%* Hmean tput-16 703924.00 ( 0.00%) 660758.00 * -6.13%* Hmean tput-17 666023.00 ( 0.00%) 697675.00 * 4.75%* Hmean tput-18 761502.00 ( 0.00%) 758360.00 * -0.41%* Hmean tput-19 796088.00 ( 0.00%) 798368.00 * 0.29%* Hmean tput-20 733564.00 ( 0.00%) 823086.00 * 12.20%* Hmean tput-21 840980.00 ( 0.00%) 856711.00 * 1.87%* Hmean tput-22 804285.00 ( 0.00%) 872238.00 * 8.45%* Hmean tput-23 795208.00 ( 0.00%) 889374.00 * 11.84%* Hmean tput-24 848619.00 ( 0.00%) 966783.00 * 13.92%* Hmean tput-25 750848.00 ( 0.00%) 903790.00 * 20.37%* Hmean tput-26 780523.00 ( 0.00%) 962254.00 * 23.28%* Hmean tput-27 1042245.00 ( 0.00%) 991544.00 * -4.86%* Hmean tput-28 1090580.00 ( 0.00%) 1035926.00 * -5.01%* Hmean tput-29 999483.00 ( 0.00%) 1082948.00 * 8.35%* Hmean tput-30 1098663.00 ( 0.00%) 1113427.00 * 1.34%* Hmean tput-31 1125671.00 ( 0.00%) 1134175.00 * 0.76%* Hmean tput-32 968167.00 ( 0.00%) 1250286.00 * 29.14%* Hmean tput-33 1077676.00 ( 0.00%) 1060893.00 * -1.56%* Hmean tput-34 1090538.00 ( 0.00%) 1090933.00 * 0.04%* Hmean tput-35 967058.00 ( 0.00%) 1107421.00 * 14.51%* Hmean tput-36 1051745.00 ( 0.00%) 1210663.00 * 15.11%* Hmean tput-37 1019465.00 ( 0.00%) 1351446.00 * 32.56%* Hmean tput-38 1083102.00 ( 0.00%) 1064541.00 * -1.71%* Hmean tput-39 1232990.00 ( 0.00%) 1303623.00 * 5.73%* Hmean tput-40 1175542.00 ( 0.00%) 1340943.00 * 14.07%* Hmean tput-41 1127826.00 ( 0.00%) 1339492.00 * 18.77%* Hmean tput-42 1198313.00 ( 0.00%) 1411023.00 * 17.75%* Hmean tput-43 1163733.00 ( 0.00%) 1228253.00 * 5.54%* Hmean tput-44 1305562.00 ( 0.00%) 1357886.00 * 4.01%* Hmean tput-45 1326752.00 ( 0.00%) 1406061.00 * 5.98%* Hmean tput-46 1339424.00 ( 0.00%) 1418451.00 * 5.90%* Hmean tput-47 1415057.00 ( 0.00%) 1381570.00 * -2.37%* Hmean tput-48 1392003.00 ( 0.00%) 1421167.00 * 2.10%* Hmean tput-49 1408374.00 ( 0.00%) 1418659.00 * 0.73%* Hmean tput-50 1359822.00 ( 0.00%) 1391070.00 * 2.30%* Hmean tput-51 1414246.00 ( 0.00%) 1392679.00 * -1.52%* Hmean tput-52 1432352.00 ( 0.00%) 1354020.00 * -5.47%* Hmean tput-53 1387563.00 ( 0.00%) 1409563.00 * 1.59%* Hmean tput-54 1406420.00 ( 0.00%) 1388711.00 * -1.26%* Hmean tput-55 1438804.00 ( 0.00%) 1387472.00 * -3.57%* Hmean tput-56 1399465.00 ( 0.00%) 1400296.00 * 0.06%* Hmean tput-57 1428132.00 ( 0.00%) 1396399.00 * -2.22%* Hmean tput-58 1432385.00 ( 0.00%) 1386253.00 * -3.22%* Hmean tput-59 1421612.00 ( 0.00%) 1371416.00 * -3.53%* Hmean tput-60 1429423.00 ( 0.00%) 1389412.00 * -2.80%* Hmean tput-61 1396230.00 ( 0.00%) 1351122.00 * -3.23%* Hmean tput-62 1418396.00 ( 0.00%) 1383098.00 * -2.49%* Hmean tput-63 1409918.00 ( 0.00%) 1374662.00 * -2.50%* Hmean tput-64 1410236.00 ( 0.00%) 1376216.00 * -2.41%* Hmean tput-65 1396405.00 ( 0.00%) 1364418.00 * -2.29%* Hmean tput-66 1395975.00 ( 0.00%) 1357326.00 * -2.77%* Hmean tput-67 1392986.00 ( 0.00%) 1349642.00 * -3.11%* Hmean tput-68 1386541.00 ( 0.00%) 1343261.00 * -3.12%* Hmean tput-69 1374407.00 ( 0.00%) 1342588.00 * -2.32%* Hmean tput-70 1377513.00 ( 0.00%) 1334654.00 * -3.11%* Hmean tput-71 1369319.00 ( 0.00%) 1334952.00 * -2.51%* Hmean tput-72 1354635.00 ( 0.00%) 1329005.00 * -1.89%* Hmean tput-73 1350933.00 ( 0.00%) 1318942.00 * -2.37%* Hmean tput-74 1351714.00 ( 0.00%) 1316347.00 * -2.62%* Hmean tput-75 1352198.00 ( 0.00%) 1309974.00 * -3.12%* Hmean tput-76 1349490.00 ( 0.00%) 1286064.00 * -4.70%* Hmean tput-77 1336131.00 ( 0.00%) 1303684.00 * -2.43%* Hmean tput-78 1308896.00 ( 0.00%) 1271024.00 * -2.89%* Hmean tput-79 1326703.00 ( 0.00%) 1290862.00 * -2.70%* Hmean tput-80 1336199.00 ( 0.00%) 1291629.00 * -3.34%* The performance at the mid-point is better but not universally better. The patch is a mixed bag depending on the workload, machine and overall levels of utilisation. Sometimes it's better (sometimes much better), other times it is worse (sometimes much worse). Given that there isn't a universally good decision in this section and more people seem to prefer the patch then it may be best to keep the LB decisions consistent and revisit imbalance handling when the load balancer code changes settle down. Jirka Hladky added the following observation. Our results are mostly in line with what you see. We observe big gains (20-50%) when the system is loaded to 1/3 of the maximum capacity and mixed results at the full load - some workloads benefit from the patch at the full load, others not, but performance changes at the full load are mostly within the noise of results (+/-5%). Overall, we think this patch is helpful. [mgorman@techsingularity.net: Rewrote changelog] Fixes: fb86f5b211 ("sched/numa: Use similar logic to the load balancer for moving between domains with spare capacity") Signed-off-by: Barry Song Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200921221849.GI3179@techsingularity.net --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 24a5ee63718f..fc3410b8b990 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1550,7 +1550,7 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running); +static inline long adjust_numa_imbalance(int imbalance, int nr_running); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1930,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -8967,7 +8967,7 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) +static inline long adjust_numa_imbalance(int imbalance, int nr_running) { unsigned int imbalance_min; @@ -8976,7 +8976,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running) * tasks that remain local when the source domain is almost idle. */ imbalance_min = 2; - if (src_nr_running <= imbalance_min) + if (nr_running <= imbalance_min) return 0; return imbalance; -- cgit v1.2.3 From 2a36ab717e8fe678d98f81c14a0b124712719840 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 23 Sep 2020 16:36:16 -0700 Subject: rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ This patchset is based on Google-internal RSEQ work done by Paul Turner and Andrew Hunter. When working with per-CPU RSEQ-based memory allocations, it is sometimes important to make sure that a global memory location is no longer accessed from RSEQ critical sections. For example, there can be two per-CPU lists, one is "active" and accessed per-CPU, while another one is inactive and worked on asynchronously "off CPU" (e.g. garbage collection is performed). Then at some point the two lists are swapped, and a fast RCU-like mechanism is required to make sure that the previously active list is no longer accessed. This patch introduces such a mechanism: in short, membarrier() syscall issues an IPI to a CPU, restarting a potentially active RSEQ critical section on the CPU. Signed-off-by: Peter Oskolkov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mathieu Desnoyers Link: https://lkml.kernel.org/r/20200923233618.2572849-1-posk@google.com --- kernel/sched/membarrier.c | 136 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 106 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 168479a7d61b..e23e74d52db5 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -18,6 +18,14 @@ #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 #endif +#ifdef CONFIG_RSEQ +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK) +#else +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ @@ -30,6 +38,11 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_rseq(void *info) +{ + rseq_preempt(current); +} + static void ipi_sync_rq_state(void *info) { struct mm_struct *mm = (struct mm_struct *) info; @@ -129,19 +142,27 @@ static int membarrier_global_expedited(void) return 0; } -static int membarrier_private_expedited(int flags) +static int membarrier_private_expedited(int flags, int cpu_id) { - int cpu; cpumask_var_t tmpmask; struct mm_struct *mm = current->mm; + smp_call_func_t ipi_func = ipi_mb; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + if (!(atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY)) + return -EPERM; + ipi_func = ipi_rseq; } else { + WARN_ON_ONCE(flags); if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) return -EPERM; @@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags) */ smp_mb(); /* system call entry is not a mb. */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) return -ENOMEM; cpus_read_lock(); - rcu_read_lock(); - for_each_online_cpu(cpu) { + + if (cpu_id >= 0) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; - p = rcu_dereference(cpu_rq(cpu)->curr); - if (p && p->mm == mm) - __cpumask_set_cpu(cpu, tmpmask); + if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) + goto out; + if (cpu_id == raw_smp_processor_id()) + goto out; + rcu_read_lock(); + p = rcu_dereference(cpu_rq(cpu_id)->curr); + if (!p || p->mm != mm) { + rcu_read_unlock(); + goto out; + } + rcu_read_unlock(); + } else { + int cpu; + + rcu_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + p = rcu_dereference(cpu_rq(cpu)->curr); + if (p && p->mm == mm) + __cpumask_set_cpu(cpu, tmpmask); + } + rcu_read_unlock(); } - rcu_read_unlock(); preempt_disable(); - smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + if (cpu_id >= 0) + smp_call_function_single(cpu_id, ipi_func, NULL, 1); + else + smp_call_function_many(tmpmask, ipi_func, NULL, 1); preempt_enable(); - free_cpumask_var(tmpmask); +out: + if (cpu_id < 0) + free_cpumask_var(tmpmask); cpus_read_unlock(); /* @@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags) set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, ret; - if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (flags == MEMBARRIER_FLAG_SYNC_CORE) { if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) return -EINVAL; ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } else if (flags == MEMBARRIER_FLAG_RSEQ) { + if (!IS_ENABLED(CONFIG_RSEQ)) + return -EINVAL; + ready_state = + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY; + } else { + WARN_ON_ONCE(flags); } /* @@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags) return 0; if (flags & MEMBARRIER_FLAG_SYNC_CORE) set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; + if (flags & MEMBARRIER_FLAG_RSEQ) + set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ; atomic_or(set_state, &mm->membarrier_state); ret = sync_runqueues_membarrier_state(mm); if (ret) @@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags) /** * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0 for all commands other than + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter + * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id + * contains the CPU on which to interrupt (= restart) + * the RSEQ critical section. + * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which + * RSEQ CS should be interrupted (@cmd must be + * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ). * * If this system call is not implemented, -ENOSYS is returned. If the * command specified does not exist, not available on the running @@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags) * smp_mb() X O O * sys_membarrier() O O O */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) { - if (unlikely(flags)) - return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU)) + return -EINVAL; + break; + default: + if (unlikely(flags)) + return -EINVAL; + } + + if (!(flags & MEMBARRIER_CMD_FLAG_CPU)) + cpu_id = -1; + switch (cmd) { case MEMBARRIER_CMD_QUERY: { @@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(0); + return membarrier_private_expedited(0, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: return membarrier_register_private_expedited(0); case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: - return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); default: return -EINVAL; } -- cgit v1.2.3 From 9abb897345ce1d41257567f571a78137c961c405 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 30 Sep 2020 10:35:32 -0700 Subject: sched/fair: Tweak pick_next_entity() Currently, pick_next_entity(...) has the following structure (simplified): [...] if (last_buddy_ok()) result = last_buddy; if (next_buddy_ok()) result = next_buddy; [...] The intended behavior is to prefer next buddy over last buddy; the current code somewhat obfuscates this, and also wastes cycles checking the last buddy when eventually the next buddy is picked up. So this patch refactors two 'ifs' above into [...] if (next_buddy_ok()) result = next_buddy; else if (last_buddy_ok()) result = last_buddy; [...] Signed-off-by: Peter Oskolkov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200930173532.1069092-1-posk@google.com --- kernel/sched/fair.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc3410b8b990..cec6cf9b2bb3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4465,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = second; } - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) - se = cfs_rq->last; - - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { + /* + * Someone really wants this to run. If it's not unfair, run it. + */ se = cfs_rq->next; + } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + se = cfs_rq->last; + } clear_buddies(cfs_rq, se); -- cgit v1.2.3 From 51cf18c90ca1b51d1cb4af3064e85fcf8610b5d2 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Fri, 28 Aug 2020 10:00:49 +0100 Subject: sched/debug: Add new tracepoint to track cpu_capacity rq->cpu_capacity is a key element in several scheduler parts, such as EAS task placement and load balancing. Tracking this value enables testing and/or debugging by a toolkit. Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1598605249-72651-1-git-send-email-vincent.donnefort@arm.com --- kernel/sched/core.c | 1 + kernel/sched/fair.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd32d859ab40..3dc415f58bd7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cec6cf9b2bb3..aa4c6227cd6d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8108,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity = 1; cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); + sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; @@ -11321,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq) } EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); +int sched_trace_rq_cpu_capacity(struct rq *rq) +{ + return rq ? +#ifdef CONFIG_SMP + rq->cpu_capacity +#else + SCHED_CAPACITY_SCALE +#endif + : -1; +} +EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); + const struct cpumask *sched_trace_rd_span(struct root_domain *rd) { #ifdef CONFIG_SMP -- cgit v1.2.3 From feff2e65efd8d84cf831668e182b2ce73c604bbb Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 16 Sep 2020 09:06:39 +0200 Subject: sched/deadline: Unthrottle PI boosted threads while enqueuing stress-ng has a test (stress-ng --cyclic) that creates a set of threads under SCHED_DEADLINE with the following parameters: dl_runtime = 10000 (10 us) dl_deadline = 100000 (100 us) dl_period = 100000 (100 us) These parameters are very aggressive. When using a system without HRTICK set, these threads can easily execute longer than the dl_runtime because the throttling happens with 1/HZ resolution. During the main part of the test, the system works just fine because the workload does not try to run over the 10 us. The problem happens at the end of the test, on the exit() path. During exit(), the threads need to do some cleanups that require real-time mutex locks, mainly those related to memory management, resulting in this scenario: Note: locks are rt_mutexes... ------------------------------------------------------------------------ TASK A: TASK B: TASK C: activation activation activation lock(a): OK! lock(b): OK! lock(a) -> block (task A owns it) -> self notice/set throttled +--< -> arm replenished timer | switch-out | lock(b) | -> B prio> | -> boost TASK B | unlock(a) switch-out | -> handle lock a to B | -> wakeup(B) | -> B is throttled: | -> do not enqueue | switch-out | | +---------------------> replenishment timer -> TASK B is boosted: -> do not enqueue ------------------------------------------------------------------------ BOOM: TASK B is runnable but !enqueued, holding TASK C: the system crashes with hung task C. This problem is avoided by removing the throttle state from the boosted thread while boosting it (by TASK A in the example above), allowing it to be queued and run boosted. The next replenishment will take care of the runtime overrun, pushing the deadline further away. See the "while (dl_se->runtime <= 0)" on replenish_dl_entity() for more information. Reported-by: Mark Simmons Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Tested-by: Mark Simmons Link: https://lkml.kernel.org/r/5076e003450835ec74e6fa5917d02c4fa41687e6.1600170294.git.bristot@redhat.com --- kernel/sched/deadline.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c19c1883d695..6d93f4518734 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1525,6 +1525,27 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { pi_se = &pi_task->dl; + /* + * Because of delays in the detection of the overrun of a + * thread's runtime, it might be the case that a thread + * goes to sleep in a rt mutex with negative runtime. As + * a consequence, the thread will be throttled. + * + * While waiting for the mutex, this thread can also be + * boosted via PI, resulting in a thread that is throttled + * and boosted at the same time. + * + * In this case, the boost overrides the throttle. + */ + if (p->dl.dl_throttled) { + /* + * The replenish timer needs to be canceled. No + * problem if it fires concurrently: boosted threads + * are ignored in dl_task_timer(). + */ + hrtimer_try_to_cancel(&p->dl.dl_timer); + p->dl.dl_throttled = 0; + } } else if (!dl_prio(p->normal_prio)) { /* * Special case in which we have a !SCHED_DEADLINE task that is going -- cgit v1.2.3