From cadefd3d6cc914d95163ba1eda766bfe7ce1e5b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Feb 2014 10:40:35 +0100 Subject: sched: Make scale_rt_power() deal with backward clocks Mike reported that, while unlikely, its entirely possible for scale_rt_power() to see the time go backwards. This yields rather 'interesting' results. So like all other sites that deal with clocks; make this one ignore backward clock movement too. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140227094035.GZ9987@twins.programming.kicks-ass.net Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd969c28..5e157f157d85 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5564,6 +5564,7 @@ static unsigned long scale_rt_power(int cpu) { struct rq *rq = cpu_rq(cpu); u64 total, available, age_stamp, avg; + s64 delta; /* * Since we're reading these variables without serialization make sure @@ -5572,7 +5573,11 @@ static unsigned long scale_rt_power(int cpu) age_stamp = ACCESS_ONCE(rq->age_stamp); avg = ACCESS_ONCE(rq->rt_avg); - total = sched_avg_period() + (rq_clock(rq) - age_stamp); + delta = rq_clock(rq) - age_stamp; + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; if (unlikely(total < avg)) { /* Ensures that power won't end up being negative */ -- cgit v1.2.3 From 46383648b3c769fa74794ae6425ab993fc113bdb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sat, 15 Mar 2014 02:15:07 +0400 Subject: sched: Revert commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()") This reverts commit 4c6c4e38c4e9 ("sched/core: Fix endless loop in pick_next_task()"), which is not necessary after ("sched/rt: Substract number of tasks of throttled queues from rq->nr_running"). Signed-off-by: Kirill Tkhai Reviewed-by: Preeti U Murthy [conflict resolution with stop task checking patch] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1394835307.18748.34.camel@HP-250-G1-Notebook-PC Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e157f157d85..43232b8bacde 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6732,10 +6732,7 @@ static int idle_balance(struct rq *this_rq) out: /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_running && - ((this_rq->stop && this_rq->stop->on_rq) || - this_rq->dl.dl_nr_running || - (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt)))) + if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; if (pulled_task) { -- cgit v1.2.3 From 792568ec6a31ca560ca4d528782cbc6cd2cea8b0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:27 -0400 Subject: sched/numa: Count pages on active node as local The NUMA code is smart enough to distribute the memory of workloads that span multiple NUMA nodes across those NUMA nodes. However, it still has a pretty high scan rate for such workloads, because any memory that is left on a node other than the node of the CPU that faulted on the memory is counted as non-local, which causes the scan rate to go up. Counting the memory on any node where the task's numa group is actively running as local, allows the scan rate to slow down once the application is settled in. This should reduce the overhead of the automatic NUMA placement code, when a workload spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-2-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d859ec975c2..f6457b63c95c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1738,6 +1738,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) struct task_struct *p = current; bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); int priv; if (!numabalancing_enabled) @@ -1786,6 +1787,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) task_numa_group(p, last_cpupid, flags, &priv); } + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + if (!priv && !local && p->numa_group && + node_isset(cpu_node, p->numa_group->active_nodes) && + node_isset(mem_node, p->numa_group->active_nodes)) + local = 1; + task_numa_placement(p); /* @@ -1800,7 +1812,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages; p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages; - p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; + p->numa_faults_locality[local] += pages; } static void reset_ptenuma_scan(struct task_struct *p) -- cgit v1.2.3 From 5085e2a328849bdee6650b32d52c87c3788ab01c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:28 -0400 Subject: sched/numa: Retry placement more frequently when misplaced When tasks have not converged on their preferred nodes yet, we want to retry fairly often, to make sure we do not migrate a task's memory to an undesirable location, only to have to move it again later. This patch reduces the interval at which migration is retried, when the task's numa_scan_period is small. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-3-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6457b63c95c..ecea8d9f957c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1326,12 +1326,15 @@ static int task_numa_migrate(struct task_struct *p) /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { + unsigned long interval = HZ; + /* This task has no NUMA fault statistics yet */ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory)) return; /* Periodically retry migrating the task to the preferred node */ - p->numa_migrate_retry = jiffies + HZ; + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; /* Success if task is already running on preferred CPU */ if (task_node(p) == p->numa_preferred_nid) -- cgit v1.2.3 From 68d1b02a58f5d9f584c1fb2923ed60ec68cbbd9b Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 11 Apr 2014 13:00:29 -0400 Subject: sched/numa: Do not set preferred_node on migration to a second choice node Setting the numa_preferred_node for a task in task_numa_migrate does nothing on a 2-node system. Either we migrate to the node that already was our preferred node, or we stay where we were. On a 4-node system, it can slightly decrease overhead, by not calling the NUMA code as much. Since every node tends to be directly connected to every other node, running on the wrong node for a while does not do much damage. However, on an 8 node system, there are far more bad nodes than there are good ones, and pretending that a second choice is actually the preferred node can greatly delay, or even prevent, a workload from converging. The only time we can safely pretend that a second choice node is the preferred node is when the task is part of a workload that spans multiple NUMA nodes. Signed-off-by: Rik van Riel Tested-by: Vinod Chegu Acked-by: Mel Gorman Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1397235629-16328-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ecea8d9f957c..051903f33eec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1301,7 +1301,16 @@ static int task_numa_migrate(struct task_struct *p) if (env.best_cpu == -1) return -EAGAIN; - sched_setnuma(p, env.dst_nid); + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) + sched_setnuma(p, env.dst_nid); /* * Reset the scan period if the task is being rescheduled on an -- cgit v1.2.3 From 39a4d9ca77a31503c6317e49742341d0859d5cb2 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Wed, 23 Apr 2014 18:30:35 -0700 Subject: sched/fair: Stop searching for tasks in newidle balance if there are runnable tasks It was found that when running some workloads (such as AIM7) on large systems with many cores, CPUs do not remain idle for long. Thus, tasks can wake/get enqueued while doing idle balancing. In this patch, while traversing the domains in idle balance, in addition to checking for pulled_task, we add an extra check for this_rq->nr_running for determining if we should stop searching for tasks to pull. If there are runnable tasks on this rq, then we will stop traversing the domains. This reduces the chance that idle balance delays a task from running. This patch resulted in approximately a 6% performance improvement when running a Java Server workload on an 8 socket machine. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/1398303035-18255-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 051903f33eec..28ccf502c63c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6713,7 +6713,6 @@ static int idle_balance(struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); - /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); @@ -6728,7 +6727,12 @@ static int idle_balance(struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) break; } rcu_read_unlock(); -- cgit v1.2.3 From 3944a9274ef6cda0cc282daf0739832f661670f7 Mon Sep 17 00:00:00 2001 From: Ben Segall Date: Thu, 15 May 2014 15:59:20 -0700 Subject: sched: Fix exec_start/task_hot on migrated tasks task_hot checks exec_start on any runnable task, but if it has been migrated since the it last ran, then exec_start is a clock_task from another cpu. If the old cpu's clock_task was sufficiently far ahead of this cpu's then the task will not be considered for another migration until it has run. Instead reset exec_start whenever a task is migrated, since it is presumably no longer hot anyway. Signed-off-by: Ben Segall [ Made it compile. ] Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140515225920.7179.13924.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28ccf502c63c..dd3fa14a2998 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4544,6 +4544,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) atomic_long_add(se->avg.load_avg_contrib, &cfs_rq->removed_load); } + + /* We have migrated, no longer consider this task hot */ + se->exec_start = 0; } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 8bf21433f38b020c3d8a3805d1d7fb73d7b40c01 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 11:40:37 -0400 Subject: sched: Call select_idle_sibling() when not affine_sd On smaller systems, the top level sched domain will be an affine domain, and select_idle_sibling is invoked for every SD_WAKE_AFFINE wakeup. This seems to be working well. On larger systems, with the node distance between far away NUMA nodes being > RECLAIM_DISTANCE, select_idle_sibling is only called if the waker and the wakee are on nodes less than RECLAIM_DISTANCE apart. This patch leaves in place the policy of not pulling the task across nodes on such systems, while fixing the issue that select_idle_sibling is not called at all in certain circumstances. The code will look for an idle CPU in the same CPU package as the CPU where the task ran previously. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: morten.rasmussen@arm.com Cc: george.mccollister@gmail.com Cc: ktkhai@parallels.com Cc: Mel Gorman Cc: Mike Galbraith Link: http://lkml.kernel.org/r/20140514114037.2d93266f@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd3fa14a2998..429164d117ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4473,10 +4473,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f sd = tmp; } - if (affine_sd) { - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) - prev_cpu = cpu; + if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + if (sd_flag & SD_BALANCE_WAKE) { new_cpu = select_idle_sibling(p, prev_cpu); goto unlock; } -- cgit v1.2.3 From 52a08ef1f13a11289c9e18cd4cfb4e51c024058b Mon Sep 17 00:00:00 2001 From: Jason Low Date: Thu, 8 May 2014 17:49:22 -0700 Subject: sched: Fix the rq->next_balance logic in rebalance_domains() and idle_balance() Currently, in idle_balance(), we update rq->next_balance when we pull_tasks. However, it is also important to update this in the !pulled_tasks case too. When the CPU is "busy" (the CPU isn't idle), rq->next_balance gets computed using sd->busy_factor (so we increase the balance interval when the CPU is busy). However, when the CPU goes idle, rq->next_balance could still be set to a large value that was computed with the sd->busy_factor. Thus, we need to also update rq->next_balance in idle_balance() in the cases where !pulled_tasks too, so that rq->next_balance gets updated without taking the busy_factor into account when the CPU is about to go idle. This patch makes rq->next_balance get updated independently of whether or not we pulled_task. Also, we add logic to ensure that we always traverse at least 1 of the sched domains to get a proper next_balance value for updating rq->next_balance. Additionally, since load_balance() modifies the sd->balance_interval, we need to re-obtain the sched domain's interval after the call to load_balance() in rebalance_domains() before we update rq->next_balance. This patch adds and uses 2 new helper functions, update_next_balance() and get_sd_balance_interval() to update next_balance and obtain the sched domain's balance_interval. Signed-off-by: Jason Low Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Cc: daniel.lezcano@linaro.org Cc: alex.shi@linaro.org Cc: efault@gmx.de Cc: vincent.guittot@linaro.org Cc: morten.rasmussen@arm.com Cc: aswin@hp.com Link: http://lkml.kernel.org/r/1399596562.2200.7.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 69 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 23 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 429164d117ea..26ec6686a00b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6672,17 +6672,44 @@ out: return ld_moved; } +static inline unsigned long +get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) +{ + unsigned long interval = sd->balance_interval; + + if (cpu_busy) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + return interval; +} + +static inline void +update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance) +{ + unsigned long interval, next; + + interval = get_sd_balance_interval(sd, cpu_busy); + next = sd->last_balance + interval; + + if (time_after(*next_balance, next)) + *next_balance = next; +} + /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ static int idle_balance(struct rq *this_rq) { + unsigned long next_balance = jiffies + HZ; + int this_cpu = this_rq->cpu; struct sched_domain *sd; int pulled_task = 0; - unsigned long next_balance = jiffies + HZ; u64 curr_cost = 0; - int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); @@ -6692,8 +6719,15 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost) + if (this_rq->avg_idle < sysctl_sched_migration_cost) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (sd) + update_next_balance(sd, 0, &next_balance); + rcu_read_unlock(); + goto out; + } /* * Drop the rq->lock, but keep IRQ/preempt disabled. @@ -6703,15 +6737,16 @@ static int idle_balance(struct rq *this_rq) update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { - unsigned long interval; int continue_balancing = 1; u64 t0, domain_cost; if (!(sd->flags & SD_LOAD_BALANCE)) continue; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { + update_next_balance(sd, 0, &next_balance); break; + } if (sd->flags & SD_BALANCE_NEWIDLE) { t0 = sched_clock_cpu(this_cpu); @@ -6727,9 +6762,7 @@ static int idle_balance(struct rq *this_rq) curr_cost += domain_cost; } - interval = msecs_to_jiffies(sd->balance_interval); - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; + update_next_balance(sd, 0, &next_balance); /* * Stop searching for tasks to pull if there are @@ -6753,15 +6786,11 @@ static int idle_balance(struct rq *this_rq) if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - if (pulled_task || time_after(jiffies, this_rq->next_balance)) { - /* - * We are going idle. next_balance may be set based on - * a busy processor. So reset next_balance. - */ +out: + /* Move the next balance forward */ + if (time_after(this_rq->next_balance, next_balance)) this_rq->next_balance = next_balance; - } -out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; @@ -7044,16 +7073,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) break; } - interval = sd->balance_interval; - if (idle != CPU_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - interval = clamp(interval, 1UL, max_load_balance_interval); + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { if (!spin_trylock(&balancing)) goto out; @@ -7069,6 +7091,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; + interval = get_sd_balance_interval(sd, idle != CPU_IDLE); } if (need_serialize) spin_unlock(&balancing); -- cgit v1.2.3 From 72465447867b9de6b5cdea5d10f9781585136270 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 9 May 2014 03:00:14 +0400 Subject: sched, nohz: Change rq->nr_running to always use wrappers Sometimes ->nr_running may cross 2 but interrupt is not being sent to rq's cpu. In this case we don't reenable the timer. Looks like this may be the reason for rare unexpected effects, if nohz is enabled. Patch replaces all places of direct changing of nr_running and makes add_nr_running() caring about crossing border. Signed-off-by: Kirill Tkhai Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20140508225830.2469.97461.stgit@localhost Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 26ec6686a00b..f7cac2ba62ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3325,7 +3325,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running -= task_delta; + sub_nr_running(rq, task_delta); cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -3376,7 +3376,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) } if (!se) - rq->nr_running += task_delta; + add_nr_running(rq, task_delta); /* determine whether we need to wake up potentially idle cpu */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -3908,7 +3908,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { update_rq_runnable_avg(rq, rq->nr_running); - inc_nr_running(rq); + add_nr_running(rq, 1); } hrtick_update(rq); } @@ -3968,7 +3968,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } if (!se) { - dec_nr_running(rq); + sub_nr_running(rq, 1); update_rq_runnable_avg(rq, 1); } hrtick_update(rq); -- cgit v1.2.3 From e63da03639cc9e6e83b62e7ef8ffdbb92421416a Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 14 May 2014 13:22:21 -0400 Subject: sched/numa: Allow task switch if load imbalance improves Currently the NUMA balancing code only allows moving tasks between NUMA nodes when the load on both nodes is in balance. This breaks down when the load was imbalanced to begin with. Allow tasks to be moved between NUMA nodes if the imbalance is small, or if the new imbalance is be smaller than the original one. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/r/20140514132221.274b3463@annuminas.surriel.com --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f7cac2ba62ea..b899613f2bc6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env, env->best_cpu = env->dst_cpu; } +static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, + long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + + /* We care about the slope of the imbalance, not the direction. */ + if (dst_load < src_load) + swap(dst_load, src_load); + + /* Is the difference below the threshold? */ + imb = dst_load * 100 - src_load * env->imbalance_pct; + if (imb <= 0) + return false; + + /* + * The imbalance is above the allowed threshold. + * Compare it with the old imbalance. + */ + if (orig_dst_load < orig_src_load) + swap(orig_dst_load, orig_src_load); + + old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; + + /* Would this change make things worse? */ + return (old_imb > imb); +} + /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking @@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env, struct rq *src_rq = cpu_rq(env->src_cpu); struct rq *dst_rq = cpu_rq(env->dst_cpu); struct task_struct *cur; - long dst_load, src_load; + long orig_src_load, src_load; + long orig_dst_load, dst_load; long load; long imp = (groupimp > 0) ? groupimp : taskimp; @@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env, * In the overloaded case, try and keep the load balanced. */ balance: - dst_load = env->dst_stats.load; - src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + orig_src_load = env->src_stats.load; /* XXX missing power terms */ load = task_h_load(env->p); - dst_load += load; - src_load -= load; + dst_load = orig_dst_load + load; + src_load = orig_src_load - load; if (cur) { load = task_h_load(cur); @@ -1195,11 +1224,8 @@ balance: src_load += load; } - /* make src_load the smaller */ - if (dst_load < src_load) - swap(dst_load, src_load); - - if (src_load * env->imbalance_pct < dst_load * 100) + if (load_too_imbalanced(orig_src_load, orig_dst_load, + src_load, dst_load, env)) goto unlock; assign: -- cgit v1.2.3 From b1ad065e65f56103db8b97edbd218a271ff5b1bb Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 15 May 2014 13:03:06 -0400 Subject: sched/numa: Update migrate_improves/degrades_locality() Update the migrate_improves/degrades_locality() functions with knowledge of pseudo-interleaving. Do not consider moving tasks around within the set of group's active nodes as improving or degrading locality. Instead, leave the load balancer free to balance the load between a numa_group's active nodes. Also, switch from the group/task_weight functions to the group/task_fault functions. The "weight" functions involve a division, but both calls use the same divisor, so there's no point in doing that from these functions. On a 4 node (x10 core) system, performance of SPECjbb2005 seems unaffected, though the number of migrations with 2 8-warehouse wide instances seems to have almost halved, due to the scheduler running each instance on a single node. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: mgorman@suse.de Cc: chegu_vinod@hp.com Link: http://lkml.kernel.org/r/20140515130306.61aae7db@cuia.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b899613f2bc6..503f750c2d25 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5123,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now) /* Returns true if the destination node has incurred more faults */ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory || @@ -5136,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; - /* Always encourage migration to the preferred node. */ - if (dst_nid == p->numa_preferred_nid) - return true; + if (numa_group) { + /* Task is already in the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return false; + + /* Task is moving into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return true; - /* If both task and group weight improve, this move is a winner. */ - if (task_weight(p, dst_nid) > task_weight(p, src_nid) && - group_weight(p, dst_nid) > group_weight(p, src_nid)) + return group_faults(p, dst_nid) > group_faults(p, src_nid); + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return true; - return false; + return task_faults(p, dst_nid) > task_faults(p, src_nid); } static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { + struct numa_group *numa_group = rcu_dereference(p->numa_group); int src_nid, dst_nid; if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER)) @@ -5165,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env) if (src_nid == dst_nid) return false; + if (numa_group) { + /* Task is moving within/into the group's interleave set. */ + if (node_isset(dst_nid, numa_group->active_nodes)) + return false; + + /* Task is moving out of the group's interleave set. */ + if (node_isset(src_nid, numa_group->active_nodes)) + return true; + + return group_faults(p, dst_nid) < group_faults(p, src_nid); + } + /* Migrating away from the preferred node is always bad. */ if (src_nid == p->numa_preferred_nid) return true; - /* If either task or group weight get worse, don't do it. */ - if (task_weight(p, dst_nid) < task_weight(p, src_nid) || - group_weight(p, dst_nid) < group_weight(p, src_nid)) - return true; - - return false; + return task_faults(p, dst_nid) < task_faults(p, src_nid); } #else -- cgit v1.2.3 From 096aa33863a5e48de52d2ff30e0801b7487944f4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 16 May 2014 00:13:32 -0400 Subject: sched/numa: Decay ->wakee_flips instead of zeroing Affine wakeups have the potential to interfere with NUMA placement. If a task wakes up too many other tasks, affine wakeups will get disabled. However, regardless of how many other tasks it wakes up, it gets re-enabled once a second, potentially interfering with NUMA placement of other tasks. By decaying wakee_wakes in half instead of zeroing it, we can avoid that problem for some workloads. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: chegu_vinod@hp.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20140516001332.67f91af2@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 503f750c2d25..c9617b73bcc0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4065,7 +4065,7 @@ static void record_wakee(struct task_struct *p) * about the loss. */ if (jiffies > current->wakee_flip_decay_ts + HZ) { - current->wakee_flips = 0; + current->wakee_flips >>= 1; current->wakee_flip_decay_ts = jiffies; } -- cgit v1.2.3