From e0f364f4069f76a3613a797c388832822d179076 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:06 -0700
Subject: [PATCH] sched: cleanup wake_idle

New sched-domains code means we don't get spans with offline CPUs in
them.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 76080d142e3d..86be13ee5006 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -927,14 +927,14 @@ static int wake_idle(int cpu, task_t *p)
 
 	for_each_domain(cpu, sd) {
 		if (sd->flags & SD_WAKE_IDLE) {
-			cpus_and(tmp, sd->span, cpu_online_map);
-			cpus_and(tmp, tmp, p->cpus_allowed);
+			cpus_and(tmp, sd->span, p->cpus_allowed);
 			for_each_cpu_mask(i, tmp) {
 				if (idle_cpu(i))
 					return i;
 			}
 		}
-		else break;
+		else
+			break;
 	}
 	return cpu;
 }
-- 
cgit v1.2.3


From 8102679447da7fcbcb5226ee0207c3a034bc6d5f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:07 -0700
Subject: [PATCH] sched: improve load balancing pinned tasks

John Hawkes explained the problem best:

	A large number of processes that are pinned to a single CPU results
	in every other CPU's load_balance() seeing this overloaded CPU as
	"busiest", yet move_tasks() never finds a task to pull-migrate.  This
	condition occurs during module unload, but can also occur as a
	denial-of-service using sys_sched_setaffinity().  Several hundred
	CPUs performing this fruitless load_balance() will livelock on the
	busiest CPU's runqueue lock.  A smaller number of CPUs will livelock
	if the pinned task count gets high.

Expanding slightly on John's patch, this one attempts to work out whether the
balancing failure has been due to too many tasks pinned on the runqueue.  This
allows it to be basically invisible to the regular blancing paths (ie.  when
there are no pinned tasks).  We can use this extra knowledge to shut down the
balancing faster, and ensure the migration threads don't start running which
is another problem observed in the wild.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 62 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 23 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 86be13ee5006..2794c79b9197 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1632,7 +1632,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-		     struct sched_domain *sd, enum idle_type idle)
+	     struct sched_domain *sd, enum idle_type idle, int *all_pinned)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -1640,10 +1640,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
 	 */
-	if (task_running(rq, p))
-		return 0;
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
+	*all_pinned = 0;
+
+	if (task_running(rq, p))
+		return 0;
 
 	/*
 	 * Aggressive migration if:
@@ -1656,7 +1658,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 		return 1;
 
 	if (task_hot(p, rq->timestamp_last_tick, sd))
-			return 0;
+		return 0;
 	return 1;
 }
 
@@ -1669,16 +1671,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 		      unsigned long max_nr_move, struct sched_domain *sd,
-		      enum idle_type idle)
+		      enum idle_type idle, int *all_pinned)
 {
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
-	int idx, pulled = 0;
+	int idx, pulled = 0, pinned = 0;
 	task_t *tmp;
 
-	if (max_nr_move <= 0 || busiest->nr_running <= 1)
+	if (max_nr_move == 0)
 		goto out;
 
+	pinned = 1;
+
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
@@ -1717,7 +1721,7 @@ skip_queue:
 
 	curr = curr->prev;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1746,6 +1750,9 @@ out:
 	 * inside pull_task().
 	 */
 	schedstat_add(sd, lb_gained[idle], pulled);
+
+	if (all_pinned)
+		*all_pinned = pinned;
 	return pulled;
 }
 
@@ -1917,7 +1924,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	struct sched_group *group;
 	runqueue_t *busiest;
 	unsigned long imbalance;
-	int nr_moved;
+	int nr_moved, all_pinned;
+	int active_balance = 0;
 
 	spin_lock(&this_rq->lock);
 	schedstat_inc(sd, lb_cnt[idle]);
@@ -1956,9 +1964,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		 */
 		double_lock_balance(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
-						imbalance, sd, idle);
+						imbalance, sd, idle,
+						&all_pinned);
 		spin_unlock(&busiest->lock);
+
+		/* All tasks on this runqueue were pinned by CPU affinity */
+		if (unlikely(all_pinned))
+			goto out_balanced;
 	}
+
 	spin_unlock(&this_rq->lock);
 
 	if (!nr_moved) {
@@ -1966,16 +1980,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		sd->nr_balance_failed++;
 
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-			int wake = 0;
 
 			spin_lock(&busiest->lock);
 			if (!busiest->active_balance) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
-				wake = 1;
+				active_balance = 1;
 			}
 			spin_unlock(&busiest->lock);
-			if (wake)
+			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 
 			/*
@@ -1984,18 +1997,21 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 			 */
 			sd->nr_balance_failed = sd->cache_nice_tries;
 		}
-
-		/*
-		 * We were unbalanced, but unsuccessful in move_tasks(),
-		 * so bump the balance_interval to lessen the lock contention.
-		 */
-		if (sd->balance_interval < sd->max_interval)
-			sd->balance_interval++;
-	} else {
+	} else
 		sd->nr_balance_failed = 0;
 
+	if (likely(!active_balance)) {
 		/* We were unbalanced, so reset the balancing interval */
 		sd->balance_interval = sd->min_interval;
+	} else {
+		/*
+		 * If we've begun active balancing, start to back off. This
+		 * case may not be covered by the all_pinned logic if there
+		 * is only 1 task on the busy runqueue (because we don't call
+		 * move_tasks).
+		 */
+		if (sd->balance_interval < sd->max_interval)
+			sd->balance_interval *= 2;
 	}
 
 	return nr_moved;
@@ -2047,7 +2063,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 
 	schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
 	nr_moved = move_tasks(this_rq, this_cpu, busiest,
-					imbalance, sd, NEWLY_IDLE);
+					imbalance, sd, NEWLY_IDLE, NULL);
 	if (!nr_moved)
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
 
@@ -2126,7 +2142,7 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 				/* move a task from busiest_rq to target_rq */
 				double_lock_balance(busiest_rq, target_rq);
 				if (move_tasks(target_rq, cpu, busiest_rq,
-						1, sd, SCHED_IDLE)) {
+						1, sd, SCHED_IDLE, NULL)) {
 					schedstat_inc(sd, alb_pushed);
 				} else {
 					schedstat_inc(sd, alb_failed);
-- 
cgit v1.2.3


From 16cfb1c04c3cbe3759f339d3333e7e1e7d59712a Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:08 -0700
Subject: [PATCH] sched: reduce active load balancing

Fix up active load balancing a bit so it doesn't get called when it shouldn't.
Reset the nr_balance_failed counter at more points where we have found
conditions to be balanced.  This reduces too aggressive active balancing seen
on some workloads.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2794c79b9197..03d737791c1a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2021,6 +2021,7 @@ out_balanced:
 
 	schedstat_inc(sd, lb_balanced[idle]);
 
+	sd->nr_balance_failed = 0;
 	/* tune up the balancing interval */
 	if (sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
@@ -2046,16 +2047,14 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
 	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
 	if (!group) {
-		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
-		goto out;
+		goto out_balanced;
 	}
 
 	busiest = find_busiest_queue(group);
 	if (!busiest || busiest == this_rq) {
-		schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
-		goto out;
+		goto out_balanced;
 	}
 
 	/* Attempt to move tasks */
@@ -2066,11 +2065,16 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 					imbalance, sd, NEWLY_IDLE, NULL);
 	if (!nr_moved)
 		schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+	else
+		sd->nr_balance_failed = 0;
 
 	spin_unlock(&busiest->lock);
-
-out:
 	return nr_moved;
+
+out_balanced:
+	schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+	sd->nr_balance_failed = 0;
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 3950745131e23472fb5ace2ee4a2093e7590ec69 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:09 -0700
Subject: [PATCH] sched: fix SMT scheduling problems

SMT balancing has a couple of problems.  Firstly, active_load_balance is too
complex - basically it should be a dumb helper for when the periodic balancer
has determined there is an imbalance, but gets stuck because the task is
running.

So rip out all its "smarts", and just make it move one task to the target CPU.

Second, the busy CPU's sched-domain tree was being used for active balancing.
This means that it may not see that nr_balance_failed has reached a critical
level.  So use the target CPU's sched-domain tree for this.  We can do this
because we hold its runqueue lock.

Lastly, reset nr_balance_failed to a point where we allow cache hot migration.
This will help ensure active load balancing is successful.

Thanks to Suresh Siddha for pointing out these issues.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 76 ++++++++++++++++++++++++----------------------------------
 1 file changed, 31 insertions(+), 45 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 03d737791c1a..41e69b5ee652 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1995,7 +1995,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 			 * We've kicked active balancing, reset the failure
 			 * counter.
 			 */
-			sd->nr_balance_failed = sd->cache_nice_tries;
+			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
 		sd->nr_balance_failed = 0;
@@ -2106,56 +2106,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
 	struct sched_domain *sd;
-	struct sched_group *cpu_group;
 	runqueue_t *target_rq;
-	cpumask_t visited_cpus;
-	int cpu;
+	int target_cpu = busiest_rq->push_cpu;
+
+	if (busiest_rq->nr_running <= 1)
+		/* no task to move */
+		return;
+
+	target_rq = cpu_rq(target_cpu);
 
 	/*
-	 * Search for suitable CPUs to push tasks to in successively higher
-	 * domains with SD_LOAD_BALANCE set.
+	 * This condition is "impossible", if it occurs
+	 * we need to fix it.  Originally reported by
+	 * Bjorn Helgaas on a 128-cpu setup.
 	 */
-	visited_cpus = CPU_MASK_NONE;
-	for_each_domain(busiest_cpu, sd) {
-		if (!(sd->flags & SD_LOAD_BALANCE))
-			/* no more domains to search */
-			break;
+	BUG_ON(busiest_rq == target_rq);
 
-		schedstat_inc(sd, alb_cnt);
+	/* move a task from busiest_rq to target_rq */
+	double_lock_balance(busiest_rq, target_rq);
 
-		cpu_group = sd->groups;
-		do {
-			for_each_cpu_mask(cpu, cpu_group->cpumask) {
-				if (busiest_rq->nr_running <= 1)
-					/* no more tasks left to move */
-					return;
-				if (cpu_isset(cpu, visited_cpus))
-					continue;
-				cpu_set(cpu, visited_cpus);
-				if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
-					continue;
-
-				target_rq = cpu_rq(cpu);
-				/*
-				 * This condition is "impossible", if it occurs
-				 * we need to fix it.  Originally reported by
-				 * Bjorn Helgaas on a 128-cpu setup.
-				 */
-				BUG_ON(busiest_rq == target_rq);
-
-				/* move a task from busiest_rq to target_rq */
-				double_lock_balance(busiest_rq, target_rq);
-				if (move_tasks(target_rq, cpu, busiest_rq,
-						1, sd, SCHED_IDLE, NULL)) {
-					schedstat_inc(sd, alb_pushed);
-				} else {
-					schedstat_inc(sd, alb_failed);
-				}
-				spin_unlock(&target_rq->lock);
-			}
-			cpu_group = cpu_group->next;
-		} while (cpu_group != sd->groups);
-	}
+	/* Search for an sd spanning us and the target CPU. */
+	for_each_domain(target_cpu, sd)
+		if ((sd->flags & SD_LOAD_BALANCE) &&
+			cpu_isset(busiest_cpu, sd->span))
+				break;
+
+	if (unlikely(sd == NULL))
+		goto out;
+
+	schedstat_inc(sd, alb_cnt);
+
+	if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+		schedstat_inc(sd, alb_pushed);
+	else
+		schedstat_inc(sd, alb_failed);
+out:
+	spin_unlock(&target_rq->lock);
 }
 
 /*
-- 
cgit v1.2.3


From db935dbd43c4290d710304662cc908f733afea06 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:11 -0700
Subject: [PATCH] sched: add debugging

These conditions should now be impossible, and we need to fix them if they
happen.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 41e69b5ee652..8b035a8b3c30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1942,15 +1942,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		goto out_balanced;
 	}
 
-	/*
-	 * This should be "impossible", but since load
-	 * balancing is inherently racy and statistical,
-	 * it could happen in theory.
-	 */
-	if (unlikely(busiest == this_rq)) {
-		WARN_ON(1);
-		goto out_balanced;
-	}
+	BUG_ON(busiest == this_rq);
 
 	schedstat_add(sd, lb_imbalance[idle], imbalance);
 
@@ -2052,11 +2044,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	}
 
 	busiest = find_busiest_queue(group);
-	if (!busiest || busiest == this_rq) {
+	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
+	BUG_ON(busiest == this_rq);
+
 	/* Attempt to move tasks */
 	double_lock_balance(this_rq, busiest);
 
-- 
cgit v1.2.3


From 99b61ccf0bf0e9a85823d39a5db6a1519caeb13d Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:12 -0700
Subject: [PATCH] sched: less aggressive idle balancing

Remove the special casing for idle CPU balancing.  Things like this are
hurting for example on SMT, where are single sibling being idle doesn't really
warrant a really aggressive pull over the NUMA domain, for example.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8b035a8b3c30..f665de34ed82 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1877,15 +1877,9 @@ nextgroup:
 
 	/* Get rid of the scaling factor, rounding down as we divide */
 	*imbalance = *imbalance / SCHED_LOAD_SCALE;
-
 	return busiest;
 
 out_balanced:
-	if (busiest && (idle == NEWLY_IDLE ||
-			(idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-		*imbalance = 1;
-		return busiest;
-	}
 
 	*imbalance = 0;
 	return NULL;
-- 
cgit v1.2.3


From 7897986bad8f6cd50d6149345aca7f6480f49464 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:13 -0700
Subject: [PATCH] sched: balance timers

Do CPU load averaging over a number of different intervals.  Allow each
interval to be chosen by sending a parameter to source_load and target_load.
0 is instantaneous, idx > 0 returns a decaying average with the most recent
sample weighted at 2^(idx-1).  To a maximum of 3 (could be easily increased).

So generally a higher number will result in more conservative balancing.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 138 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 74 insertions(+), 64 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index f665de34ed82..b597b07e7911 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,7 +206,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
-	unsigned long cpu_load;
+	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
 
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return min(rq->cpu_load, load_now);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return max(rq->cpu_load, load_now);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 #endif
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	runqueue_t *rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
-	struct sched_domain *sd;
+	struct sched_domain *sd, *this_sd = NULL;
 	int new_cpu;
 #endif
 
@@ -986,72 +990,64 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-#ifdef CONFIG_SCHEDSTATS
+	new_cpu = cpu;
+
 	schedstat_inc(rq, ttwu_cnt);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
-	} else {
-		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
+		goto out_set_cpu;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			schedstat_inc(sd, ttwu_wake_remote);
+			this_sd = sd;
+			break;
 		}
 	}
-#endif
 
-	new_cpu = cpu;
-	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
-	load = source_load(cpu);
-	this_load = target_load(this_cpu);
-
 	/*
-	 * If sync wakeup then subtract the (maximum possible) effect of
-	 * the currently running task from the load of the current CPU:
+	 * Check for affine wakeup and passive balancing possibilities.
 	 */
-	if (sync)
-		this_load -= SCHED_LOAD_SCALE;
-
-	/* Don't pull the task off an idle CPU to a busy one */
-	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-		goto out_set_cpu;
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
 
-	new_cpu = this_cpu; /* Wake to this CPU if we can */
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
 
-	/*
-	 * Scan domains for affine wakeup and passive balancing
-	 * possibilities.
-	 */
-	for_each_domain(this_cpu, sd) {
-		unsigned int imbalance;
 		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
+		 * If sync wakeup then subtract the (maximum possible) effect of
+		 * the currently running task from the load of the current CPU:
 		 */
-		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+		if (sync)
+			this_load -= SCHED_LOAD_SCALE;
+
+		 /* Don't pull the task off an idle CPU to a busy one */
+		if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
+			goto out_set_cpu;
 
-		if ((sd->flags & SD_WAKE_AFFINE) &&
-				!task_hot(p, rq->timestamp_last_tick, sd)) {
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if ((this_sd->flags & SD_WAKE_AFFINE) &&
+			!task_hot(p, rq->timestamp_last_tick, this_sd)) {
 			/*
 			 * This domain has SD_WAKE_AFFINE and p is cache cold
 			 * in this domain.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_affine);
-				goto out_set_cpu;
-			}
-		} else if ((sd->flags & SD_WAKE_BALANCE) &&
+			schedstat_inc(this_sd, ttwu_move_affine);
+			goto out_set_cpu;
+		} else if ((this_sd->flags & SD_WAKE_BALANCE) &&
 				imbalance*this_load <= 100*load) {
 			/*
 			 * This domain has SD_WAKE_BALANCE and there is
 			 * an imbalance.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_balance);
-				goto out_set_cpu;
-			}
+			schedstat_inc(this_sd, ttwu_move_balance);
+			goto out_set_cpu;
 		}
 	}
 
@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
-		load = target_load(i);
+		load = target_load(i, sd->wake_idx);
 
 		if (load < min_load) {
 			min_cpu = i;
@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	}
 
 	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
 
 	/*
 	 * Would with the addition of the new task to the
@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	if (idle == NOT_IDLE)
+		load_idx = sd->busy_idx;
+	else if (idle == NEWLY_IDLE)
+		load_idx = sd->newidle_idx;
+	else
+		load_idx = sd->idle_idx;
 
 	do {
 		unsigned long load;
@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i);
+				load = target_load(i, load_idx);
 			else
-				load = source_load(i);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
+	int i;
 
-	/* Update our load */
-	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-	/*
-	 * Round up the averaging division if load is increasing. This
-	 * prevents us from getting stuck on 9 if the load is 10, for
-	 * example.
-	 */
-	if (this_load > old_load)
-		old_load++;
-	this_rq->cpu_load = (old_load + this_load) / 2;
+	/* Update our load */
+	for (i = 0; i < 3; i++) {
+		unsigned long new_load = this_load;
+		int scale = 1 << i;
+		old_load = this_rq->cpu_load[i];
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
+		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+	}
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -4921,13 +4929,15 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
+		rq->nr_running = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_dummy;
-		rq->cpu_load = 0;
+		for (j = 1; j < 3; j++)
+			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
-- 
cgit v1.2.3


From a3f21bce1fefdf92a4d1705e888d390b10f3ac6f Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:15 -0700
Subject: [PATCH] sched: tweak affine wakeups

Do less affine wakeups.  We're trying to reduce dbt2-pgsql idle time
regressions here...  make sure we don't don't move tasks the wrong way in an
imbalance condition.  Also, remove the cache coldness requirement from the
calculation - this seems to induce sharp cutoff points where behaviour will
suddenly change on some workloads if the load creeps slightly over or under
some point.  It is good for periodic balancing because in that case have
otherwise have no other context to determine what task to move.

But also make a minor tweak to "wake balancing" - the imbalance tolerance is
now set at half the domain's imbalance, so we get the opportunity to do wake
balancing before the more random periodic rebalancing gets preformed.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 57 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index b597b07e7911..5ae3568eed0b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1016,38 +1016,45 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 		int idx = this_sd->wake_idx;
 		unsigned int imbalance;
 
+		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
 		load = source_load(cpu, idx);
 		this_load = target_load(this_cpu, idx);
 
-		/*
-		 * If sync wakeup then subtract the (maximum possible) effect of
-		 * the currently running task from the load of the current CPU:
-		 */
-		if (sync)
-			this_load -= SCHED_LOAD_SCALE;
-
-		 /* Don't pull the task off an idle CPU to a busy one */
-		if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-			goto out_set_cpu;
-
 		new_cpu = this_cpu; /* Wake to this CPU if we can */
 
-		if ((this_sd->flags & SD_WAKE_AFFINE) &&
-			!task_hot(p, rq->timestamp_last_tick, this_sd)) {
-			/*
-			 * This domain has SD_WAKE_AFFINE and p is cache cold
-			 * in this domain.
-			 */
-			schedstat_inc(this_sd, ttwu_move_affine);
-			goto out_set_cpu;
-		} else if ((this_sd->flags & SD_WAKE_BALANCE) &&
-				imbalance*this_load <= 100*load) {
+		if (this_sd->flags & SD_WAKE_AFFINE) {
+			unsigned long tl = this_load;
 			/*
-			 * This domain has SD_WAKE_BALANCE and there is
-			 * an imbalance.
+			 * If sync wakeup then subtract the (maximum possible)
+			 * effect of the currently running task from the load
+			 * of the current CPU:
 			 */
-			schedstat_inc(this_sd, ttwu_move_balance);
-			goto out_set_cpu;
+			if (sync)
+				tl -= SCHED_LOAD_SCALE;
+
+			if ((tl <= load &&
+				tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+				100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and
+				 * p is cache cold in this domain, and
+				 * there is no bad imbalance.
+				 */
+				schedstat_inc(this_sd, ttwu_move_affine);
+				goto out_set_cpu;
+			}
+		}
+
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		if (this_sd->flags & SD_WAKE_BALANCE) {
+			if (imbalance*this_load <= 100*load) {
+				schedstat_inc(this_sd, ttwu_move_balance);
+				goto out_set_cpu;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From cafb20c1f9976a70d633bb1e1c8c24eab00e4e80 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:17 -0700
Subject: [PATCH] sched: no aggressive idle balancing

Remove the very aggressive idle stuff that has recently gone into 2.6 - it is
going against the direction we are trying to go.  Hopefully we can regain
performance through other methods.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5ae3568eed0b..396724a2519f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -414,22 +414,6 @@ static inline runqueue_t *this_rq_lock(void)
 	return rq;
 }
 
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
-	int sib;
-	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
-		if (idle_cpu(sib))
-			continue;
-		return 0;
-	}
-
-	return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
@@ -1652,12 +1636,11 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 
 	/*
 	 * Aggressive migration if:
-	 * 1) the [whole] cpu is idle, or
+	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (cpu_and_siblings_are_idle(this_cpu) || \
-			sd->nr_balance_failed > sd->cache_nice_tries)
+	if (sd->nr_balance_failed > sd->cache_nice_tries)
 		return 1;
 
 	if (task_hot(p, rq->timestamp_last_tick, sd))
-- 
cgit v1.2.3


From 147cbb4bbe991452698f0772d8292f22825710ba Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:19 -0700
Subject: [PATCH] sched: balance on fork

Reimplement the balance on exec balancing to be sched-domains aware.  Use this
to also do balance on fork balancing.  Make x86_64 do balance on fork over the
NUMA domain.

The problem that the non sched domains aware blancing became apparent on dual
core, multi socket opterons.  What we want is for the new tasks to be sent to
a different socket, but more often than not, we would first load up our
sibling core, or fill two cores of a single remote socket before selecting a
new one.

This gives large improvements to STREAM on such systems.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 164 ++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 109 insertions(+), 55 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 396724a2519f..7ecc237e2aab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], load_now);
 }
 
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long min_load = ULONG_MAX, this_load = 0;
+	int load_idx = sd->forkexec_idx;
+	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+	do {
+		unsigned long load, avg_load;
+		int local_group;
+		int i;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+		/* XXX: put a cpus allowed check */
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = source_load(i, load_idx);
+			else
+				load = target_load(i, load_idx);
+
+			avg_load += load;
+		}
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+		} else if (avg_load < min_load) {
+			min_load = avg_load;
+			idlest = group;
+		}
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!idlest || 100*this_load < imbalance*min_load)
+		return NULL;
+	return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+	unsigned long load, min_load = ULONG_MAX;
+	int idlest = -1;
+	int i;
+
+	for_each_cpu_mask(i, group->cpumask) {
+		load = source_load(i, 0);
+
+		if (load < min_load || (load == min_load && i == this_cpu)) {
+			min_load = load;
+			idlest = i;
+		}
+	}
+
+	return idlest;
+}
+
+
 #endif
 
 /*
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }
 
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd);
-#endif
-
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
+#ifdef CONFIG_SMP
+	struct sched_domain *tmp, *sd = NULL;
+#endif
 
 	rq = task_rq_lock(p, &flags);
-	cpu = task_cpu(p);
+	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
+	cpu = task_cpu(p);
 
-	BUG_ON(p->state != TASK_RUNNING);
+#ifdef CONFIG_SMP
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & SD_BALANCE_FORK)
+			sd = tmp;
+
+	if (sd) {
+		struct sched_group *group;
+
+		cpu = task_cpu(p);
+		group = find_idlest_group(sd, p, cpu);
+		if (group) {
+			int new_cpu;
+			new_cpu = find_idlest_cpu(group, cpu);
+			if (new_cpu != -1 && new_cpu != cpu &&
+					cpu_isset(new_cpu, p->cpus_allowed)) {
+				set_task_cpu(p, new_cpu);
+				task_rq_unlock(rq, &flags);
+				rq = task_rq_lock(p, &flags);
+				cpu = task_cpu(p);
+			}
+		}
+	}
+#endif
 
 	/*
 	 * We decrease the sleep average of forking parents
@@ -1480,51 +1574,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	}
 }
 
-/*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd)
-{
-	unsigned long load, min_load, this_load;
-	int i, min_cpu;
-	cpumask_t mask;
-
-	min_cpu = UINT_MAX;
-	min_load = ULONG_MAX;
-
-	cpus_and(mask, sd->span, p->cpus_allowed);
-
-	for_each_cpu_mask(i, mask) {
-		load = target_load(i, sd->wake_idx);
-
-		if (load < min_load) {
-			min_cpu = i;
-			min_load = load;
-
-			/* break out early on an idle CPU: */
-			if (!min_load)
-				break;
-		}
-	}
-
-	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
-
-	/*
-	 * Would with the addition of the new task to the
-	 * current CPU there be an imbalance between this
-	 * CPU and the idlest CPU?
-	 *
-	 * Use half of the balancing threshold - new-context is
-	 * a good opportunity to balance.
-	 */
-	if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
-		return min_cpu;
-
-	return this_cpu;
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1578,8 +1627,15 @@ void sched_exec(void)
 			sd = tmp;
 
 	if (sd) {
+		struct sched_group *group;
 		schedstat_inc(sd, sbe_attempts);
-		new_cpu = find_idlest_cpu(current, this_cpu, sd);
+		group = find_idlest_group(sd, current, this_cpu);
+		if (!group)
+			goto out;
+		new_cpu = find_idlest_cpu(group, this_cpu);
+		if (new_cpu == -1)
+			goto out;
+
 		if (new_cpu != this_cpu) {
 			schedstat_inc(sd, sbe_pushed);
 			put_cpu();
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
-			goto nextgroup;
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
 		}
-nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 
-- 
cgit v1.2.3


From 68767a0ae428801649d510d9a65bb71feed44dd1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:20 -0700
Subject: [PATCH] sched: schedstats update for balance on fork

Add SCHEDSTAT statistics for sched-balance-fork.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 63 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7ecc237e2aab..2711130cd973 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,7 +309,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 11
+#define SCHEDSTAT_VERSION 12
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -356,9 +356,10 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
-			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_pushed, sd->sbe_attempts,
+			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
 #endif
@@ -1264,24 +1265,34 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 			sd = tmp;
 
 	if (sd) {
+		int new_cpu;
 		struct sched_group *group;
 
+		schedstat_inc(sd, sbf_cnt);
 		cpu = task_cpu(p);
 		group = find_idlest_group(sd, p, cpu);
-		if (group) {
-			int new_cpu;
-			new_cpu = find_idlest_cpu(group, cpu);
-			if (new_cpu != -1 && new_cpu != cpu &&
-					cpu_isset(new_cpu, p->cpus_allowed)) {
-				set_task_cpu(p, new_cpu);
-				task_rq_unlock(rq, &flags);
-				rq = task_rq_lock(p, &flags);
-				cpu = task_cpu(p);
-			}
+		if (!group) {
+			schedstat_inc(sd, sbf_balanced);
+			goto no_forkbalance;
+		}
+
+		new_cpu = find_idlest_cpu(group, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			schedstat_inc(sd, sbf_balanced);
+			goto no_forkbalance;
+		}
+
+		if (cpu_isset(new_cpu, p->cpus_allowed)) {
+			schedstat_inc(sd, sbf_pushed);
+			set_task_cpu(p, new_cpu);
+			task_rq_unlock(rq, &flags);
+			rq = task_rq_lock(p, &flags);
+			cpu = task_cpu(p);
 		}
 	}
-#endif
 
+no_forkbalance:
+#endif
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
@@ -1618,30 +1629,28 @@ void sched_exec(void)
 	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
 
-	/* Prefer the current CPU if there's only this task running */
-	if (this_rq()->nr_running <= 1)
-		goto out;
-
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_BALANCE_EXEC)
 			sd = tmp;
 
 	if (sd) {
 		struct sched_group *group;
-		schedstat_inc(sd, sbe_attempts);
+		schedstat_inc(sd, sbe_cnt);
 		group = find_idlest_group(sd, current, this_cpu);
-		if (!group)
+		if (!group) {
+			schedstat_inc(sd, sbe_balanced);
 			goto out;
+		}
 		new_cpu = find_idlest_cpu(group, this_cpu);
-		if (new_cpu == -1)
+		if (new_cpu == -1 || new_cpu == this_cpu) {
+			schedstat_inc(sd, sbe_balanced);
 			goto out;
-
-		if (new_cpu != this_cpu) {
-			schedstat_inc(sd, sbe_pushed);
-			put_cpu();
-			sched_migrate_task(current, new_cpu);
-			return;
 		}
+
+		schedstat_inc(sd, sbe_pushed);
+		put_cpu();
+		sched_migrate_task(current, new_cpu);
+		return;
 	}
 out:
 	put_cpu();
-- 
cgit v1.2.3


From 48c08d3f8ff94fa118187e4d8d4a5707bb85e59d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Jun 2005 14:57:22 -0700
Subject: [PATCH] sched: uninline task_timeslice

      "Chen, Kenneth W" <kenneth.w.chen@intel.com>

uninline task_timeslice() - reduces code footprint noticeably, and it's
slowpath code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2711130cd973..98bf1c091da5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
 #define SCALE_PRIO(x, prio) \
 	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
 
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
 {
 	if (p->static_prio < NICE_TO_PRIO(0))
 		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
-- 
cgit v1.2.3


From 4866cde064afbb6c2a488c265e696879de616daa Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:23 -0700
Subject: [PATCH] sched: cleanup context switch locking

Instead of requiring architecture code to interact with the scheduler's
locking implementation, provide a couple of defines that can be used by the
architecture to request runqueue unlocked context switches, and ask for
interrupts to be enabled over the context switch.

Also replaces the "switch_lock" used by these architectures with an oncpu
flag (note, not a potentially slow bitflag).  This eliminates one bus
locked memory operation when context switching, and simplifies the
task_running function.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 108 insertions(+), 24 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 98bf1c091da5..b1410577f9a8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -268,14 +268,71 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-/*
- * Default context-switch locking:
- */
 #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next)	do { } while (0)
-# define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+	return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+	spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
+	return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	spin_unlock_irq(&rq->lock);
+#else
+	spin_unlock(&rq->lock);
 #endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -1196,17 +1253,14 @@ void fastcall sched_fork(task_t *p)
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
-	spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	p->oncpu = 0;
+#endif
 #ifdef CONFIG_PREEMPT
-	/*
-	 * During context-switch we hold precisely one spinlock, which
-	 * schedule_tail drops. (in the common case it's this_rq()->lock,
-	 * but it also can be p->switch_lock.) So we compensate with a count
-	 * of 1. Also, we want to start with kernel preemption disabled.
-	 */
+	/* Want to start with kernel preemption disabled. */
 	p->thread_info->preempt_count = 1;
 #endif
 	/*
@@ -1387,23 +1441,41 @@ void fastcall sched_exit(task_t * p)
 	task_rq_unlock(rq, &flags);
 }
 
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
 /**
  * finish_task_switch - clean up after a task-switch
  * @prev: the thread we just switched away from.
  *
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
 	__releases(rq->lock)
 {
-	runqueue_t *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
 	unsigned long prev_task_flags;
 
@@ -1421,7 +1493,8 @@ static inline void finish_task_switch(task_t *prev)
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(rq, prev);
+	finish_arch_switch(prev);
+	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_task_flags & PF_DEAD))
@@ -1435,8 +1508,12 @@ static inline void finish_task_switch(task_t *prev)
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
-	finish_task_switch(prev);
-
+	runqueue_t *rq = this_rq();
+	finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -2816,11 +2893,15 @@ switch_tasks:
 		rq->curr = next;
 		++*switch_count;
 
-		prepare_arch_switch(rq, next);
+		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
-
-		finish_task_switch(prev);
+		/*
+		 * this_rq must be evaluated again because prev may have moved
+		 * CPUs since it called schedule(), thus the 'rq' on its stack
+		 * frame will be invalid.
+		 */
+		finish_task_switch(this_rq(), prev);
 	} else
 		spin_unlock_irq(&rq->lock);
 
@@ -4085,6 +4166,9 @@ void __devinit init_idle(task_t *idle, int cpu)
 
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	idle->oncpu = 1;
+#endif
 	set_tsk_need_resched(idle);
 	spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v1.2.3


From 41c7ce9ad9a859871dffbe7dbc8b1f9571724e3c Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:24 -0700
Subject: [PATCH] sched: null domains

Fix the last 2 places that directly access a runqueue's sched-domain and
assume it cannot be NULL.

That allows the use of NULL for domain, instead of a dummy domain, to signify
no balancing is to happen.  No functional changes.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index b1410577f9a8..77c07c2928b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2579,11 +2579,15 @@ out:
 #ifdef CONFIG_SCHED_SMT
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = this_rq->sd;
+	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	int i;
 
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
+
+	if (!sd)
 		return;
 
 	/*
@@ -2624,13 +2628,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = this_rq->sd;
+	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	prio_array_t *array;
 	int ret = 0, i;
 	task_t *p;
 
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
+	for_each_domain(this_cpu, tmp)
+		if (tmp->flags & SD_SHARE_CPUPOWER)
+			sd = tmp;
+
+	if (!sd)
 		return 0;
 
 	/*
@@ -4617,6 +4625,11 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
 	int level = 0;
 
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
 	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
 	do {
@@ -4874,7 +4887,7 @@ static void __devinit arch_init_sched_domains(void)
 	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/*
-	 * Set up domains. Isolated domains just stay on the dummy domain.
+	 * Set up domains. Isolated domains just stay on the NULL domain.
 	 */
 	for_each_cpu_mask(i, cpu_default_map) {
 		int group;
@@ -4987,18 +5000,11 @@ static void __devinit arch_destroy_sched_domains(void)
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
-/*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
- */
-static struct sched_domain sched_domain_dummy;
-
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
  * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
  * which will prevent rebalancing while the sched domains are recalculated.
  */
 static int update_sched_domains(struct notifier_block *nfb,
@@ -5010,7 +5016,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
 		for_each_online_cpu(i)
-			cpu_attach_domain(&sched_domain_dummy, i);
+			cpu_attach_domain(NULL, i);
 		arch_destroy_sched_domains();
 		return NOTIFY_OK;
 
@@ -5072,7 +5078,7 @@ void __init sched_init(void)
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
-		rq->sd = &sched_domain_dummy;
+		rq->sd = NULL;
 		for (j = 1; j < 3; j++)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
-- 
cgit v1.2.3


From 245af2c7870bd5940f7bfad19a0a03b32751fbc5 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Sat, 25 Jun 2005 14:57:25 -0700
Subject: [PATCH] sched: remove degenerate domains

Remove degenerate scheduler domains during the sched-domain init.

For example on x86_64, we always have NUMA configured in.  On Intel EM64T
systems, top most sched domain will be of NUMA and with only one sched_group
in it.

With fork/exec balances(recent Nick's fixes in -mm tree), we always endup
taking wrong decisions because of this topmost domain (as it contains only one
group and find_idlest_group always returns NULL).  We will endup loading HT
package completely first, letting active load balance kickin and correct it.

In general, this patch also makes sense with out recent Nick's fixes in -mm.

From: Nick Piggin <nickpiggin@yahoo.com.au>

Modified to account for more than just sched_groups when scanning for
degenerate domains by Nick Piggin.  And allow a runqueue's sd to go NULL
rather than keep a single degenerate domain around (this happens when you run
with maxcpus=1).

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 77c07c2928b9..e75b301b5340 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4712,6 +4712,57 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 #define sched_domain_debug(sd, cpu) {}
 #endif
 
+static int __devinit sd_degenerate(struct sched_domain *sd)
+{
+	if (cpus_weight(sd->span) == 1)
+		return 1;
+
+	/* Following flags need at least 2 groups */
+	if (sd->flags & (SD_LOAD_BALANCE |
+			 SD_BALANCE_NEWIDLE |
+			 SD_BALANCE_FORK |
+			 SD_BALANCE_EXEC)) {
+		if (sd->groups != sd->groups->next)
+			return 0;
+	}
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_IDLE |
+			 SD_WAKE_AFFINE |
+			 SD_WAKE_BALANCE))
+		return 0;
+
+	return 1;
+}
+
+static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+						struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpus_equal(sd->span, parent->span))
+		return 0;
+
+	/* Does parent contain flags not in child? */
+	/* WAKE_BALANCE is a subset of WAKE_AFFINE */
+	if (cflags & SD_WAKE_AFFINE)
+		pflags &= ~SD_WAKE_BALANCE;
+	/* Flags needing groups don't count if only 1 group in parent */
+	if (parent->groups == parent->groups->next) {
+		pflags &= ~(SD_LOAD_BALANCE |
+				SD_BALANCE_NEWIDLE |
+				SD_BALANCE_FORK |
+				SD_BALANCE_EXEC);
+	}
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
@@ -4722,6 +4773,19 @@ void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
 	int local = 1;
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; tmp = tmp->parent) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+		if (sd_parent_degenerate(tmp, parent))
+			tmp->parent = parent->parent;
+	}
+
+	if (sd && sd_degenerate(sd))
+		sd = sd->parent;
 
 	sched_domain_debug(sd, cpu);
 
-- 
cgit v1.2.3


From 3dbd5342074a1e570ec84edf859deb9be588006d Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:26 -0700
Subject: [PATCH] sched: multilevel sbe sbf

The fundamental problem that Suresh has with balance on exec and fork is that
it only tries to balance the top level domain with the flag set.

This was worked around by removing degenerate domains, but is still a problem
if people want to start using more complex sched-domains, especially
multilevel NUMA that ia64 is already using.

This patch makes balance on fork and exec try balancing over not just the top
most domain with the flag set, but all the way down the domain tree.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 45 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index e75b301b5340..ef32389ee768 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1319,21 +1319,24 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 			sd = tmp;
 
 	if (sd) {
+		cpumask_t span;
 		int new_cpu;
 		struct sched_group *group;
 
+again:
 		schedstat_inc(sd, sbf_cnt);
+		span = sd->span;
 		cpu = task_cpu(p);
 		group = find_idlest_group(sd, p, cpu);
 		if (!group) {
 			schedstat_inc(sd, sbf_balanced);
-			goto no_forkbalance;
+			goto nextlevel;
 		}
 
 		new_cpu = find_idlest_cpu(group, cpu);
 		if (new_cpu == -1 || new_cpu == cpu) {
 			schedstat_inc(sd, sbf_balanced);
-			goto no_forkbalance;
+			goto nextlevel;
 		}
 
 		if (cpu_isset(new_cpu, p->cpus_allowed)) {
@@ -1343,9 +1346,21 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 			rq = task_rq_lock(p, &flags);
 			cpu = task_cpu(p);
 		}
+
+		/* Now try balancing at a lower domain level */
+nextlevel:
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (cpus_subset(span, tmp->span))
+				break;
+			if (tmp->flags & SD_BALANCE_FORK)
+				sd = tmp;
+		}
+
+		if (sd)
+			goto again;
 	}
 
-no_forkbalance:
 #endif
 	/*
 	 * We decrease the sleep average of forking parents
@@ -1711,25 +1726,41 @@ void sched_exec(void)
 			sd = tmp;
 
 	if (sd) {
+		cpumask_t span;
 		struct sched_group *group;
+again:
 		schedstat_inc(sd, sbe_cnt);
+		span = sd->span;
 		group = find_idlest_group(sd, current, this_cpu);
 		if (!group) {
 			schedstat_inc(sd, sbe_balanced);
-			goto out;
+			goto nextlevel;
 		}
 		new_cpu = find_idlest_cpu(group, this_cpu);
 		if (new_cpu == -1 || new_cpu == this_cpu) {
 			schedstat_inc(sd, sbe_balanced);
-			goto out;
+			goto nextlevel;
 		}
 
 		schedstat_inc(sd, sbe_pushed);
 		put_cpu();
 		sched_migrate_task(current, new_cpu);
-		return;
+
+		/* Now try balancing at a lower domain level */
+		this_cpu = get_cpu();
+nextlevel:
+		sd = NULL;
+		for_each_domain(this_cpu, tmp) {
+			if (cpus_subset(span, tmp->span))
+				break;
+			if (tmp->flags & SD_BALANCE_EXEC)
+				sd = tmp;
+		}
+
+		if (sd)
+			goto again;
 	}
-out:
+
 	put_cpu();
 }
 
-- 
cgit v1.2.3


From 674311d5b411e9042df4fdf7aef0b3c8217b6240 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:27 -0700
Subject: [PATCH] sched: RCU domains

One of the problems with the multilevel balance-on-fork/exec is that it needs
to jump through hoops to satisfy sched-domain's locking semantics (that is,
you may traverse your own domain when not preemptable, and you may traverse
others' domains when holding their runqueue lock).

balance-on-exec had to potentially migrate between more than one CPU before
finding a final CPU to migrate to, and balance-on-fork needed to potentially
take multiple runqueue locks.

So bite the bullet and make sched-domains go completely RCU.  This actually
simplifies the code quite a bit.

From: Ingo Molnar <mingo@elte.hu>

schedstats RCU fix, and a nice comment on for_each_domain, from Ingo.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 60 +++++++++++++++-------------------------------------------
 1 file changed, 15 insertions(+), 45 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index ef32389ee768..54ce787b6207 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -260,8 +260,15 @@ struct runqueue {
 
 static DEFINE_PER_CPU(struct runqueue, runqueues);
 
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See update_sched_domains: synchronize_kernel for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
 #define for_each_domain(cpu, domain) \
-	for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
 
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		(&__get_cpu_var(runqueues))
@@ -395,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 #ifdef CONFIG_SMP
 		/* domain-specific stats */
+		preempt_disable();
 		for_each_domain(cpu, sd) {
 			enum idle_type itype;
 			char mask_str[NR_CPUS];
@@ -419,6 +427,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
+		preempt_enable();
 #endif
 	}
 	return 0;
@@ -824,22 +833,12 @@ inline int task_curr(const task_t *p)
 }
 
 #ifdef CONFIG_SMP
-enum request_type {
-	REQ_MOVE_TASK,
-	REQ_SET_DOMAIN,
-};
-
 typedef struct {
 	struct list_head list;
-	enum request_type type;
 
-	/* For REQ_MOVE_TASK */
 	task_t *task;
 	int dest_cpu;
 
-	/* For REQ_SET_DOMAIN */
-	struct sched_domain *sd;
-
 	struct completion done;
 } migration_req_t;
 
@@ -861,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 	}
 
 	init_completion(&req->done);
-	req->type = REQ_MOVE_TASK;
 	req->task = p;
 	req->dest_cpu = dest_cpu;
 	list_add(&req->list, &rq->migration_queue);
@@ -4378,17 +4376,9 @@ static int migration_thread(void * data)
 		req = list_entry(head->next, migration_req_t, list);
 		list_del_init(head->next);
 
-		if (req->type == REQ_MOVE_TASK) {
-			spin_unlock(&rq->lock);
-			__migrate_task(req->task, cpu, req->dest_cpu);
-			local_irq_enable();
-		} else if (req->type == REQ_SET_DOMAIN) {
-			rq->sd = req->sd;
-			spin_unlock_irq(&rq->lock);
-		} else {
-			spin_unlock_irq(&rq->lock);
-			WARN_ON(1);
-		}
+		spin_unlock(&rq->lock);
+		__migrate_task(req->task, cpu, req->dest_cpu);
+		local_irq_enable();
 
 		complete(&req->done);
 	}
@@ -4619,7 +4609,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 			migration_req_t *req;
 			req = list_entry(rq->migration_queue.next,
 					 migration_req_t, list);
-			BUG_ON(req->type != REQ_MOVE_TASK);
 			list_del_init(&req->list);
 			complete(&req->done);
 		}
@@ -4800,10 +4789,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd,
  */
 void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
-	migration_req_t req;
-	unsigned long flags;
 	runqueue_t *rq = cpu_rq(cpu);
-	int local = 1;
 	struct sched_domain *tmp;
 
 	/* Remove the sched domains which do not contribute to scheduling. */
@@ -4820,24 +4806,7 @@ void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
 
 	sched_domain_debug(sd, cpu);
 
-	spin_lock_irqsave(&rq->lock, flags);
-
-	if (cpu == smp_processor_id() || !cpu_online(cpu)) {
-		rq->sd = sd;
-	} else {
-		init_completion(&req.done);
-		req.type = REQ_SET_DOMAIN;
-		req.sd = sd;
-		list_add(&req.list, &rq->migration_queue);
-		local = 0;
-	}
-
-	spin_unlock_irqrestore(&rq->lock, flags);
-
-	if (!local) {
-		wake_up_process(rq->migration_thread);
-		wait_for_completion(&req.done);
-	}
+	rcu_assign_pointer(rq->sd, sd);
 }
 
 /* cpus with isolated domains */
@@ -5112,6 +5081,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	case CPU_DOWN_PREPARE:
 		for_each_online_cpu(i)
 			cpu_attach_domain(NULL, i);
+		synchronize_kernel();
 		arch_destroy_sched_domains();
 		return NOTIFY_OK;
 
-- 
cgit v1.2.3


From 476d139c218e44e045e4bc6d4cc02b010b343939 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:29 -0700
Subject: [PATCH] sched: consolidate sbe sbf

Consolidate balance-on-exec with balance-on-fork.  This is made easy by the
sched-domains RCU patches.

As well as the general goodness of code reduction, this allows the runqueues
to be unlocked during balance-on-fork.

schedstats is a problem.  Maybe just have balance-on-event instead of
distinguishing fork and exec?

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 174 ++++++++++++++++++++++-----------------------------------
 1 file changed, 68 insertions(+), 106 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 54ce787b6207..579da278e72f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1021,8 +1021,59 @@ static int find_idlest_cpu(struct sched_group *group, int this_cpu)
 	return idlest;
 }
 
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+	struct task_struct *t = current;
+	struct sched_domain *tmp, *sd = NULL;
 
-#endif
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & flag)
+			sd = tmp;
+
+	while (sd) {
+		cpumask_t span;
+		struct sched_group *group;
+		int new_cpu;
+		int weight;
+
+		span = sd->span;
+		group = find_idlest_group(sd, t, cpu);
+		if (!group)
+			goto nextlevel;
+
+		new_cpu = find_idlest_cpu(group, cpu);
+		if (new_cpu == -1 || new_cpu == cpu)
+			goto nextlevel;
+
+		/* Now try balancing at a lower domain level */
+		cpu = new_cpu;
+nextlevel:
+		sd = NULL;
+		weight = cpus_weight(span);
+		for_each_domain(cpu, tmp) {
+			if (weight <= cpus_weight(tmp->span))
+				break;
+			if (tmp->flags & flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return cpu;
+}
+
+#endif /* CONFIG_SMP */
 
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -1240,8 +1291,15 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
 {
+	int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+	set_task_cpu(p, cpu);
+
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -1282,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
 		 * runqueue lock is not a problem.
 		 */
 		current->time_slice = 1;
-		preempt_disable();
 		scheduler_tick();
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();
+	put_cpu();
 }
 
 /*
@@ -1302,64 +1358,12 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
-#ifdef CONFIG_SMP
-	struct sched_domain *tmp, *sd = NULL;
-#endif
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 
-#ifdef CONFIG_SMP
-	for_each_domain(cpu, tmp)
-		if (tmp->flags & SD_BALANCE_FORK)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		int new_cpu;
-		struct sched_group *group;
-
-again:
-		schedstat_inc(sd, sbf_cnt);
-		span = sd->span;
-		cpu = task_cpu(p);
-		group = find_idlest_group(sd, p, cpu);
-		if (!group) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
-
-		new_cpu = find_idlest_cpu(group, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
-
-		if (cpu_isset(new_cpu, p->cpus_allowed)) {
-			schedstat_inc(sd, sbf_pushed);
-			set_task_cpu(p, new_cpu);
-			task_rq_unlock(rq, &flags);
-			rq = task_rq_lock(p, &flags);
-			cpu = task_cpu(p);
-		}
-
-		/* Now try balancing at a lower domain level */
-nextlevel:
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_FORK)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
-
-#endif
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
@@ -1708,58 +1712,16 @@ out:
 }
 
 /*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
-	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
-
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_BALANCE_EXEC)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		struct sched_group *group;
-again:
-		schedstat_inc(sd, sbe_cnt);
-		span = sd->span;
-		group = find_idlest_group(sd, current, this_cpu);
-		if (!group) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-		new_cpu = find_idlest_cpu(group, this_cpu);
-		if (new_cpu == -1 || new_cpu == this_cpu) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-
-		schedstat_inc(sd, sbe_pushed);
-		put_cpu();
-		sched_migrate_task(current, new_cpu);
-
-		/* Now try balancing at a lower domain level */
-		this_cpu = get_cpu();
-nextlevel:
-		sd = NULL;
-		for_each_domain(this_cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_EXEC)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
-
+	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
+	if (new_cpu != this_cpu)
+		sched_migrate_task(current, new_cpu);
 }
 
 /*
-- 
cgit v1.2.3


From 77391d71681d05d2f4502f91ad62618522abf624 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:30 -0700
Subject: [PATCH] sched: relax pinned balancing

The maximum rebalance interval allowed by the multiprocessor balancing
backoff is often not large enough to handle corner cases where there are
lots of tasks pinned on a CPU.  Suresh reported:

	I see system livelock's if for example I have 7000 processes
	pinned onto one cpu (this is on the fastest 8-way system I
	have access to).

After this patch, the machine is reported to go well above this number.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 579da278e72f..6e452eb95ac3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2030,6 +2030,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 	return busiest;
 }
 
+/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL	512
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
@@ -2042,7 +2048,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	struct sched_group *group;
 	runqueue_t *busiest;
 	unsigned long imbalance;
-	int nr_moved, all_pinned;
+	int nr_moved, all_pinned = 0;
 	int active_balance = 0;
 
 	spin_lock(&this_rq->lock);
@@ -2133,7 +2139,8 @@ out_balanced:
 
 	sd->nr_balance_failed = 0;
 	/* tune up the balancing interval */
-	if (sd->balance_interval < sd->max_interval)
+	if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+			(sd->balance_interval < sd->max_interval))
 		sd->balance_interval *= 2;
 
 	return 0;
-- 
cgit v1.2.3


From a3464a102a69a4e00efb0a763e274ce290995b4b Mon Sep 17 00:00:00 2001
From: Chen Shang <shangcs@gmail.com>
Date: Sat, 25 Jun 2005 14:57:31 -0700
Subject: [PATCH] sched: micro-optimize task requeueing in schedule()

micro-optimize task requeueing in schedule() & clean up recalc_task_prio().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6e452eb95ac3..a3d1c8e43d34 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -673,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 	rq->nr_running++;
 }
 
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
 	unsigned long long __sleep_time = now - p->timestamp;
@@ -732,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
 		}
 	}
 
-	p->prio = effective_prio(p);
+	return effective_prio(p);
 }
 
 /*
@@ -755,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 	}
 #endif
 
-	recalc_task_prio(p, now);
+	p->prio = recalc_task_prio(p, now);
 
 	/*
 	 * This checks to make sure it's not an uninterruptible task
@@ -2751,7 +2751,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx;
+	int cpu, idx, new_prio;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -2873,9 +2873,14 @@ go_idle:
 			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
 
 		array = next->array;
-		dequeue_task(next, array);
-		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
+		new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+		if (unlikely(next->prio != new_prio)) {
+			dequeue_task(next, array);
+			next->prio = new_prio;
+			enqueue_task(next, array);
+		} else
+			requeue_task(next, array);
 	}
 	next->activated = 0;
 switch_tasks:
-- 
cgit v1.2.3


From 37e4ab3f0cba13adf3535d373fd98e5ee47b5410 Mon Sep 17 00:00:00 2001
From: Olivier Croquette <ocroquette@free.fr>
Date: Sat, 25 Jun 2005 14:57:32 -0700
Subject: [PATCH] Changing RT priority without CAP_SYS_NICE

Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.

But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.

The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.

This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...

The POSIX norm says that the permissions are implementation specific, so
I think we can do that.

In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.

From: Ingo Molnar <mingo@elte.hu>

cleaned up and merged to -mm.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index a3d1c8e43d34..d3d81b82e378 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3531,13 +3531,24 @@ recheck:
 	if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
 		return -EINVAL;
 
-	if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
-	    param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
-	    !capable(CAP_SYS_NICE))
-		return -EPERM;
-	if ((current->euid != p->euid) && (current->euid != p->uid) &&
-	    !capable(CAP_SYS_NICE))
-		return -EPERM;
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (!capable(CAP_SYS_NICE)) {
+		/* can't change policy */
+		if (policy != p->policy)
+			return -EPERM;
+		/* can't increase priority */
+		if (policy != SCHED_NORMAL &&
+		    param->sched_priority > p->rt_priority &&
+		    param->sched_priority >
+				p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+			return -EPERM;
+		/* can't change other user's priorities */
+		if ((current->euid != p->euid) &&
+		    (current->euid != p->uid))
+			return -EPERM;
+	}
 
 	retval = security_task_setscheduler(p, policy, param);
 	if (retval)
-- 
cgit v1.2.3


From 1a20ff27ef75d866730ee796acd811a925af762f Mon Sep 17 00:00:00 2001
From: Dinakar Guniguntala <dino@in.ibm.com>
Date: Sat, 25 Jun 2005 14:57:33 -0700
Subject: [PATCH] Dynamic sched domains: sched changes

The following patches add dynamic sched domains functionality that was
extensively discussed on lkml and lse-tech.  I would like to see this added to
-mm

o The main advantage with this feature is that it ensures that the scheduler
  load balacing code only balances against the cpus that are in the sched
  domain as defined by an exclusive cpuset and not all of the cpus in the
  system. This removes any overhead due to load balancing code trying to
  pull tasks outside of the cpu exclusive cpuset only to be prevented by
  the tasks' cpus_allowed mask.
o cpu exclusive cpusets are useful for servers running orthogonal
  workloads such as RT applications requiring low latency and HPC
  applications that are throughput sensitive

o It provides a new API partition_sched_domains in sched.c
  that makes dynamic sched domains possible.
o cpu_exclusive cpusets sets are now associated with a sched domain.
  Which means that the users can dynamically modify the sched domains
  through the cpuset file system interface
o ia64 sched domain code has been updated to support this feature as well
o Currently, this does not support hotplug. (However some of my tests
  indicate hotplug+preempt is currently broken)
o I have tested it extensively on x86.
o This should have very minimal impact on performance as none of
  the fast paths are affected

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 132 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 86 insertions(+), 46 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index d3d81b82e378..dee96b22635e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See update_sched_domains: synchronize_kernel for details.
+ * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
@@ -4624,7 +4624,7 @@ int __init migration_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
@@ -4717,7 +4717,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 #define sched_domain_debug(sd, cpu) {}
 #endif
 
-static int __devinit sd_degenerate(struct sched_domain *sd)
+static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
@@ -4740,7 +4740,7 @@ static int __devinit sd_degenerate(struct sched_domain *sd)
 	return 1;
 }
 
-static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+static int sd_parent_degenerate(struct sched_domain *sd,
 						struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
@@ -4772,7 +4772,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4823,7 +4823,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
 			cpumask_t span, int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
@@ -4859,13 +4859,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
 
 
 #ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
 #else
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
@@ -4873,7 +4874,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
@@ -4886,7 +4887,7 @@ static int __devinit cpu_to_phys_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
@@ -4917,39 +4918,28 @@ static void check_sibling_maps(void)
 #endif
 
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
  */
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	cpumask_t cpu_default_map;
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/*
-	 * Set up domains. Isolated domains just stay on the NULL domain.
+	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = cpu_default_map;
+		sd->span = *cpu_map;
 		sd->groups = &sched_group_nodes[group];
 #endif
 
@@ -4967,7 +4957,7 @@ static void __devinit arch_init_sched_domains(void)
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		sd->groups = &sched_group_cpus[group];
 #endif
@@ -4977,7 +4967,7 @@ static void __devinit arch_init_sched_domains(void)
 	/* Set up CPU (sibling) groups */
 	for_each_online_cpu(i) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
@@ -4990,7 +4980,7 @@ static void __devinit arch_init_sched_domains(void)
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
@@ -5000,12 +4990,12 @@ static void __devinit arch_init_sched_domains(void)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, cpu_default_map,
+	init_sched_build_groups(sched_group_nodes, *cpu_map,
 					&cpu_to_node_group);
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -5029,7 +5019,7 @@ static void __devinit arch_init_sched_domains(void)
 	}
 
 	/* Attach the domains */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -5039,16 +5029,71 @@ static void __devinit arch_init_sched_domains(void)
 		cpu_attach_domain(sd, i);
 	}
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
 
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+	check_sibling_maps();
+#endif
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	build_sched_domains(&cpu_default_map);
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 	/* Do nothing: everything is statically allocated. */
 }
-#endif
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+	int i;
+
+	for_each_cpu_mask(i, *cpu_map)
+		cpu_attach_domain(NULL, i);
+	synchronize_sched();
+	arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+	cpumask_t change_map;
+
+	cpus_and(*partition1, *partition1, cpu_online_map);
+	cpus_and(*partition2, *partition2, cpu_online_map);
+	cpus_or(change_map, *partition1, *partition2);
+
+	/* Detach sched domains from all of the affected cpus */
+	detach_destroy_domains(&change_map);
+	if (!cpus_empty(*partition1))
+		build_sched_domains(partition1);
+	if (!cpus_empty(*partition2))
+		build_sched_domains(partition2);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
@@ -5059,15 +5104,10 @@ static void __devinit arch_destroy_sched_domains(void)
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
-	int i;
-
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
-		for_each_online_cpu(i)
-			cpu_attach_domain(NULL, i);
-		synchronize_kernel();
-		arch_destroy_sched_domains();
+		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
@@ -5083,7 +5123,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	}
 
 	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 
 	return NOTIFY_OK;
 }
@@ -5092,7 +5132,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 void __init sched_init_smp(void)
 {
 	lock_cpu_hotplug();
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
-- 
cgit v1.2.3


From 3e1d1d28d99dabe63c64f7f40f1ca1d646de1f73 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Fri, 24 Jun 2005 23:13:50 -0700
Subject: [PATCH] Cleanup patch for process freezing

1. Establish a simple API for process freezing defined in linux/include/sched.h:

   frozen(process)		Check for frozen process
   freezing(process)		Check if a process is being frozen
   freeze(process)		Tell a process to freeze (go to refrigerator)
   thaw_process(process)	Restart process
   frozen_process(process)	Process is frozen now

2. Remove all references to PF_FREEZE and PF_FROZEN from all
   kernel sources except sched.h

3. Fix numerous locations where try_to_freeze is manually done by a driver

4. Remove the argument that is no longer necessary from two function calls.

5. Some whitespace cleanup

6. Clear potential race in refrigerator (provides an open window of PF_FREEZE
   cleared before setting PF_FROZEN, recalc_sigpending does not check
   PF_FROZEN).

This patch does not address the problem of freeze_processes() violating the rule
that a task may only modify its own flags by setting PF_FREEZE. This is not clean
in an SMP environment. freeze(process) is therefore not SMP safe!

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 76080d142e3d..6fa9ea4ae44c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4174,8 +4174,7 @@ static int migration_thread(void * data)
 		struct list_head *head;
 		migration_req_t *req;
 
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		spin_lock_irq(&rq->lock);
 
-- 
cgit v1.2.3