diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2011-04-07 14:09:50 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-04-11 12:58:19 +0200 |
commit | dce840a08702bd13a9a186e07e63d1ef82256b5e (patch) | |
tree | 168bb98aed7f5761ebe31aa92c34959e9d0f238a /kernel/sched_fair.c | |
parent | a9c9a9b6bff27ac9c746344a9c1a19bf3327002c (diff) |
sched: Dynamically allocate sched_domain/sched_group data-structures
Instead of relying on static allocations for the sched_domain and
sched_group trees, dynamically allocate and RCU free them.
Allocating this dynamically also allows for some build_sched_groups()
simplification since we can now (like with other simplifications) rely
on the sched_domain tree instead of hard-coded knowledge.
One tricky to note is that detach_destroy_domains() needs to hold
rcu_read_lock() over the entire tear-down, per-cpu is not sufficient
since that can lead to partial sched_group existance (could possibly
be solved by doing the tear-down backwards but this is much more
robust).
A concequence of the above is that we can no longer print the
sched_domain debug stuff from cpu_attach_domain() since that might now
run with preemption disabled (due to classic RCU etc.) and
sched_domain_debug() does some GFP_KERNEL allocations.
Another thing to note is that we now fully rely on normal RCU and not
RCU-sched, this is because with the new and exiting RCU flavours we
grew over the years BH doesn't necessarily hold off RCU-sched grace
periods (-rt is known to break this). This would in fact already cause
us grief since we do sched_domain/sched_group iterations from softirq
context.
This patch is somewhat larger than I would like it to be, but I didn't
find any means of shrinking/splitting this.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r-- | kernel/sched_fair.c | 30 |
1 files changed, 24 insertions, 6 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0af8d1..4a8ac7c2a18e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1622,6 +1622,7 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ + rcu_read_lock(); for_each_domain(target, sd) { if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) break; @@ -1641,6 +1642,7 @@ static int select_idle_sibling(struct task_struct *p, int target) cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) break; } + rcu_read_unlock(); return target; } @@ -1673,6 +1675,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ new_cpu = prev_cpu; } + rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) continue; @@ -1723,9 +1726,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ if (affine_sd) { if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) - return select_idle_sibling(p, cpu); - else - return select_idle_sibling(p, prev_cpu); + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; } while (sd) { @@ -1766,6 +1770,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ } /* while loop will break here if sd == NULL */ } +unlock: + rcu_read_unlock(); return new_cpu; } @@ -3462,6 +3468,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) raw_spin_unlock(&this_rq->lock); update_shares(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; @@ -3483,6 +3490,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) break; } } + rcu_read_unlock(); raw_spin_lock(&this_rq->lock); @@ -3531,6 +3539,7 @@ static int active_load_balance_cpu_stop(void *data) double_lock_balance(busiest_rq, target_rq); /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); for_each_domain(target_cpu, sd) { if ((sd->flags & SD_LOAD_BALANCE) && cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) @@ -3546,6 +3555,7 @@ static int active_load_balance_cpu_stop(void *data) else schedstat_inc(sd, alb_failed); } + rcu_read_unlock(); double_unlock_balance(busiest_rq, target_rq); out_unlock: busiest_rq->active_balance = 0; @@ -3672,6 +3682,7 @@ static int find_new_ilb(int cpu) { struct sched_domain *sd; struct sched_group *ilb_group; + int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -3687,20 +3698,25 @@ static int find_new_ilb(int cpu) if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; + rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { ilb_group = sd->groups; do { - if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.grp_idle_mask); + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); + goto unlock; + } ilb_group = ilb_group->next; } while (ilb_group != sd->groups); } +unlock: + rcu_read_unlock(); out_done: - return nr_cpu_ids; + return ilb; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -3845,6 +3861,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) update_shares(cpu); + rcu_read_lock(); for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -3890,6 +3907,7 @@ out: if (!balance) break; } + rcu_read_unlock(); /* * next_balance will be updated only when there is a need. |