summaryrefslogtreecommitdiff
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c305
1 files changed, 248 insertions, 57 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 52ab113d8bb9..fdbdb5084c49 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+/*
+ * The exponential sliding window over which load is averaged for shares
+ * distribution.
+ * (default: 10msec)
+ */
+unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+
static const struct sched_class fair_sched_class;
/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
return cfs_rq->tg->cfs_rq[this_cpu];
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_rq->on_list) {
+ /*
+ * Ensure we either appear before our parent (if already
+ * enqueued) or force our parent to appear after us when it is
+ * enqueued. The fact that we always enqueue bottom-up
+ * reduces this to two cases.
+ */
+ if (cfs_rq->tg->parent &&
+ cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ } else {
+ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
+ &rq_of(cfs_rq)->leaf_cfs_rq_list);
+ }
+
+ cfs_rq->on_list = 1;
+ }
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ if (cfs_rq->on_list) {
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ cfs_rq->on_list = 0;
+ }
+}
+
/* Iterate thr' all leaf cfs_rq's on a runqueue */
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
return &cpu_rq(this_cpu)->cfs;
}
+static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
+{
+}
+
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
WRT_SYSCTL(sched_min_granularity);
WRT_SYSCTL(sched_latency);
WRT_SYSCTL(sched_wakeup_granularity);
- WRT_SYSCTL(sched_shares_ratelimit);
#undef WRT_SYSCTL
return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
return calc_delta_fair(sched_slice(cfs_rq, se), se);
}
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -514,6 +561,14 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->vruntime += delta_exec_weighted;
update_min_vruntime(cfs_rq);
+
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+ cfs_rq->load_unacc_exec_time += delta_exec;
+ if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+#endif
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +688,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_add(&se->group_node, &cfs_rq->tasks);
}
cfs_rq->nr_running++;
- se->on_rq = 1;
}
static void
@@ -647,9 +701,124 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_del_init(&se->group_node);
}
cfs_rq->nr_running--;
- se->on_rq = 0;
}
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+ int global_update)
+{
+ struct task_group *tg = cfs_rq->tg;
+ long load_avg;
+
+ load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
+ load_avg -= cfs_rq->load_contribution;
+
+ if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
+ atomic_add(load_avg, &tg->load_weight);
+ cfs_rq->load_contribution += load_avg;
+ }
+}
+
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+ u64 period = sysctl_sched_shares_window;
+ u64 now, delta;
+ unsigned long load = cfs_rq->load.weight;
+
+ if (!cfs_rq)
+ return;
+
+ now = rq_of(cfs_rq)->clock;
+ delta = now - cfs_rq->load_stamp;
+
+ /* truncate load history at 4 idle periods */
+ if (cfs_rq->load_stamp > cfs_rq->load_last &&
+ now - cfs_rq->load_last > 4 * period) {
+ cfs_rq->load_period = 0;
+ cfs_rq->load_avg = 0;
+ }
+
+ cfs_rq->load_stamp = now;
+ cfs_rq->load_unacc_exec_time = 0;
+ cfs_rq->load_period += delta;
+ if (load) {
+ cfs_rq->load_last = now;
+ cfs_rq->load_avg += delta * load;
+ }
+
+ /* consider updating load contribution on each fold or truncate */
+ if (global_update || cfs_rq->load_period > period
+ || !cfs_rq->load_period)
+ update_cfs_rq_load_contribution(cfs_rq, global_update);
+
+ while (cfs_rq->load_period > period) {
+ /*
+ * Inline assembly required to prevent the compiler
+ * optimising this loop into a divmod call.
+ * See __iter_div_u64_rem() for another example of this.
+ */
+ asm("" : "+rm" (cfs_rq->load_period));
+ cfs_rq->load_period /= 2;
+ cfs_rq->load_avg /= 2;
+ }
+
+ if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
+ list_del_leaf_cfs_rq(cfs_rq);
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ if (se->on_rq)
+ account_entity_dequeue(cfs_rq, se);
+
+ update_load_set(&se->load, weight);
+
+ if (se->on_rq)
+ account_entity_enqueue(cfs_rq, se);
+}
+
+static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+ struct task_group *tg;
+ struct sched_entity *se;
+ long load_weight, load, shares;
+
+ if (!cfs_rq)
+ return;
+
+ tg = cfs_rq->tg;
+ se = tg->se[cpu_of(rq_of(cfs_rq))];
+ if (!se)
+ return;
+
+ load = cfs_rq->load.weight + weight_delta;
+
+ load_weight = atomic_read(&tg->load_weight);
+ load_weight -= cfs_rq->load_contribution;
+ load_weight += load;
+
+ shares = (tg->shares * load);
+ if (load_weight)
+ shares /= load_weight;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+ if (shares > tg->shares)
+ shares = tg->shares;
+
+ reweight_entity(cfs_rq_of(se), se, shares);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+{
+}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +940,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, se->load.weight);
account_entity_enqueue(cfs_rq, se);
if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +953,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
check_spread(cfs_rq, se);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
+ se->on_rq = 1;
+
+ if (cfs_rq->nr_running == 1)
+ list_add_leaf_cfs_rq(cfs_rq);
}
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1000,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
+ se->on_rq = 0;
+ update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq, 0);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1233,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
flags = ENQUEUE_WAKEUP;
}
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+
hrtick_update(rq);
}
@@ -1071,12 +1256,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
+
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight)
break;
flags |= DEQUEUE_SLEEP;
}
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ update_cfs_load(cfs_rq, 0);
+ update_cfs_shares(cfs_rq, 0);
+ }
+
hrtick_update(rq);
}
@@ -1143,51 +1336,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
- *
- * The problem is that perfectly aligning the shares is rather expensive, hence
- * we try to avoid doing that too often - see update_shares(), which ratelimits
- * this change.
- *
- * We compensate this by not only taking the current delta into account, but
- * also considering the delta between when the shares were last adjusted and
- * now.
- *
- * We still saw a performance dip, some tracing learned us that between
- * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
- * significantly. Therefore try to bias the error in direction of failing
- * the affine wakeup.
- *
*/
-static long effective_load(struct task_group *tg, int cpu,
- long wl, long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
if (!tg->parent)
return wl;
- /*
- * By not taking the decrease of shares on the other cpu into
- * account our error leans towards reducing the affine wakeups.
- */
- if (!wl && sched_feat(ASYM_EFF_LOAD))
- return wl;
-
for_each_sched_entity(se) {
long S, rw, s, a, b;
- long more_w;
-
- /*
- * Instead of using this increment, also add the difference
- * between when the shares were last updated and now.
- */
- more_w = se->my_q->load.weight - se->my_q->rq_weight;
- wl += more_w;
- wg += more_w;
S = se->my_q->tg->shares;
- s = se->my_q->shares;
- rw = se->my_q->rq_weight;
+ s = se->load.weight;
+ rw = se->my_q->load.weight;
a = S*(rw + wl);
b = S*rw + s*wg;
@@ -1508,23 +1670,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
sd = tmp;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (sched_feat(LB_SHARES_UPDATE)) {
- /*
- * Pick the largest domain to update shares over
- */
- tmp = sd;
- if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
- tmp = affine_sd;
-
- if (tmp) {
- raw_spin_unlock(&rq->lock);
- update_shares(tmp);
- raw_spin_lock(&rq->lock);
- }
- }
-#endif
-
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
return select_idle_sibling(p, cpu);
@@ -1913,6 +2058,48 @@ out:
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * update tg->load_weight by folding this cpu's load_avg
+ */
+static int update_shares_cpu(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned long flags;
+ struct rq *rq;
+
+ if (!tg->se[cpu])
+ return 0;
+
+ rq = cpu_rq(cpu);
+ cfs_rq = tg->cfs_rq[cpu];
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ update_rq_clock(rq);
+ update_cfs_load(cfs_rq, 1);
+
+ /*
+ * We need to update shares after updating tg->load_weight in
+ * order to adjust the weight of groups with long running tasks.
+ */
+ update_cfs_shares(cfs_rq, 0);
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return 0;
+}
+
+static void update_shares(int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ struct rq *rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ for_each_leaf_cfs_rq(rq, cfs_rq)
+ update_shares_cpu(cfs_rq->tg, cpu);
+ rcu_read_unlock();
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -1960,6 +2147,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
return max_load_move - rem_load_move;
}
#else
+static inline void update_shares(int cpu)
+{
+}
+
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
@@ -3036,7 +3227,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_count[idle]);
redo:
- update_shares(sd);
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
cpus, balance);
@@ -3178,8 +3368,6 @@ out_one_pinned:
else
ld_moved = 0;
out:
- if (ld_moved)
- update_shares(sd);
return ld_moved;
}
@@ -3203,6 +3391,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
*/
raw_spin_unlock(&this_rq->lock);
+ update_shares(this_cpu);
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
@@ -3571,6 +3760,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
int update_next_balance = 0;
int need_serialize;
+ update_shares(cpu);
+
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;