diff options
-rw-r--r-- | Documentation/scheduler/sched-bwc.txt | 122 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | include/trace/events/sched.h | 9 | ||||
-rw-r--r-- | init/Kconfig | 12 | ||||
-rw-r--r-- | kernel/sched.c | 559 | ||||
-rw-r--r-- | kernel/sched_cpupri.c | 89 | ||||
-rw-r--r-- | kernel/sched_cpupri.h | 7 | ||||
-rw-r--r-- | kernel/sched_fair.c | 716 | ||||
-rw-r--r-- | kernel/sched_features.h | 5 | ||||
-rw-r--r-- | kernel/sched_rt.c | 91 | ||||
-rw-r--r-- | kernel/sched_stoptask.c | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 10 |
12 files changed, 1439 insertions, 187 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt new file mode 100644 index 000000000000..f6b1873f68ab --- /dev/null +++ b/Documentation/scheduler/sched-bwc.txt @@ -0,0 +1,122 @@ +CFS Bandwidth Control +===================== + +[ This document only discusses CPU bandwidth control for SCHED_NORMAL. + The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ] + +CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the +specification of the maximum CPU bandwidth available to a group or hierarchy. + +The bandwidth allowed for a group is specified using a quota and period. Within +each given "period" (microseconds), a group is allowed to consume only up to +"quota" microseconds of CPU time. When the CPU bandwidth consumption of a +group exceeds this limit (for that period), the tasks belonging to its +hierarchy will be throttled and are not allowed to run again until the next +period. + +A group's unused runtime is globally tracked, being refreshed with quota units +above at each period boundary. As threads consume this bandwidth it is +transferred to cpu-local "silos" on a demand basis. The amount transferred +within each of these updates is tunable and described as the "slice". + +Management +---------- +Quota and period are managed within the cpu subsystem via cgroupfs. + +cpu.cfs_quota_us: the total available run-time within a period (in microseconds) +cpu.cfs_period_us: the length of a period (in microseconds) +cpu.stat: exports throttling statistics [explained further below] + +The default values are: + cpu.cfs_period_us=100ms + cpu.cfs_quota=-1 + +A value of -1 for cpu.cfs_quota_us indicates that the group does not have any +bandwidth restriction in place, such a group is described as an unconstrained +bandwidth group. This represents the traditional work-conserving behavior for +CFS. + +Writing any (valid) positive value(s) will enact the specified bandwidth limit. +The minimum quota allowed for the quota or period is 1ms. There is also an +upper bound on the period length of 1s. Additional restrictions exist when +bandwidth limits are used in a hierarchical fashion, these are explained in +more detail below. + +Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit +and return the group to an unconstrained state once more. + +Any updates to a group's bandwidth specification will result in it becoming +unthrottled if it is in a constrained state. + +System wide settings +-------------------- +For efficiency run-time is transferred between the global pool and CPU local +"silos" in a batch fashion. This greatly reduces global accounting pressure +on large systems. The amount transferred each time such an update is required +is described as the "slice". + +This is tunable via procfs: + /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms) + +Larger slice values will reduce transfer overheads, while smaller values allow +for more fine-grained consumption. + +Statistics +---------- +A group's bandwidth statistics are exported via 3 fields in cpu.stat. + +cpu.stat: +- nr_periods: Number of enforcement intervals that have elapsed. +- nr_throttled: Number of times the group has been throttled/limited. +- throttled_time: The total time duration (in nanoseconds) for which entities + of the group have been throttled. + +This interface is read-only. + +Hierarchical considerations +--------------------------- +The interface enforces that an individual entity's bandwidth is always +attainable, that is: max(c_i) <= C. However, over-subscription in the +aggregate case is explicitly allowed to enable work-conserving semantics +within a hierarchy. + e.g. \Sum (c_i) may exceed C +[ Where C is the parent's bandwidth, and c_i its children ] + + +There are two ways in which a group may become throttled: + a. it fully consumes its own quota within a period + b. a parent's quota is fully consumed within its period + +In case b) above, even though the child may have runtime remaining it will not +be allowed to until the parent's runtime is refreshed. + +Examples +-------- +1. Limit a group to 1 CPU worth of runtime. + + If period is 250ms and quota is also 250ms, the group will get + 1 CPU worth of runtime every 250ms. + + # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */ + # echo 250000 > cpu.cfs_period_us /* period = 250ms */ + +2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine. + + With 500ms period and 1000ms quota, the group can get 2 CPUs worth of + runtime every 500ms. + + # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */ + # echo 500000 > cpu.cfs_period_us /* period = 500ms */ + + The larger period here allows for increased burst capacity. + +3. Limit a group to 20% of 1 CPU. + + With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU. + + # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */ + # echo 50000 > cpu.cfs_period_us /* period = 50ms */ + + By using a small period here we are ensuring a consistent latency + response at the expense of burst capacity. + diff --git a/include/linux/sched.h b/include/linux/sched.h index 41d0237fd449..9fda2888a6ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2039,6 +2039,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { } static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif +#ifdef CONFIG_CFS_BANDWIDTH +extern unsigned int sysctl_sched_cfs_bandwidth_slice; +#endif + #ifdef CONFIG_RT_MUTEXES extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f6334782a593..959ff18b63b6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) * For all intents and purposes a preempted task is a running task. */ if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) - state = TASK_RUNNING; + state = TASK_RUNNING | TASK_STATE_MAX; #endif return state; @@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch, __entry->next_prio = next->prio; ), - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d", + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->prev_state ? - __print_flags(__entry->prev_state, "|", + __entry->prev_state & (TASK_STATE_MAX-1) ? + __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" }, { 64, "x" }, { 128, "W" }) : "R", + __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); diff --git a/init/Kconfig b/init/Kconfig index d62778390e55..d19b3a77ab44 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED depends on CGROUP_SCHED default CGROUP_SCHED +config CFS_BANDWIDTH + bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED" + depends on EXPERIMENTAL + depends on FAIR_GROUP_SCHED + default n + help + This option allows users to define CPU bandwidth rates (limits) for + tasks running within the fair group scheduler. Groups with no limit + set are considered to be unconstrained and will run with no + restriction. + See tip/Documentation/scheduler/sched-bwc.txt for more information. + config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" depends on EXPERIMENTAL diff --git a/kernel/sched.c b/kernel/sched.c index b50b0f0c9aa9..c5cf15e1eb57 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { - ktime_t now; + unsigned long delta; + ktime_t soft, hard, now; + + for (;;) { + if (hrtimer_active(period_timer)) + break; + + now = hrtimer_cb_get_time(period_timer); + hrtimer_forward(period_timer, now, period); + + soft = hrtimer_get_softexpires(period_timer); + hard = hrtimer_get_expires(period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); + } +} +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; @@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) return; raw_spin_lock(&rt_b->rt_runtime_lock); - for (;;) { - unsigned long delta; - ktime_t soft, hard; - - if (hrtimer_active(&rt_b->rt_period_timer)) - break; - - now = hrtimer_cb_get_time(&rt_b->rt_period_timer); - hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - - soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); - hard = hrtimer_get_expires(&rt_b->rt_period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -247,6 +250,24 @@ struct cfs_rq; static LIST_HEAD(task_groups); +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchal_quota; + u64 runtime_expires; + + int idle, timer_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + /* task group related information */ struct task_group { struct cgroup_subsys_state css; @@ -278,6 +299,8 @@ struct task_group { #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif + + struct cfs_bandwidth cfs_bandwidth; }; /* task_group_lock serializes the addition/removal of task groups */ @@ -311,7 +334,7 @@ struct task_group root_task_group; /* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load; - unsigned long nr_running; + unsigned long nr_running, h_nr_running; u64 exec_clock; u64 min_vruntime; @@ -377,9 +400,120 @@ struct cfs_rq { unsigned long load_contribution; #endif +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_timestamp; + int throttled, throttle_count; + struct list_head throttled_list; +#endif #endif }; +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_BANDWIDTH +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); + + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + raw_spin_unlock(&cfs_b->lock); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); +} +#else +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -520,8 +654,6 @@ struct rq { int cpu; int online; - unsigned long avg_load_per_task; - u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -1471,24 +1603,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ + (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) typedef int (*tg_visitor)(struct task_group *, void *); /* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. + * Iterate task_group tree rooted at *from, calling @down when first entering a + * node and @up when leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. */ -static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +static int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; int ret; - rcu_read_lock(); - parent = &root_task_group; + parent = from; + down: ret = (*down)(parent, data); if (ret) - goto out_unlock; + goto out; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1497,19 +1633,29 @@ up: continue; } ret = (*up)(parent, data); - if (ret) - goto out_unlock; + if (ret || parent == from) + goto out; child = parent; parent = parent->parent; if (parent) goto up; -out_unlock: - rcu_read_unlock(); - +out: return ret; } +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ + +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + static int tg_nop(struct task_group *tg, void *data) { return 0; @@ -1569,11 +1715,9 @@ static unsigned long cpu_avg_load_per_task(int cpu) unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) - rq->avg_load_per_task = rq->load.weight / nr_running; - else - rq->avg_load_per_task = 0; + return rq->load.weight / nr_running; - return rq->avg_load_per_task; + return 0; } #ifdef CONFIG_PREEMPT @@ -1806,7 +1950,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible--; enqueue_task(rq, p, flags); - inc_nr_running(rq); } /* @@ -1818,7 +1961,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); - dec_nr_running(rq); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -2848,19 +2990,23 @@ void sched_fork(struct task_struct *p) p->state = TASK_RUNNING; /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + + /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + if (task_has_rt_policy(p)) { p->policy = SCHED_NORMAL; - p->normal_prio = p->static_prio; - } - - if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); - p->normal_prio = p->static_prio; - set_load_weight(p); - } + p->rt_priority = 0; + } else if (PRIO_TO_NICE(p->static_prio) < 0) + p->static_prio = NICE_TO_PRIO(0); + + p->prio = p->normal_prio = __normal_prio(p); + set_load_weight(p); /* * We don't need the reset flag anymore after the fork. It has @@ -2869,11 +3015,6 @@ void sched_fork(struct task_struct *p) p->sched_reset_on_fork = 0; } - /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; @@ -4239,7 +4380,7 @@ pick_next_task(struct rq *rq) * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; @@ -6197,6 +6338,30 @@ static void calc_global_load_remove(struct rq *rq) rq->calc_load_active = 0; } +#ifdef CONFIG_CFS_BANDWIDTH +static void unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + if (!cfs_rq->runtime_enabled) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = cfs_b->quota; + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + } +} +#else +static void unthrottle_offline_cfs_rqs(struct rq *rq) {} +#endif + /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -6222,6 +6387,9 @@ static void migrate_tasks(unsigned int dead_cpu) */ rq->stop = NULL; + /* Ensure any throttled groups are reachable by pick_next_task */ + unthrottle_offline_cfs_rqs(rq); + for ( ; ; ) { /* * There's this thread running, bail when that's the only @@ -7965,6 +8133,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, /* allow initial update_cfs_load() to truncate */ cfs_rq->load_stamp = 1; #endif + init_cfs_rq_runtime(cfs_rq); tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; @@ -8104,6 +8273,7 @@ void __init sched_init(void) * We achieve this by letting root_task_group's tasks sit * directly in rq->cfs (i.e root_task_group->se[] = NULL). */ + init_cfs_bandwidth(&root_task_group.cfs_bandwidth); init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8345,6 +8515,8 @@ static void free_fair_sched_group(struct task_group *tg) { int i; + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -8372,6 +8544,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); @@ -8647,12 +8821,7 @@ unsigned long sched_group_shares(struct task_group *tg) } #endif -#ifdef CONFIG_RT_GROUP_SCHED -/* - * Ensure that the real time constraints are schedulable. - */ -static DEFINE_MUTEX(rt_constraints_mutex); - +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -8660,6 +8829,13 @@ static unsigned long to_ratio(u64 period, u64 runtime) return div64_u64(runtime << 20, period); } +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); /* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) @@ -8680,7 +8856,7 @@ struct rt_schedulable_data { u64 rt_runtime; }; -static int tg_schedulable(struct task_group *tg, void *data) +static int tg_rt_schedulable(struct task_group *tg, void *data) { struct rt_schedulable_data *d = data; struct task_group *child; @@ -8738,16 +8914,22 @@ static int tg_schedulable(struct task_group *tg, void *data) static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { + int ret; + struct rt_schedulable_data data = { .tg = tg, .rt_period = period, .rt_runtime = runtime, }; - return walk_tg_tree(tg_schedulable, tg_nop, &data); + rcu_read_lock(); + ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); + rcu_read_unlock(); + + return ret; } -static int tg_set_bandwidth(struct task_group *tg, +static int tg_set_rt_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime) { int i, err = 0; @@ -8786,7 +8968,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) if (rt_runtime_us < 0) rt_runtime = RUNTIME_INF; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_runtime(struct task_group *tg) @@ -8811,7 +8993,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) if (rt_period == 0) return -EINVAL; - return tg_set_bandwidth(tg, rt_period, rt_runtime); + return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); } long sched_group_rt_period(struct task_group *tg) @@ -9001,6 +9183,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) return (u64) scale_load_down(tg->shares); } + +#ifdef CONFIG_CFS_BANDWIDTH +static DEFINE_MUTEX(cfs_constraints_mutex); + +const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ +const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); + +static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) +{ + int i, ret = 0, runtime_enabled; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + + if (tg == &root_task_group) + return -EINVAL; + + /* + * Ensure we have at some amount of bandwidth every period. This is + * to prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota < min_cfs_quota_period || period < min_cfs_quota_period) + return -EINVAL; + + /* + * Likewise, bound things on the otherside by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period > max_cfs_quota_period) + return -EINVAL; + + mutex_lock(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); + if (ret) + goto out_unlock; + + runtime_enabled = quota != RUNTIME_INF; + raw_spin_lock_irq(&cfs_b->lock); + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + + __refill_cfs_bandwidth_runtime(cfs_b); + /* restart the period timer (if active) to handle new period expiry */ + if (runtime_enabled && cfs_b->timer_active) { + /* force a reprogram */ + cfs_b->timer_active = 0; + __start_cfs_bandwidth(cfs_b); + } + raw_spin_unlock_irq(&cfs_b->lock); + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock_irq(&rq->lock); + cfs_rq->runtime_enabled = runtime_enabled; + cfs_rq->runtime_remaining = 0; + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + raw_spin_unlock_irq(&rq->lock); + } +out_unlock: + mutex_unlock(&cfs_constraints_mutex); + + return ret; +} + +int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +{ + u64 quota, period; + + period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + if (cfs_quota_us < 0) + quota = RUNTIME_INF; + else + quota = (u64)cfs_quota_us * NSEC_PER_USEC; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_quota(struct task_group *tg) +{ + u64 quota_us; + + if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) + return -1; + + quota_us = tg_cfs_bandwidth(tg)->quota; + do_div(quota_us, NSEC_PER_USEC); + + return quota_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 quota, period; + + period = (u64)cfs_period_us * NSEC_PER_USEC; + quota = tg_cfs_bandwidth(tg)->quota; + + if (period <= 0) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period, quota); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + do_div(cfs_period_us, NSEC_PER_USEC); + + return cfs_period_us; +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_quota(cgroup_tg(cgrp)); +} + +static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_quota_us) +{ + return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +struct cfs_schedulable_data { + struct task_group *tg; + u64 period, quota; +}; + +/* + * normalize group quota/period to be quota/max_period + * note: units are usecs + */ +static u64 normalize_cfs_quota(struct task_group *tg, + struct cfs_schedulable_data *d) +{ + u64 quota, period; + + if (tg == d->tg) { + period = d->period; + quota = d->quota; + } else { + period = tg_get_cfs_period(tg); + quota = tg_get_cfs_quota(tg); + } + + /* note: these should typically be equivalent */ + if (quota == RUNTIME_INF || quota == -1) + return RUNTIME_INF; + + return to_ratio(period, quota); +} + +static int tg_cfs_schedulable_down(struct task_group *tg, void *data) +{ + struct cfs_schedulable_data *d = data; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + s64 quota = 0, parent_quota = -1; + + if (!tg->parent) { + quota = RUNTIME_INF; + } else { + struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); + + quota = normalize_cfs_quota(tg, d); + parent_quota = parent_b->hierarchal_quota; + + /* + * ensure max(child_quota) <= parent_quota, inherit when no + * limit is set + */ + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } + cfs_b->hierarchal_quota = quota; + + return 0; +} + +static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) +{ + int ret; + struct cfs_schedulable_data data = { + .tg = tg, + .period = period, + .quota = quota, + }; + + if (quota != RUNTIME_INF) { + do_div(data.period, NSEC_PER_USEC); + do_div(data.quota, NSEC_PER_USEC); + } + + rcu_read_lock(); + ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); + rcu_read_unlock(); + + return ret; +} + +static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct task_group *tg = cgroup_tg(cgrp); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + + cb->fill(cb, "nr_periods", cfs_b->nr_periods); + cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); + cb->fill(cb, "throttled_time", cfs_b->throttled_time); + + return 0; +} +#endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9035,6 +9449,22 @@ static struct cftype cpu_files[] = { .write_u64 = cpu_shares_write_u64, }, #endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "cfs_quota_us", + .read_s64 = cpu_cfs_quota_read_s64, + .write_s64 = cpu_cfs_quota_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, + { + .name = "stat", + .read_map = cpu_stats_show, + }, +#endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", @@ -9344,4 +9774,3 @@ struct cgroup_subsys cpuacct_subsys = { .subsys_id = cpuacct_subsys_id, }; #endif /* CONFIG_CGROUP_CPUACCT */ - diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 2722dc1b4138..a86cf9d9eb11 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -47,9 +47,6 @@ static int convert_prio(int prio) return cpupri; } -#define for_each_cpupri_active(array, idx) \ - for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) - /** * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context @@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - for_each_cpupri_active(cp->pri_active, idx) { - struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + if (task_pri >= MAX_RT_PRIO) + return 0; - if (idx >= task_pri) - break; + for (idx = 0; idx < task_pri; idx++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + int skip = 0; + + if (!atomic_read(&(vec)->count)) + skip = 1; + /* + * When looking at the vector, we need to read the counter, + * do a memory barrier, then read the mask. + * + * Note: This is still all racey, but we can deal with it. + * Ideally, we only want to look at masks that are set. + * + * If a mask is not set, then the only thing wrong is that we + * did a little more work than necessary. + * + * If we read a zero count but the mask is set, because of the + * memory barriers, that can only happen when the highest prio + * task for a run queue has left the run queue, in which case, + * it will be followed by a pull. If the task we are processing + * fails to find a proper place to go, that pull request will + * pull this task if the run queue is running at a lower + * priority. + */ + smp_rmb(); + + /* Need to do the rmb for every iteration */ + if (skip) + continue; if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; @@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) { int *currpri = &cp->cpu_to_pri[cpu]; int oldpri = *currpri; - unsigned long flags; + int do_mb = 0; newpri = convert_prio(newpri); @@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * If the cpu was currently mapped to a different value, we * need to map it to the new value then remove the old value. * Note, we must add the new value first, otherwise we risk the - * cpu being cleared from pri_active, and this cpu could be - * missed for a push or pull. + * cpu being missed by the priority loop in cpupri_find. */ if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; - raw_spin_lock_irqsave(&vec->lock, flags); - cpumask_set_cpu(cpu, vec->mask); - vec->count++; - if (vec->count == 1) - set_bit(newpri, cp->pri_active); - - raw_spin_unlock_irqrestore(&vec->lock, flags); + /* + * When adding a new vector, we update the mask first, + * do a write memory barrier, and then update the count, to + * make sure the vector is visible when count is set. + */ + smp_mb__before_atomic_inc(); + atomic_inc(&(vec)->count); + do_mb = 1; } if (likely(oldpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - raw_spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); + /* + * Because the order of modification of the vec->count + * is important, we must make sure that the update + * of the new prio is seen before we decrement the + * old prio. This makes sure that the loop sees + * one or the other when we raise the priority of + * the run queue. We don't care about when we lower the + * priority, as that will trigger an rt pull anyway. + * + * We only need to do a memory barrier if we updated + * the new priority vec. + */ + if (do_mb) + smp_mb__after_atomic_inc(); + + /* + * When removing from the vector, we decrement the counter first + * do a memory barrier and then clear the mask. + */ + atomic_dec(&(vec)->count); + smp_mb__after_atomic_inc(); cpumask_clear_cpu(cpu, vec->mask); - - raw_spin_unlock_irqrestore(&vec->lock, flags); } *currpri = newpri; @@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { struct cpupri_vec *vec = &cp->pri_to_cpu[i]; - raw_spin_lock_init(&vec->lock); - vec->count = 0; + atomic_set(&vec->count, 0); if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 9fc7d386fea4..f6d756173491 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -4,7 +4,6 @@ #include <linux/sched.h> #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) -#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) #define CPUPRI_INVALID -1 #define CPUPRI_IDLE 0 @@ -12,14 +11,12 @@ /* values 2-101 are RT priorities 0-99 */ struct cpupri_vec { - raw_spinlock_t lock; - int count; - cpumask_var_t mask; + atomic_t count; + cpumask_var_t mask; }; struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - long pri_active[CPUPRI_NR_PRI_WORDS]; int cpu_to_pri[NR_CPUS]; }; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index bc8ee9993814..fef0bfde7c8c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; */ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; +#ifdef CONFIG_CFS_BANDWIDTH +/* + * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool + * each time a cfs_rq requests quota. + * + * Note: in the case that the slice exceeds the runtime remaining (either due + * to consumption or the quota being specified to be smaller than the slice) + * we will always only issue the remaining available time. + * + * default: 5 msec, units: microseconds + */ +unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; +#endif + static const struct sched_class fair_sched_class; /************************************************************** @@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq) cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } + + account_cfs_rq_runtime(cfs_rq, delta_exec); } static inline void @@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_FAIR_GROUP_SCHED +/* we need this in update_cfs_load and load-balance functions below */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); # ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) @@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) u64 now, delta; unsigned long load = cfs_rq->load.weight; - if (cfs_rq->tg == &root_task_group) + if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq)) return; now = rq_of(cfs_rq)->clock_task; @@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) tg = cfs_rq->tg; se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se) + if (!se || throttled_hierarchy(cfs_rq)) return; #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) @@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_running == 1) { list_add_leaf_cfs_rq(cfs_rq); + check_enqueue_throttle(cfs_rq); + } } static void __clear_buddies_last(struct sched_entity *se) @@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); } @@ -1077,6 +1106,8 @@ static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; + struct sched_entity *se; + s64 delta; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; @@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ - if (!sched_feat(WAKEUP_PREEMPT)) - return; - if (delta_exec < sysctl_sched_min_granularity) return; - if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_first_entity(cfs_rq); - s64 delta = curr->vruntime - se->vruntime; + se = __pick_first_entity(cfs_rq); + delta = curr->vruntime - se->vruntime; - if (delta < 0) - return; + if (delta < 0) + return; - if (delta > ideal_runtime) - resched_task(rq_of(cfs_rq)->curr); - } + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); } static void @@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) return se; } +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq); + static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) if (prev->on_rq) update_curr(cfs_rq); + /* throttle cfs_rqs exceeding runtime */ + check_cfs_rq_runtime(cfs_rq); + check_spread(cfs_rq, prev); if (prev->on_rq) { update_stats_wait_start(cfs_rq, prev); @@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) return; #endif - if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) + if (cfs_rq->nr_running > 1) check_preempt_tick(cfs_rq, curr); } + +/************************************************** + * CFS bandwidth control machinery + */ + +#ifdef CONFIG_CFS_BANDWIDTH +/* + * default period for cfs group bandwidth. + * default: 0.1s, units: nanoseconds + */ +static inline u64 default_cfs_period(void) +{ + return 100000000ULL; +} + +static inline u64 sched_cfs_bandwidth_slice(void) +{ + return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; +} + +/* + * Replenish runtime according to assigned quota and update expiration time. + * We use sched_clock_cpu directly instead of rq->clock to avoid adding + * additional synchronization around rq->lock. + * + * requires cfs_b->lock + */ +static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +{ + u64 now; + + if (cfs_b->quota == RUNTIME_INF) + return; + + now = sched_clock_cpu(smp_processor_id()); + cfs_b->runtime = cfs_b->quota; + cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); +} + +/* returns 0 on failure to allocate runtime */ +static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct task_group *tg = cfs_rq->tg; + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + u64 amount = 0, min_amount, expires; + + /* note: this is a positive sum as runtime_remaining <= 0 */ + min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota == RUNTIME_INF) + amount = min_amount; + else { + /* + * If the bandwidth pool has become inactive, then at least one + * period must have elapsed since the last consumption. + * Refresh the global state and ensure bandwidth timer becomes + * active. + */ + if (!cfs_b->timer_active) { + __refill_cfs_bandwidth_runtime(cfs_b); + __start_cfs_bandwidth(cfs_b); + } + + if (cfs_b->runtime > 0) { + amount = min(cfs_b->runtime, min_amount); + cfs_b->runtime -= amount; + cfs_b->idle = 0; + } + } + expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + cfs_rq->runtime_remaining += amount; + /* + * we may have advanced our local expiration to account for allowed + * spread between our sched_clock and the one on which runtime was + * issued. + */ + if ((s64)(expires - cfs_rq->runtime_expires) > 0) + cfs_rq->runtime_expires = expires; + + return cfs_rq->runtime_remaining > 0; +} + +/* + * Note: This depends on the synchronization provided by sched_clock and the + * fact that rq->clock snapshots this value. + */ +static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct rq *rq = rq_of(cfs_rq); + + /* if the deadline is ahead of our clock, nothing to do */ + if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0)) + return; + + if (cfs_rq->runtime_remaining < 0) + return; + + /* + * If the local deadline has passed we have to consider the + * possibility that our sched_clock is 'fast' and the global deadline + * has not truly expired. + * + * Fortunately we can check determine whether this the case by checking + * whether the global deadline has advanced. + */ + + if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) { + /* extend local deadline, drift is bounded above by 2 ticks */ + cfs_rq->runtime_expires += TICK_NSEC; + } else { + /* global deadline is ahead, expiration has passed */ + cfs_rq->runtime_remaining = 0; + } +} + +static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + /* dock delta_exec before expiring quota (as it could span periods) */ + cfs_rq->runtime_remaining -= delta_exec; + expire_cfs_rq_runtime(cfs_rq); + + if (likely(cfs_rq->runtime_remaining > 0)) + return; + + /* + * if we're unable to extend our runtime we resched so that the active + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) + resched_task(rq_of(cfs_rq)->curr); +} + +static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) +{ + if (!cfs_rq->runtime_enabled) + return; + + __account_cfs_rq_runtime(cfs_rq, delta_exec); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_rq->throttled; +} + +/* check whether cfs_rq, or any parent, is throttled */ +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return cfs_rq->throttle_count; +} + +/* + * Ensure that neither of the group entities corresponding to src_cpu or + * dest_cpu are members of a throttled hierarchy when performing group + * load-balance operations. + */ +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + struct cfs_rq *src_cfs_rq, *dest_cfs_rq; + + src_cfs_rq = tg->cfs_rq[src_cpu]; + dest_cfs_rq = tg->cfs_rq[dest_cpu]; + + return throttled_hierarchy(src_cfs_rq) || + throttled_hierarchy(dest_cfs_rq); +} + +/* updated child weight may affect parent so we have to do this bottom up */ +static int tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; +#ifdef CONFIG_SMP + if (!cfs_rq->throttle_count) { + u64 delta = rq->clock_task - cfs_rq->load_stamp; + + /* leaving throttled state, advance shares averaging windows */ + cfs_rq->load_stamp += delta; + cfs_rq->load_last += delta; + + /* update entity weight now that we are on_rq again */ + update_cfs_shares(cfs_rq); + } +#endif + + return 0; +} + +static int tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + /* group is entering throttled state, record last load */ + if (!cfs_rq->throttle_count) + update_cfs_load(cfs_rq, 0); + cfs_rq->throttle_count++; + + return 0; +} + +static void throttle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, dequeue = 1; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + /* account load preceding throttle */ + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + rcu_read_unlock(); + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + break; + + if (dequeue) + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + qcfs_rq->h_nr_running -= task_delta; + + if (qcfs_rq->load.weight) + dequeue = 0; + } + + if (!se) + rq->nr_running -= task_delta; + + cfs_rq->throttled = 1; + cfs_rq->throttled_timestamp = rq->clock; + raw_spin_lock(&cfs_b->lock); + list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); + raw_spin_unlock(&cfs_b->lock); +} + +static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + int enqueue = 1; + long task_delta; + + se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; + + cfs_rq->throttled = 0; + raw_spin_lock(&cfs_b->lock); + cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp; + list_del_rcu(&cfs_rq->throttled_list); + raw_spin_unlock(&cfs_b->lock); + cfs_rq->throttled_timestamp = 0; + + update_rq_clock(rq); + /* update hierarchical throttle state */ + walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + + if (!cfs_rq->load.weight) + return; + + task_delta = cfs_rq->h_nr_running; + for_each_sched_entity(se) { + if (se->on_rq) + enqueue = 0; + + cfs_rq = cfs_rq_of(se); + if (enqueue) + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + + if (cfs_rq_throttled(cfs_rq)) + break; + } + + if (!se) + rq->nr_running += task_delta; + + /* determine whether we need to wake up potentially idle cpu */ + if (rq->curr == rq->idle && rq->cfs.nr_running) + resched_task(rq->curr); +} + +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, + u64 remaining, u64 expires) +{ + struct cfs_rq *cfs_rq; + u64 runtime = remaining; + + rcu_read_lock(); + list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, + throttled_list) { + struct rq *rq = rq_of(cfs_rq); + + raw_spin_lock(&rq->lock); + if (!cfs_rq_throttled(cfs_rq)) + goto next; + + runtime = -cfs_rq->runtime_remaining + 1; + if (runtime > remaining) + runtime = remaining; + remaining -= runtime; + + cfs_rq->runtime_remaining += runtime; + cfs_rq->runtime_expires = expires; + + /* we check whether we're throttled above */ + if (cfs_rq->runtime_remaining > 0) + unthrottle_cfs_rq(cfs_rq); + +next: + raw_spin_unlock(&rq->lock); + + if (!remaining) + break; + } + rcu_read_unlock(); + + return remaining; +} + +/* + * Responsible for refilling a task_group's bandwidth and unthrottling its + * cfs_rqs as appropriate. If there has been no activity within the last + * period the timer is deactivated until scheduling resumes; cfs_b->idle is + * used to track this state. + */ +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +{ + u64 runtime, runtime_expires; + int idle = 1, throttled; + + raw_spin_lock(&cfs_b->lock); + /* no need to continue the timer with no bandwidth constraint */ + if (cfs_b->quota == RUNTIME_INF) + goto out_unlock; + + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + /* idle depends on !throttled (for the case of a large deficit) */ + idle = cfs_b->idle && !throttled; + cfs_b->nr_periods += overrun; + + /* if we're going inactive then everything else can be deferred */ + if (idle) + goto out_unlock; + + __refill_cfs_bandwidth_runtime(cfs_b); + + if (!throttled) { + /* mark as potentially idle for the upcoming period */ + cfs_b->idle = 1; + goto out_unlock; + } + + /* account preceding periods in which throttling occurred */ + cfs_b->nr_throttled += overrun; + + /* + * There are throttled entities so we must first use the new bandwidth + * to unthrottle them before making it generally available. This + * ensures that all existing debts will be paid before a new cfs_rq is + * allowed to run. + */ + runtime = cfs_b->runtime; + runtime_expires = cfs_b->runtime_expires; + cfs_b->runtime = 0; + + /* + * This check is repeated as we are holding onto the new bandwidth + * while we unthrottle. This can potentially race with an unthrottled + * group trying to acquire new bandwidth from the global pool. + */ + while (throttled && runtime > 0) { + raw_spin_unlock(&cfs_b->lock); + /* we can't nest cfs_b->lock while distributing bandwidth */ + runtime = distribute_cfs_runtime(cfs_b, runtime, + runtime_expires); + raw_spin_lock(&cfs_b->lock); + + throttled = !list_empty(&cfs_b->throttled_cfs_rq); + } + + /* return (any) remaining runtime */ + cfs_b->runtime = runtime; + /* + * While we are ensured activity in the period following an + * unthrottle, this also covers the case in which the new bandwidth is + * insufficient to cover the existing bandwidth deficit. (Forcing the + * timer to remain active while there are any throttled entities.) + */ + cfs_b->idle = 0; +out_unlock: + if (idle) + cfs_b->timer_active = 0; + raw_spin_unlock(&cfs_b->lock); + + return idle; +} + +/* a cfs_rq won't donate quota below this amount */ +static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC; +/* minimum remaining period time to redistribute slack quota */ +static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC; +/* how long we wait to gather additional slack before distributing */ +static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; + +/* are we near the end of the current quota period? */ +static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) +{ + struct hrtimer *refresh_timer = &cfs_b->period_timer; + u64 remaining; + + /* if the call-back is running a quota refresh is already occurring */ + if (hrtimer_callback_running(refresh_timer)) + return 1; + + /* is a quota refresh about to occur? */ + remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); + if (remaining < min_expire) + return 1; + + return 0; +} + +static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) +{ + u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration; + + /* if there's a quota refresh soon don't bother with slack */ + if (runtime_refresh_within(cfs_b, min_left)) + return; + + start_bandwidth_timer(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period)); +} + +/* we know any runtime found here is valid as update_curr() precedes return */ +static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime; + + if (slack_runtime <= 0) + return; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF && + cfs_rq->runtime_expires == cfs_b->runtime_expires) { + cfs_b->runtime += slack_runtime; + + /* we are under rq->lock, defer unthrottling using a timer */ + if (cfs_b->runtime > sched_cfs_bandwidth_slice() && + !list_empty(&cfs_b->throttled_cfs_rq)) + start_cfs_slack_bandwidth(cfs_b); + } + raw_spin_unlock(&cfs_b->lock); + + /* even if it's not valid for return we don't want to try again */ + cfs_rq->runtime_remaining -= slack_runtime; +} + +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running) + return; + + __return_cfs_rq_runtime(cfs_rq); +} + +/* + * This is done with a timer (instead of inline with bandwidth return) since + * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. + */ +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) +{ + u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + u64 expires; + + /* confirm we're still not at a refresh boundary */ + if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) + return; + + raw_spin_lock(&cfs_b->lock); + if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { + runtime = cfs_b->runtime; + cfs_b->runtime = 0; + } + expires = cfs_b->runtime_expires; + raw_spin_unlock(&cfs_b->lock); + + if (!runtime) + return; + + runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + + raw_spin_lock(&cfs_b->lock); + if (expires == cfs_b->runtime_expires) + cfs_b->runtime = runtime; + raw_spin_unlock(&cfs_b->lock); +} + +/* + * When a group wakes up we want to make sure that its quota is not already + * expired/exceeded, otherwise it may be allowed to steal additional ticks of + * runtime as update_curr() throttling can not not trigger until it's on-rq. + */ +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) +{ + /* an active group must be handled by the update_curr()->put() path */ + if (!cfs_rq->runtime_enabled || cfs_rq->curr) + return; + + /* ensure the group is not already throttled */ + if (cfs_rq_throttled(cfs_rq)) + return; + + /* update runtime allocation */ + account_cfs_rq_runtime(cfs_rq, 0); + if (cfs_rq->runtime_remaining <= 0) + throttle_cfs_rq(cfs_rq); +} + +/* conditionally throttle active cfs_rq's from put_prev_entity() */ +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) + return; + + /* + * it's possible for a throttled entity to be forced into a running + * state (e.g. set_curr_task), in this case we're finished. + */ + if (cfs_rq_throttled(cfs_rq)) + return; + + throttle_cfs_rq(cfs_rq); +} +#else +static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, + unsigned long delta_exec) {} +static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) +{ + return 0; +} + +static inline int throttled_lb_pair(struct task_group *tg, + int src_cpu, int dest_cpu) +{ + return 0; +} +#endif + /************************************************** * CFS operations on tasks: */ @@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); + + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running increment below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; + cfs_rq->h_nr_running++; + flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running++; + + if (cfs_rq_throttled(cfs_rq)) + break; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } + if (!se) + inc_nr_running(rq); hrtick_update(rq); } @@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + /* + * end evaluation on encountering a throttled cfs_rq + * + * note: in the case of encountering a throttled cfs_rq we will + * post the final h_nr_running decrement below. + */ + if (cfs_rq_throttled(cfs_rq)) + break; + cfs_rq->h_nr_running--; + /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* @@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + cfs_rq->h_nr_running--; + + if (cfs_rq_throttled(cfs_rq)) + break; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } + if (!se) + dec_nr_running(rq); hrtick_update(rq); } @@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; } - #else static inline unsigned long effective_load(struct task_group *tg, int cpu, @@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(se == pse)) return; + /* + * This is possible from callers such as pull_task(), in which we + * unconditionally check_prempt_curr() after an enqueue (which may have + * lead to a throttle). This both saves work and prevents false + * next-buddy nomination below. + */ + if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) + return; + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { set_next_buddy(pse); next_buddy_marked = 1; @@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. + * + * Note: this also catches the edge-case of curr being in a throttled + * group (e.g. via set_curr_task), since update_curr() (in the + * enqueue of curr) will have resulted in resched being set. This + * prevents us from potentially nominating it as a false LAST_BUDDY + * below. */ if (test_tsk_need_resched(curr)) return; @@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(p->policy != SCHED_NORMAL)) return; - - if (!sched_feat(WAKEUP_PREEMPT)) - return; - find_matching_se(&se, &pse); update_curr(cfs_rq_of(se)); BUG_ON(!pse); @@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp { struct sched_entity *se = &p->se; - if (!se->on_rq) + /* throttled hierarchies are not runnable */ + if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) return false; /* Tell the scheduler that we'd really like pse to run next. */ @@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, for_each_leaf_cfs_rq(busiest, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), + busiest->cpu, this_cpu)) + break; if (!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) @@ -2217,8 +2868,13 @@ static void update_shares(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) + for_each_leaf_cfs_rq(rq, cfs_rq) { + /* throttled entities do not contribute to load */ + if (throttled_hierarchy(cfs_rq)) + continue; + update_shares_cpu(cfs_rq->tg, cpu); + } rcu_read_unlock(); } @@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, u64 rem_load, moved_load; /* - * empty group + * empty group or part of a throttled hierarchy */ - if (!busiest_cfs_rq->task_weight) + if (!busiest_cfs_rq->task_weight || + throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; @@ -3667,7 +4324,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) struct sched_domain *sd; for_each_domain(cpu, sd) - if (sd && (sd->flags & flag)) + if (sd->flags & flag) break; return sd; @@ -4251,8 +4908,13 @@ static void set_curr_task_fair(struct rq *rq) { struct sched_entity *se = &rq->curr->se; - for_each_sched_entity(se) - set_next_entity(cfs_rq_of(se), se); + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + set_next_entity(cfs_rq, se); + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 2e74677cb040..efa0a7b75dde 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) SCHED_FEAT(START_DEBIT, 1) /* - * Should wakeups try to preempt running tasks. - */ -SCHED_FEAT(WAKEUP_PREEMPT, 1) - -/* * Based on load and program behaviour, see if it makes sense to place * a newly woken task on the same cpu as the task that woke it -- * improve cache locality. Typically used with SYNC wakeups as diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index af1177858be3..0cc188cf7664 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) update_rt_migration(rt_rq); } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); plist_node_init(&p->pushable_tasks, p->prio); plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); + + /* Update the highest prio pushable task */ + if (p->prio < rq->rt.highest_prio.next) + rq->rt.highest_prio.next = p->prio; } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) { plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); -} -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); + /* Update the new highest prio pushable task */ + if (has_pushable_tasks(rq)) { + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + rq->rt.highest_prio.next = p->prio; + } else + rq->rt.highest_prio.next = MAX_RT_PRIO; } #else @@ -698,47 +710,13 @@ static void update_curr_rt(struct rq *rq) #if defined CONFIG_SMP -static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); - -static inline int next_prio(struct rq *rq) -{ - struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); - - if (next && rt_prio(next->prio)) - return next->prio; - else - return MAX_RT_PRIO; -} - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - if (prio < prev_prio) { - - /* - * If the new task is higher in priority than anything on the - * run-queue, we know that the previous high becomes our - * next-highest. - */ - rt_rq->highest_prio.next = prev_prio; - - if (rq->online) - cpupri_set(&rq->rd->cpupri, rq->cpu, prio); - - } else if (prio == rt_rq->highest_prio.curr) - /* - * If the next task is equal in priority to the highest on - * the run-queue, then we implicitly know that the next highest - * task cannot be any lower than current - */ - rt_rq->highest_prio.next = prio; - else if (prio < rt_rq->highest_prio.next) - /* - * Otherwise, we need to recompute next-highest - */ - rt_rq->highest_prio.next = next_prio(rq); + if (rq->online && prio < prev_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, prio); } static void @@ -746,9 +724,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { struct rq *rq = rq_of_rt_rq(rt_rq); - if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) - rt_rq->highest_prio.next = next_prio(rq); - if (rq->online && rt_rq->highest_prio.curr != prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } @@ -961,6 +936,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + inc_nr_running(rq); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -971,6 +948,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); + + dec_nr_running(rq); } /* @@ -1017,10 +996,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) struct rq *rq; int cpu; - if (sd_flag != SD_BALANCE_WAKE) - return smp_processor_id(); - cpu = task_cpu(p); + + /* For anything but wake ups, just return the task_cpu */ + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + rq = cpu_rq(cpu); rcu_read_lock(); @@ -1059,6 +1040,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) } rcu_read_unlock(); +out: return cpu; } @@ -1178,7 +1160,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) static void put_prev_task_rt(struct rq *rq, struct task_struct *p) { update_curr_rt(rq); - p->se.exec_start = 0; /* * The previous task needs to be made eligible for pushing @@ -1394,6 +1375,7 @@ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; + int ret = 0; if (!rq->rt.overloaded) return 0; @@ -1426,7 +1408,7 @@ retry: if (!lowest_rq) { struct task_struct *task; /* - * find lock_lowest_rq releases rq->lock + * find_lock_lowest_rq releases rq->lock * so it is possible that next_task has migrated. * * We need to make sure that the task is still on the same @@ -1436,12 +1418,11 @@ retry: task = pick_next_pushable_task(rq); if (task_cpu(next_task) == rq->cpu && task == next_task) { /* - * If we get here, the task hasn't moved at all, but - * it has failed to push. We will not try again, - * since the other cpus will pull from us when they - * are ready. + * The task hasn't migrated, and is still the next + * eligible task, but we failed to find a run-queue + * to push it to. Do not retry in this case, since + * other cpus will pull from us when ready. */ - dequeue_pushable_task(rq, next_task); goto out; } @@ -1460,6 +1441,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); + ret = 1; resched_task(lowest_rq->curr); @@ -1468,7 +1450,7 @@ retry: out: put_task_struct(next_task); - return 1; + return ret; } static void push_rt_tasks(struct rq *rq) @@ -1863,4 +1845,3 @@ static void print_rt_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif /* CONFIG_SCHED_DEBUG */ - diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 6f437632afab..8b44e7fa7fb3 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { + inc_nr_running(rq); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { + dec_nr_running(rq); } static void yield_task_stop(struct rq *rq) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d65b531e50..2d2ecdcc8cdb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .procname = "sched_cfs_bandwidth_slice_us", + .data = &sysctl_sched_cfs_bandwidth_slice, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", |