summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/sched/debug.c8
-rw-r--r--kernel/sched/fair.c57
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h3
5 files changed, 87 insertions, 18 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 825447720620..960acc06a818 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1499,7 +1499,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
- smp_wmb();
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
@@ -3007,6 +3013,12 @@ need_resched:
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
@@ -7876,7 +7888,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
- account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+ /*
+ * If we need to toggle cfs_bandwidth_used, off->on must occur
+ * before making related changes, and on->off must occur afterwards
+ */
+ if (runtime_enabled && !runtime_was_enabled)
+ cfs_bandwidth_usage_inc();
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7902,6 +7919,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
+ if (runtime_was_enabled && !runtime_enabled)
+ cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 068ad55aa641..b38d50312e25 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -224,6 +224,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
atomic_read(&cfs_rq->tg->runnable_avg));
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+ cfs_rq->tg->cfs_bandwidth.timer_active);
+ SEQ_printf(m, " .%-30s: %d\n", "throttled",
+ cfs_rq->throttled);
+ SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
+ cfs_rq->throttle_count);
+#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 009a62bb859d..305ef886219e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -936,6 +936,13 @@ void task_numa_work(struct callback_head *work)
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
continue;
+ /*
+ * Skip inaccessible VMAs to avoid any confusion between
+ * PROT_NONE and NUMA hinting ptes
+ */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ continue;
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -2022,13 +2029,14 @@ static inline bool cfs_bandwidth_used(void)
return static_key_false(&__cfs_bandwidth_used);
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
+{
+ static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
{
- /* only need to count groups transitioning between enabled/!enabled */
- if (enabled && !was_enabled)
- static_key_slow_inc(&__cfs_bandwidth_used);
- else if (!enabled && was_enabled)
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -2036,7 +2044,8 @@ static bool cfs_bandwidth_used(void)
return true;
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
#endif /* HAVE_JUMP_LABEL */
/*
@@ -2288,6 +2297,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
}
@@ -2399,6 +2410,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (idle)
goto out_unlock;
+ /*
+ * if we have relooped after returning idle once, we need to update our
+ * status as actually running, so that other cpus doing
+ * __start_cfs_bandwidth will stop trying to cancel us.
+ */
+ cfs_b->timer_active = 1;
+
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
@@ -2459,7 +2477,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2535,10 +2559,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
u64 expires;
/* confirm we're still not at a refresh boundary */
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ raw_spin_lock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
return;
+ }
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
@@ -2663,11 +2689,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
raw_spin_unlock(&cfs_b->lock);
- /* ensure cfs_b->lock is available while we wait */
- hrtimer_cancel(&cfs_b->period_timer);
-
+ cpu_relax();
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
if (cfs_b->timer_active)
@@ -6065,7 +6091,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- update_load_set(&se->load, 0);
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 111539a27757..8f4243e73423 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -924,6 +924,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -933,6 +940,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0fc275c70d7d..aa23bd2294e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1354,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {