diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 20 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 174 | ||||
-rw-r--r-- | kernel/sched/sched.h | 4 | ||||
-rw-r--r-- | kernel/sched/topology.c | 2 |
5 files changed, 89 insertions, 117 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b3ff73d6a4c2..97a27726ea21 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -432,10 +432,11 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) * its already queued (either by us or someone else) and will get the * wakeup due to that. * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). + * In order to ensure that a pending wakeup will observe our pending + * state, even in the failed case, an explicit smp_mb() must be used. */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + smp_mb__before_atomic(); + if (cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)) return; get_task_struct(task); @@ -1111,7 +1112,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; - if (!cpumask_intersects(new_mask, cpu_valid_mask)) { + dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } @@ -1132,7 +1134,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ @@ -5026,7 +5027,7 @@ long __sched io_schedule_timeout(long timeout) } EXPORT_SYMBOL(io_schedule_timeout); -void io_schedule(void) +void __sched io_schedule(void) { int token; @@ -5241,10 +5242,11 @@ void init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + __sched_fork(0, idle); + raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); - __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->flags |= PF_IDLE; @@ -6342,10 +6344,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#else - /* We don't support RT-tasks being in separate groups */ - if (task->sched_class != &fair_sched_class) - return -EINVAL; #endif /* * Serialize against wake_up_new_task() such that if its diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..45c2cd37fe6b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -738,7 +738,7 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); /* We might have scheduled out from guest path */ - if (current->flags & PF_VCPU) + if (tsk->flags & PF_VCPU) vtime_account_guest(tsk, vtime); else __vtime_account_system(tsk, vtime); @@ -781,7 +781,7 @@ void vtime_guest_enter(struct task_struct *tsk) */ write_seqcount_begin(&vtime->seqcount); __vtime_account_system(tsk, vtime); - current->flags |= PF_VCPU; + tsk->flags |= PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -792,7 +792,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); - current->flags &= ~PF_VCPU; + tsk->flags &= ~PF_VCPU; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index af7de1f9906c..0b4e997fea1a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2358,13 +2358,23 @@ no_join: return; } -void task_numa_free(struct task_struct *p) +/* + * Get rid of NUMA staticstics associated with a task (either current or dead). + * If @final is set, the task is dead and has reached refcount zero, so we can + * safely free all relevant data structures. Otherwise, there might be + * concurrent reads from places like load balancing and procfs, and we should + * reset the data back to default state without freeing ->numa_faults. + */ +void task_numa_free(struct task_struct *p, bool final) { struct numa_group *grp = p->numa_group; - void *numa_faults = p->numa_faults; + unsigned long *numa_faults = p->numa_faults; unsigned long flags; int i; + if (!numa_faults) + return; + if (grp) { spin_lock_irqsave(&grp->lock, flags); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) @@ -2377,8 +2387,14 @@ void task_numa_free(struct task_struct *p) put_numa_group(grp); } - p->numa_faults = NULL; - kfree(numa_faults); + if (final) { + p->numa_faults = NULL; + kfree(numa_faults); + } else { + p->total_numa_faults = 0; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + numa_faults[i] = 0; + } } /* @@ -4075,23 +4091,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) } /* - * Replenish runtime according to assigned quota and update expiration time. - * We use sched_clock_cpu directly instead of rq->clock to avoid adding - * additional synchronization around rq->lock. + * Replenish runtime according to assigned quota. We use sched_clock_cpu + * directly instead of rq->clock to avoid adding additional synchronization + * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { - u64 now; - - if (cfs_b->quota == RUNTIME_INF) - return; - - now = sched_clock_cpu(smp_processor_id()); - cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; + if (cfs_b->quota != RUNTIME_INF) + cfs_b->runtime = cfs_b->quota; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -4113,8 +4122,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -4131,65 +4139,23 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; + if (cfs_rq->throttled) + return; /* * if we're unable to extend our runtime we resched so that the active * hierarchy can be throttled @@ -4369,8 +4335,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -4386,13 +4351,15 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, if (!cfs_rq_throttled(cfs_rq)) goto next; + /* By the above check, this should never be true */ + SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + runtime = -cfs_rq->runtime_remaining + 1; if (runtime > remaining) runtime = remaining; remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -4417,7 +4384,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4445,8 +4412,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -4459,8 +4424,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; @@ -4537,8 +4501,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -4570,7 +4533,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); @@ -4587,7 +4549,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -4596,11 +4557,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); - if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } @@ -4695,20 +4655,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); - new = (old * 147) / 128; /* ~115% */ - new = min(new, max_cfs_quota_period); - - cfs_b->period = ns_to_ktime(new); - - /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ - cfs_b->quota *= new; - cfs_b->quota = div64_u64(cfs_b->quota, old); - - pr_warn_ratelimited( - "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", - smp_processor_id(), - div_u64(new, NSEC_PER_USEC), - div_u64(cfs_b->quota, NSEC_PER_USEC)); + /* + * Grow period by a factor of 2 to avoid losing precision. + * Precision loss in the quota/period ratio can cause __cfs_schedulable + * to fail. + */ + new = old * 2; + if (new < max_cfs_quota_period) { + cfs_b->period = ns_to_ktime(new); + cfs_b->quota *= 2; + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } else { + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(old, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + } /* reset count so we don't come right back in here */ count = 0; @@ -8338,9 +8306,10 @@ more_balance: out_balanced: /* * We reach balance although we may have faced some affinity - * constraints. Clear the imbalance flag if it was set. + * constraints. Clear the imbalance flag only if other tasks got + * a chance to move and fix the imbalance. */ - if (sd_parent) { + if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { int *group_imbalance = &sd_parent->groups->sgc->imbalance; if (*group_imbalance) @@ -8358,13 +8327,22 @@ out_all_pinned: sd->nr_balance_failed = 0; out_one_pinned: + ld_moved = 0; + + /* + * idle_balance() disregards balance intervals, so we could repeatedly + * reach this code, which would lead to balance_interval skyrocketting + * in a short amount of time. Skip the balance_interval increase logic + * to avoid that. + */ + if (env.idle == CPU_NEWLY_IDLE) + goto out; + /* tune up the balancing interval */ if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - - ld_moved = 0; out: return ld_moved; } @@ -9401,18 +9379,18 @@ err: void online_fair_sched_group(struct task_group *tg) { struct sched_entity *se; + struct rq_flags rf; struct rq *rq; int i; for_each_possible_cpu(i) { rq = cpu_rq(i); se = tg->se[i]; - - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 452b56923c6d..268f560ec998 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -280,8 +280,6 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; short idle, period_active; struct hrtimer period_timer, slack_timer; @@ -489,8 +487,6 @@ struct cfs_rq { #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock, throttled_clock_task; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9dcd80ed9d4c..867d173dab48 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1347,7 +1347,7 @@ void sched_init_numa(void) int level = 0; int i, j, k; - sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); + sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); if (!sched_domains_numa_distance) return; |