diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 59 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 57 | ||||
-rw-r--r-- | kernel/sched/debug.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 87 | ||||
-rw-r--r-- | kernel/sched/features.h | 3 | ||||
-rw-r--r-- | kernel/sched/idle.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 15 | ||||
-rw-r--r-- | kernel/sched/sched.h | 4 |
8 files changed, 159 insertions, 70 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b89ca5c83143..85be684687b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3714,7 +3714,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static inline bool ttwu_queue_cond(int cpu, int wake_flags) +static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* * Do not complicate things with the async wake_list while the CPU is @@ -3723,6 +3723,10 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) if (!cpu_active(cpu)) return false; + /* Ensure the task will still be allowed to run on the CPU. */ + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. @@ -3730,13 +3734,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) if (!cpus_share_cache(smp_processor_id(), cpu)) return true; + if (cpu == smp_processor_id()) + return false; + /* - * If the task is descheduling and the only running task on the - * CPU then use the wakelist to offload the task activation to - * the soon-to-be-idle CPU as the current CPU is likely busy. - * nr_running is checked to avoid unnecessary task stacking. + * If the wakee cpu is idle, or the task is descheduling and the + * only running task on the CPU, then use the wakelist to offload + * the task activation to the idle (or soon-to-be-idle) CPU as + * the current CPU is likely busy. nr_running is checked to + * avoid unnecessary task stacking. + * + * Note that we can only get here with (wakee) p->on_rq=0, + * p->on_cpu can be whatever, we've done the dequeue, so + * the wakee has been accounted out of ->nr_running. */ - if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) + if (!cpu_rq(cpu)->nr_running) return true; return false; @@ -3744,10 +3756,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { - if (WARN_ON_ONCE(cpu == smp_processor_id())) - return false; - + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -4069,7 +4078,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * scheduling. */ if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) goto unlock; /* @@ -4585,7 +4594,8 @@ static inline void prepare_task(struct task_struct *next) * Claim the task as running, we do this before switching to it * such that any running task will have this set. * - * See the ttwu() WF_ON_CPU case and its ordering comment. + * See the smp_load_acquire(&p->on_cpu) case in ttwu() and + * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); #endif @@ -6379,8 +6389,12 @@ static inline void sched_submit_work(struct task_struct *tsk) preempt_enable_no_resched(); } - if (tsk_is_pi_blocked(tsk)) - return; + /* + * spinlock and rwlock must not flush block requests. This will + * deadlock if the callback attempts to acquire a lock which is + * already acquired. + */ + SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); /* * If we are going to sleep and we have plugged IO queued, @@ -8737,7 +8751,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur, } int task_can_attach(struct task_struct *p, - const struct cpumask *cs_cpus_allowed) + const struct cpumask *cs_effective_cpus) { int ret = 0; @@ -8756,8 +8770,13 @@ int task_can_attach(struct task_struct *p, } if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, - cs_cpus_allowed)) - ret = dl_task_can_attach(p, cs_cpus_allowed); + cs_effective_cpus)) { + int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus); + + if (unlikely(cpu >= nr_cpu_ids)) + return -EINVAL; + ret = dl_cpu_busy(cpu, p); + } out: return ret; @@ -9041,8 +9060,10 @@ static void cpuset_cpu_active(void) static int cpuset_cpu_inactive(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - if (dl_cpu_busy(cpu)) - return -EBUSY; + int ret = dl_cpu_busy(cpu, NULL); + + if (ret) + return ret; cpuset_update_active_cpus(); } else { num_cpus_frozen++; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fffcb1aa77b7..147b757d162b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1561,7 +1561,10 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); + if (!(flags & ENQUEUE_REPLENISH)) + printk_deferred_once("sched: DL de-boosted task PID %d: REPLENISH flag missing\n", + task_pid_nr(p)); + return; } @@ -2861,41 +2864,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) } #ifdef CONFIG_SMP -int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) -{ - unsigned long flags, cap; - unsigned int dest_cpu; - struct dl_bw *dl_b; - bool overflow; - int ret; - - dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); - - rcu_read_lock_sched(); - dl_b = dl_bw_of(dest_cpu); - raw_spin_lock_irqsave(&dl_b->lock, flags); - cap = dl_bw_capacity(dest_cpu); - overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw); - if (overflow) { - ret = -EBUSY; - } else { - /* - * We reserve space for this task in the destination - * root_domain, as we can't fail after this point. - * We will free resources in the source root_domain - * later on (see set_cpus_allowed_dl()). - */ - int cpus = dl_bw_cpus(dest_cpu); - - __dl_add(dl_b, p->dl.dl_bw, cpus); - ret = 0; - } - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - rcu_read_unlock_sched(); - - return ret; -} - int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -2917,7 +2885,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, return ret; } -bool dl_cpu_busy(unsigned int cpu) +int dl_cpu_busy(int cpu, struct task_struct *p) { unsigned long flags, cap; struct dl_bw *dl_b; @@ -2927,11 +2895,22 @@ bool dl_cpu_busy(unsigned int cpu) dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); cap = dl_bw_capacity(cpu); - overflow = __dl_overflow(dl_b, cap, 0, 0); + overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0); + + if (!overflow && p) { + /* + * We reserve space for this task in the destination + * root_domain, as we can't fail after this point. + * We will free resources in the source root_domain + * later on (see set_cpus_allowed_dl()). + */ + __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu)); + } + raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - return overflow; + return overflow ? -EBUSY : 0; } #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7a2d32d2025f..34c5ff3a0669 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -416,7 +416,7 @@ void update_sched_domain_debugfs(void) char buf[32]; snprintf(buf, sizeof(buf), "cpu%d", cpu); - debugfs_remove(debugfs_lookup(buf, sd_dentry)); + debugfs_lookup_and_remove(buf, sd_dentry); d_cpu = debugfs_create_dir(buf, sd_dentry); i = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fcbacc35d2b9..a853e4e9e3c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6280,6 +6280,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; struct rq *this_rq = this_rq(); int this = smp_processor_id(); struct sched_domain *this_sd; @@ -6319,6 +6320,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool time = cpu_clock(this); } + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { + /* because !--nr is the condition to stop scan */ + nr = READ_ONCE(sd_share->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -9166,6 +9178,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; } +static void update_idle_cpu_scan(struct lb_env *env, + unsigned long sum_util) +{ + struct sched_domain_shared *sd_share; + int llc_weight, pct; + u64 x, y, tmp; + /* + * Update the number of CPUs to scan in LLC domain, which could + * be used as a hint in select_idle_cpu(). The update of sd_share + * could be expensive because it is within a shared cache line. + * So the write of this hint only occurs during periodic load + * balancing, rather than CPU_NEWLY_IDLE, because the latter + * can fire way more frequently than the former. + */ + if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) + return; + + llc_weight = per_cpu(sd_llc_size, env->dst_cpu); + if (env->sd->span_weight != llc_weight) + return; + + sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu)); + if (!sd_share) + return; + + /* + * The number of CPUs to search drops as sum_util increases, when + * sum_util hits 85% or above, the scan stops. + * The reason to choose 85% as the threshold is because this is the + * imbalance_pct(117) when a LLC sched group is overloaded. + * + * let y = SCHED_CAPACITY_SCALE - p * x^2 [1] + * and y'= y / SCHED_CAPACITY_SCALE + * + * x is the ratio of sum_util compared to the CPU capacity: + * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE) + * y' is the ratio of CPUs to be scanned in the LLC domain, + * and the number of CPUs to scan is calculated by: + * + * nr_scan = llc_weight * y' [2] + * + * When x hits the threshold of overloaded, AKA, when + * x = 100 / pct, y drops to 0. According to [1], + * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000 + * + * Scale x by SCHED_CAPACITY_SCALE: + * x' = sum_util / llc_weight; [3] + * + * and finally [1] becomes: + * y = SCHED_CAPACITY_SCALE - + * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4] + * + */ + /* equation [3] */ + x = sum_util; + do_div(x, llc_weight); + + /* equation [4] */ + pct = env->sd->imbalance_pct; + tmp = x * x * pct * pct; + do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); + tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); + y = SCHED_CAPACITY_SCALE - tmp; + + /* equation [2] */ + y *= llc_weight; + do_div(y, SCHED_CAPACITY_SCALE); + if ((int)y != sd_share->nr_idle_scan) + WRITE_ONCE(sd_share->nr_idle_scan, (int)y); +} + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -9178,6 +9261,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; + unsigned long sum_util = 0; int sg_status = 0; do { @@ -9210,6 +9294,7 @@ next_group: sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; + sum_util += sgs->group_util; sg = sg->next; } while (sg != env->sd->groups); @@ -9235,6 +9320,8 @@ next_group: WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); } + + update_idle_cpu_scan(env, sum_util); } #define NUMA_IMBALANCE_MIN 2 diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 7f8dace0964c..c4947c1b5edb 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -55,7 +55,8 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, true) +SCHED_FEAT(SIS_PROP, false) +SCHED_FEAT(SIS_UTIL, true) /* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index d17b0a5ce6ac..499a3e286cd0 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -105,7 +105,7 @@ void __cpuidle default_idle_call(void) * last -- this is very similar to the entry code. */ trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(_THIS_IP_); + lockdep_hardirqs_on_prepare(); rcu_idle_enter(); lockdep_hardirqs_on(_THIS_IP_); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8007d087a57f..f75dcd3537b8 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -444,7 +444,7 @@ static inline void rt_queue_push_tasks(struct rq *rq) #endif /* CONFIG_SMP */ static void enqueue_top_rt_rq(struct rt_rq *rt_rq); -static void dequeue_top_rt_rq(struct rt_rq *rt_rq); +static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { @@ -565,7 +565,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) rt_se = rt_rq->tg->rt_se[cpu]; if (!rt_se) { - dequeue_top_rt_rq(rt_rq); + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ cpufreq_update_util(rq_of_rt_rq(rt_rq), 0); } @@ -651,7 +651,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { - dequeue_top_rt_rq(rt_rq); + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -1051,7 +1051,7 @@ static void update_curr_rt(struct rq *rq) } static void -dequeue_top_rt_rq(struct rt_rq *rt_rq) +dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count) { struct rq *rq = rq_of_rt_rq(rt_rq); @@ -1062,7 +1062,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq) BUG_ON(!rq->nr_running); - sub_nr_running(rq, rt_rq->rt_nr_running); + sub_nr_running(rq, count); rt_rq->rt_queued = 0; } @@ -1342,18 +1342,21 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; + unsigned int rt_nr_running; for_each_sched_rt_entity(rt_se) { rt_se->back = back; back = rt_se; } - dequeue_top_rt_rq(rt_rq_of_se(back)); + rt_nr_running = rt_rq_of_se(back)->rt_nr_running; for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se, flags); } + + dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running); } static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fe8be2f8a47d..e49902898253 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -348,9 +348,8 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); -extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); -extern bool dl_cpu_busy(unsigned int cpu); +extern int dl_cpu_busy(int cpu, struct task_struct *p); #ifdef CONFIG_CGROUP_SCHED @@ -2053,7 +2052,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_ON_CPU 0x40 /* Wakee is on_cpu */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); |