diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 80 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 2 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 4 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 263 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 38 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 116 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 148 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 39 | ||||
| -rw-r--r-- | kernel/sched/isolation.c | 4 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 14 | ||||
| -rw-r--r-- | kernel/sched/syscalls.c | 30 |
11 files changed, 535 insertions, 203 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 759777694c78..496dff740dca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4729,8 +4729,11 @@ void sched_cancel_fork(struct task_struct *p) scx_cancel_fork(p); } +static void sched_mm_cid_fork(struct task_struct *t); + void sched_post_fork(struct task_struct *p) { + sched_mm_cid_fork(p); uclamp_post_fork(p); scx_post_fork(p); } @@ -6830,6 +6833,7 @@ static void __sched notrace __schedule(int sched_mode) /* SCX must consult the BPF scheduler to tell if rq is empty */ if (!rq->nr_running && !scx_enabled()) { next = prev; + rq->next_class = &idle_sched_class; goto picked; } } else if (!preempt && prev_state) { @@ -10616,13 +10620,10 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc } } -static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) +static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) { /* Remote access to mm::mm_cid::pcpu requires rq_lock */ guard(task_rq_lock)(t); - /* If the task is not active it is not in the users count */ - if (!t->mm_cid.active) - return false; if (cid_on_task(t->mm_cid.cid)) { /* If running on the CPU, put the CID in transit mode, otherwise drop it */ if (task_rq(t)->curr == t) @@ -10630,69 +10631,43 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm else mm_unset_cid_on_task(t); } - return true; } -static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm) +static void mm_cid_fixup_tasks_to_cpus(void) { - struct task_struct *p, *t; - unsigned int users; + struct mm_struct *mm = current->mm; + struct task_struct *t; - /* - * This can obviously race with a concurrent affinity change, which - * increases the number of allowed CPUs for this mm, but that does - * not affect the mode and only changes the CID constraints. A - * possible switch back to per task mode happens either in the - * deferred handler function or in the next fork()/exit(). - * - * The caller has already transferred. The newly incoming task is - * already accounted for, but not yet visible. - */ - users = mm->mm_cid.users - 2; - if (!users) - return; + lockdep_assert_held(&mm->mm_cid.mutex); - guard(rcu)(); - for_other_threads(current, t) { - if (mm_cid_fixup_task_to_cpu(t, mm)) - users--; + hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) { + /* Current has already transferred before invoking the fixup. */ + if (t != current) + mm_cid_fixup_task_to_cpu(t, mm); } - if (!users) - return; - - /* Happens only for VM_CLONE processes. */ - for_each_process_thread(p, t) { - if (t == current || t->mm != mm) - continue; - if (mm_cid_fixup_task_to_cpu(t, mm)) { - if (--users == 0) - return; - } - } -} - -static void mm_cid_fixup_tasks_to_cpus(void) -{ - struct mm_struct *mm = current->mm; - - mm_cid_do_fixup_tasks_to_cpus(mm); mm_cid_complete_transit(mm, MM_CID_ONCPU); } static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) { + lockdep_assert_held(&mm->mm_cid.lock); + t->mm_cid.active = 1; + hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list); mm->mm_cid.users++; return mm_update_max_cids(mm); } -void sched_mm_cid_fork(struct task_struct *t) +static void sched_mm_cid_fork(struct task_struct *t) { struct mm_struct *mm = t->mm; bool percpu; - WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); + if (!mm) + return; + + WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET); guard(mutex)(&mm->mm_cid.mutex); scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { @@ -10731,12 +10706,13 @@ void sched_mm_cid_fork(struct task_struct *t) static bool sched_mm_cid_remove_user(struct task_struct *t) { + lockdep_assert_held(&t->mm->mm_cid.lock); + t->mm_cid.active = 0; - scoped_guard(preempt) { - /* Clear the transition bit */ - t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); - mm_unset_cid_on_task(t); - } + /* Clear the transition bit */ + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); + mm_unset_cid_on_task(t); + hlist_del_init(&t->mm_cid.node); t->mm->mm_cid.users--; return mm_update_max_cids(t->mm); } @@ -10879,11 +10855,13 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p) mutex_init(&mm->mm_cid.mutex); mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); + INIT_HLIST_HEAD(&mm->mm_cid.user_list); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); bitmap_zero(mm_cidmask(mm), num_possible_cpus()); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } #endif /* !CONFIG_SCHED_MM_CID */ static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d08b00429323..674de6a48551 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1027,7 +1027,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { - if (unlikely(!dl_is_implicit(dl_se) && + if (unlikely((!dl_is_implicit(dl_se) || dl_se->dl_defer) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b24f40f05019..15bf45b6f912 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -902,6 +902,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; + u64 avruntime; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -925,6 +926,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) right_vruntime = last->vruntime; zero_vruntime = cfs_rq->zero_vruntime; + avruntime = avg_vruntime(cfs_rq); raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", @@ -934,7 +936,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", SPLIT_NS(zero_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", - SPLIT_NS(avg_vruntime(cfs_rq))); + SPLIT_NS(avruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 62b1f3ac5630..064eaa76be4b 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -976,8 +976,12 @@ static bool scx_dsq_priq_less(struct rb_node *node_a, static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) { - /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ - WRITE_ONCE(dsq->nr, dsq->nr + delta); + /* + * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE() + * on the read side and WRITE_ONCE() on the write side to properly + * annotate the concurrent lockless access and avoid KCSAN warnings. + */ + WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta); } static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) @@ -1099,22 +1103,13 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, } /* seq records the order tasks are queued, used by BPF DSQ iterator */ - dsq->seq++; + WRITE_ONCE(dsq->seq, dsq->seq + 1); p->scx.dsq_seq = dsq->seq; dsq_mod_nr(dsq, 1); p->scx.dsq = dsq; /* - * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the - * direct dispatch path, but we clear them here because the direct - * dispatch verdict may be overridden on the enqueue path during e.g. - * bypass. - */ - p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; - p->scx.ddsp_enq_flags = 0; - - /* * We're transitioning out of QUEUEING or DISPATCHING. store_release to * match waiters' load_acquire. */ @@ -1279,12 +1274,34 @@ static void mark_direct_dispatch(struct scx_sched *sch, p->scx.ddsp_enq_flags = enq_flags; } +/* + * Clear @p direct dispatch state when leaving the scheduler. + * + * Direct dispatch state must be cleared in the following cases: + * - direct_dispatch(): cleared on the synchronous enqueue path, deferred + * dispatch keeps the state until consumed + * - process_ddsp_deferred_locals(): cleared after consuming deferred state, + * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch + * verdict is ignored (local/global/bypass) + * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred + * cancellation and holding_cpu races + * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by + * the scx_bypass() loop, so that stale state is not reused by a subsequent + * scheduler instance + */ +static inline void clear_direct_dispatch(struct task_struct *p) +{ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; +} + static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, u64 enq_flags) { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + u64 ddsp_enq_flags; touch_core_sched_dispatch(rq, p); @@ -1325,8 +1342,10 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, return; } - dispatch_enqueue(sch, dsq, p, - p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + ddsp_enq_flags = p->scx.ddsp_enq_flags; + clear_direct_dispatch(p); + + dispatch_enqueue(sch, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -1435,6 +1454,7 @@ enqueue: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); + clear_direct_dispatch(p); dispatch_enqueue(sch, dsq, p, enq_flags); } @@ -1466,16 +1486,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; } -static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) +static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) { struct scx_sched *sch = scx_root; int sticky_cpu = p->scx.sticky_cpu; + u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; if (enq_flags & ENQUEUE_WAKEUP) rq->scx.flags |= SCX_RQ_IN_WAKEUP; - enq_flags |= rq->scx.extra_enq_flags; - if (sticky_cpu >= 0) p->scx.sticky_cpu = -1; @@ -1607,6 +1626,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags sub_nr_running(rq, 1); dispatch_dequeue(rq, p); + clear_direct_dispatch(p); return true; } @@ -2290,13 +2310,15 @@ static void process_ddsp_deferred_locals(struct rq *rq) struct task_struct, scx.dsq_list.node))) { struct scx_sched *sch = scx_root; struct scx_dispatch_q *dsq; + u64 dsq_id = p->scx.ddsp_dsq_id; + u64 enq_flags = p->scx.ddsp_enq_flags; list_del_init(&p->scx.dsq_list.node); + clear_direct_dispatch(p); - dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + dsq = find_dsq_for_dispatch(sch, rq, dsq_id, p); if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(sch, rq, dsq, p, - p->scx.ddsp_enq_flags); + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); } } @@ -2401,7 +2423,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, { struct scx_sched *sch = scx_root; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); update_curr_scx(rq); @@ -2444,6 +2466,48 @@ switch_class: switch_class(rq, next); } +static void kick_sync_wait_bal_cb(struct rq *rq) +{ + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; + bool waited; + s32 cpu; + + /* + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled + * — a target CPU may be waiting for us to process an IPI (e.g. TLB + * flush) while we wait for its kick_sync to advance. + * + * Also, keep advancing our own kick_sync so that new kick_sync waits + * targeting us, which can start after we drop the lock, cannot form + * cyclic dependencies. + */ +retry: + waited = false; + for_each_cpu(cpu, rq->scx.cpus_to_sync) { + /* + * smp_load_acquire() pairs with smp_store_release() on + * kick_sync updates on the target CPUs. + */ + if (cpu == cpu_of(rq) || + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); + continue; + } + + raw_spin_rq_unlock_irq(rq); + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + cpu_relax(); + } + raw_spin_rq_lock_irq(rq); + waited = true; + } + + if (waited) + goto retry; +} + static struct task_struct *first_local_task(struct rq *rq) { return list_first_entry_or_null(&rq->scx.local_dsq.list, @@ -2457,10 +2521,10 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) bool keep_prev; struct task_struct *p; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); - rq->next_class = &ext_sched_class; + rq_modified_begin(rq, &ext_sched_class); rq_unpin_lock(rq, rf); balance_one(rq, prev); @@ -2468,6 +2532,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) maybe_queue_balance_callback(rq); /* + * Defer to a balance callback which can drop rq lock and enable + * IRQs. Waiting directly in the pick path would deadlock against + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. + */ + if (unlikely(rq->scx.kick_sync_pending)) { + rq->scx.kick_sync_pending = false; + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, + kick_sync_wait_bal_cb); + } + + /* * If any higher-priority sched class enqueued a runnable task on * this rq during balance_one(), abort and return RETRY_TASK, so * that the scheduler loop can restart. @@ -2475,7 +2550,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) * If @force_scx is true, always try to pick a SCHED_EXT task, * regardless of any higher-priority sched classes activity. */ - if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class)) + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -2735,7 +2810,7 @@ static bool check_rq_for_timeouts(struct rq *rq) unsigned long last_runnable = p->scx.runnable_at; if (unlikely(time_after(jiffies, - last_runnable + scx_watchdog_timeout))) { + last_runnable + READ_ONCE(scx_watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, @@ -2763,7 +2838,7 @@ static void scx_watchdog_workfn(struct work_struct *work) cond_resched(); } queue_delayed_work(system_unbound_wq, to_delayed_work(work), - scx_watchdog_timeout / 2); + READ_ONCE(scx_watchdog_timeout) / 2); } void scx_tick(struct rq *rq) @@ -2959,6 +3034,8 @@ static void scx_disable_task(struct task_struct *p) lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + clear_direct_dispatch(p); + if (SCX_HAS_OP(sch, disable)) SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); @@ -3585,7 +3662,6 @@ static int scx_cgroup_init(struct scx_sched *sch) ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { - css_put(css); scx_error(sch, "ops.cgroup_init() failed (%d)", ret); return ret; } @@ -3708,7 +3784,9 @@ static void scx_kobj_release(struct kobject *kobj) static ssize_t scx_attr_ops_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return sysfs_emit(buf, "%s\n", scx_root->ops.name); + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + return sysfs_emit(buf, "%s\n", sch->ops.name); } SCX_ATTR(ops); @@ -3752,7 +3830,9 @@ static const struct kobj_type scx_ktype = { static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); + const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + + return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); } static const struct kset_uevent_ops scx_uevent_ops = { @@ -3901,8 +3981,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, * consider offloading iff the total queued duration is over the * threshold. */ - min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; - if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) + min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV; + if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) return 0; raw_spin_rq_lock_irq(rq); @@ -4130,7 +4210,7 @@ static void scx_bypass(bool bypass) WARN_ON_ONCE(scx_bypass_depth <= 0); if (scx_bypass_depth != 1) goto unlock; - WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC); + WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); bypass_timestamp = ktime_get_ns(); if (sch) scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); @@ -4423,10 +4503,19 @@ done: scx_bypass(false); } +/* + * Claim the exit on @sch. The caller must ensure that the helper kthread work + * is kicked before the current task can be preempted. Once exit_kind is + * claimed, scx_error() can no longer trigger, so if the current task gets + * preempted and the BPF scheduler fails to schedule it back, the helper work + * will never be kicked and the whole system can wedge. + */ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) { int none = SCX_EXIT_NONE; + lockdep_assert_preemption_disabled(); + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) return false; @@ -4449,6 +4538,7 @@ static void scx_disable(enum scx_exit_kind kind) rcu_read_lock(); sch = rcu_dereference(scx_root); if (sch) { + guard(preempt)(); scx_claim_exit(sch, kind); kthread_queue_work(sch->helper, &sch->disable_work); } @@ -4697,6 +4787,9 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (!cpumask_empty(rq->scx.cpus_to_wait)) dump_line(&ns, " cpus_to_wait : %*pb", cpumask_pr_args(rq->scx.cpus_to_wait)); + if (!cpumask_empty(rq->scx.cpus_to_sync)) + dump_line(&ns, " cpus_to_sync : %*pb", + cpumask_pr_args(rq->scx.cpus_to_sync)); used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { @@ -4771,6 +4864,8 @@ static bool scx_vexit(struct scx_sched *sch, { struct scx_exit_info *ei = sch->exit_info; + guard(preempt)(); + if (!scx_claim_exit(sch, kind)) return false; @@ -4955,20 +5050,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) return 0; } -static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +/* + * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid + * starvation. During the READY -> ENABLED task switching loop, the calling + * thread's sched_class gets switched from fair to ext. As fair has higher + * priority than ext, the calling thread can be indefinitely starved under + * fair-class saturation, leading to a system hang. + */ +struct scx_enable_cmd { + struct kthread_work work; + struct sched_ext_ops *ops; + int ret; +}; + +static void scx_enable_workfn(struct kthread_work *work) { + struct scx_enable_cmd *cmd = + container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; struct scx_sched *sch; struct scx_task_iter sti; struct task_struct *p; unsigned long timeout; int i, cpu, ret; - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), - cpu_possible_mask)) { - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); - return -EINVAL; - } - mutex_lock(&scx_enable_mutex); if (scx_enable_state() != SCX_DISABLED) { @@ -5060,7 +5165,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) WRITE_ONCE(scx_watchdog_timeout, timeout); WRITE_ONCE(scx_watchdog_timestamp, jiffies); queue_delayed_work(system_unbound_wq, &scx_watchdog_work, - scx_watchdog_timeout / 2); + READ_ONCE(scx_watchdog_timeout) / 2); /* * Once __scx_enabled is set, %current can be switched to SCX anytime. @@ -5185,13 +5290,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) atomic_long_inc(&scx_enable_seq); - return 0; + cmd->ret = 0; + return; err_free_ksyncs: free_kick_syncs(); err_unlock: mutex_unlock(&scx_enable_mutex); - return ret; + cmd->ret = ret; + return; err_disable_unlock_all: scx_cgroup_unlock(); @@ -5210,7 +5317,42 @@ err_disable: */ scx_error(sch, "scx_enable() failed (%d)", ret); kthread_flush_work(&sch->disable_work); - return 0; + cmd->ret = 0; +} + +static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +{ + static struct kthread_worker *helper; + static DEFINE_MUTEX(helper_mutex); + struct scx_enable_cmd cmd; + + if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), + cpu_possible_mask)) { + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); + return -EINVAL; + } + + if (!READ_ONCE(helper)) { + mutex_lock(&helper_mutex); + if (!helper) { + struct kthread_worker *w = + kthread_run_worker(0, "scx_enable_helper"); + if (IS_ERR_OR_NULL(w)) { + mutex_unlock(&helper_mutex); + return -ENOMEM; + } + sched_set_fifo(w->task); + WRITE_ONCE(helper, w); + } + mutex_unlock(&helper_mutex); + } + + kthread_init_work(&cmd.work, scx_enable_workfn); + cmd.ops = ops; + + kthread_queue_work(READ_ONCE(helper), &cmd.work); + kthread_flush_work(&cmd.work); + return cmd.ret; } @@ -5545,11 +5687,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { if (cur_class == &ext_sched_class) { + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); ksyncs[cpu] = rq->scx.kick_sync; should_wait = true; - } else { - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } resched_curr(rq); @@ -5604,27 +5746,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } - if (!should_wait) - return; - - for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - - /* - * Busy-wait until the task running at the time of kicking is no - * longer running. This can be used to implement e.g. core - * scheduling. - * - * smp_cond_load_acquire() pairs with store_releases in - * pick_task_scx() and put_prev_task_scx(). The former breaks - * the wait if SCX's scheduling path is entered even if the same - * task is picked subsequently. The latter is necessary to break - * the wait when $cpu is taken by a higher sched class. - */ - if (cpu != cpu_of(this_rq)) - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); - - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + /* + * Can't wait in hardirq — kick_sync can't advance, deadlocking if + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). + */ + if (should_wait) { + raw_spin_rq_lock(this_rq); + this_scx->kick_sync_pending = true; + resched_curr(this_rq); + raw_spin_rq_unlock(this_rq); } } @@ -5729,6 +5859,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index c5a3b0bac7c3..44c3a50c542c 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -543,7 +543,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * piled up on it even if there is an idle core elsewhere on * the system. */ - waker_node = cpu_to_node(cpu); + waker_node = scx_cpu_node_if_enabled(cpu); if (!(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && @@ -663,9 +663,8 @@ void scx_idle_init_masks(void) BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); - /* Allocate per-node idle cpumasks */ - scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, - num_possible_nodes()); + /* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */ + scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids); BUG_ON(!scx_idle_node_masks); for_each_node(i) { @@ -861,25 +860,32 @@ static bool check_builtin_idle_enabled(struct scx_sched *sch) * code. * * We can't simply check whether @p->migration_disabled is set in a - * sched_ext callback, because migration is always disabled for the current - * task while running BPF code. + * sched_ext callback, because the BPF prolog (__bpf_prog_enter) may disable + * migration for the current task while running BPF code. * - * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively - * disable and re-enable migration. For this reason, the current task - * inside a sched_ext callback is always a migration-disabled task. + * Since the BPF prolog calls migrate_disable() only when CONFIG_PREEMPT_RCU + * is enabled (via rcu_read_lock_dont_migrate()), migration_disabled == 1 for + * the current task is ambiguous only in that case: it could be from the BPF + * prolog rather than a real migrate_disable() call. * - * Therefore, when @p->migration_disabled == 1, check whether @p is the - * current task or not: if it is, then migration was not disabled before - * entering the callback, otherwise migration was disabled. + * Without CONFIG_PREEMPT_RCU, the BPF prolog never calls migrate_disable(), + * so migration_disabled == 1 always means the task is truly + * migration-disabled. + * + * Therefore, when migration_disabled == 1 and CONFIG_PREEMPT_RCU is enabled, + * check whether @p is the current task or not: if it is, then migration was + * not disabled before entering the callback, otherwise migration was disabled. * * Returns true if @p is migration-disabled, false otherwise. */ static bool is_bpf_migration_disabled(const struct task_struct *p) { - if (p->migration_disabled == 1) - return p != current; - else - return p->migration_disabled; + if (p->migration_disabled == 1) { + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return p != current; + return true; + } + return p->migration_disabled; } static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 386c677e4c9a..00b450597f3e 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -74,7 +74,7 @@ enum scx_exit_flags { * info communication. The following flag indicates whether ops.init() * finished successfully. */ - SCX_EFLAG_INITIALIZED, + SCX_EFLAG_INITIALIZED = 1LLU << 0, }; /* @@ -1035,26 +1035,108 @@ static const char *scx_enable_state_str[] = { }; /* - * sched_ext_entity->ops_state + * Task Ownership State Machine (sched_ext_entity->ops_state) * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: + * The sched_ext core uses this state machine to track task ownership + * between the SCX core and the BPF scheduler. This allows the BPF + * scheduler to dispatch tasks without strict ordering requirements, while + * the SCX core safely rejects invalid dispatches. * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ + * State Transitions * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. + * .------------> NONE (owned by SCX core) + * | | ^ + * | enqueue | | direct dispatch + * | v | + * | QUEUEING -------' + * | | + * | enqueue | + * | completes | + * | v + * | QUEUED (owned by BPF scheduler) + * | | + * | dispatch | + * | | + * | v + * | DISPATCHING + * | | + * | dispatch | + * | completes | + * `---------------' * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. + * State Descriptions + * + * - %SCX_OPSS_NONE: + * Task is owned by the SCX core. It's either on a run queue, running, + * or being manipulated by the core scheduler. The BPF scheduler has no + * claim on this task. + * + * - %SCX_OPSS_QUEUEING: + * Transitional state while transferring a task from the SCX core to + * the BPF scheduler. The task's rq lock is held during this state. + * Since QUEUEING is both entered and exited under the rq lock, dequeue + * can never observe this state (it would be a BUG). When finishing a + * dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion + * path busy-waits for it to leave this state (via wait_ops_state()) + * before retrying. + * + * - %SCX_OPSS_QUEUED: + * Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue) + * and the BPF scheduler is responsible for dispatching it. A QSEQ + * (queue sequence number) is embedded in this state to detect + * dispatch/dequeue races: if a task is dequeued and re-enqueued, the + * QSEQ changes and any in-flight dispatch operations targeting the old + * QSEQ are safely ignored. + * + * - %SCX_OPSS_DISPATCHING: + * Transitional state while transferring a task from the BPF scheduler + * back to the SCX core. This state indicates the BPF scheduler has + * selected the task for execution. When dequeue needs to take the task + * off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path + * busy-waits for it to leave this state (via wait_ops_state()) before + * proceeding. Exits to %SCX_OPSS_NONE when dispatch completes. + * + * Memory Ordering + * + * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into + * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release() + * and waiters must use atomic_long_read_acquire(). This ensures proper + * synchronization between concurrent operations. + * + * Cross-CPU Task Migration + * + * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply + * grab the target CPU's rq lock because a concurrent dequeue might be + * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock + * (deadlock). + * + * The sched_ext core uses a "lock dancing" protocol coordinated by + * p->scx.holding_cpu. When moving a task to a different rq: + * + * 1. Verify task can be moved (CPU affinity, migration_disabled, etc.) + * 2. Set p->scx.holding_cpu to the current CPU + * 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING + * is set, so clearing DISPATCHING first prevents the circular wait + * (safe to lock the rq we need) + * 4. Unlock the current CPU's rq + * 5. Lock src_rq (where the task currently lives) + * 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the + * race (dequeue clears holding_cpu to -1 when it takes the task), in + * this case migration is aborted + * 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly + * into dst_rq's local DSQ (no lock swap needed) + * 8. Otherwise: call move_remote_task_to_local_dsq(), which releases + * src_rq, locks dst_rq, and performs the deactivate/activate + * migration cycle (dst_rq is held on return) + * 9. Unlock dst_rq and re-lock the current CPU's rq to restore + * the lock state expected by the caller + * + * If any verification fails, abort the migration. + * + * This state tracking allows the BPF scheduler to try to dispatch any task + * at any time regardless of its state. The SCX core can safely + * reject/ignore invalid dispatches, simplifying the BPF scheduler + * implementation. */ enum scx_ops_state { SCX_OPSS_NONE, /* owned by the SCX core */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eea99ec01a3f..ab4114712be7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a, return vruntime_cmp(a->deadline, "<", b->deadline); } +/* + * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale + * and this value should be no more than two lag bounds. Which puts it in the + * general order of: + * + * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT + * + * which is around 44 bits in size (on 64bit); that is 20 for + * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for + * however many msec the actual slice+tick ends up begin. + * + * (disregarding the actual divide-by-weight part makes for the worst case + * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually + * being the zero-lag point). + */ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime); @@ -676,41 +691,65 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) } static inline -void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta) { /* - * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight + * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight */ cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta; + cfs_rq->zero_vruntime += delta; } /* - * Specifically: avg_runtime() + 0 must result in entity_eligible() := true + * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true * For this to be so, the result of this function must have a left bias. + * + * Called in: + * - place_entity() -- before enqueue + * - update_entity_lag() -- before dequeue + * - update_deadline() -- slice expiration + * + * This means it is one entry 'behind' but that puts it close enough to where + * the bound on entity_key() is at most two lag bounds. */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - s64 avg = cfs_rq->sum_w_vruntime; - long load = cfs_rq->sum_weight; + long weight = cfs_rq->sum_weight; + s64 delta = 0; - if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + if (curr && !curr->on_rq) + curr = NULL; - avg += entity_key(cfs_rq, curr) * weight; - load += weight; - } + if (weight) { + s64 runtime = cfs_rq->sum_w_vruntime; + + if (curr) { + unsigned long w = scale_load_down(curr->load.weight); + + runtime += entity_key(cfs_rq, curr) * w; + weight += w; + } - if (load) { /* sign flips effective floor / ceiling */ - if (avg < 0) - avg -= (load - 1); - avg = div_s64(avg, load); + if (runtime < 0) + runtime -= (weight - 1); + + delta = div_s64(runtime, weight); + } else if (curr) { + /* + * When there is but one element, it is the average. + */ + delta = curr->vruntime - cfs_rq->zero_vruntime; } - return cfs_rq->zero_vruntime + avg; + update_zero_vruntime(cfs_rq, delta); + + return cfs_rq->zero_vruntime; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); + /* * lag_i = S - s_i = w_i * (V - v_i) * @@ -724,17 +763,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) * EEVDF gives the following limit for a steady state system: * * -r_max < lag < max(r_max, q) - * - * XXX could add max_slice to the augmented data to track this. */ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; s64 vlag, limit; WARN_ON_ONCE(!se->on_rq); vlag = avg_vruntime(cfs_rq) - se->vruntime; - limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + limit = calc_delta_fair(max_slice, se); se->vlag = clamp(vlag, -limit, limit); } @@ -777,16 +815,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static void update_zero_vruntime(struct cfs_rq *cfs_rq) -{ - u64 vruntime = avg_vruntime(cfs_rq); - s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime); - - sum_w_vruntime_update(cfs_rq, delta); - - cfs_rq->zero_vruntime = vruntime; -} - static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) { struct sched_entity *root = __pick_root_entity(cfs_rq); @@ -802,6 +830,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) return min_slice; } +static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq) +{ + struct sched_entity *root = __pick_root_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 max_slice = 0ULL; + + if (curr && curr->on_rq) + max_slice = curr->slice; + + if (root) + max_slice = max(max_slice, root->max_slice); + + return max_slice; +} + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) { return entity_before(__node_2_se(a), __node_2_se(b)); @@ -826,6 +869,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n } } +static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (rse->max_slice > se->max_slice) + se->max_slice = rse->max_slice; + } +} + /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ @@ -833,6 +885,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { u64 old_min_vruntime = se->min_vruntime; u64 old_min_slice = se->min_slice; + u64 old_max_slice = se->max_slice; struct rb_node *node = &se->run_node; se->min_vruntime = se->vruntime; @@ -843,8 +896,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) __min_slice_update(se, node->rb_right); __min_slice_update(se, node->rb_left); + se->max_slice = se->slice; + __max_slice_update(se, node->rb_right); + __max_slice_update(se, node->rb_left); + return se->min_vruntime == old_min_vruntime && - se->min_slice == old_min_slice; + se->min_slice == old_min_slice && + se->max_slice == old_max_slice; } RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, @@ -856,7 +914,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { sum_w_vruntime_add(cfs_rq, se); - update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -868,7 +925,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); sum_w_vruntime_sub(cfs_rq, se); - update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -1075,6 +1131,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) * EEVDF: vd_i = ve_i + r_i / w_i */ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); + avg_vruntime(cfs_rq); /* * The task has consumed its request, reschedule. @@ -3790,6 +3847,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; + bool rel_vprot = false; + u64 vprot; if (se->on_rq) { /* commit outstanding execution time */ @@ -3797,6 +3856,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_entity_lag(cfs_rq, se); se->deadline -= se->vruntime; se->rel_deadline = 1; + if (curr && protect_slice(se)) { + vprot = se->vprot - se->vruntime; + rel_vprot = true; + } + cfs_rq->nr_queued--; if (!curr) __dequeue_entity(cfs_rq, se); @@ -3812,6 +3876,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (se->rel_deadline) se->deadline = div_s64(se->deadline * se->load.weight, weight); + if (rel_vprot) + vprot = div_s64(vprot * se->load.weight, weight); + update_load_set(&se->load, weight); do { @@ -3823,6 +3890,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { place_entity(cfs_rq, se, 0); + if (rel_vprot) + se->vprot = se->vruntime + vprot; update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); @@ -5420,7 +5489,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } static void -set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first) { clear_buddies(cfs_rq, se); @@ -5435,7 +5504,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(cfs_rq, se); + if (first) + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -8948,13 +9018,13 @@ again: pse = parent_entity(pse); } if (se_depth >= pse_depth) { - set_next_entity(cfs_rq_of(se), se); + set_next_entity(cfs_rq_of(se), se, true); se = parent_entity(se); } } put_prev_entity(cfs_rq, pse); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, true); __set_next_task_fair(rq, p, true); } @@ -9054,7 +9124,7 @@ static void yield_task_fair(struct rq *rq) */ if (entity_eligible(cfs_rq, se)) { se->vruntime = se->deadline; - se->deadline += calc_delta_fair(se->slice, se); + update_deadline(cfs_rq, se); } } @@ -12908,7 +12978,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = sched_clock_cpu(this_cpu); __sched_balance_update_blocked_averages(this_rq); - this_rq->next_class = &fair_sched_class; + rq_modified_begin(this_rq, &fair_sched_class); raw_spin_rq_unlock(this_rq); for_each_domain(this_cpu, sd) { @@ -12975,7 +13045,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) pulled_task = 1; /* If a higher prio class was modified, restart the pick */ - if (sched_class_above(this_rq->next_class, &fair_sched_class)) + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13568,7 +13638,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - set_next_entity(cfs_rq, se); + set_next_entity(cfs_rq, se, first); /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 3681b6ad9276..a83be0c834dd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -161,6 +161,14 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, return cpuidle_enter(drv, dev, next_state); } +static void idle_call_stop_or_retain_tick(bool stop_tick) +{ + if (stop_tick || tick_nohz_tick_stopped()) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); +} + /** * cpuidle_idle_call - the main idle function * @@ -170,7 +178,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ -static void cpuidle_idle_call(void) +static void cpuidle_idle_call(bool stop_tick) { struct cpuidle_device *dev = cpuidle_get_device(); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); @@ -186,7 +194,7 @@ static void cpuidle_idle_call(void) } if (cpuidle_not_available(drv, dev)) { - tick_nohz_idle_stop_tick(); + idle_call_stop_or_retain_tick(stop_tick); default_idle_call(); goto exit_idle; @@ -221,24 +229,35 @@ static void cpuidle_idle_call(void) next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); - } else { - bool stop_tick = true; + } else if (drv->state_count > 1) { + /* + * stop_tick is expected to be true by default by cpuidle + * governors, which allows them to select idle states with + * target residency above the tick period length. + */ + stop_tick = true; /* * Ask the cpuidle framework to choose a convenient idle state. */ next_state = cpuidle_select(drv, dev, &stop_tick); - if (stop_tick || tick_nohz_tick_stopped()) - tick_nohz_idle_stop_tick(); - else - tick_nohz_idle_retain_tick(); + idle_call_stop_or_retain_tick(stop_tick); entered_state = call_cpuidle(drv, dev, next_state); /* * Give the governor an opportunity to reflect on the outcome */ cpuidle_reflect(dev, entered_state); + } else { + idle_call_stop_or_retain_tick(stop_tick); + + /* + * If there is only a single idle state (or none), there is + * nothing meaningful for the governor to choose. Skip the + * governor and always use state 0. + */ + call_cpuidle(drv, dev, 0); } exit_idle: @@ -259,6 +278,7 @@ exit_idle: static void do_idle(void) { int cpu = smp_processor_id(); + bool got_tick = false; /* * Check if we need to update blocked load @@ -329,8 +349,9 @@ static void do_idle(void) tick_nohz_idle_restart_tick(); cpu_idle_poll(); } else { - cpuidle_idle_call(); + cpuidle_idle_call(got_tick); } + got_tick = tick_nohz_idle_got_tick(); arch_cpu_idle_exit(); } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 3b725d39c06e..ef152d401fe2 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -123,8 +123,6 @@ int housekeeping_update(struct cpumask *isol_mask) struct cpumask *trial, *old = NULL; int err; - lockdep_assert_cpus_held(); - trial = kmalloc(cpumask_size(), GFP_KERNEL); if (!trial) return -ENOMEM; @@ -136,7 +134,7 @@ int housekeeping_update(struct cpumask *isol_mask) } if (!housekeeping.flags) - static_branch_enable_cpuslocked(&housekeeping_overridden); + static_branch_enable(&housekeeping_overridden); if (housekeeping.flags & HK_FLAG_DOMAIN) old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b82fb70a9d54..1ef9ba480f51 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -805,9 +805,12 @@ struct scx_rq { cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; + cpumask_var_t cpus_to_sync; + bool kick_sync_pending; unsigned long kick_sync; local_t reenq_local_deferred; struct balance_callback deferred_bal_cb; + struct balance_callback kick_sync_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; struct scx_dispatch_q bypass_dsq; @@ -2748,6 +2751,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla #define sched_class_above(_a, _b) ((_a) < (_b)) +static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class) +{ + if (sched_class_above(rq->next_class, class)) + rq->next_class = class; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class) +{ + return sched_class_above(rq->next_class, class); +} + static inline bool sched_stop_runnable(struct rq *rq) { return rq->stop && task_on_rq_queued(rq->stop); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 6f10db3646e7..cadb0e9fe19b 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -284,6 +284,35 @@ static bool check_same_owner(struct task_struct *p) uid_eq(cred->euid, pcred->uid)); } +#ifdef CONFIG_RT_MUTEXES +static inline void __setscheduler_dl_pi(int newprio, int policy, + struct task_struct *p, + struct sched_change_ctx *scope) +{ + /* + * In case a DEADLINE task (either proper or boosted) gets + * setscheduled to a lower priority class, check if it neeeds to + * inherit parameters from a potential pi_task. In that case make + * sure replenishment happens with the next enqueue. + */ + + if (dl_prio(newprio) && !dl_policy(policy)) { + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + if (pi_task) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } + } +} +#else /* !CONFIG_RT_MUTEXES */ +static inline void __setscheduler_dl_pi(int newprio, int policy, + struct task_struct *p, + struct sched_change_ctx *scope) +{ +} +#endif /* !CONFIG_RT_MUTEXES */ + #ifdef CONFIG_UCLAMP_TASK static int uclamp_validate(struct task_struct *p, @@ -655,6 +684,7 @@ change: __setscheduler_params(p, attr); p->sched_class = next_class; p->prio = newprio; + __setscheduler_dl_pi(newprio, policy, p, scope); } __setscheduler_uclamp(p, attr); |
