diff options
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 432 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 5 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 96 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 18 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 4105 | ||||
| -rw-r--r-- | kernel/sched/ext.h | 4 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 232 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.h | 2 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 344 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 578 | ||||
| -rw-r--r-- | kernel/sched/features.h | 8 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 30 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 64 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 106 | ||||
| -rw-r--r-- | kernel/sched/syscalls.c | 16 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 279 |
16 files changed, 4764 insertions, 1555 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 496dff740dca..da20fb6ea25a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -122,6 +122,11 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_entry_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_exit_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_set_need_resched_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_throttle_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_replenish_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_update_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_server_start_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_dl_server_stop_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); @@ -687,11 +692,6 @@ bool raw_spin_rq_trylock(struct rq *rq) } } -void raw_spin_rq_unlock(struct rq *rq) -{ - raw_spin_unlock(rq_lockp(rq)); -} - /* * double_rq_lock - safely lock two runqueues */ @@ -872,7 +872,14 @@ void update_rq_clock(struct rq *rq) * Use HR-timers to deliver accurate preemption points. */ -static void hrtick_clear(struct rq *rq) +enum { + HRTICK_SCHED_NONE = 0, + HRTICK_SCHED_DEFER = BIT(1), + HRTICK_SCHED_START = BIT(2), + HRTICK_SCHED_REARM_HRTIMER = BIT(3) +}; + +static void __used hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) hrtimer_cancel(&rq->hrtick_timer); @@ -897,12 +904,24 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void __hrtick_restart(struct rq *rq) +static inline bool hrtick_needs_rearm(struct hrtimer *timer, ktime_t expires) +{ + /* + * Queued is false when the timer is not started or currently + * running the callback. In both cases, restart. If queued check + * whether the expiry time actually changes substantially. + */ + return !hrtimer_is_queued(timer) || + abs(expires - hrtimer_get_expires(timer)) > 5000; +} + +static void hrtick_cond_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = rq->hrtick_time; - hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); + if (hrtick_needs_rearm(timer, time)) + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -914,7 +933,7 @@ static void __hrtick_start(void *arg) struct rq_flags rf; rq_lock(rq, &rf); - __hrtick_restart(rq); + hrtick_cond_restart(rq); rq_unlock(rq, &rf); } @@ -925,7 +944,6 @@ static void __hrtick_start(void *arg) */ void hrtick_start(struct rq *rq, u64 delay) { - struct hrtimer *timer = &rq->hrtick_timer; s64 delta; /* @@ -933,27 +951,67 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - rq->hrtick_time = ktime_add_ns(hrtimer_cb_get_time(timer), delta); + + /* + * If this is in the middle of schedule() only note the delay + * and let hrtick_schedule_exit() deal with it. + */ + if (rq->hrtick_sched) { + rq->hrtick_sched |= HRTICK_SCHED_START; + rq->hrtick_delay = delta; + return; + } + + rq->hrtick_time = ktime_add_ns(ktime_get(), delta); + if (!hrtick_needs_rearm(&rq->hrtick_timer, rq->hrtick_time)) + return; if (rq == this_rq()) - __hrtick_restart(rq); + hrtimer_start(&rq->hrtick_timer, rq->hrtick_time, HRTIMER_MODE_ABS_PINNED_HARD); else smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -static void hrtick_rq_init(struct rq *rq) +static inline void hrtick_schedule_enter(struct rq *rq) { - INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); - hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + rq->hrtick_sched = HRTICK_SCHED_DEFER; + if (hrtimer_test_and_clear_rearm_deferred()) + rq->hrtick_sched |= HRTICK_SCHED_REARM_HRTIMER; } -#else /* !CONFIG_SCHED_HRTICK: */ -static inline void hrtick_clear(struct rq *rq) + +static inline void hrtick_schedule_exit(struct rq *rq) { + if (rq->hrtick_sched & HRTICK_SCHED_START) { + rq->hrtick_time = ktime_add_ns(ktime_get(), rq->hrtick_delay); + hrtick_cond_restart(rq); + } else if (idle_rq(rq)) { + /* + * No need for using hrtimer_is_active(). The timer is CPU local + * and interrupts are disabled, so the callback cannot be + * running and the queued state is valid. + */ + if (hrtimer_is_queued(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); + } + + if (rq->hrtick_sched & HRTICK_SCHED_REARM_HRTIMER) + __hrtimer_rearm_deferred(); + + rq->hrtick_sched = HRTICK_SCHED_NONE; } -static inline void hrtick_rq_init(struct rq *rq) +static void hrtick_rq_init(struct rq *rq) { + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); + rq->hrtick_sched = HRTICK_SCHED_NONE; + hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_HARD | HRTIMER_MODE_LAZY_REARM); } +#else /* !CONFIG_SCHED_HRTICK: */ +static inline void hrtick_clear(struct rq *rq) { } +static inline void hrtick_rq_init(struct rq *rq) { } +static inline void hrtick_schedule_enter(struct rq *rq) { } +static inline void hrtick_schedule_exit(struct rq *rq) { } #endif /* !CONFIG_SCHED_HRTICK */ /* @@ -3847,6 +3905,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { + int this_cpu = smp_processor_id(); + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ if (!scx_allow_ttwu_queue(p)) return false; @@ -3871,10 +3931,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. */ - if (!cpus_share_cache(smp_processor_id(), cpu)) + if (!cpus_share_cache(this_cpu, cpu)) return true; - if (cpu == smp_processor_id()) + if (cpu == this_cpu) return false; /* @@ -4721,7 +4781,7 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); - return scx_fork(p); + return scx_fork(p, kargs); } void sched_cancel_fork(struct task_struct *p) @@ -4738,7 +4798,7 @@ void sched_post_fork(struct task_struct *p) scx_post_fork(p); } -unsigned long to_ratio(u64 period, u64 runtime) +u64 to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) return BW_UNIT; @@ -4913,6 +4973,34 @@ static inline void finish_task(struct task_struct *prev) smp_store_release(&prev->on_cpu, 0); } +/* + * Only called from __schedule context + * + * There are some cases where we are going to re-do the action + * that added the balance callbacks. We may not be in a state + * where we can run them, so just zap them so they can be + * properly re-added on the next time around. This is similar + * handling to running the callbacks, except we just don't call + * them. + */ +static void zap_balance_callbacks(struct rq *rq) +{ + struct balance_callback *next, *head; + bool found = false; + + lockdep_assert_rq_held(rq); + + head = rq->balance_callback; + while (head) { + if (head == &balance_push_callback) + found = true; + next = head->next; + head->next = NULL; + head = next; + } + rq->balance_callback = found ? &balance_push_callback : NULL; +} + static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5032,6 +5120,7 @@ static inline void finish_lock_switch(struct rq *rq) */ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP_); __balance_callbacks(rq, NULL); + hrtick_schedule_exit(rq); raw_spin_rq_unlock_irq(rq); } @@ -5681,7 +5770,7 @@ static void sched_tick_remote(struct work_struct *work) os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); if (os == TICK_SCHED_REMOTE_RUNNING) - queue_delayed_work(system_unbound_wq, dwork, HZ); + queue_delayed_work(system_dfl_wq, dwork, HZ); } static void sched_tick_start(int cpu) @@ -5700,7 +5789,7 @@ static void sched_tick_start(int cpu) if (os == TICK_SCHED_REMOTE_OFFLINE) { twork->cpu = cpu; INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + queue_delayed_work(system_dfl_wq, &twork->work, HZ); } } @@ -6498,6 +6587,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; + set_task_blocked_on_waking(p, NULL); + return false; } @@ -6535,6 +6626,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_PROXY_EXEC +static inline void proxy_set_task_cpu(struct task_struct *p, int cpu) +{ + unsigned int wake_cpu; + + /* + * Since we are enqueuing a blocked task on a cpu it may + * not be able to run on, preserve wake_cpu when we + * __set_task_cpu so we can return the task to where it + * was previously runnable. + */ + wake_cpu = p->wake_cpu; + __set_task_cpu(p, cpu); + p->wake_cpu = wake_cpu; +} + static inline struct task_struct *proxy_resched_idle(struct rq *rq) { put_prev_set_next_task(rq, rq->donor, rq->idle); @@ -6543,7 +6649,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq) return rq->idle; } -static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) { unsigned long state = READ_ONCE(donor->__state); @@ -6563,17 +6669,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) return try_to_block_task(rq, donor, &state, true); } -static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) + __releases(__rq_lockp(rq)) { - if (!__proxy_deactivate(rq, donor)) { + /* + * The class scheduler may have queued a balance callback + * from pick_next_task() called earlier. + * + * So here we have to zap callbacks before unlocking the rq + * as another CPU may jump in and call sched_balance_rq + * which can trip the warning in rq_pin_lock() if we + * leave callbacks set. + * + * After we later reaquire the rq lock, we will force __schedule() + * to pick_again, so the callbacks will get re-established. + */ + zap_balance_callbacks(rq); + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(__rq_lockp(rq)) +{ + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + update_rq_clock(rq); +} + +/* + * If the blocked-on relationship crosses CPUs, migrate @p to the + * owner's CPU. + * + * This is because we must respect the CPU affinity of execution + * contexts (owner) but we can ignore affinity for scheduling + * contexts (@p). So we have to move scheduling contexts towards + * potential execution contexts. + * + * Note: The owner can disappear, but simply migrate to @target_cpu + * and leave that CPU to sort things out. + */ +static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int target_cpu) + __must_hold(__rq_lockp(rq)) +{ + struct rq *target_rq = cpu_rq(target_cpu); + + lockdep_assert_rq_held(rq); + WARN_ON(p == rq->curr); + /* + * Since we are migrating a blocked donor, it could be rq->donor, + * and we want to make sure there aren't any references from this + * rq to it before we drop the lock. This avoids another cpu + * jumping in and grabbing the rq lock and referencing rq->donor + * or cfs_rq->curr, etc after we have migrated it to another cpu, + * and before we pick_again in __schedule. + * + * So call proxy_resched_idle() to drop the rq->donor references + * before we release the lock. + */ + proxy_resched_idle(rq); + + deactivate_task(rq, p, DEQUEUE_NOCLOCK); + proxy_set_task_cpu(p, target_cpu); + + proxy_release_rq_lock(rq, rf); + + attach_one_task(target_rq, p); + + proxy_reacquire_rq_lock(rq, rf); +} + +static void proxy_force_return(struct rq *rq, struct rq_flags *rf, + struct task_struct *p) + __must_hold(__rq_lockp(rq)) +{ + struct rq *task_rq, *target_rq = NULL; + int cpu, wake_flag = WF_TTWU; + + lockdep_assert_rq_held(rq); + WARN_ON(p == rq->curr); + + if (p == rq->donor) + proxy_resched_idle(rq); + + proxy_release_rq_lock(rq, rf); + /* + * We drop the rq lock, and re-grab task_rq_lock to get + * the pi_lock (needed for select_task_rq) as well. + */ + scoped_guard (task_rq_lock, p) { + task_rq = scope.rq; + /* - * XXX: For now, if deactivation failed, set donor - * as unblocked, as we aren't doing proxy-migrations - * yet (more logic will be needed then). + * Since we let go of the rq lock, the task may have been + * woken or migrated to another rq before we got the + * task_rq_lock. So re-check we're on the same RQ. If + * not, the task has already been migrated and that CPU + * will handle any futher migrations. */ - donor->blocked_on = NULL; + if (task_rq != rq) + break; + + /* + * Similarly, if we've been dequeued, someone else will + * wake us + */ + if (!task_on_rq_queued(p)) + break; + + /* + * Since we should only be calling here from __schedule() + * -> find_proxy_task(), no one else should have + * assigned current out from under us. But check and warn + * if we see this, then bail. + */ + if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { + WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", + __func__, cpu_of(task_rq), + p->comm, p->pid, p->on_cpu); + break; + } + + update_rq_clock(task_rq); + deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); + cpu = select_task_rq(p, p->wake_cpu, &wake_flag); + set_task_cpu(p, cpu); + target_rq = cpu_rq(cpu); + clear_task_blocked_on(p, NULL); } - return NULL; + + if (target_rq) + attach_one_task(target_rq, p); + + proxy_reacquire_rq_lock(rq, rf); } /* @@ -6587,31 +6816,41 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d * p->pi_lock * rq->lock * mutex->wait_lock + * p->blocked_lock * * Returns the task that is going to be used as execution context (the one * that is actually going to be run on cpu_of(rq)). */ static struct task_struct * find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { struct task_struct *owner = NULL; + bool curr_in_chain = false; int this_cpu = cpu_of(rq); struct task_struct *p; struct mutex *mutex; + int owner_cpu; /* Follow blocked_on chain. */ - for (p = donor; task_is_blocked(p); p = owner) { - mutex = p->blocked_on; - /* Something changed in the chain, so pick again */ - if (!mutex) - return NULL; + for (p = donor; (mutex = p->blocked_on); p = owner) { + /* if its PROXY_WAKING, do return migration or run if current */ + if (mutex == PROXY_WAKING) { + if (task_current(rq, p)) { + clear_task_blocked_on(p, PROXY_WAKING); + return p; + } + goto force_return; + } + /* * By taking mutex->wait_lock we hold off concurrent mutex_unlock() * and ensure @owner sticks around. */ guard(raw_spinlock)(&mutex->wait_lock); + guard(raw_spinlock)(&p->blocked_lock); - /* Check again that p is blocked with wait_lock held */ + /* Check again that p is blocked with blocked_lock held */ if (mutex != __get_task_blocked_on(p)) { /* * Something changed in the blocked_on chain and @@ -6622,20 +6861,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) return NULL; } + if (task_current(rq, p)) + curr_in_chain = true; + owner = __mutex_owner(mutex); if (!owner) { - __clear_task_blocked_on(p, mutex); - return p; + /* + * If there is no owner, either clear blocked_on + * and return p (if it is current and safe to + * just run on this rq), or return-migrate the task. + */ + if (task_current(rq, p)) { + __clear_task_blocked_on(p, NULL); + return p; + } + goto force_return; } if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { /* XXX Don't handle blocked owners/delayed dequeue yet */ - return proxy_deactivate(rq, donor); + if (curr_in_chain) + return proxy_resched_idle(rq); + goto deactivate; } - if (task_cpu(owner) != this_cpu) { - /* XXX Don't handle migrations yet */ - return proxy_deactivate(rq, donor); + owner_cpu = task_cpu(owner); + if (owner_cpu != this_cpu) { + /* + * @owner can disappear, simply migrate to @owner_cpu + * and leave that CPU to sort things out. + */ + if (curr_in_chain) + return proxy_resched_idle(rq); + goto migrate_task; } if (task_on_rq_migrating(owner)) { @@ -6692,9 +6950,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * guarantee its existence, as per ttwu_remote(). */ } - WARN_ON_ONCE(owner && !owner->on_rq); return owner; + +deactivate: + if (proxy_deactivate(rq, donor)) + return NULL; + /* If deactivate fails, force return */ + p = donor; +force_return: + proxy_force_return(rq, rf, p); + return NULL; +migrate_task: + proxy_migrate_task(rq, rf, p, owner_cpu); + return NULL; } #else /* SCHED_PROXY_EXEC */ static struct task_struct * @@ -6705,23 +6974,6 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) } #endif /* SCHED_PROXY_EXEC */ -static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) -{ - if (!sched_proxy_exec()) - return; - /* - * pick_next_task() calls set_next_task() on the chosen task - * at some point, which ensures it is not push/pullable. - * However, the chosen/donor task *and* the mutex owner form an - * atomic pair wrt push/pull. - * - * Make sure owner we run is not pushable. Unfortunately we can - * only deal with that by means of a dequeue/enqueue cycle. :-/ - */ - dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); - enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); -} - /* * __schedule() is the main scheduler function. * @@ -6785,9 +7037,6 @@ static void __sched notrace __schedule(int sched_mode) schedule_debug(prev, preempt); - if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) - hrtick_clear(rq); - klp_sched_try_switch(prev); local_irq_disable(); @@ -6814,6 +7063,8 @@ static void __sched notrace __schedule(int sched_mode) rq_lock(rq, &rf); smp_mb__after_spinlock(); + hrtick_schedule_enter(rq); + /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); @@ -6849,16 +7100,45 @@ static void __sched notrace __schedule(int sched_mode) } pick_again: + assert_balance_callbacks_empty(rq); next = pick_next_task(rq, rq->donor, &rf); - rq_set_donor(rq, next); rq->next_class = next->sched_class; - if (unlikely(task_is_blocked(next))) { - next = find_proxy_task(rq, next, &rf); - if (!next) - goto pick_again; - if (next == rq->idle) - goto keep_resched; + if (sched_proxy_exec()) { + struct task_struct *prev_donor = rq->donor; + + rq_set_donor(rq, next); + if (unlikely(next->blocked_on)) { + next = find_proxy_task(rq, next, &rf); + if (!next) { + zap_balance_callbacks(rq); + goto pick_again; + } + if (next == rq->idle) { + zap_balance_callbacks(rq); + goto keep_resched; + } + } + if (rq->donor == prev_donor && prev != next) { + struct task_struct *donor = rq->donor; + /* + * When transitioning like: + * + * prev next + * donor: B B + * curr: A B or C + * + * then put_prev_set_next_task() will not have done + * anything, since B == B. However, A might have + * missed a RT/DL balance opportunity due to being + * on_cpu. + */ + donor->sched_class->put_prev_task(rq, donor, donor); + donor->sched_class->set_next_task(rq, donor, true); + } + } else { + rq_set_donor(rq, next); } + picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -6874,9 +7154,6 @@ keep_resched: */ RCU_INIT_POINTER(rq->curr, next); - if (!task_current_donor(rq, next)) - proxy_tag_curr(rq, next); - /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6910,12 +7187,9 @@ keep_resched: /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - /* In case next was already curr but just got blocked_donor */ - if (!task_current_donor(rq, next)) - proxy_tag_curr(rq, next); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq, NULL); + hrtick_schedule_exit(rq); raw_spin_rq_unlock_irq(rq); } trace_sched_exit_tp(is_switch); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 153232dd8276..ae9fd211cec1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, unsigned int flags) { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long prev_util = sg_cpu->util; unsigned long max_cap; @@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) sg_cpu->util = prev_util; - cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, + cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min, sg_cpu->util, max_cap); - sg_cpu->sg_policy->last_freq_update_time = time; + sg_policy->last_freq_update_time = time; } static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d08b00429323..edca7849b165 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -18,6 +18,7 @@ #include <linux/cpuset.h> #include <linux/sched/clock.h> +#include <linux/sched/deadline.h> #include <uapi/linux/sched/types.h> #include "sched.h" #include "pelt.h" @@ -57,17 +58,6 @@ static int __init sched_dl_sysctl_init(void) late_initcall(sched_dl_sysctl_init); #endif /* CONFIG_SYSCTL */ -static bool dl_server(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_server; -} - -static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) -{ - BUG_ON(dl_server(dl_se)); - return container_of(dl_se, struct task_struct, dl); -} - static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq) { return container_of(dl_rq, struct rq, dl); @@ -115,6 +105,19 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) } #endif /* !CONFIG_RT_MUTEXES */ +static inline u8 dl_get_type(struct sched_dl_entity *dl_se, struct rq *rq) +{ + if (!dl_server(dl_se)) + return DL_TASK; + if (dl_se == &rq->fair_server) + return DL_SERVER_FAIR; +#ifdef CONFIG_SCHED_CLASS_EXT + if (dl_se == &rq->ext_server) + return DL_SERVER_EXT; +#endif + return DL_OTHER; +} + static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -733,6 +736,7 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, dl_se->dl_throttled = 1; dl_se->dl_defer_armed = 1; } + trace_sched_dl_replenish_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); } /* @@ -848,6 +852,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + trace_sched_dl_replenish_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); + /* * If this is the replenishment of a deferred reservation, * clear the flag and return. @@ -975,22 +981,6 @@ update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) } /* - * Regarding the deadline, a task with implicit deadline has a relative - * deadline == relative period. A task with constrained deadline has a - * relative deadline <= relative period. - * - * We support constrained deadline tasks. However, there are some restrictions - * applied only for tasks which do not have an implicit deadline. See - * update_dl_entity() to know more about such restrictions. - * - * The dl_is_implicit() returns true if the task has an implicit deadline. - */ -static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) -{ - return dl_se->dl_deadline == dl_se->dl_period; -} - -/* * When a deadline entity is placed in the runqueue, its runtime and deadline * might need to be updated. This is done by a CBS wake up rule. There are two * different rules: 1) the original CBS; and 2) the Revisited CBS. @@ -1027,7 +1017,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, rq_clock(rq))) { - if (unlikely(!dl_is_implicit(dl_se) && + if (unlikely((!dl_is_implicit(dl_se) || dl_se->dl_defer) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); @@ -1097,7 +1087,7 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) act = ns_to_ktime(dl_next_period(dl_se)); } - now = hrtimer_cb_get_time(timer); + now = ktime_get(); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -1345,6 +1335,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(dl_se))) return; + trace_sched_dl_throttle_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); dl_se->dl_throttled = 1; if (dl_se->runtime > 0) dl_se->runtime = 0; @@ -1508,6 +1499,7 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { + trace_sched_dl_throttle_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); dl_se->dl_throttled = 1; /* If requested, inform the user about runtime overruns. */ @@ -1532,6 +1524,8 @@ throttle: if (!is_leftmost(dl_se, &rq->dl)) resched_curr(rq); + } else { + trace_sched_dl_update_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); } /* @@ -1810,6 +1804,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) return; + trace_sched_dl_server_start_tp(dl_se, cpu_of(rq), dl_get_type(dl_se, rq)); dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) @@ -1821,6 +1816,8 @@ void dl_server_stop(struct sched_dl_entity *dl_se) if (!dl_server(dl_se) || !dl_server_active(dl_se)) return; + trace_sched_dl_server_stop_tp(dl_se, cpu_of(dl_se->rq), + dl_get_type(dl_se, dl_se->rq)); dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; @@ -2142,10 +2139,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags) { struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); if (!schedstat_enabled()) return; + if (p != rq->curr) + update_stats_wait_end_dl(dl_rq, dl_se); + if ((flags & DEQUEUE_SLEEP)) { unsigned int state; @@ -2801,12 +2802,26 @@ static int find_later_rq(struct task_struct *task) static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) { - struct task_struct *p; + struct task_struct *i, *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + while (next_node) { + i = __node_2_pdl(next_node); + /* make sure task isn't on_cpu (possible with proxy-exec) */ + if (!task_on_cpu(rq, i)) { + p = i; + break; + } + + next_node = rb_next(next_node); + } + + if (!p) + return NULL; WARN_ON_ONCE(rq->cpu != task_cpu(p)); WARN_ON_ONCE(task_current(rq, p)); @@ -3613,13 +3628,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } -void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { struct sched_dl_entity *dl_se = &p->dl; + struct rq *rq = task_rq(p); + u64 adj_deadline; attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; + if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) { + guard(raw_spinlock_irq)(&rq->__lock); + update_rq_clock(rq); + if (task_current(rq, p)) + update_curr_dl(rq); + + attr->sched_runtime = dl_se->runtime; + adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns(); + attr->sched_deadline = adj_deadline; + } else { + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + } attr->sched_period = dl_se->dl_period; attr->sched_flags &= ~SCHED_DL_FLAGS; attr->sched_flags |= dl_se->flags; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b24f40f05019..74c1617cf652 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,6 +8,7 @@ */ #include <linux/debugfs.h> #include <linux/nmi.h> +#include <linux/log2.h> #include "sched.h" /* @@ -901,10 +902,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread; + s64 zero_vruntime = -1, sum_w_vruntime = -1; + u64 avruntime; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); + unsigned int sum_shift; unsigned long flags; + u64 sum_weight; #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); @@ -925,6 +930,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) right_vruntime = last->vruntime; zero_vruntime = cfs_rq->zero_vruntime; + sum_w_vruntime = cfs_rq->sum_w_vruntime; + sum_weight = cfs_rq->sum_weight; + sum_shift = cfs_rq->sum_shift; + avruntime = avg_vruntime(cfs_rq); raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", @@ -933,8 +942,13 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", SPLIT_NS(zero_vruntime)); + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime", + sum_w_vruntime, ilog2(abs(sum_w_vruntime))); + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight", + sum_weight); + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", - SPLIT_NS(avg_vruntime(cfs_rq))); + SPLIT_NS(avruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 26a6ac2f8826..e426e27b6794 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,6 +9,8 @@ #include <linux/btf_ids.h> #include "ext_idle.h" +static DEFINE_RAW_SPINLOCK(scx_sched_lock); + /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -17,7 +19,23 @@ * are used as temporary markers to indicate that the dereferences need to be * updated to point to the associated scheduler instances rather than scx_root. */ -static struct scx_sched __rcu *scx_root; +struct scx_sched __rcu *scx_root; + +/* + * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock. + * Readers can hold either or rcu_read_lock(). + */ +static LIST_HEAD(scx_sched_all); + +#ifdef CONFIG_EXT_SUB_SCHED +static const struct rhashtable_params scx_sched_hash_params = { + .key_len = sizeof_field(struct scx_sched, ops.sub_cgroup_id), + .key_offset = offsetof(struct scx_sched, ops.sub_cgroup_id), + .head_offset = offsetof(struct scx_sched, hash_node), +}; + +static struct rhashtable scx_sched_hash; +#endif /* * During exit, a task may schedule after losing its PIDs. When disabling the @@ -33,37 +51,39 @@ static DEFINE_MUTEX(scx_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); -static int scx_bypass_depth; +static DEFINE_RAW_SPINLOCK(scx_bypass_lock); static cpumask_var_t scx_bypass_lb_donee_cpumask; static cpumask_var_t scx_bypass_lb_resched_cpumask; -static bool scx_aborting; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -/* - * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass - * depth on enable failure. Will be removed when bypass depth is moved into the - * sched instance. - */ -static bool scx_bypassed_for_enable; - static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); +#ifdef CONFIG_EXT_SUB_SCHED /* - * A monotically increasing sequence number that is incremented every time a - * scheduler is enabled. This can be used by to check if any custom sched_ext + * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit + * tasks for the sub-sched being enabled. Use a global variable instead of a + * per-task field as all enables are serialized. + */ +static struct scx_sched *scx_enabling_sub_sched; +#else +#define scx_enabling_sub_sched (struct scx_sched *)NULL +#endif /* CONFIG_EXT_SUB_SCHED */ + +/* + * A monotonically increasing sequence number that is incremented every time a + * scheduler is enabled. This can be used to check if any custom sched_ext * scheduler has ever been used in the system. */ static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); /* - * The maximum amount of time in jiffies that a task may be runnable without - * being scheduled on a CPU. If this timeout is exceeded, it will trigger - * scx_error(). + * Watchdog interval. All scx_sched's share a single watchdog timer and the + * interval is half of the shortest sch->watchdog_timeout. */ -static unsigned long scx_watchdog_timeout; +static unsigned long scx_watchdog_interval; /* * The last time the delayed work was run. This delayed work relies on @@ -106,25 +126,6 @@ static const struct rhashtable_params dsq_hash_params = { static LLIST_HEAD(dsqs_to_free); -/* dispatch buf */ -struct scx_dsp_buf_ent { - struct task_struct *task; - unsigned long qseq; - u64 dsq_id; - u64 enq_flags; -}; - -static u32 scx_dsp_max_batch; - -struct scx_dsp_ctx { - struct rq *rq; - u32 cursor; - u32 nr_tasks; - struct scx_dsp_buf_ent buf[]; -}; - -static struct scx_dsp_ctx __percpu *scx_dsp_ctx; - /* string formatting from BPF */ struct scx_bstr_buf { u64 data[MAX_BPRINTF_VARARGS]; @@ -135,6 +136,8 @@ static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); static struct scx_bstr_buf scx_exit_bstr_buf; /* ops debug dump */ +static DEFINE_RAW_SPINLOCK(scx_dump_lock); + struct scx_dump_data { s32 cpu; bool first; @@ -156,7 +159,6 @@ static struct kset *scx_kset; * There usually is no reason to modify these as normal scheduler operation * shouldn't be affected by them. The knobs are primarily for debugging. */ -static u64 scx_slice_dfl = SCX_SLICE_DFL; static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; @@ -193,10 +195,10 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> -static void process_ddsp_deferred_locals(struct rq *rq); +static void run_deferred(struct rq *rq); static bool task_dead_and_done(struct task_struct *p); -static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); +static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind); static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -227,28 +229,109 @@ static long jiffies_delta_msecs(unsigned long at, unsigned long now) return -(long)jiffies_to_msecs(now - at); } -/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ -static u32 higher_bits(u32 flags) +static bool u32_before(u32 a, u32 b) { - return ~((1 << fls(flags)) - 1); + return (s32)(a - b) < 0; } -/* return the mask with only the highest bit set */ -static u32 highest_bit(u32 flags) +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_parent - Find the parent sched + * @sch: sched to find the parent of + * + * Returns the parent scheduler or %NULL if @sch is root. + */ +static struct scx_sched *scx_parent(struct scx_sched *sch) { - int bit = fls(flags); - return ((u64)1 << bit) >> 1; + if (sch->level) + return sch->ancestors[sch->level - 1]; + else + return NULL; } -static bool u32_before(u32 a, u32 b) +/** + * scx_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @root: sched whose descendants to walk + * + * To be used by scx_for_each_descendant_pre(). Find the next descendant to + * visit for pre-order traversal of @root's descendants. @root is included in + * the iteration and the first node to be visited. + */ +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, + struct scx_sched *root) { - return (s32)(a - b) < 0; + struct scx_sched *next; + + lockdep_assert(lockdep_is_held(&scx_enable_mutex) || + lockdep_is_held(&scx_sched_lock)); + + /* if first iteration, visit @root */ + if (!pos) + return root; + + /* visit the first child if exists */ + next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + while (pos != root) { + if (!list_is_last(&pos->sibling, &scx_parent(pos)->children)) + return list_next_entry(pos, sibling); + pos = scx_parent(pos); + } + + return NULL; } -static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, - struct task_struct *p) +static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { - return sch->global_dsqs[cpu_to_node(task_cpu(p))]; + return rhashtable_lookup(&scx_sched_hash, &cgroup_id, + scx_sched_hash_params); +} + +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) +{ + rcu_assign_pointer(p->scx.sched, sch); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; } +static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; } +static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; } +static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {} +#endif /* CONFIG_EXT_SUB_SCHED */ + +/** + * scx_is_descendant - Test whether sched is a descendant + * @sch: sched to test + * @ancestor: ancestor sched to test against + * + * Test whether @sch is a descendant of @ancestor. + */ +static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor) +{ + if (sch->level < ancestor->level) + return false; + return sch->ancestors[ancestor->level] == ancestor; +} + +/** + * scx_for_each_descendant_pre - pre-order walk of a sched's descendants + * @pos: iteration cursor + * @root: sched to walk the descendants of + * + * Walk @root's descendants. @root is included in the iteration and the first + * node to be visited. Must be called with either scx_enable_mutex or + * scx_sched_lock held. + */ +#define scx_for_each_descendant_pre(pos, root) \ + for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos); \ + (pos) = scx_next_descendant_pre((pos), (root))) + +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu) +{ + return &sch->pnode[cpu_to_node(cpu)]->global_dsq; } static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) @@ -264,28 +347,106 @@ static const struct sched_class *scx_setscheduler_class(struct task_struct *p) return __setscheduler_class(p->policy, p->prio); } -/* - * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX - * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate - * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check - * whether it's running from an allowed context. +static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu) +{ + return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq; +} + +static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu) +{ +#ifdef CONFIG_EXT_SUB_SCHED + /* + * If @sch is a sub-sched which is bypassing, its tasks should go into + * the bypass DSQs of the nearest ancestor which is not bypassing. The + * not-bypassing ancestor is responsible for scheduling all tasks from + * bypassing sub-trees. If all ancestors including root are bypassing, + * all tasks should go to the root's bypass DSQs. + * + * Whenever a sched starts bypassing, all runnable tasks in its subtree + * are re-enqueued after scx_bypassing() is turned on, guaranteeing that + * all tasks are transferred to the right DSQs. + */ + while (scx_parent(sch) && scx_bypassing(sch, cpu)) + sch = scx_parent(sch); +#endif /* CONFIG_EXT_SUB_SCHED */ + + return bypass_dsq(sch, cpu); +} + +/** + * bypass_dsp_enabled - Check if bypass dispatch path is enabled + * @sch: scheduler to check + * + * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled + * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors + * are bypassing. In the former case, the ancestor is not itself bypassing but + * its bypass DSQs will be populated with bypassed tasks from descendants. Thus, + * the ancestor's bypass dispatch path must be active even though its own + * bypass_depth remains zero. * - * @mask is constant, always inline to cull the mask calculations. + * This function checks bypass_dsp_enable_depth which is managed separately from + * bypass_depth to enable this decoupling. See enable_bypass_dsp() and + * disable_bypass_dsp(). */ -static __always_inline void scx_kf_allow(u32 mask) +static bool bypass_dsp_enabled(struct scx_sched *sch) { - /* nesting is allowed only in increasing scx_kf_mask order */ - WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, - "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", - current->scx.kf_mask, mask); - current->scx.kf_mask |= mask; - barrier(); + return unlikely(atomic_read(&sch->bypass_dsp_enable_depth)); } -static void scx_kf_disallow(u32 mask) +/** + * rq_is_open - Is the rq available for immediate execution of an SCX task? + * @rq: rq to test + * @enq_flags: optional %SCX_ENQ_* of the task being enqueued + * + * Returns %true if @rq is currently open for executing an SCX task. After a + * %false return, @rq is guaranteed to invoke SCX dispatch path at least once + * before going to idle and not inserting a task into @rq's local DSQ after a + * %false return doesn't cause @rq to stall. + */ +static bool rq_is_open(struct rq *rq, u64 enq_flags) { - barrier(); - current->scx.kf_mask &= ~mask; + lockdep_assert_rq_held(rq); + + /* + * A higher-priority class task is either running or in the process of + * waking up on @rq. + */ + if (sched_class_above(rq->next_class, &ext_sched_class)) + return false; + + /* + * @rq is either in transition to or in idle and there is no + * higher-priority class task waking up on it. + */ + if (sched_class_above(&ext_sched_class, rq->next_class)) + return true; + + /* + * @rq is either picking, in transition to, or running an SCX task. + */ + + /* + * If we're in the dispatch path holding rq lock, $curr may or may not + * be ready depending on whether the on-going dispatch decides to extend + * $curr's slice. We say yes here and resolve it at the end of dispatch. + * See balance_one(). + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return true; + + /* + * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch, + * so allow it to avoid spuriously triggering reenq on a combined + * PREEMPT|IMMED insertion. + */ + if (enq_flags & SCX_ENQ_PREEMPT) + return true; + + /* + * @rq is either in transition to or running an SCX task and can't go + * idle without another SCX dispatch cycle. + */ + return false; } /* @@ -308,119 +469,77 @@ static inline void update_locked_rq(struct rq *rq) __this_cpu_write(scx_locked_rq_state, rq); } -#define SCX_CALL_OP(sch, mask, op, rq, args...) \ +#define SCX_CALL_OP(sch, op, rq, args...) \ do { \ if (rq) \ update_locked_rq(rq); \ - if (mask) { \ - scx_kf_allow(mask); \ - (sch)->ops.op(args); \ - scx_kf_disallow(mask); \ - } else { \ - (sch)->ops.op(args); \ - } \ + (sch)->ops.op(args); \ if (rq) \ update_locked_rq(NULL); \ } while (0) -#define SCX_CALL_OP_RET(sch, mask, op, rq, args...) \ +#define SCX_CALL_OP_RET(sch, op, rq, args...) \ ({ \ __typeof__((sch)->ops.op(args)) __ret; \ \ if (rq) \ update_locked_rq(rq); \ - if (mask) { \ - scx_kf_allow(mask); \ - __ret = (sch)->ops.op(args); \ - scx_kf_disallow(mask); \ - } else { \ - __ret = (sch)->ops.op(args); \ - } \ + __ret = (sch)->ops.op(args); \ if (rq) \ update_locked_rq(NULL); \ __ret; \ }) /* - * Some kfuncs are allowed only on the tasks that are subjects of the - * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such - * restrictions, the following SCX_CALL_OP_*() variants should be used when - * invoking scx_ops operations that take task arguments. These can only be used - * for non-nesting operations due to the way the tasks are tracked. - * - * kfuncs which can only operate on such tasks can in turn use - * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on - * the specific task. + * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments + * and records them in current->scx.kf_tasks[] for the duration of the call. A + * kfunc invoked from inside such an op can then use + * scx_kf_arg_task_ok() to verify that its task argument is one of + * those subject tasks. + * + * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - + * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock + * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if + * kf_tasks[] is set, @p's scheduler-protected fields are stable. + * + * kf_tasks[] can not stack, so task-based SCX ops must not nest. The + * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants + * while a previous one is still in progress. */ -#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ do { \ - BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP((sch), mask, op, rq, task, ##args); \ + SCX_CALL_OP((sch), op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ ({ \ __typeof__((sch)->ops.op(task, ##args)) __ret; \ - BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ ({ \ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ - BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ + WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \ }) -/* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) -{ - if (unlikely(!(current->scx.kf_mask & mask))) { - scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); - return false; - } - - /* - * Enforce nesting boundaries. e.g. A kfunc which can be called from - * DISPATCH must not be called if we're running DEQUEUE which is nested - * inside ops.dispatch(). We don't need to check boundaries for any - * blocking kfuncs as the verifier ensures they're only called from - * sleepable progs. - */ - if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && - (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_error(sch, "cpu_release kfunc called from a nested operation"); - return false; - } - - if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && - (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_error(sch, "dispatch kfunc called from a nested operation"); - return false; - } - - return true; -} - /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, - u32 mask, +static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch, struct task_struct *p) { - if (!scx_kf_allowed(sch, mask)) - return false; - if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { scx_error(sch, "called on a task not being operated on"); @@ -430,9 +549,22 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, return true; } +enum scx_dsq_iter_flags { + /* iterate in the reverse dispatch order */ + SCX_DSQ_ITER_REV = 1U << 16, + + __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, + __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, + + __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, + __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | + __SCX_DSQ_ITER_HAS_SLICE | + __SCX_DSQ_ITER_HAS_VTIME, +}; + /** * nldsq_next_task - Iterate to the next task in a non-local DSQ - * @dsq: user dsq being iterated + * @dsq: non-local dsq being iterated * @cur: current position, %NULL to start iteration * @rev: walk backwards * @@ -472,6 +604,85 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ (p) = nldsq_next_task((dsq), (p), false)) +/** + * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @dsq: non-local dsq being iterated + * + * Find the next task in a cursor based iteration. The caller must have + * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock + * between the iteration steps. + * + * Only tasks which were queued before @cursor was initialized are visible. This + * bounds the iteration and guarantees that vtime never jumps in the other + * direction while iterating. + */ +static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor, + struct scx_dispatch_q *dsq) +{ + bool rev = cursor->flags & SCX_DSQ_ITER_REV; + struct task_struct *p; + + lockdep_assert_held(&dsq->lock); + BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR)); + + if (list_empty(&cursor->node)) + p = NULL; + else + p = container_of(cursor, struct task_struct, scx.dsq_list); + + /* skip cursors and tasks that were queued after @cursor init */ + do { + p = nldsq_next_task(dsq, p, rev); + } while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq))); + + if (p) { + if (rev) + list_move_tail(&cursor->node, &p->scx.dsq_list.node); + else + list_move(&cursor->node, &p->scx.dsq_list.node); + } else { + list_del_init(&cursor->node); + } + + return p; +} + +/** + * nldsq_cursor_lost_task - Test whether someone else took the task since iteration + * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR() + * @rq: rq @p was on + * @dsq: dsq @p was on + * @p: target task + * + * @p is a task returned by nldsq_cursor_next_task(). The locks may have been + * dropped and re-acquired inbetween. Verify that no one else took or is in the + * process of taking @p from @dsq. + * + * On %false return, the caller can assume full ownership of @p. + */ +static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor, + struct rq *rq, struct scx_dispatch_q *dsq, + struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + lockdep_assert_held(&dsq->lock); + + /* + * @p could have already left $src_dsq, got re-enqueud, or be in the + * process of being consumed by someone else. + */ + if (unlikely(p->scx.dsq != dsq || + u32_before(cursor->priv, p->scx.dsq_seq) || + p->scx.holding_cpu >= 0)) + return true; + + /* if @p has stayed on @dsq, its rq couldn't have changed */ + if (WARN_ON_ONCE(rq != task_rq(p))) + return true; + + return false; +} /* * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] @@ -479,19 +690,6 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, * changes without breaking backward compatibility. Can be used with * bpf_for_each(). See bpf_iter_scx_dsq_*(). */ -enum scx_dsq_iter_flags { - /* iterate in the reverse dispatch order */ - SCX_DSQ_ITER_REV = 1U << 16, - - __SCX_DSQ_ITER_HAS_SLICE = 1U << 30, - __SCX_DSQ_ITER_HAS_VTIME = 1U << 31, - - __SCX_DSQ_ITER_USER_FLAGS = SCX_DSQ_ITER_REV, - __SCX_DSQ_ITER_ALL_FLAGS = __SCX_DSQ_ITER_USER_FLAGS | - __SCX_DSQ_ITER_HAS_SLICE | - __SCX_DSQ_ITER_HAS_VTIME, -}; - struct bpf_iter_scx_dsq_kern { struct scx_dsq_list_node cursor; struct scx_dispatch_q *dsq; @@ -514,14 +712,31 @@ struct scx_task_iter { struct rq_flags rf; u32 cnt; bool list_locked; +#ifdef CONFIG_EXT_SUB_SCHED + struct cgroup *cgrp; + struct cgroup_subsys_state *css_pos; + struct css_task_iter css_iter; +#endif }; /** * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration * @iter: iterator to init + * @cgrp: Optional root of cgroup subhierarchy to iterate + * + * Initialize @iter. Once initialized, @iter must eventually be stopped with + * scx_task_iter_stop(). * - * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter - * must eventually be stopped with scx_task_iter_stop(). + * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns + * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks. + * + * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using + * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup + * task migrations. + * + * The two modes of iterations are largely independent and it's likely that + * scx_tasks can be removed in favor of always using cgroup iteration if + * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS. * * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock() * between this and the first next() call or between any two next() calls. If @@ -532,10 +747,19 @@ struct scx_task_iter { * All tasks which existed when the iteration started are guaranteed to be * visited as long as they are not dead. */ -static void scx_task_iter_start(struct scx_task_iter *iter) +static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp) { memset(iter, 0, sizeof(*iter)); +#ifdef CONFIG_EXT_SUB_SCHED + if (cgrp) { + lockdep_assert_held(&cgroup_mutex); + iter->cgrp = cgrp; + iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self); + css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + return; + } +#endif raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; @@ -588,6 +812,14 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { +#ifdef CONFIG_EXT_SUB_SCHED + if (iter->cgrp) { + if (iter->css_pos) + css_task_iter_end(&iter->css_iter); + __scx_task_iter_rq_unlock(iter); + return; + } +#endif __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); @@ -611,6 +843,24 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) cond_resched(); } +#ifdef CONFIG_EXT_SUB_SCHED + if (iter->cgrp) { + while (iter->css_pos) { + struct task_struct *p; + + p = css_task_iter_next(&iter->css_iter); + if (p) + return p; + + css_task_iter_end(&iter->css_iter); + iter->css_pos = css_next_descendant_pre(iter->css_pos, + &iter->cgrp->self); + if (iter->css_pos) + css_task_iter_start(iter->css_pos, 0, &iter->css_iter); + } + return NULL; + } +#endif __scx_task_iter_maybe_relock(iter); list_for_each_entry(pos, cursor, tasks_node) { @@ -810,16 +1060,6 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err return -EPROTO; } -static void run_deferred(struct rq *rq) -{ - process_ddsp_deferred_locals(rq); - - if (local_read(&rq->scx.reenq_local_deferred)) { - local_set(&rq->scx.reenq_local_deferred, 0); - reenq_local(rq); - } -} - static void deferred_bal_cb_workfn(struct rq *rq) { run_deferred(rq); @@ -845,10 +1085,18 @@ static void deferred_irq_workfn(struct irq_work *irq_work) static void schedule_deferred(struct rq *rq) { /* - * Queue an irq work. They are executed on IRQ re-enable which may take - * a bit longer than the scheduler hook in schedule_deferred_locked(). + * This is the fallback when schedule_deferred_locked() can't use + * the cheaper balance callback or wakeup hook paths (the target + * CPU is not in balance or wakeup). Currently, this is primarily + * hit by reenqueue operations targeting a remote CPU. + * + * Queue on the target CPU. The deferred work can run from any CPU + * correctly - the _locked() path already processes remote rqs from + * the calling CPU - but targeting the owning CPU allows IPI delivery + * without waiting for the calling CPU to re-enable IRQs and is + * cheaper as the reenqueue runs locally. */ - irq_work_queue(&rq->scx.deferred_irq_work); + irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq)); } /** @@ -898,6 +1146,81 @@ static void schedule_deferred_locked(struct rq *rq) schedule_deferred(rq); } +static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq, + u64 reenq_flags, struct rq *locked_rq) +{ + struct rq *rq; + + /* + * Allowing reenqueues doesn't make sense while bypassing. This also + * blocks from new reenqueues to be scheduled on dead scheds. + */ + if (unlikely(READ_ONCE(sch->bypass_depth))) + return; + + if (dsq->id == SCX_DSQ_LOCAL) { + rq = container_of(dsq, struct rq, scx.local_dsq); + + struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq)); + struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local; + + /* + * Pairs with smp_mb() in process_deferred_reenq_locals() and + * guarantees that there is a reenq_local() afterwards. + */ + smp_mb(); + + if (list_empty(&drl->node) || + (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) { + + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + + if (list_empty(&drl->node)) + list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals); + WRITE_ONCE(drl->flags, drl->flags | reenq_flags); + } + } else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) { + rq = this_rq(); + + struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq)); + struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user; + + /* + * Pairs with smp_mb() in process_deferred_reenq_users() and + * guarantees that there is a reenq_user() afterwards. + */ + smp_mb(); + + if (list_empty(&dru->node) || + (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) { + + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + + if (list_empty(&dru->node)) + list_move_tail(&dru->node, &rq->scx.deferred_reenq_users); + WRITE_ONCE(dru->flags, dru->flags | reenq_flags); + } + } else { + scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id); + return; + } + + if (rq == locked_rq) + schedule_deferred_locked(rq); + else + schedule_deferred(rq); +} + +static void schedule_reenq_local(struct rq *rq, u64 reenq_flags) +{ + struct scx_sched *root = rcu_dereference_sched(scx_root); + + if (WARN_ON_ONCE(!root)) + return; + + schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq); +} + /** * touch_core_sched - Update timestamp used for core-sched task ordering * @rq: rq to read clock from, must be locked @@ -974,28 +1297,105 @@ static bool scx_dsq_priq_less(struct rb_node *node_a, return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); } -static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) +static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { + /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ + WRITE_ONCE(dsq->nr, dsq->nr + 1); + /* - * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE() - * on the read side and WRITE_ONCE() on the write side to properly - * annotate the concurrent lockless access and avoid KCSAN warnings. + * Once @p reaches a local DSQ, it can only leave it by being dispatched + * to the CPU or dequeued. In both cases, the only way @p can go back to + * the BPF sched is through enqueueing. If being inserted into a local + * DSQ with IMMED, persist the state until the next enqueueing event in + * do_enqueue_task() so that we can maintain IMMED protection through + * e.g. SAVE/RESTORE cycles and slice extensions. */ - WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta); + if (enq_flags & SCX_ENQ_IMMED) { + if (unlikely(dsq->id != SCX_DSQ_LOCAL)) { + WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK)); + return; + } + p->scx.flags |= SCX_TASK_IMMED; + } + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + return; + + rq->scx.nr_immed++; + + /* + * If @rq already had other tasks or the current task is not + * done yet, @p can't go on the CPU immediately. Re-enqueue. + */ + if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags))) + schedule_reenq_local(rq, 0); + } +} + +static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p) +{ + /* see dsq_inc_nr() */ + WRITE_ONCE(dsq->nr, dsq->nr - 1); + + if (p->scx.flags & SCX_TASK_IMMED) { + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + + if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) || + WARN_ON_ONCE(rq->scx.nr_immed <= 0)) + return; + + rq->scx.nr_immed--; + } } static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { - p->scx.slice = READ_ONCE(scx_slice_dfl); + p->scx.slice = READ_ONCE(sch->slice_dfl); __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } +/* + * Return true if @p is moving due to an internal SCX migration, false + * otherwise. + */ +static inline bool task_scx_migrating(struct task_struct *p) +{ + /* + * We only need to check sticky_cpu: it is set to the destination + * CPU in move_remote_task_to_local_dsq() before deactivate_task() + * and cleared when the task is enqueued on the destination, so it + * is only non-negative during an internal SCX migration. + */ + return p->scx.sticky_cpu >= 0; +} + +/* + * Call ops.dequeue() if the task is in BPF custody and not migrating. + * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked. + */ +static void call_task_dequeue(struct scx_sched *sch, struct rq *rq, + struct task_struct *p, u64 deq_flags) +{ + if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p)) + return; + + if (SCX_HAS_OP(sch, dequeue)) + SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags); + + p->scx.flags &= ~SCX_TASK_IN_CUSTODY; +} + static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); bool preempt = false; + call_task_dequeue(scx_root, rq, p, 0); + /* * If @rq is in balance, the CPU is already vacant and looking for the * next task to run. No need to preempt or trigger resched after moving @@ -1014,8 +1414,9 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p resched_curr(rq); } -static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, - struct task_struct *p, u64 enq_flags) +static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq, + struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) { bool is_local = dsq->id == SCX_DSQ_LOCAL; @@ -1031,7 +1432,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(sch, p); + dsq = find_global_dsq(sch, task_cpu(p)); raw_spin_lock(&dsq->lock); } } @@ -1106,17 +1507,30 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, WRITE_ONCE(dsq->seq, dsq->seq + 1); p->scx.dsq_seq = dsq->seq; - dsq_mod_nr(dsq, 1); + dsq_inc_nr(dsq, p, enq_flags); p->scx.dsq = dsq; /* - * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the - * direct dispatch path, but we clear them here because the direct - * dispatch verdict may be overridden on the enqueue path during e.g. - * bypass. + * Update custody and call ops.dequeue() before clearing ops_state: + * once ops_state is cleared, waiters in ops_dequeue() can proceed + * and dequeue_task_scx() will RMW p->scx.flags. If we clear + * ops_state first, both sides would modify p->scx.flags + * concurrently in a non-atomic way. */ - p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; - p->scx.ddsp_enq_flags = 0; + if (is_local) { + local_dsq_post_enq(dsq, p, enq_flags); + } else { + /* + * Task on global/bypass DSQ: leave custody, task on + * non-terminal DSQ: enter custody. + */ + if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS) + call_task_dequeue(sch, rq, p, 0); + else + p->scx.flags |= SCX_TASK_IN_CUSTODY; + + raw_spin_unlock(&dsq->lock); + } /* * We're transitioning out of QUEUEING or DISPATCHING. store_release to @@ -1124,11 +1538,6 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, */ if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - - if (is_local) - local_dsq_post_enq(dsq, p, enq_flags); - else - raw_spin_unlock(&dsq->lock); } static void task_unlink_from_dsq(struct task_struct *p, @@ -1143,7 +1552,7 @@ static void task_unlink_from_dsq(struct task_struct *p, } list_del_init(&p->scx.dsq_list.node); - dsq_mod_nr(dsq, -1); + dsq_dec_nr(dsq, p); if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { struct task_struct *first_task; @@ -1222,7 +1631,7 @@ static void dispatch_dequeue_locked(struct task_struct *p, static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, struct rq *rq, u64 dsq_id, - struct task_struct *p) + s32 tcpu) { struct scx_dispatch_q *dsq; @@ -1233,20 +1642,19 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(sch, p); + return find_global_dsq(sch, tcpu); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(sch, p); + dsq = find_global_dsq(sch, tcpu); else dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { - scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", - dsq_id, p->comm, p->pid); - return find_global_dsq(sch, p); + scx_error(sch, "non-existent DSQ 0x%llx", dsq_id); + return find_global_dsq(sch, tcpu); } return dsq; @@ -1283,12 +1691,34 @@ static void mark_direct_dispatch(struct scx_sched *sch, p->scx.ddsp_enq_flags = enq_flags; } +/* + * Clear @p direct dispatch state when leaving the scheduler. + * + * Direct dispatch state must be cleared in the following cases: + * - direct_dispatch(): cleared on the synchronous enqueue path, deferred + * dispatch keeps the state until consumed + * - process_ddsp_deferred_locals(): cleared after consuming deferred state, + * - do_enqueue_task(): cleared on enqueue fallbacks where the dispatch + * verdict is ignored (local/global/bypass) + * - dequeue_task_scx(): cleared after dispatch_dequeue(), covering deferred + * cancellation and holding_cpu races + * - scx_disable_task(): cleared for queued wakeup tasks, which are excluded by + * the scx_bypass() loop, so that stale state is not reused by a subsequent + * scheduler instance + */ +static inline void clear_direct_dispatch(struct task_struct *p) +{ + p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; + p->scx.ddsp_enq_flags = 0; +} + static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, u64 enq_flags) { struct rq *rq = task_rq(p); struct scx_dispatch_q *dsq = - find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); + find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p)); + u64 ddsp_enq_flags; touch_core_sched_dispatch(rq, p); @@ -1329,8 +1759,10 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, return; } - dispatch_enqueue(sch, dsq, p, - p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); + ddsp_enq_flags = p->scx.ddsp_enq_flags; + clear_direct_dispatch(p); + + dispatch_enqueue(sch, rq, dsq, p, ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); } static bool scx_rq_online(struct rq *rq) @@ -1348,18 +1780,26 @@ static bool scx_rq_online(struct rq *rq) static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, int sticky_cpu) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); struct task_struct **ddsp_taskp; struct scx_dispatch_q *dsq; unsigned long qseq; WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); - /* rq migration */ + /* internal movements - rq migration / RESTORE */ if (sticky_cpu == cpu_of(rq)) goto local_norefill; /* + * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr(). + * Note that exiting and migration-disabled tasks that skip + * ops.enqueue() below will lose IMMED protection unless + * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set. + */ + p->scx.flags &= ~SCX_TASK_IMMED; + + /* * If !scx_rq_online(), we already told the BPF scheduler that the CPU * is offline and are just running the hotplug path. Don't bother the * BPF scheduler. @@ -1367,7 +1807,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (!scx_rq_online(rq)) goto local; - if (scx_rq_bypassing(rq)) { + if (scx_bypassing(sch, cpu_of(rq))) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); goto bypass; } @@ -1402,13 +1842,19 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags); + SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags); *ddsp_taskp = NULL; if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) goto direct; /* + * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY + * so ops.dequeue() is called when it leaves custody. + */ + p->scx.flags |= SCX_TASK_IN_CUSTODY; + + /* * If not directly dispatched, QUEUEING isn't clear yet and dispatch or * dequeue may be waiting. The store_release matches their load_acquire. */ @@ -1419,16 +1865,16 @@ direct: direct_dispatch(sch, p, enq_flags); return; local_norefill: - dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); + dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags); return; local: dsq = &rq->scx.local_dsq; goto enqueue; global: - dsq = find_global_dsq(sch, p); + dsq = find_global_dsq(sch, task_cpu(p)); goto enqueue; bypass: - dsq = &task_rq(p)->scx.bypass_dsq; + dsq = bypass_enq_target_dsq(sch, task_cpu(p)); goto enqueue; enqueue: @@ -1439,7 +1885,8 @@ enqueue: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); - dispatch_enqueue(sch, dsq, p, enq_flags); + clear_direct_dispatch(p); + dispatch_enqueue(sch, rq, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -1472,16 +1919,13 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); int sticky_cpu = p->scx.sticky_cpu; u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags; if (enq_flags & ENQUEUE_WAKEUP) rq->scx.flags |= SCX_RQ_IN_WAKEUP; - if (sticky_cpu >= 0) - p->scx.sticky_cpu = -1; - /* * Restoring a running task will be immediately followed by * set_next_task_scx() which expects the task to not be on the BPF @@ -1502,7 +1946,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_ add_nr_running(rq, 1); if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags); + SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags); if (enq_flags & SCX_ENQ_WAKEUP) touch_core_sched(rq, p); @@ -1512,6 +1956,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_ dl_server_start(&rq->ext_server); do_enqueue_task(rq, p, enq_flags, sticky_cpu); + + if (sticky_cpu >= 0) + p->scx.sticky_cpu = -1; out: rq->scx.flags &= ~SCX_RQ_IN_WAKEUP; @@ -1522,7 +1969,7 @@ out: static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); unsigned long opss; /* dequeue is always temporary, don't reset runnable_at */ @@ -1541,10 +1988,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) */ BUG(); case SCX_OPSS_QUEUED: - if (SCX_HAS_OP(sch, dequeue)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, - p, deq_flags); - + /* A queued task must always be in BPF scheduler's custody */ + WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY)); if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, SCX_OPSS_NONE)) break; @@ -1567,11 +2012,35 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags) BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); break; } + + /* + * Call ops.dequeue() if the task is still in BPF custody. + * + * The code that clears ops_state to %SCX_OPSS_NONE does not always + * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when + * we're moving a task that was in %SCX_OPSS_DISPATCHING to a + * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE + * so that a concurrent dequeue can proceed, but we clear + * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the + * task. So we can see NONE + IN_CUSTODY here and we must handle + * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see + * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until + * it is enqueued on the destination. + */ + call_task_dequeue(sch, rq, p, deq_flags); } -static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) +static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); + u64 deq_flags = core_deq_flags; + + /* + * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property + * change (not sleep or core-sched pick). + */ + if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC))) + deq_flags |= SCX_DEQ_SCHED_CHANGE; if (!(p->scx.flags & SCX_TASK_QUEUED)) { WARN_ON_ONCE(task_runnable(p)); @@ -1594,11 +2063,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags */ if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) { update_curr_scx(rq); - SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false); + SCX_CALL_OP_TASK(sch, stopping, rq, p, false); } if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags); + SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags); if (deq_flags & SCX_DEQ_SLEEP) p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; @@ -1610,32 +2079,56 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags sub_nr_running(rq, 1); dispatch_dequeue(rq, p); + clear_direct_dispatch(p); return true; } static void yield_task_scx(struct rq *rq) { - struct scx_sched *sch = scx_root; struct task_struct *p = rq->donor; + struct scx_sched *sch = scx_task_sched(p); if (SCX_HAS_OP(sch, yield)) - SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); + SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL); else p->scx.slice = 0; } static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { - struct scx_sched *sch = scx_root; struct task_struct *from = rq->donor; + struct scx_sched *sch = scx_task_sched(from); - if (SCX_HAS_OP(sch, yield)) - return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, - from, to); + if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to)) + return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to); else return false; } +static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) +{ + /* + * Preemption between SCX tasks is implemented by resetting the victim + * task's slice to 0 and triggering reschedule on the target CPU. + * Nothing to do. + */ + if (p->sched_class == &ext_sched_class) + return; + + /* + * Getting preempted by a higher-priority class. Reenqueue IMMED tasks. + * This captures all preemption cases including: + * + * - A SCX task is currently running. + * + * - @rq is waking from idle due to a SCX task waking to it. + * + * - A higher-priority wakes up while SCX dispatch is in progress. + */ + if (rq->scx.nr_immed) + schedule_reenq_local(rq, 0); +} + static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *src_dsq, struct rq *dst_rq) @@ -1653,7 +2146,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, else list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list); - dsq_mod_nr(dst_dsq, 1); + dsq_inc_nr(dst_dsq, p, enq_flags); p->scx.dsq = dst_dsq; local_dsq_post_enq(dst_dsq, p, enq_flags); @@ -1673,10 +2166,13 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, { lockdep_assert_rq_held(src_rq); - /* the following marks @p MIGRATING which excludes dequeue */ + /* + * Set sticky_cpu before deactivate_task() to properly mark the + * beginning of an SCX-internal migration. + */ + p->scx.sticky_cpu = cpu_of(dst_rq); deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu_of(dst_rq)); - p->scx.sticky_cpu = cpu_of(dst_rq); raw_spin_rq_unlock(src_rq); raw_spin_rq_lock(dst_rq); @@ -1716,7 +2212,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { - int cpu = cpu_of(rq); + s32 cpu = cpu_of(rq); WARN_ON_ONCE(task_cpu(p) == cpu); @@ -1810,13 +2306,14 @@ static bool unlink_dsq_and_lock_src_rq(struct task_struct *p, !WARN_ON_ONCE(src_rq != task_rq(p)); } -static bool consume_remote_task(struct rq *this_rq, struct task_struct *p, +static bool consume_remote_task(struct rq *this_rq, + struct task_struct *p, u64 enq_flags, struct scx_dispatch_q *dsq, struct rq *src_rq) { raw_spin_rq_unlock(this_rq); if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) { - move_remote_task_to_local_dsq(p, 0, src_rq, this_rq); + move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq); return true; } else { raw_spin_rq_unlock(src_rq); @@ -1856,8 +2353,9 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dst_dsq = find_global_dsq(sch, p); + dst_dsq = find_global_dsq(sch, task_cpu(p)); dst_rq = src_rq; + enq_flags |= SCX_ENQ_GDSQ_FALLBACK; } } else { /* no need to migrate if destination is a non-local DSQ */ @@ -1888,14 +2386,14 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dispatch_dequeue_locked(p, src_dsq); raw_spin_unlock(&src_dsq->lock); - dispatch_enqueue(sch, dst_dsq, p, enq_flags); + dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags); } return dst_rq; } static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, - struct scx_dispatch_q *dsq) + struct scx_dispatch_q *dsq, u64 enq_flags) { struct task_struct *p; retry: @@ -1920,18 +2418,18 @@ retry: * the system into the bypass mode. This can easily live-lock the * machine. If aborting, exit from all non-bypass DSQs. */ - if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) + if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS) break; if (rq == task_rq) { task_unlink_from_dsq(p, dsq); - move_local_task_to_local_dsq(p, 0, dsq, rq); + move_local_task_to_local_dsq(p, enq_flags, dsq, rq); raw_spin_unlock(&dsq->lock); return true; } if (task_can_run_on_remote_rq(sch, p, rq, false)) { - if (likely(consume_remote_task(rq, p, dsq, task_rq))) + if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq))) return true; goto retry; } @@ -1945,7 +2443,7 @@ static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq) { int node = cpu_to_node(cpu_of(rq)); - return consume_dispatch_q(sch, rq, sch->global_dsqs[node]); + return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0); } /** @@ -1978,15 +2476,15 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, * If dispatching to @rq that @p is already on, no lock dancing needed. */ if (rq == src_rq && rq == dst_rq) { - dispatch_enqueue(sch, dst_dsq, p, + dispatch_enqueue(sch, rq, dst_dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dispatch_enqueue(sch, find_global_dsq(sch, p), p, - enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p, + enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK); return; } @@ -2023,7 +2521,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, */ if (src_rq == dst_rq) { p->scx.holding_cpu = -1; - dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p, + dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p, enq_flags); } else { move_remote_task_to_local_dsq(p, enq_flags, @@ -2093,6 +2591,12 @@ retry: if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) return; + /* see SCX_EV_INSERT_NOT_OWNED definition */ + if (unlikely(!scx_task_on_sched(sch, p))) { + __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); + return; + } + /* * While we know @p is accessible, we don't yet have a claim on * it - the BPF scheduler is allowed to dispatch tasks @@ -2117,17 +2621,17 @@ retry: BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); - dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p); + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p)); if (dsq->id == SCX_DSQ_LOCAL) dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); else - dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); + dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); } static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; u32 u; for (u = 0; u < dspc->cursor; u++) { @@ -2154,67 +2658,54 @@ static inline void maybe_queue_balance_callback(struct rq *rq) rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; } -static int balance_one(struct rq *rq, struct task_struct *prev) +/* + * One user of this function is scx_bpf_dispatch() which can be called + * recursively as sub-sched dispatches nest. Always inline to reduce stack usage + * from the call frame. + */ +static __always_inline bool +scx_dispatch_sched(struct scx_sched *sch, struct rq *rq, + struct task_struct *prev, bool nested) { - struct scx_sched *sch = scx_root; - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); - bool prev_on_scx = prev->sched_class == &ext_sched_class; - bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; + struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; int nr_loops = SCX_DSP_MAX_LOOPS; + s32 cpu = cpu_of(rq); + bool prev_on_sch = (prev->sched_class == &ext_sched_class) && + scx_task_on_sched(sch, prev); - lockdep_assert_rq_held(rq); - rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~SCX_RQ_BAL_KEEP; - - if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && - unlikely(rq->scx.cpu_released)) { - /* - * If the previous sched_class for the current CPU was not SCX, - * notify the BPF scheduler that it again has control of the - * core. This callback complements ->cpu_release(), which is - * emitted in switch_class(). - */ - if (SCX_HAS_OP(sch, cpu_acquire)) - SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, - cpu_of(rq), NULL); - rq->scx.cpu_released = false; - } + if (consume_global_dsq(sch, rq)) + return true; - if (prev_on_scx) { - update_curr_scx(rq); + if (bypass_dsp_enabled(sch)) { + /* if @sch is bypassing, only the bypass DSQs are active */ + if (scx_bypassing(sch, cpu)) + return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); +#ifdef CONFIG_EXT_SUB_SCHED /* - * If @prev is runnable & has slice left, it has priority and - * fetching more just increases latency for the fetched tasks. - * Tell pick_task_scx() to keep running @prev. If the BPF - * scheduler wants to handle this explicitly, it should - * implement ->cpu_release(). + * If @sch isn't bypassing but its children are, @sch is + * responsible for making forward progress for both its own + * tasks that aren't bypassing and the bypassing descendants' + * tasks. The following implements a simple built-in behavior - + * let each CPU try to run the bypass DSQ every Nth time. * - * See scx_disable_workfn() for the explanation on the bypassing - * test. + * Later, if necessary, we can add an ops flag to suppress the + * auto-consumption and a kfunc to consume the bypass DSQ and, + * so that the BPF scheduler can fully control scheduling of + * bypassed tasks. */ - if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { - rq->scx.flags |= SCX_RQ_BAL_KEEP; - goto has_tasks; - } - } - - /* if there already are tasks to run, nothing to do */ - if (rq->scx.local_dsq.nr) - goto has_tasks; - - if (consume_global_dsq(sch, rq)) - goto has_tasks; + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); - if (scx_rq_bypassing(rq)) { - if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) - goto has_tasks; - else - goto no_tasks; + if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) && + consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) { + __scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1); + return true; + } +#endif /* CONFIG_EXT_SUB_SCHED */ } if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) - goto no_tasks; + return false; dspc->rq = rq; @@ -2228,19 +2719,25 @@ static int balance_one(struct rq *rq, struct task_struct *prev) do { dspc->nr_tasks = 0; - SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, - cpu_of(rq), prev_on_scx ? prev : NULL); + if (nested) { + SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + } else { + /* stash @prev so that nested invocations can access it */ + rq->scx.sub_dispatch_prev = prev; + SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL); + rq->scx.sub_dispatch_prev = NULL; + } flush_dispatch_buf(sch, rq); - if (prev_on_rq && prev->scx.slice) { + if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) { rq->scx.flags |= SCX_RQ_BAL_KEEP; - goto has_tasks; + return true; } if (rq->scx.local_dsq.nr) - goto has_tasks; + return true; if (consume_global_dsq(sch, rq)) - goto has_tasks; + return true; /* * ops.dispatch() can trap us in this loop by repeatedly @@ -2249,21 +2746,80 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_kick_cpu() for deferred kicking. + * __scx_bpf_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_kick_cpu(sch, cpu_of(rq), 0); + scx_kick_cpu(sch, cpu, 0); break; } } while (dspc->nr_tasks); -no_tasks: + /* + * Prevent the CPU from going idle while bypassed descendants have tasks + * queued. Without this fallback, bypassed tasks could stall if the host + * scheduler's ops.dispatch() doesn't yield any tasks. + */ + if (bypass_dsp_enabled(sch)) + return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0); + + return false; +} + +static int balance_one(struct rq *rq, struct task_struct *prev) +{ + struct scx_sched *sch = scx_root; + s32 cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + rq->scx.flags |= SCX_RQ_IN_BALANCE; + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; + + if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && + unlikely(rq->scx.cpu_released)) { + /* + * If the previous sched_class for the current CPU was not SCX, + * notify the BPF scheduler that it again has control of the + * core. This callback complements ->cpu_release(), which is + * emitted in switch_class(). + */ + if (SCX_HAS_OP(sch, cpu_acquire)) + SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL); + rq->scx.cpu_released = false; + } + + if (prev->sched_class == &ext_sched_class) { + update_curr_scx(rq); + + /* + * If @prev is runnable & has slice left, it has priority and + * fetching more just increases latency for the fetched tasks. + * Tell pick_task_scx() to keep running @prev. If the BPF + * scheduler wants to handle this explicitly, it should + * implement ->cpu_release(). + * + * See scx_disable_workfn() for the explanation on the bypassing + * test. + */ + if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice && + !scx_bypassing(sch, cpu)) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } + } + + /* if there already are tasks to run, nothing to do */ + if (rq->scx.local_dsq.nr) + goto has_tasks; + + if (scx_dispatch_sched(sch, rq, prev, false)) + goto has_tasks; + /* * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if (prev_on_rq && - (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) { + if ((prev->scx.flags & SCX_TASK_QUEUED) && + (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; __scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1); goto has_tasks; @@ -2272,40 +2828,26 @@ no_tasks: return false; has_tasks: - rq->scx.flags &= ~SCX_RQ_IN_BALANCE; - return true; -} - -static void process_ddsp_deferred_locals(struct rq *rq) -{ - struct task_struct *p; - - lockdep_assert_rq_held(rq); - /* - * Now that @rq can be unlocked, execute the deferred enqueueing of - * tasks directly dispatched to the local DSQs of other CPUs. See - * direct_dispatch(). Keep popping from the head instead of using - * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq - * temporarily. + * @rq may have extra IMMED tasks without reenq scheduled: + * + * - rq_is_open() can't reliably tell when and how slice is going to be + * modified for $curr and allows IMMED tasks to be queued while + * dispatch is in progress. + * + * - A non-IMMED HEAD task can get queued in front of an IMMED task + * between the IMMED queueing and the subsequent scheduling event. */ - while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, - struct task_struct, scx.dsq_list.node))) { - struct scx_sched *sch = scx_root; - struct scx_dispatch_q *dsq; + if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed)) + schedule_reenq_local(rq, 0); - list_del_init(&p->scx.dsq_list.node); - - dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p); - if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) - dispatch_to_local_dsq(sch, rq, dsq, p, - p->scx.ddsp_enq_flags); - } + rq->scx.flags &= ~SCX_RQ_IN_BALANCE; + return true; } static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (p->scx.flags & SCX_TASK_QUEUED) { /* @@ -2320,7 +2862,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p); + SCX_CALL_OP_TASK(sch, running, rq, p); clr_task_runnable(p, true); @@ -2392,8 +2934,7 @@ static void switch_class(struct rq *rq, struct task_struct *next) .task = next, }; - SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq, - cpu_of(rq), &args); + SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args); } rq->scx.cpu_released = true; } @@ -2402,16 +2943,16 @@ static void switch_class(struct rq *rq, struct task_struct *next) static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true); + SCX_CALL_OP_TASK(sch, stopping, rq, p, true); if (p->scx.flags & SCX_TASK_QUEUED) { set_task_runnable(rq, p); @@ -2420,11 +2961,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * If @p has slice left and is being put, @p is getting * preempted by a higher priority scheduler class or core-sched * forcing a different task. Leave it at the head of the local - * DSQ. + * DSQ unless it was an IMMED task. IMMED tasks should not + * linger on a busy CPU, reenqueue them to the BPF scheduler. */ - if (p->scx.slice && !scx_rq_bypassing(rq)) { - dispatch_enqueue(sch, &rq->scx.local_dsq, p, - SCX_ENQ_HEAD); + if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) { + if (p->scx.flags & SCX_TASK_IMMED) { + p->scx.flags |= SCX_TASK_REENQ_PREEMPTED; + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + } else { + dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD); + } goto switch_class; } @@ -2447,6 +2994,48 @@ switch_class: switch_class(rq, next); } +static void kick_sync_wait_bal_cb(struct rq *rq) +{ + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; + bool waited; + s32 cpu; + + /* + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled + * — a target CPU may be waiting for us to process an IPI (e.g. TLB + * flush) while we wait for its kick_sync to advance. + * + * Also, keep advancing our own kick_sync so that new kick_sync waits + * targeting us, which can start after we drop the lock, cannot form + * cyclic dependencies. + */ +retry: + waited = false; + for_each_cpu(cpu, rq->scx.cpus_to_sync) { + /* + * smp_load_acquire() pairs with smp_store_release() on + * kick_sync updates on the target CPUs. + */ + if (cpu == cpu_of(rq) || + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); + continue; + } + + raw_spin_rq_unlock_irq(rq); + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + cpu_relax(); + } + raw_spin_rq_lock_irq(rq); + waited = true; + } + + if (waited) + goto retry; +} + static struct task_struct *first_local_task(struct rq *rq) { return list_first_entry_or_null(&rq->scx.local_dsq.list, @@ -2460,7 +3049,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) bool keep_prev; struct task_struct *p; - /* see kick_cpus_irq_workfn() */ + /* see kick_sync_wait_bal_cb() */ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); rq_modified_begin(rq, &ext_sched_class); @@ -2471,6 +3060,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) maybe_queue_balance_callback(rq); /* + * Defer to a balance callback which can drop rq lock and enable + * IRQs. Waiting directly in the pick path would deadlock against + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. + */ + if (unlikely(rq->scx.kick_sync_pending)) { + rq->scx.kick_sync_pending = false; + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, + kick_sync_wait_bal_cb); + } + + /* * If any higher-priority sched class enqueued a runnable task on * this rq during balance_one(), abort and return RETRY_TASK, so * that the scheduler loop can restart. @@ -2496,16 +3096,17 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); + refill_task_slice_dfl(scx_task_sched(p), p); } else { p = first_local_task(rq); if (!p) return NULL; if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = rcu_dereference_sched(scx_root); + struct scx_sched *sch = scx_task_sched(p); - if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { + if (!scx_bypassing(sch, cpu_of(rq)) && + !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); sch->warned_zero_slice = true; @@ -2571,16 +3172,17 @@ void ext_server_init(struct rq *rq) bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch_a = scx_task_sched(a); + struct scx_sched *sch_b = scx_task_sched(b); /* * The const qualifiers are dropped from task_struct pointers when * calling ops.core_sched_before(). Accesses are controlled by the * verifier. */ - if (SCX_HAS_OP(sch, core_sched_before) && - !scx_rq_bypassing(task_rq(a))) - return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before, + if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) && + !scx_bypassing(sch_a, task_cpu(a))) + return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before, NULL, (struct task_struct *)a, (struct task_struct *)b); @@ -2591,8 +3193,8 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) { - struct scx_sched *sch = scx_root; - bool rq_bypass; + struct scx_sched *sch = scx_task_sched(p); + bool bypassing; /* * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it @@ -2607,8 +3209,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag if (unlikely(wake_flags & WF_EXEC)) return prev_cpu; - rq_bypass = scx_rq_bypassing(task_rq(p)); - if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) { + bypassing = scx_bypassing(sch, task_cpu(p)); + if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) { s32 cpu; struct task_struct **ddsp_taskp; @@ -2616,10 +3218,9 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag WARN_ON_ONCE(*ddsp_taskp); *ddsp_taskp = p; - cpu = SCX_CALL_OP_TASK_RET(sch, - SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, - select_cpu, NULL, p, prev_cpu, - wake_flags); + this_rq()->scx.in_select_cpu = true; + cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags); + this_rq()->scx.in_select_cpu = false; p->scx.selected_cpu = cpu; *ddsp_taskp = NULL; if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()")) @@ -2638,7 +3239,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag } p->scx.selected_cpu = cpu; - if (rq_bypass) + if (bypassing) __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); return cpu; } @@ -2652,7 +3253,7 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p) static void set_cpus_allowed_scx(struct task_struct *p, struct affinity_context *ac) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); set_cpus_allowed_common(p, ac); @@ -2668,14 +3269,13 @@ static void set_cpus_allowed_scx(struct task_struct *p, * designation pointless. Cast it away when calling the operation. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL, - p, (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr); } static void handle_hotplug(struct rq *rq, bool online) { struct scx_sched *sch = scx_root; - int cpu = cpu_of(rq); + s32 cpu = cpu_of(rq); atomic_long_inc(&scx_hotplug_seq); @@ -2691,9 +3291,9 @@ static void handle_hotplug(struct rq *rq, bool online) scx_idle_update_selcpu_topology(&sch->ops); if (online && SCX_HAS_OP(sch, cpu_online)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu); + SCX_CALL_OP(sch, cpu_online, NULL, cpu); else if (!online && SCX_HAS_OP(sch, cpu_offline)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu); + SCX_CALL_OP(sch, cpu_offline, NULL, cpu); else scx_exit(sch, SCX_EXIT_UNREG_KERN, SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, @@ -2721,7 +3321,6 @@ static void rq_offline_scx(struct rq *rq) rq->scx.flags &= ~SCX_RQ_ONLINE; } - static bool check_rq_for_timeouts(struct rq *rq) { struct scx_sched *sch; @@ -2735,10 +3334,11 @@ static bool check_rq_for_timeouts(struct rq *rq) goto out_unlock; list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { + struct scx_sched *sch = scx_task_sched(p); unsigned long last_runnable = p->scx.runnable_at; if (unlikely(time_after(jiffies, - last_runnable + READ_ONCE(scx_watchdog_timeout)))) { + last_runnable + READ_ONCE(sch->watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, @@ -2755,6 +3355,7 @@ out_unlock: static void scx_watchdog_workfn(struct work_struct *work) { + unsigned long intv; int cpu; WRITE_ONCE(scx_watchdog_timestamp, jiffies); @@ -2765,28 +3366,30 @@ static void scx_watchdog_workfn(struct work_struct *work) cond_resched(); } - queue_delayed_work(system_unbound_wq, to_delayed_work(work), - READ_ONCE(scx_watchdog_timeout) / 2); + + intv = READ_ONCE(scx_watchdog_interval); + if (intv < ULONG_MAX) + queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); } void scx_tick(struct rq *rq) { - struct scx_sched *sch; + struct scx_sched *root; unsigned long last_check; if (!scx_enabled()) return; - sch = rcu_dereference_bh(scx_root); - if (unlikely(!sch)) + root = rcu_dereference_bh(scx_root); + if (unlikely(!root)) return; last_check = READ_ONCE(scx_watchdog_timestamp); if (unlikely(time_after(jiffies, - last_check + READ_ONCE(scx_watchdog_timeout)))) { + last_check + READ_ONCE(root->watchdog_timeout)))) { u32 dur_ms = jiffies_to_msecs(jiffies - last_check); - scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, + scx_exit(root, SCX_EXIT_ERROR_STALL, 0, "watchdog failed to check in for %u.%03us", dur_ms / 1000, dur_ms % 1000); } @@ -2796,7 +3399,7 @@ void scx_tick(struct rq *rq) static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(curr); update_curr_scx(rq); @@ -2804,11 +3407,11 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) * While disabling, always resched and refresh core-sched timestamp as * we can't trust the slice management or ops.core_sched_before(). */ - if (scx_rq_bypassing(rq)) { + if (scx_bypassing(sch, cpu_of(rq))) { curr->scx.slice = 0; touch_core_sched(rq, curr); } else if (SCX_HAS_OP(sch, tick)) { - SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr); + SCX_CALL_OP_TASK(sch, tick, rq, curr); } if (!curr->scx.slice) @@ -2837,18 +3440,16 @@ static struct cgroup *tg_cgrp(struct task_group *tg) #endif /* CONFIG_EXT_GROUP_SCHED */ -static enum scx_task_state scx_get_task_state(const struct task_struct *p) +static u32 scx_get_task_state(const struct task_struct *p) { - return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; + return p->scx.flags & SCX_TASK_STATE_MASK; } -static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) +static void scx_set_task_state(struct task_struct *p, u32 state) { - enum scx_task_state prev_state = scx_get_task_state(p); + u32 prev_state = scx_get_task_state(p); bool warn = false; - BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); - switch (state) { case SCX_TASK_NONE: break; @@ -2862,42 +3463,45 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) warn = prev_state != SCX_TASK_READY; break; default: - warn = true; + WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]", + prev_state, state, p->comm, p->pid); return; } - WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", + WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]", prev_state, state, p->comm, p->pid); p->scx.flags &= ~SCX_TASK_STATE_MASK; - p->scx.flags |= state << SCX_TASK_STATE_SHIFT; + p->scx.flags |= state; } -static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork) +static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) { - struct scx_sched *sch = scx_root; int ret; p->scx.disallow = false; if (SCX_HAS_OP(sch, init_task)) { struct scx_init_task_args args = { - SCX_INIT_TASK_ARGS_CGROUP(tg) + SCX_INIT_TASK_ARGS_CGROUP(task_group(p)) .fork = fork, }; - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL, - p, &args); + ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args); if (unlikely(ret)) { ret = ops_sanitize_err(sch, "init_task", ret); return ret; } } - scx_set_task_state(p, SCX_TASK_INIT); - if (p->scx.disallow) { - if (!fork) { + if (unlikely(scx_parent(sch))) { + scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]", + p->comm, p->pid); + } else if (unlikely(fork)) { + scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", + p->comm, p->pid); + } else { struct rq *rq; struct rq_flags rf; @@ -2916,25 +3520,43 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork } task_rq_unlock(rq, p, &rf); - } else if (p->policy == SCHED_EXT) { - scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork", - p->comm, p->pid); } } - p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; return 0; } -static void scx_enable_task(struct task_struct *p) +static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork) +{ + int ret; + + ret = __scx_init_task(sch, p, fork); + if (!ret) { + /* + * While @p's rq is not locked. @p is not visible to the rest of + * SCX yet and it's safe to update the flags and state. + */ + p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; + scx_set_task_state(p, SCX_TASK_INIT); + } + return ret; +} + +static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p) { - struct scx_sched *sch = scx_root; struct rq *rq = task_rq(p); u32 weight; lockdep_assert_rq_held(rq); /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); + + /* * Set the weight before calling ops.enable() so that the scheduler * doesn't see a stale value if they inspect the task struct. */ @@ -2946,34 +3568,47 @@ static void scx_enable_task(struct task_struct *p) p->scx.weight = sched_weight_to_cgroup(weight); if (SCX_HAS_OP(sch, enable)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p); - scx_set_task_state(p, SCX_TASK_ENABLED); + SCX_CALL_OP_TASK(sch, enable, rq, p); if (SCX_HAS_OP(sch, set_weight)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, - p, p->scx.weight); + SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); } -static void scx_disable_task(struct task_struct *p) +static void scx_enable_task(struct scx_sched *sch, struct task_struct *p) +{ + __scx_enable_task(sch, p); + scx_set_task_state(p, SCX_TASK_ENABLED); +} + +static void scx_disable_task(struct scx_sched *sch, struct task_struct *p) { - struct scx_sched *sch = scx_root; struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); + clear_direct_dispatch(p); + if (SCX_HAS_OP(sch, disable)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p); + SCX_CALL_OP_TASK(sch, disable, rq, p); scx_set_task_state(p, SCX_TASK_READY); + + /* + * Verify the task is not in BPF scheduler's custody. If flag + * transitions are consistent, the flag should always be clear + * here. + */ + WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY); } -static void scx_exit_task(struct task_struct *p) +static void __scx_disable_and_exit_task(struct scx_sched *sch, + struct task_struct *p) { - struct scx_sched *sch = scx_root; struct scx_exit_task_args args = { .cancelled = false, }; + lockdep_assert_held(&p->pi_lock); lockdep_assert_rq_held(task_rq(p)); switch (scx_get_task_state(p)) { @@ -2985,7 +3620,7 @@ static void scx_exit_task(struct task_struct *p) case SCX_TASK_READY: break; case SCX_TASK_ENABLED: - scx_disable_task(p); + scx_disable_task(sch, p); break; default: WARN_ON_ONCE(true); @@ -2993,8 +3628,26 @@ static void scx_exit_task(struct task_struct *p) } if (SCX_HAS_OP(sch, exit_task)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p), - p, &args); + SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args); +} + +static void scx_disable_and_exit_task(struct scx_sched *sch, + struct task_struct *p) +{ + __scx_disable_and_exit_task(sch, p); + + /* + * If set, @p exited between __scx_init_task() and scx_enable_task() in + * scx_sub_enable() and is initialized for both the associated sched and + * its parent. Disable and exit for the child too. + */ + if ((p->scx.flags & SCX_TASK_SUB_INIT) && + !WARN_ON_ONCE(!scx_enabling_sub_sched)) { + __scx_disable_and_exit_task(scx_enabling_sub_sched, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + + scx_set_task_sched(p, NULL); scx_set_task_state(p, SCX_TASK_NONE); } @@ -3008,7 +3661,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = READ_ONCE(scx_slice_dfl); + scx->slice = SCX_SLICE_DFL; } void scx_pre_fork(struct task_struct *p) @@ -3022,14 +3675,25 @@ void scx_pre_fork(struct task_struct *p) percpu_down_read(&scx_fork_rwsem); } -int scx_fork(struct task_struct *p) +int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { + s32 ret; + percpu_rwsem_assert_held(&scx_fork_rwsem); - if (scx_init_task_enabled) - return scx_init_task(p, task_group(p), true); - else - return 0; + if (scx_init_task_enabled) { +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched; +#else + struct scx_sched *sch = scx_root; +#endif + ret = scx_init_task(sch, p, true); + if (!ret) + scx_set_task_sched(p, sch); + return ret; + } + + return 0; } void scx_post_fork(struct task_struct *p) @@ -3047,7 +3711,7 @@ void scx_post_fork(struct task_struct *p) struct rq *rq; rq = task_rq_lock(p, &rf); - scx_enable_task(p); + scx_enable_task(scx_task_sched(p), p); task_rq_unlock(rq, p, &rf); } } @@ -3067,7 +3731,7 @@ void scx_cancel_fork(struct task_struct *p) rq = task_rq_lock(p, &rf); WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); - scx_exit_task(p); + scx_disable_and_exit_task(scx_task_sched(p), p); task_rq_unlock(rq, p, &rf); } @@ -3118,15 +3782,15 @@ void sched_ext_dead(struct task_struct *p) raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* - * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED - * transitions can't race us. Disable ops for @p. + * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY -> + * ENABLED transitions can't race us. Disable ops for @p. */ if (scx_get_task_state(p) != SCX_TASK_NONE) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - scx_exit_task(p); + scx_disable_and_exit_task(scx_task_sched(p), p); task_rq_unlock(rq, p, &rf); } } @@ -3134,7 +3798,7 @@ void sched_ext_dead(struct task_struct *p) static void reweight_task_scx(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); lockdep_assert_rq_held(task_rq(p)); @@ -3143,8 +3807,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight)); if (SCX_HAS_OP(sch, set_weight)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq, - p, p->scx.weight); + SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight); } static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) @@ -3153,20 +3816,19 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) static void switching_to_scx(struct rq *rq, struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = scx_task_sched(p); if (task_dead_and_done(p)) return; - scx_enable_task(p); + scx_enable_task(sch, p); /* * set_cpus_allowed_scx() is not called while @p is associated with a * different scheduler class. Keep the BPF scheduler up-to-date. */ if (SCX_HAS_OP(sch, set_cpumask)) - SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq, - p, (struct cpumask *)p->cpus_ptr); + SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr); } static void switched_from_scx(struct rq *rq, struct task_struct *p) @@ -3174,11 +3836,9 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p) if (task_dead_and_done(p)) return; - scx_disable_task(p); + scx_disable_task(scx_task_sched(p), p); } -static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {} - static void switched_to_scx(struct rq *rq, struct task_struct *p) {} int scx_check_setscheduler(struct task_struct *p, int policy) @@ -3193,17 +3853,327 @@ int scx_check_setscheduler(struct task_struct *p, int policy) return 0; } +static void process_ddsp_deferred_locals(struct rq *rq) +{ + struct task_struct *p; + + lockdep_assert_rq_held(rq); + + /* + * Now that @rq can be unlocked, execute the deferred enqueueing of + * tasks directly dispatched to the local DSQs of other CPUs. See + * direct_dispatch(). Keep popping from the head instead of using + * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq + * temporarily. + */ + while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals, + struct task_struct, scx.dsq_list.node))) { + struct scx_sched *sch = scx_task_sched(p); + struct scx_dispatch_q *dsq; + u64 dsq_id = p->scx.ddsp_dsq_id; + u64 enq_flags = p->scx.ddsp_enq_flags; + + list_del_init(&p->scx.dsq_list.node); + clear_direct_dispatch(p); + + dsq = find_dsq_for_dispatch(sch, rq, dsq_id, task_cpu(p)); + if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL)) + dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags); + } +} + +/* + * Determine whether @p should be reenqueued from a local DSQ. + * + * @reenq_flags is mutable and accumulates state across the DSQ walk: + * + * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First" + * tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at + * the head consumes the first slot. + * + * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if + * rq_is_open() is true. + * + * An IMMED task is kept (returns %false) only if it's the first task in the DSQ + * AND the current task is done — i.e. it will execute immediately. All other + * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head, + * every IMMED task behind it gets reenqueued. + * + * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ | + * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local + * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers + * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT + * in process_deferred_reenq_locals(). + */ +static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason) +{ + bool first; + + first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST); + *reenq_flags |= SCX_REENQ_TSR_NOT_FIRST; + + *reason = SCX_TASK_REENQ_KFUNC; + + if ((p->scx.flags & SCX_TASK_IMMED) && + (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) { + __scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1); + *reason = SCX_TASK_REENQ_IMMED; + return true; + } + + return *reenq_flags & SCX_REENQ_ANY; +} + +static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags) +{ + LIST_HEAD(tasks); + u32 nr_enqueued = 0; + struct task_struct *p, *n; + + lockdep_assert_rq_held(rq); + + if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK)) + reenq_flags &= ~__SCX_REENQ_TSR_MASK; + if (rq_is_open(rq, 0)) + reenq_flags |= SCX_REENQ_TSR_RQ_OPEN; + + /* + * The BPF scheduler may choose to dispatch tasks back to + * @rq->scx.local_dsq. Move all candidate tasks off to a private list + * first to avoid processing the same tasks repeatedly. + */ + list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, + scx.dsq_list.node) { + struct scx_sched *task_sch = scx_task_sched(p); + u32 reason; + + /* + * If @p is being migrated, @p's current CPU may not agree with + * its allowed CPUs and the migration_cpu_stop is about to + * deactivate and re-activate @p anyway. Skip re-enqueueing. + * + * While racing sched property changes may also dequeue and + * re-enqueue a migrating task while its current CPU and allowed + * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to + * the current local DSQ for running tasks and thus are not + * visible to the BPF scheduler. + */ + if (p->migration_pending) + continue; + + if (!scx_is_descendant(task_sch, sch)) + continue; + + if (!local_task_should_reenq(p, &reenq_flags, &reason)) + continue; + + dispatch_dequeue(rq, p); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + + list_add_tail(&p->scx.dsq_list.node, &tasks); + } + + list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { + list_del_init(&p->scx.dsq_list.node); + + do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + nr_enqueued++; + } + + return nr_enqueued; +} + +static void process_deferred_reenq_locals(struct rq *rq) +{ + u64 seq = ++rq->scx.deferred_reenq_locals_seq; + + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_sched *sch; + u64 reenq_flags; + bool skip = false; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_local *drl = + list_first_entry_or_null(&rq->scx.deferred_reenq_locals, + struct scx_deferred_reenq_local, + node); + struct scx_sched_pcpu *sch_pcpu; + + if (!drl) + return; + + sch_pcpu = container_of(drl, struct scx_sched_pcpu, + deferred_reenq_local); + sch = sch_pcpu->sch; + + reenq_flags = drl->flags; + WRITE_ONCE(drl->flags, 0); + list_del_init(&drl->node); + + if (likely(drl->seq != seq)) { + drl->seq = seq; + drl->cnt = 0; + } else { + if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) { + scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times", + drl->cnt); + skip = true; + } + + __scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1); + } + } + + if (!skip) { + /* see schedule_dsq_reenq() */ + smp_mb(); + + reenq_local(sch, rq, reenq_flags); + } + } +} + +static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason) +{ + *reason = SCX_TASK_REENQ_KFUNC; + return reenq_flags & SCX_REENQ_ANY; +} + +static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags) +{ + struct rq *locked_rq = rq; + struct scx_sched *sch = dsq->sched; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0); + struct task_struct *p; + s32 nr_enqueued = 0; + + lockdep_assert_rq_held(rq); + + raw_spin_lock(&dsq->lock); + + while (likely(!READ_ONCE(sch->bypass_depth))) { + struct rq *task_rq; + u32 reason; + + p = nldsq_cursor_next_task(&cursor, dsq); + if (!p) + break; + + if (!user_task_should_reenq(p, reenq_flags, &reason)) + continue; + + task_rq = task_rq(p); + + if (locked_rq != task_rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + if (unlikely(!raw_spin_rq_trylock(task_rq))) { + raw_spin_unlock(&dsq->lock); + raw_spin_rq_lock(task_rq); + raw_spin_lock(&dsq->lock); + } + locked_rq = task_rq; + + /* did we lose @p while switching locks? */ + if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p)) + continue; + } + + /* @p is on @dsq, its rq and @dsq are locked */ + dispatch_dequeue_locked(p, dsq); + raw_spin_unlock(&dsq->lock); + + if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK)) + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + p->scx.flags |= reason; + + do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1); + + p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK; + + if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) { + raw_spin_rq_unlock(locked_rq); + locked_rq = NULL; + cpu_relax(); + } + + raw_spin_lock(&dsq->lock); + } + + list_del_init(&cursor.node); + raw_spin_unlock(&dsq->lock); + + if (locked_rq != rq) { + if (locked_rq) + raw_spin_rq_unlock(locked_rq); + raw_spin_rq_lock(rq); + } +} + +static void process_deferred_reenq_users(struct rq *rq) +{ + lockdep_assert_rq_held(rq); + + while (true) { + struct scx_dispatch_q *dsq; + u64 reenq_flags; + + scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) { + struct scx_deferred_reenq_user *dru = + list_first_entry_or_null(&rq->scx.deferred_reenq_users, + struct scx_deferred_reenq_user, + node); + struct scx_dsq_pcpu *dsq_pcpu; + + if (!dru) + return; + + dsq_pcpu = container_of(dru, struct scx_dsq_pcpu, + deferred_reenq_user); + dsq = dsq_pcpu->dsq; + reenq_flags = dru->flags; + WRITE_ONCE(dru->flags, 0); + list_del_init(&dru->node); + } + + /* see schedule_dsq_reenq() */ + smp_mb(); + + BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN); + reenq_user(rq, dsq, reenq_flags); + } +} + +static void run_deferred(struct rq *rq) +{ + process_ddsp_deferred_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_locals)) + process_deferred_reenq_locals(rq); + + if (!list_empty(&rq->scx.deferred_reenq_users)) + process_deferred_reenq_users(rq); +} + #ifdef CONFIG_NO_HZ_FULL bool scx_can_stop_tick(struct rq *rq) { struct task_struct *p = rq->curr; - - if (scx_rq_bypassing(rq)) - return false; + struct scx_sched *sch = scx_task_sched(p); if (p->sched_class != &ext_sched_class) return true; + if (scx_bypassing(sch, cpu_of(rq))) + return false; + /* * @rq can dispatch from different DSQs, so we can't tell whether it * needs the tick or not by looking at nr_running. Allow stopping ticks @@ -3241,7 +4211,7 @@ int scx_tg_online(struct task_group *tg) .bw_quota_us = tg->scx.bw_quota_us, .bw_burst_us = tg->scx.bw_burst_us }; - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, + ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, tg->css.cgroup, &args); if (ret) ret = ops_sanitize_err(sch, "cgroup_init", ret); @@ -3263,8 +4233,7 @@ void scx_tg_offline(struct task_group *tg) if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, - tg->css.cgroup); + SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); } @@ -3293,8 +4262,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) continue; if (SCX_HAS_OP(sch, cgroup_prep_move)) { - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, - cgroup_prep_move, NULL, + ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL, p, from, css->cgroup); if (ret) goto err; @@ -3309,7 +4277,7 @@ err: cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + SCX_CALL_OP(sch, cgroup_cancel_move, NULL, p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } @@ -3330,7 +4298,7 @@ void scx_cgroup_move_task(struct task_struct *p) */ if (SCX_HAS_OP(sch, cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) - SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL, + SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p), p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); p->scx.cgrp_moving_from = NULL; @@ -3348,7 +4316,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && p->scx.cgrp_moving_from) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL, + SCX_CALL_OP(sch, cgroup_cancel_move, NULL, p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } @@ -3362,8 +4330,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL, - tg_cgrp(tg), weight); + SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight); tg->scx.weight = weight; @@ -3377,8 +4344,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle) percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, - tg_cgrp(tg), idle); + SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle); /* Update the task group's idle state */ tg->scx.idle = idle; @@ -3397,7 +4363,7 @@ void scx_group_set_bandwidth(struct task_group *tg, (tg->scx.bw_period_us != period_us || tg->scx.bw_quota_us != quota_us || tg->scx.bw_burst_us != burst_us)) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL, + SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL, tg_cgrp(tg), period_us, quota_us, burst_us); tg->scx.bw_period_us = period_us; @@ -3406,33 +4372,55 @@ void scx_group_set_bandwidth(struct task_group *tg, percpu_up_read(&scx_cgroup_ops_rwsem); } +#endif /* CONFIG_EXT_GROUP_SCHED */ + +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) +static struct cgroup *root_cgroup(void) +{ + return &cgrp_dfl_root.cgrp; +} + +static struct cgroup *sch_cgroup(struct scx_sched *sch) +{ + return sch->cgrp; +} + +/* for each descendant of @cgrp including self, set ->scx_sched to @sch */ +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) +{ + struct cgroup *pos; + struct cgroup_subsys_state *css; + + cgroup_for_each_live_descendant_pre(pos, css, cgrp) + rcu_assign_pointer(pos->scx_sched, sch); +} static void scx_cgroup_lock(void) { +#ifdef CONFIG_EXT_GROUP_SCHED percpu_down_write(&scx_cgroup_ops_rwsem); +#endif cgroup_lock(); } static void scx_cgroup_unlock(void) { cgroup_unlock(); +#ifdef CONFIG_EXT_GROUP_SCHED percpu_up_write(&scx_cgroup_ops_rwsem); +#endif } - -#else /* CONFIG_EXT_GROUP_SCHED */ - +#else /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ +static struct cgroup *root_cgroup(void) { return NULL; } +static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; } +static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {} static void scx_cgroup_lock(void) {} static void scx_cgroup_unlock(void) {} - -#endif /* CONFIG_EXT_GROUP_SCHED */ +#endif /* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */ /* * Omitted operations: * - * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task - * isn't tied to the CPU at that point. Preemption is implemented by resetting - * the victim task's slice to 0 and triggering reschedule on the target CPU. - * * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. * * - task_fork/dead: We need fork/dead notifications for all tasks regardless of @@ -3473,13 +4461,60 @@ DEFINE_SCHED_CLASS(ext) = { #endif }; -static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) +static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id, + struct scx_sched *sch) { + s32 cpu; + memset(dsq, 0, sizeof(*dsq)); raw_spin_lock_init(&dsq->lock); INIT_LIST_HEAD(&dsq->list); dsq->id = dsq_id; + dsq->sched = sch; + + dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu); + if (!dsq->pcpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + + pcpu->dsq = dsq; + INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node); + } + + return 0; +} + +static void exit_dsq(struct scx_dispatch_q *dsq) +{ + s32 cpu; + + for_each_possible_cpu(cpu) { + struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu); + struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user; + struct rq *rq = cpu_rq(cpu); + + /* + * There must have been a RCU grace period since the last + * insertion and @dsq should be off the deferred list by now. + */ + if (WARN_ON_ONCE(!list_empty(&dru->node))) { + guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock); + list_del_init(&dru->node); + } + } + + free_percpu(dsq->pcpu); +} + +static void free_dsq_rcufn(struct rcu_head *rcu) +{ + struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu); + + exit_dsq(dsq); + kfree(dsq); } static void free_dsq_irq_workfn(struct irq_work *irq_work) @@ -3488,7 +4523,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work) struct scx_dispatch_q *dsq, *tmp_dsq; llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) - kfree_rcu(dsq, rcu); + call_rcu(&dsq->rcu, free_dsq_rcufn); } static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); @@ -3553,8 +4588,7 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, - css->cgroup); + SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup); } } @@ -3585,7 +4619,7 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, + ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL, css->cgroup, &args); if (ret) { scx_error(sch, "ops.cgroup_init() failed (%d)", ret); @@ -3664,6 +4698,7 @@ static const struct attribute_group scx_global_attr_group = { .attrs = scx_global_attrs, }; +static void free_pnode(struct scx_sched_pnode *pnode); static void free_exit_info(struct scx_exit_info *ei); static void scx_sched_free_rcu_work(struct work_struct *work) @@ -3672,22 +4707,42 @@ static void scx_sched_free_rcu_work(struct work_struct *work) struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work); struct rhashtable_iter rht_iter; struct scx_dispatch_q *dsq; - int node; + int cpu, node; - irq_work_sync(&sch->error_irq_work); + irq_work_sync(&sch->disable_irq_work); kthread_destroy_worker(sch->helper); + timer_shutdown_sync(&sch->bypass_lb_timer); + +#ifdef CONFIG_EXT_SUB_SCHED + kfree(sch->cgrp_path); + if (sch_cgroup(sch)) + cgroup_put(sch_cgroup(sch)); +#endif /* CONFIG_EXT_SUB_SCHED */ + + for_each_possible_cpu(cpu) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + + /* + * $sch would have entered bypass mode before the RCU grace + * period. As that blocks new deferrals, all + * deferred_reenq_local_node's must be off-list by now. + */ + WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node)); + + exit_dsq(bypass_dsq(sch, cpu)); + } free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) - kfree(sch->global_dsqs[node]); - kfree(sch->global_dsqs); + free_pnode(sch->pnode[node]); + kfree(sch->pnode); rhashtable_walk_enter(&sch->dsq_hash, &rht_iter); do { rhashtable_walk_start(&rht_iter); - while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) + while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter)))) destroy_dsq(sch, dsq->id); rhashtable_walk_stop(&rht_iter); @@ -3704,7 +4759,7 @@ static void scx_kobj_release(struct kobject *kobj) struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work); - queue_rcu_work(system_unbound_wq, &sch->rcu_work); + queue_rcu_work(system_dfl_wq, &sch->rcu_work); } static ssize_t scx_attr_ops_show(struct kobject *kobj, @@ -3733,10 +4788,14 @@ static ssize_t scx_attr_events_show(struct kobject *kobj, at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING); at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT); at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH); at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE); + at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED); + at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH); return at; } SCX_ATTR(events); @@ -3756,7 +4815,17 @@ static const struct kobj_type scx_ktype = { static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { - const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); + const struct scx_sched *sch; + + /* + * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype) + * and sub-scheduler kset kobjects (kset_ktype) through the parent + * chain walk. Filter out the latter to avoid invalid casts. + */ + if (kobj->ktype != &scx_ktype) + return 0; + + sch = container_of(kobj, struct scx_sched, kobj); return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); } @@ -3785,7 +4854,7 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) if (!scx_enabled()) return true; - sch = rcu_dereference_sched(scx_root); + sch = scx_task_sched(p); if (unlikely(!sch)) return true; @@ -3878,7 +4947,7 @@ void scx_softlockup(u32 dur_s) * a good state before taking more drastic actions. * * Returns %true if sched_ext is enabled and abort was initiated, which may - * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * resolve the reported hardlockup. %false if sched_ext is not enabled or * someone else already initiated abort. */ bool scx_hardlockup(int cpu) @@ -3891,13 +4960,14 @@ bool scx_hardlockup(int cpu) return true; } -static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, +static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor, struct cpumask *donee_mask, struct cpumask *resched_mask, u32 nr_donor_target, u32 nr_donee_target) { - struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + struct rq *donor_rq = cpu_rq(donor); + struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor); struct task_struct *p, *n; - struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0); s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; u32 nr_balanced = 0, min_delta_us; @@ -3911,7 +4981,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us))) return 0; - raw_spin_rq_lock_irq(rq); + raw_spin_rq_lock_irq(donor_rq); raw_spin_lock(&donor_dsq->lock); list_add(&cursor.node, &donor_dsq->list); resume: @@ -3919,7 +4989,6 @@ resume: n = nldsq_next_task(donor_dsq, n, false); while ((p = n)) { - struct rq *donee_rq; struct scx_dispatch_q *donee_dsq; int donee; @@ -3935,14 +5004,13 @@ resume: if (donee >= nr_cpu_ids) continue; - donee_rq = cpu_rq(donee); - donee_dsq = &donee_rq->scx.bypass_dsq; + donee_dsq = bypass_dsq(sch, donee); /* * $p's rq is not locked but $p's DSQ lock protects its * scheduling properties making this test safe. */ - if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) + if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false)) continue; /* @@ -3957,7 +5025,7 @@ resume: * between bypass DSQs. */ dispatch_dequeue_locked(p, donor_dsq); - dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); + dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED); /* * $donee might have been idle and need to be woken up. No need @@ -3972,9 +5040,9 @@ resume: if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { list_move_tail(&cursor.node, &n->scx.dsq_list.node); raw_spin_unlock(&donor_dsq->lock); - raw_spin_rq_unlock_irq(rq); + raw_spin_rq_unlock_irq(donor_rq); cpu_relax(); - raw_spin_rq_lock_irq(rq); + raw_spin_rq_lock_irq(donor_rq); raw_spin_lock(&donor_dsq->lock); goto resume; } @@ -3982,7 +5050,7 @@ resume: list_del_init(&cursor.node); raw_spin_unlock(&donor_dsq->lock); - raw_spin_rq_unlock_irq(rq); + raw_spin_rq_unlock_irq(donor_rq); return nr_balanced; } @@ -4000,7 +5068,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node) /* count the target tasks and CPUs */ for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); nr_tasks += nr; nr_cpus++; @@ -4022,24 +5090,21 @@ static void bypass_lb_node(struct scx_sched *sch, int node) cpumask_clear(donee_mask); for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) + if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target) cpumask_set_cpu(cpu, donee_mask); } /* iterate !donee CPUs and see if they should be offloaded */ cpumask_clear(resched_mask); for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - struct rq *rq = cpu_rq(cpu); - struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; - if (cpumask_empty(donee_mask)) break; if (cpumask_test_cpu(cpu, donee_mask)) continue; - if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) + if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target) continue; - nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, + nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask, nr_donor_target, nr_target); } @@ -4047,7 +5112,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node) resched_cpu(cpu); for_each_cpu_and(cpu, cpu_online_mask, node_mask) { - u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr); after_min = min(nr, after_min); after_max = max(nr, after_max); @@ -4069,12 +5134,11 @@ static void bypass_lb_node(struct scx_sched *sch, int node) */ static void scx_bypass_lb_timerfn(struct timer_list *timer) { - struct scx_sched *sch; + struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer); int node; u32 intv_us; - sch = rcu_dereference_all(scx_root); - if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth)) + if (!bypass_dsp_enabled(sch)) return; for_each_node_with_cpus(node) @@ -4085,10 +5149,102 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer) mod_timer(timer, jiffies + usecs_to_jiffies(intv_us)); } -static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); +static bool inc_bypass_depth(struct scx_sched *sch) +{ + lockdep_assert_held(&scx_bypass_lock); + + WARN_ON_ONCE(sch->bypass_depth < 0); + WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1); + if (sch->bypass_depth != 1) + return false; + + WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); + sch->bypass_timestamp = ktime_get_ns(); + scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); + return true; +} + +static bool dec_bypass_depth(struct scx_sched *sch) +{ + lockdep_assert_held(&scx_bypass_lock); + + WARN_ON_ONCE(sch->bypass_depth < 1); + WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1); + if (sch->bypass_depth != 0) + return false; + + WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL); + scx_add_event(sch, SCX_EV_BYPASS_DURATION, + ktime_get_ns() - sch->bypass_timestamp); + return true; +} + +static void enable_bypass_dsp(struct scx_sched *sch) +{ + struct scx_sched *host = scx_parent(sch) ?: sch; + u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us); + s32 ret; + + /* + * @sch->bypass_depth transitioning from 0 to 1 triggers enabling. + * Shouldn't stagger. + */ + if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim))) + return; + + /* + * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of + * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is + * called iff @sch is not already bypassed due to an ancestor bypassing, + * we can assume that the parent is not bypassing and thus will be the + * host of the bypass DSQs. + * + * While the situation may change in the future, the following + * guarantees that the nearest non-bypassing ancestor or root has bypass + * dispatch enabled while a descendant is bypassing, which is all that's + * required. + * + * bypass_dsp_enabled() test is used to determine whether to enter the + * bypass dispatch handling path from both bypassing and hosting scheds. + * Bump enable depth on both @sch and bypass dispatch host. + */ + ret = atomic_inc_return(&sch->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret <= 0); + + if (host != sch) { + ret = atomic_inc_return(&host->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret <= 0); + } + + /* + * The LB timer will stop running if bypass dispatch is disabled. Start + * after enabling bypass dispatch. + */ + if (intv_us && !timer_pending(&host->bypass_lb_timer)) + mod_timer(&host->bypass_lb_timer, + jiffies + usecs_to_jiffies(intv_us)); +} + +/* may be called without holding scx_bypass_lock */ +static void disable_bypass_dsp(struct scx_sched *sch) +{ + s32 ret; + + if (!test_and_clear_bit(0, &sch->bypass_dsp_claim)) + return; + + ret = atomic_dec_return(&sch->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret < 0); + + if (scx_parent(sch)) { + ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth); + WARN_ON_ONCE(ret < 0); + } +} /** * scx_bypass - [Un]bypass scx_ops and guarantee forward progress + * @sch: sched to bypass * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without @@ -4118,49 +5274,42 @@ static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn); * * - scx_prio_less() reverts to the default core_sched_at order. */ -static void scx_bypass(bool bypass) +static void scx_bypass(struct scx_sched *sch, bool bypass) { - static DEFINE_RAW_SPINLOCK(bypass_lock); - static unsigned long bypass_timestamp; - struct scx_sched *sch; + struct scx_sched *pos; unsigned long flags; int cpu; - raw_spin_lock_irqsave(&bypass_lock, flags); - sch = rcu_dereference_bh(scx_root); + raw_spin_lock_irqsave(&scx_bypass_lock, flags); if (bypass) { - u32 intv_us; - - WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1); - WARN_ON_ONCE(scx_bypass_depth <= 0); - if (scx_bypass_depth != 1) + if (!inc_bypass_depth(sch)) goto unlock; - WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC); - bypass_timestamp = ktime_get_ns(); - if (sch) - scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1); - - intv_us = READ_ONCE(scx_bypass_lb_intv_us); - if (intv_us && !timer_pending(&scx_bypass_lb_timer)) { - scx_bypass_lb_timer.expires = - jiffies + usecs_to_jiffies(intv_us); - add_timer_global(&scx_bypass_lb_timer); - } + + enable_bypass_dsp(sch); } else { - WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1); - WARN_ON_ONCE(scx_bypass_depth < 0); - if (scx_bypass_depth != 0) + if (!dec_bypass_depth(sch)) goto unlock; - WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL); - if (sch) - scx_add_event(sch, SCX_EV_BYPASS_DURATION, - ktime_get_ns() - bypass_timestamp); } /* + * Bypass state is propagated to all descendants - an scx_sched bypasses + * if itself or any of its ancestors are in bypass mode. + */ + raw_spin_lock(&scx_sched_lock); + scx_for_each_descendant_pre(pos, sch) { + if (pos == sch) + continue; + if (bypass) + inc_bypass_depth(pos); + else + dec_bypass_depth(pos); + } + raw_spin_unlock(&scx_sched_lock); + + /* * No task property is changing. We just need to make sure all currently - * queued tasks are re-queued according to the new scx_rq_bypassing() + * queued tasks are re-queued according to the new scx_bypassing() * state. As an optimization, walk each rq's runnable_list instead of * the scx_tasks list. * @@ -4172,19 +5321,23 @@ static void scx_bypass(bool bypass) struct task_struct *p, *n; raw_spin_rq_lock(rq); + raw_spin_lock(&scx_sched_lock); - if (bypass) { - WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); - rq->scx.flags |= SCX_RQ_BYPASSING; - } else { - WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING)); - rq->scx.flags &= ~SCX_RQ_BYPASSING; + scx_for_each_descendant_pre(pos, sch) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu); + + if (pos->bypass_depth) + pcpu->flags |= SCX_SCHED_PCPU_BYPASSING; + else + pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING; } + raw_spin_unlock(&scx_sched_lock); + /* * We need to guarantee that no tasks are on the BPF scheduler * while bypassing. Either we see enabled or the enable path - * sees scx_rq_bypassing() before moving tasks to SCX. + * sees scx_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { raw_spin_rq_unlock(rq); @@ -4200,6 +5353,9 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { + if (!scx_is_descendant(scx_task_sched(p), sch)) + continue; + /* cycling deq/enq is enough, see the function comment */ scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { /* nothing */ ; @@ -4213,8 +5369,11 @@ static void scx_bypass(bool bypass) raw_spin_rq_unlock(rq); } + /* disarming must come after moving all tasks out of the bypass DSQs */ + if (!bypass) + disable_bypass_dsp(sch); unlock: - raw_spin_unlock_irqrestore(&bypass_lock, flags); + raw_spin_unlock_irqrestore(&scx_bypass_lock, flags); } static void free_exit_info(struct scx_exit_info *ei) @@ -4256,6 +5415,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind) return "unregistered from the main kernel"; case SCX_EXIT_SYSRQ: return "disabled by sysrq-S"; + case SCX_EXIT_PARENT: + return "parent exiting"; case SCX_EXIT_ERROR: return "runtime error"; case SCX_EXIT_ERROR_BPF: @@ -4281,28 +5442,279 @@ static void free_kick_syncs(void) } } -static void scx_disable_workfn(struct kthread_work *work) +static void refresh_watchdog(void) { - struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); - struct scx_exit_info *ei = sch->exit_info; + struct scx_sched *sch; + unsigned long intv = ULONG_MAX; + + /* take the shortest timeout and use its half for watchdog interval */ + rcu_read_lock(); + list_for_each_entry_rcu(sch, &scx_sched_all, all) + intv = max(min(intv, sch->watchdog_timeout / 2), 1); + rcu_read_unlock(); + + WRITE_ONCE(scx_watchdog_timestamp, jiffies); + WRITE_ONCE(scx_watchdog_interval, intv); + + if (intv < ULONG_MAX) + mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); + else + cancel_delayed_work_sync(&scx_watchdog_work); +} + +static s32 scx_link_sched(struct scx_sched *sch) +{ + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { +#ifdef CONFIG_EXT_SUB_SCHED + struct scx_sched *parent = scx_parent(sch); + s32 ret; + + if (parent) { + /* + * scx_claim_exit() propagates exit_kind transition to + * its sub-scheds while holding scx_sched_lock - either + * we can see the parent's non-NONE exit_kind or the + * parent can shoot us down. + */ + if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) { + scx_error(sch, "parent disabled"); + return -ENOENT; + } + + ret = rhashtable_lookup_insert_fast(&scx_sched_hash, + &sch->hash_node, scx_sched_hash_params); + if (ret) { + scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret); + return ret; + } + + list_add_tail(&sch->sibling, &parent->children); + } +#endif /* CONFIG_EXT_SUB_SCHED */ + + list_add_tail_rcu(&sch->all, &scx_sched_all); + } + + refresh_watchdog(); + return 0; +} + +static void scx_unlink_sched(struct scx_sched *sch) +{ + scoped_guard(raw_spinlock_irq, &scx_sched_lock) { +#ifdef CONFIG_EXT_SUB_SCHED + if (scx_parent(sch)) { + rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node, + scx_sched_hash_params); + list_del_init(&sch->sibling); + } +#endif /* CONFIG_EXT_SUB_SCHED */ + list_del_rcu(&sch->all); + } + + refresh_watchdog(); +} + +/* + * Called to disable future dumps and wait for in-progress one while disabling + * @sch. Once @sch becomes empty during disable, there's no point in dumping it. + * This prevents calling dump ops on a dead sch. + */ +static void scx_disable_dump(struct scx_sched *sch) +{ + guard(raw_spinlock_irqsave)(&scx_dump_lock); + sch->dump_disabled = true; +} + +#ifdef CONFIG_EXT_SUB_SCHED +static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); + +static void drain_descendants(struct scx_sched *sch) +{ + /* + * Child scheds that finished the critical part of disabling will take + * themselves off @sch->children. Wait for it to drain. As propagation + * is recursive, empty @sch->children means that all proper descendant + * scheds reached unlinking stage. + */ + wait_event(scx_unlink_waitq, list_empty(&sch->children)); +} + +static void scx_fail_parent(struct scx_sched *sch, + struct task_struct *failed, s32 fail_code) +{ + struct scx_sched *parent = scx_parent(sch); struct scx_task_iter sti; struct task_struct *p; - int kind, cpu; - kind = atomic_read(&sch->exit_kind); - while (true) { - if (kind == SCX_EXIT_DONE) /* already disabled? */ - return; - WARN_ON_ONCE(kind == SCX_EXIT_NONE); - if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) + scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler", + fail_code, failed->comm, failed->pid); + + /* + * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into + * it. This may cause downstream failures on the BPF side but $parent is + * dying anyway. + */ + scx_bypass(parent, true); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (scx_task_on_sched(parent, p)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + scx_disable_and_exit_task(sch, p); + rcu_assign_pointer(p->scx.sched, parent); + } + } + scx_task_iter_stop(&sti); +} + +static void scx_sub_disable(struct scx_sched *sch) +{ + struct scx_sched *parent = scx_parent(sch); + struct scx_task_iter sti; + struct task_struct *p; + int ret; + + /* + * Guarantee forward progress and wait for descendants to be disabled. + * To limit disruptions, $parent is not bypassed. Tasks are fully + * prepped and then inserted back into $parent. + */ + scx_bypass(sch, true); + drain_descendants(sch); + + /* + * Here, every runnable task is guaranteed to make forward progress and + * we can safely use blocking synchronization constructs. Actually + * disable ops. + */ + mutex_lock(&scx_enable_mutex); + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + set_cgroup_sched(sch_cgroup(sch), parent); + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* filter out duplicate visits */ + if (scx_task_on_sched(parent, p)) + continue; + + /* + * By the time control reaches here, all descendant schedulers + * should already have been disabled. + */ + WARN_ON_ONCE(!scx_task_on_sched(sch, p)); + + /* + * If $p is about to be freed, nothing prevents $sch from + * unloading before $p reaches sched_ext_free(). Disable and + * exit $p right away. + */ + if (!tryget_task_struct(p)) { + scx_disable_and_exit_task(sch, p); + continue; + } + + scx_task_iter_unlock(&sti); + + /* + * $p is READY or ENABLED on @sch. Initialize for $parent, + * disable and exit from @sch, and then switch over to $parent. + * + * If a task fails to initialize for $parent, the only available + * action is disabling $parent too. While this allows disabling + * of a child sched to cause the parent scheduler to fail, the + * failure can only originate from ops.init_task() of the + * parent. A child can't directly affect the parent through its + * own failures. + */ + ret = __scx_init_task(parent, p, false); + if (ret) { + scx_fail_parent(sch, p, ret); + put_task_struct(p); break; + } + + rq = task_rq_lock(p, &rf); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p is initialized for $parent and still attached to + * @sch. Disable and exit for @sch, switch over to + * $parent, override the state to READY to account for + * $p having already been initialized, and then enable. + */ + scx_disable_and_exit_task(sch, p); + scx_set_task_state(p, SCX_TASK_INIT); + rcu_assign_pointer(p->scx.sched, parent); + scx_set_task_state(p, SCX_TASK_READY); + scx_enable_task(parent, p); + } + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); } - ei->kind = kind; - ei->reason = scx_exit_reason(ei->kind); + scx_task_iter_stop(&sti); + + scx_disable_dump(sch); + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + /* + * All tasks are moved off of @sch but there may still be on-going + * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use + * the expedited version as ancestors may be waiting in bypass mode. + * Also, tell the parent that there is no need to keep running bypass + * DSQs for us. + */ + synchronize_rcu_expedited(); + disable_bypass_dsp(sch); + + scx_unlink_sched(sch); + + mutex_unlock(&scx_enable_mutex); + + /* + * @sch is now unlinked from the parent's children list. Notify and call + * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called + * after unlinking and releasing all locks. See scx_claim_exit(). + */ + wake_up_all(&scx_unlink_waitq); + + if (parent->ops.sub_detach && sch->sub_attached) { + struct scx_sub_detach_args sub_detach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + SCX_CALL_OP(parent, sub_detach, NULL, + &sub_detach_args); + } + + if (sch->ops.exit) + SCX_CALL_OP(sch, exit, NULL, sch->exit_info); + kobject_del(&sch->kobj); +} +#else /* CONFIG_EXT_SUB_SCHED */ +static void drain_descendants(struct scx_sched *sch) { } +static void scx_sub_disable(struct scx_sched *sch) { } +#endif /* CONFIG_EXT_SUB_SCHED */ + +static void scx_root_disable(struct scx_sched *sch) +{ + struct scx_exit_info *ei = sch->exit_info; + struct scx_task_iter sti; + struct task_struct *p; + int cpu; - /* guarantee forward progress by bypassing scx_ops */ - scx_bypass(true); - WRITE_ONCE(scx_aborting, false); + /* guarantee forward progress and wait for descendants to be disabled */ + scx_bypass(sch, true); + drain_descendants(sch); switch (scx_set_enable_state(SCX_DISABLING)) { case SCX_DISABLING: @@ -4329,7 +5741,7 @@ static void scx_disable_workfn(struct kthread_work *work) /* * Shut down cgroup support before tasks so that the cgroup attach path - * doesn't race against scx_exit_task(). + * doesn't race against scx_disable_and_exit_task(). */ scx_cgroup_lock(); scx_cgroup_exit(sch); @@ -4343,7 +5755,7 @@ static void scx_disable_workfn(struct kthread_work *work) scx_init_task_enabled = false; - scx_task_iter_start(&sti); + scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; @@ -4358,9 +5770,16 @@ static void scx_disable_workfn(struct kthread_work *work) p->sched_class = new_class; } - scx_exit_task(p); + scx_disable_and_exit_task(scx_task_sched(p), p); } scx_task_iter_stop(&sti); + + scx_disable_dump(sch); + + scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), NULL); + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); /* @@ -4393,9 +5812,9 @@ static void scx_disable_workfn(struct kthread_work *work) } if (sch->ops.exit) - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); + SCX_CALL_OP(sch, exit, NULL, ei); - cancel_delayed_work_sync(&scx_watchdog_work); + scx_unlink_sched(sch); /* * scx_root clearing must be inside cpus_read_lock(). See @@ -4412,21 +5831,13 @@ static void scx_disable_workfn(struct kthread_work *work) */ kobject_del(&sch->kobj); - free_percpu(scx_dsp_ctx); - scx_dsp_ctx = NULL; - scx_dsp_max_batch = 0; free_kick_syncs(); - if (scx_bypassed_for_enable) { - scx_bypassed_for_enable = false; - scx_bypass(false); - } - mutex_unlock(&scx_enable_mutex); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); done: - scx_bypass(false); + scx_bypass(sch, false); } /* @@ -4442,6 +5853,9 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) lockdep_assert_preemption_disabled(); + if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) + kind = SCX_EXIT_ERROR; + if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) return false; @@ -4450,25 +5864,61 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) * flag to break potential live-lock scenarios, ensuring we can * successfully reach scx_bypass(). */ - WRITE_ONCE(scx_aborting, true); + WRITE_ONCE(sch->aborting, true); + + /* + * Propagate exits to descendants immediately. Each has a dedicated + * helper kthread and can run in parallel. While most of disabling is + * serialized, running them in separate threads allows parallelizing + * ops.exit(), which can take arbitrarily long prolonging bypass mode. + * + * To guarantee forward progress, this propagation must be in-line so + * that ->aborting is synchronously asserted for all sub-scheds. The + * propagation is also the interlocking point against sub-sched + * attachment. See scx_link_sched(). + * + * This doesn't cause recursions as propagation only takes place for + * non-propagation exits. + */ + if (kind != SCX_EXIT_PARENT) { + scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) { + struct scx_sched *pos; + scx_for_each_descendant_pre(pos, sch) + scx_disable(pos, SCX_EXIT_PARENT); + } + } + return true; } -static void scx_disable(enum scx_exit_kind kind) +static void scx_disable_workfn(struct kthread_work *work) { - struct scx_sched *sch; - - if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) - kind = SCX_EXIT_ERROR; + struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); + struct scx_exit_info *ei = sch->exit_info; + int kind; - rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) { - guard(preempt)(); - scx_claim_exit(sch, kind); - kthread_queue_work(sch->helper, &sch->disable_work); + kind = atomic_read(&sch->exit_kind); + while (true) { + if (kind == SCX_EXIT_DONE) /* already disabled? */ + return; + WARN_ON_ONCE(kind == SCX_EXIT_NONE); + if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE)) + break; } - rcu_read_unlock(); + ei->kind = kind; + ei->reason = scx_exit_reason(ei->kind); + + if (scx_parent(sch)) + scx_sub_disable(sch); + else + scx_root_disable(sch); +} + +static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind) +{ + guard(preempt)(); + if (scx_claim_exit(sch, kind)) + irq_work_queue(&sch->disable_irq_work); } static void dump_newline(struct seq_buf *s) @@ -4486,14 +5936,14 @@ static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) #ifdef CONFIG_TRACEPOINTS if (trace_sched_ext_dump_enabled()) { - /* protected by scx_dump_state()::dump_lock */ + /* protected by scx_dump_lock */ static char line_buf[SCX_EXIT_MSG_LEN]; va_start(args, fmt); vscnprintf(line_buf, sizeof(line_buf), fmt, args); va_end(args); - trace_sched_ext_dump(line_buf); + trace_call__sched_ext_dump(line_buf); } #endif /* @s may be zero sized and seq_buf triggers WARN if so */ @@ -4582,25 +6032,38 @@ static void ops_dump_exit(void) scx_dump_data.cpu = -1; } -static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, +static void scx_dump_task(struct scx_sched *sch, + struct seq_buf *s, struct scx_dump_ctx *dctx, struct task_struct *p, char marker) { static unsigned long bt[SCX_EXIT_BT_LEN]; - struct scx_sched *sch = scx_root; + struct scx_sched *task_sch = scx_task_sched(p); + const char *own_marker; + char sch_id_buf[32]; char dsq_id_buf[19] = "(n/a)"; unsigned long ops_state = atomic_long_read(&p->scx.ops_state); unsigned int bt_len = 0; + own_marker = task_sch == sch ? "*" : ""; + + if (task_sch->level == 0) + scnprintf(sch_id_buf, sizeof(sch_id_buf), "root"); + else + scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu", + task_sch->level, task_sch->ops.sub_cgroup_id); + if (p->scx.dsq) scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", (unsigned long long)p->scx.dsq->id); dump_newline(s); - dump_line(s, " %c%c %s[%d] %+ldms", + dump_line(s, " %c%c %s[%d] %s%s %+ldms", marker, task_state_to_char(p), p->comm, p->pid, + own_marker, sch_id_buf, jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", - scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, + scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT, + p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", @@ -4612,7 +6075,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); - SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p); + SCX_CALL_OP(sch, dump_task, NULL, dctx, p); ops_dump_exit(); } @@ -4625,11 +6088,17 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, } } -static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) +/* + * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless + * of which scheduler they belong to. If false, only dump tasks owned by @sch. + * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped + * separately. For error dumps, @dump_all_tasks=true since only the failing + * scheduler is dumped. + */ +static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei, + size_t dump_len, bool dump_all_tasks) { - static DEFINE_SPINLOCK(dump_lock); static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; - struct scx_sched *sch = scx_root; struct scx_dump_ctx dctx = { .kind = ei->kind, .exit_code = ei->exit_code, @@ -4639,14 +6108,24 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) }; struct seq_buf s; struct scx_event_stats events; - unsigned long flags; char *buf; int cpu; - spin_lock_irqsave(&dump_lock, flags); + guard(raw_spinlock_irqsave)(&scx_dump_lock); + + if (sch->dump_disabled) + return; seq_buf_init(&s, ei->dump, dump_len); +#ifdef CONFIG_EXT_SUB_SCHED + if (sch->level == 0) + dump_line(&s, "%s: root", sch->ops.name); + else + dump_line(&s, "%s: sub%d-%llu %s", + sch->ops.name, sch->level, sch->ops.sub_cgroup_id, + sch->cgrp_path); +#endif if (ei->kind == SCX_EXIT_NONE) { dump_line(&s, "Debug dump triggered by %s", ei->reason); } else { @@ -4660,7 +6139,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (SCX_HAS_OP(sch, dump)) { ops_dump_init(&s, ""); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx); + SCX_CALL_OP(sch, dump, NULL, &dctx); ops_dump_exit(); } @@ -4713,11 +6192,14 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) if (!cpumask_empty(rq->scx.cpus_to_wait)) dump_line(&ns, " cpus_to_wait : %*pb", cpumask_pr_args(rq->scx.cpus_to_wait)); + if (!cpumask_empty(rq->scx.cpus_to_sync)) + dump_line(&ns, " cpus_to_sync : %*pb", + cpumask_pr_args(rq->scx.cpus_to_sync)); used = seq_buf_used(&ns); if (SCX_HAS_OP(sch, dump_cpu)) { ops_dump_init(&ns, " "); - SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL, + SCX_CALL_OP(sch, dump_cpu, NULL, &dctx, cpu, idle); ops_dump_exit(); } @@ -4739,11 +6221,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) seq_buf_set_overflow(&s); } - if (rq->curr->sched_class == &ext_sched_class) - scx_dump_task(&s, &dctx, rq->curr, '*'); + if (rq->curr->sched_class == &ext_sched_class && + (dump_all_tasks || scx_task_on_sched(sch, rq->curr))) + scx_dump_task(sch, &s, &dctx, rq->curr, '*'); list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) - scx_dump_task(&s, &dctx, p, ' '); + if (dump_all_tasks || scx_task_on_sched(sch, p)) + scx_dump_task(sch, &s, &dctx, p, ' '); next: rq_unlock_irqrestore(rq, &rf); } @@ -4758,25 +6242,27 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING); scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_dump_event(s, &events, SCX_EV_REENQ_IMMED); + scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT); scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL); scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION); scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH); scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE); + scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED); + scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH); if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) memcpy(ei->dump + dump_len - sizeof(trunc_marker), trunc_marker, sizeof(trunc_marker)); - - spin_unlock_irqrestore(&dump_lock, flags); } -static void scx_error_irq_workfn(struct irq_work *irq_work) +static void scx_disable_irq_workfn(struct irq_work *irq_work) { - struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work); + struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work); struct scx_exit_info *ei = sch->exit_info; if (ei->kind >= SCX_EXIT_ERROR) - scx_dump_state(ei, sch->ops.exit_dump_len); + scx_dump_state(sch, ei, sch->ops.exit_dump_len, true); kthread_queue_work(sch->helper, &sch->disable_work); } @@ -4806,7 +6292,7 @@ static bool scx_vexit(struct scx_sched *sch, ei->kind = kind; ei->reason = scx_exit_reason(ei->kind); - irq_work_queue(&sch->error_irq_work); + irq_work_queue(&sch->disable_irq_work); return true; } @@ -4837,14 +6323,47 @@ static int alloc_kick_syncs(void) return 0; } -static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) +static void free_pnode(struct scx_sched_pnode *pnode) +{ + if (!pnode) + return; + exit_dsq(&pnode->global_dsq); + kfree(pnode); +} + +static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node) +{ + struct scx_sched_pnode *pnode; + + pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node); + if (!pnode) + return NULL; + + if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) { + kfree(pnode); + return NULL; + } + + return pnode; +} + +/* + * Allocate and initialize a new scx_sched. @cgrp's reference is always + * consumed whether the function succeeds or fails. + */ +static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops, + struct cgroup *cgrp, + struct scx_sched *parent) { struct scx_sched *sch; - int node, ret; + s32 level = parent ? parent->level + 1 : 0; + s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids; - sch = kzalloc_obj(*sch); - if (!sch) - return ERR_PTR(-ENOMEM); + sch = kzalloc_flex(*sch, ancestors, level + 1); + if (!sch) { + ret = -ENOMEM; + goto err_put_cgrp; + } sch->exit_info = alloc_exit_info(ops->exit_dump_len); if (!sch->exit_info) { @@ -4856,29 +6375,42 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) if (ret < 0) goto err_free_ei; - sch->global_dsqs = kzalloc_objs(sch->global_dsqs[0], nr_node_ids); - if (!sch->global_dsqs) { + sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids); + if (!sch->pnode) { ret = -ENOMEM; goto err_free_hash; } for_each_node_state(node, N_POSSIBLE) { - struct scx_dispatch_q *dsq; - - dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); - if (!dsq) { + sch->pnode[node] = alloc_pnode(sch, node); + if (!sch->pnode[node]) { ret = -ENOMEM; - goto err_free_gdsqs; + goto err_free_pnode; } - - init_dsq(dsq, SCX_DSQ_GLOBAL); - sch->global_dsqs[node] = dsq; } - sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; + sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu, + dsp_ctx.buf, sch->dsp_max_batch), + __alignof__(struct scx_sched_pcpu)); if (!sch->pcpu) { ret = -ENOMEM; - goto err_free_gdsqs; + goto err_free_pnode; + } + + for_each_possible_cpu(cpu) { + ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch); + if (ret) { + bypass_fail_cpu = cpu; + goto err_free_pcpu; + } + } + + for_each_possible_cpu(cpu) { + struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu); + + pcpu->sch = sch; + INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node); } sch->helper = kthread_run_worker(0, "sched_ext_helper"); @@ -4889,33 +6421,98 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sched_set_fifo(sch->helper->task); + if (parent) + memcpy(sch->ancestors, parent->ancestors, + level * sizeof(parent->ancestors[0])); + sch->ancestors[level] = sch; + sch->level = level; + + if (ops->timeout_ms) + sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); + else + sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; + + sch->slice_dfl = SCX_SLICE_DFL; atomic_set(&sch->exit_kind, SCX_EXIT_NONE); - init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); + init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn); kthread_init_work(&sch->disable_work, scx_disable_workfn); + timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0); sch->ops = *ops; - ops->priv = sch; + rcu_assign_pointer(ops->priv, sch); sch->kobj.kset = scx_kset; - ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); - if (ret < 0) + +#ifdef CONFIG_EXT_SUB_SCHED + char *buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; goto err_stop_helper; + } + cgroup_path(cgrp, buf, PATH_MAX); + sch->cgrp_path = kstrdup(buf, GFP_KERNEL); + kfree(buf); + if (!sch->cgrp_path) { + ret = -ENOMEM; + goto err_stop_helper; + } + + sch->cgrp = cgrp; + INIT_LIST_HEAD(&sch->children); + INIT_LIST_HEAD(&sch->sibling); + + if (parent) + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, + &parent->sub_kset->kobj, + "sub-%llu", cgroup_id(cgrp)); + else + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + + if (ret < 0) { + kobject_put(&sch->kobj); + return ERR_PTR(ret); + } + if (ops->sub_attach) { + sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj); + if (!sch->sub_kset) { + kobject_put(&sch->kobj); + return ERR_PTR(-ENOMEM); + } + } +#else /* CONFIG_EXT_SUB_SCHED */ + ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root"); + if (ret < 0) { + kobject_put(&sch->kobj); + return ERR_PTR(ret); + } +#endif /* CONFIG_EXT_SUB_SCHED */ return sch; +#ifdef CONFIG_EXT_SUB_SCHED err_stop_helper: kthread_destroy_worker(sch->helper); +#endif err_free_pcpu: + for_each_possible_cpu(cpu) { + if (cpu == bypass_fail_cpu) + break; + exit_dsq(bypass_dsq(sch, cpu)); + } free_percpu(sch->pcpu); -err_free_gdsqs: +err_free_pnode: for_each_node_state(node, N_POSSIBLE) - kfree(sch->global_dsqs[node]); - kfree(sch->global_dsqs); + free_pnode(sch->pnode[node]); + kfree(sch->pnode); err_free_hash: rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL); err_free_ei: free_exit_info(sch->exit_info); err_free_sch: kfree(sch); +err_put_cgrp: +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) + cgroup_put(cgrp); +#endif return ERR_PTR(ret); } @@ -4964,9 +6561,6 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) return -EINVAL; } - if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) - pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); - if (ops->cpu_acquire || ops->cpu_release) pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n"); @@ -4986,15 +6580,14 @@ struct scx_enable_cmd { int ret; }; -static void scx_enable_workfn(struct kthread_work *work) +static void scx_root_enable_workfn(struct kthread_work *work) { - struct scx_enable_cmd *cmd = - container_of(work, struct scx_enable_cmd, work); + struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); struct sched_ext_ops *ops = cmd->ops; + struct cgroup *cgrp = root_cgroup(); struct scx_sched *sch; struct scx_task_iter sti; struct task_struct *p; - unsigned long timeout; int i, cpu, ret; mutex_lock(&scx_enable_mutex); @@ -5008,7 +6601,10 @@ static void scx_enable_workfn(struct kthread_work *work) if (ret) goto err_unlock; - sch = scx_alloc_and_add_sched(ops); +#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED) + cgroup_get(cgrp); +#endif + sch = scx_alloc_and_add_sched(ops, cgrp, NULL); if (IS_ERR(sch)) { ret = PTR_ERR(sch); goto err_free_ksyncs; @@ -5020,13 +6616,15 @@ static void scx_enable_workfn(struct kthread_work *work) */ WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); WARN_ON_ONCE(scx_root); - if (WARN_ON_ONCE(READ_ONCE(scx_aborting))) - WRITE_ONCE(scx_aborting, false); atomic_long_set(&scx_nr_rejected, 0); - for_each_possible_cpu(cpu) - cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + rq->scx.local_dsq.sched = sch; + rq->scx.cpuperf_target = SCX_CPUPERF_ONE; + } /* * Keep CPUs stable during enable so that the BPF scheduler can track @@ -5040,10 +6638,14 @@ static void scx_enable_workfn(struct kthread_work *work) */ rcu_assign_pointer(scx_root, sch); + ret = scx_link_sched(sch); + if (ret) + goto err_disable; + scx_idle_enable(ops); if (sch->ops.init) { - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL); + ret = SCX_CALL_OP_RET(sch, init, NULL); if (ret) { ret = ops_sanitize_err(sch, "init", ret); cpus_read_unlock(); @@ -5070,34 +6672,13 @@ static void scx_enable_workfn(struct kthread_work *work) if (ret) goto err_disable; - WARN_ON_ONCE(scx_dsp_ctx); - scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; - scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, - scx_dsp_max_batch), - __alignof__(struct scx_dsp_ctx)); - if (!scx_dsp_ctx) { - ret = -ENOMEM; - goto err_disable; - } - - if (ops->timeout_ms) - timeout = msecs_to_jiffies(ops->timeout_ms); - else - timeout = SCX_WATCHDOG_MAX_TIMEOUT; - - WRITE_ONCE(scx_watchdog_timeout, timeout); - WRITE_ONCE(scx_watchdog_timestamp, jiffies); - queue_delayed_work(system_unbound_wq, &scx_watchdog_work, - READ_ONCE(scx_watchdog_timeout) / 2); - /* * Once __scx_enabled is set, %current can be switched to SCX anytime. * This can lead to stalls as some BPF schedulers (e.g. userspace * scheduling) may not function correctly before all tasks are switched. * Init in bypass mode to guarantee forward progress. */ - scx_bypass(true); - scx_bypassed_for_enable = true; + scx_bypass(sch, true); for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) @@ -5129,11 +6710,12 @@ static void scx_enable_workfn(struct kthread_work *work) * never sees uninitialized tasks. */ scx_cgroup_lock(); + set_cgroup_sched(sch_cgroup(sch), sch); ret = scx_cgroup_init(sch); if (ret) goto err_disable_unlock_all; - scx_task_iter_start(&sti); + scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { /* * @p may already be dead, have lost all its usages counts and @@ -5145,7 +6727,7 @@ static void scx_enable_workfn(struct kthread_work *work) scx_task_iter_unlock(&sti); - ret = scx_init_task(p, task_group(p), false); + ret = scx_init_task(sch, p, false); if (ret) { put_task_struct(p); scx_task_iter_stop(&sti); @@ -5154,6 +6736,7 @@ static void scx_enable_workfn(struct kthread_work *work) goto err_disable_unlock_all; } + scx_set_task_sched(p, sch); scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); @@ -5175,7 +6758,7 @@ static void scx_enable_workfn(struct kthread_work *work) * scx_tasks_lock. */ percpu_down_write(&scx_fork_rwsem); - scx_task_iter_start(&sti); + scx_task_iter_start(&sti, NULL); while ((p = scx_task_iter_next_locked(&sti))) { unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; @@ -5188,15 +6771,14 @@ static void scx_enable_workfn(struct kthread_work *work) queue_flags |= DEQUEUE_CLASS; scoped_guard (sched_change, p, queue_flags) { - p->scx.slice = READ_ONCE(scx_slice_dfl); + p->scx.slice = READ_ONCE(sch->slice_dfl); p->sched_class = new_class; } } scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); - scx_bypassed_for_enable = false; - scx_bypass(false); + scx_bypass(sch, false); if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE); @@ -5238,12 +6820,318 @@ err_disable: * Flush scx_disable_work to ensure that error is reported before init * completion. sch's base reference will be put by bpf_scx_unreg(). */ - scx_error(sch, "scx_enable() failed (%d)", ret); + scx_error(sch, "scx_root_enable() failed (%d)", ret); kthread_flush_work(&sch->disable_work); cmd->ret = 0; } -static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) +#ifdef CONFIG_EXT_SUB_SCHED +/* verify that a scheduler can be attached to @cgrp and return the parent */ +static struct scx_sched *find_parent_sched(struct cgroup *cgrp) +{ + struct scx_sched *parent = cgrp->scx_sched; + struct scx_sched *pos; + + lockdep_assert_held(&scx_sched_lock); + + /* can't attach twice to the same cgroup */ + if (parent->cgrp == cgrp) + return ERR_PTR(-EBUSY); + + /* does $parent allow sub-scheds? */ + if (!parent->ops.sub_attach) + return ERR_PTR(-EOPNOTSUPP); + + /* can't insert between $parent and its exiting children */ + list_for_each_entry(pos, &parent->children, sibling) + if (cgroup_is_descendant(pos->cgrp, cgrp)) + return ERR_PTR(-EBUSY); + + return parent; +} + +static bool assert_task_ready_or_enabled(struct task_struct *p) +{ + u32 state = scx_get_task_state(p); + + switch (state) { + case SCX_TASK_READY: + case SCX_TASK_ENABLED: + return true; + default: + WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched", + state, p->comm, p->pid); + return false; + } +} + +static void scx_sub_enable_workfn(struct kthread_work *work) +{ + struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work); + struct sched_ext_ops *ops = cmd->ops; + struct cgroup *cgrp; + struct scx_sched *parent, *sch; + struct scx_task_iter sti; + struct task_struct *p; + s32 i, ret; + + mutex_lock(&scx_enable_mutex); + + if (!scx_enabled()) { + ret = -ENODEV; + goto out_unlock; + } + + cgrp = cgroup_get_from_id(ops->sub_cgroup_id); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; + } + + raw_spin_lock_irq(&scx_sched_lock); + parent = find_parent_sched(cgrp); + if (IS_ERR(parent)) { + raw_spin_unlock_irq(&scx_sched_lock); + ret = PTR_ERR(parent); + goto out_put_cgrp; + } + kobject_get(&parent->kobj); + raw_spin_unlock_irq(&scx_sched_lock); + + /* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */ + sch = scx_alloc_and_add_sched(ops, cgrp, parent); + kobject_put(&parent->kobj); + if (IS_ERR(sch)) { + ret = PTR_ERR(sch); + goto out_unlock; + } + + ret = scx_link_sched(sch); + if (ret) + goto err_disable; + + if (sch->level >= SCX_SUB_MAX_DEPTH) { + scx_error(sch, "max nesting depth %d violated", + SCX_SUB_MAX_DEPTH); + goto err_disable; + } + + if (sch->ops.init) { + ret = SCX_CALL_OP_RET(sch, init, NULL); + if (ret) { + ret = ops_sanitize_err(sch, "init", ret); + scx_error(sch, "ops.init() failed (%d)", ret); + goto err_disable; + } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; + } + + if (validate_ops(sch, ops)) + goto err_disable; + + struct scx_sub_attach_args sub_attach_args = { + .ops = &sch->ops, + .cgroup_path = sch->cgrp_path, + }; + + ret = SCX_CALL_OP_RET(parent, sub_attach, NULL, + &sub_attach_args); + if (ret) { + ret = ops_sanitize_err(sch, "sub_attach", ret); + scx_error(sch, "parent rejected (%d)", ret); + goto err_disable; + } + sch->sub_attached = true; + + scx_bypass(sch, true); + + for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) + if (((void (**)(void))ops)[i]) + set_bit(i, sch->has_op); + + percpu_down_write(&scx_fork_rwsem); + scx_cgroup_lock(); + + /* + * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see + * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down. + */ + set_cgroup_sched(sch_cgroup(sch), sch); + if (!(cgrp->self.flags & CSS_ONLINE)) { + scx_error(sch, "cgroup is not online"); + goto err_unlock_and_disable; + } + + /* + * Initialize tasks for the new child $sch without exiting them for + * $parent so that the tasks can always be reverted back to $parent + * sched on child init failure. + */ + WARN_ON_ONCE(scx_enabling_sub_sched); + scx_enabling_sub_sched = sch; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + struct rq *rq; + struct rq_flags rf; + + /* + * Task iteration may visit the same task twice when racing + * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which + * finished __scx_init_task() and skip if set. + * + * A task may exit and get freed between __scx_init_task() + * completion and scx_enable_task(). In such cases, + * scx_disable_and_exit_task() must exit the task for both the + * parent and child scheds. + */ + if (p->scx.flags & SCX_TASK_SUB_INIT) + continue; + + /* see scx_root_enable() */ + if (!tryget_task_struct(p)) + continue; + + if (!assert_task_ready_or_enabled(p)) { + ret = -EINVAL; + goto abort; + } + + scx_task_iter_unlock(&sti); + + /* + * As $p is still on $parent, it can't be transitioned to INIT. + * Let's worry about task state later. Use __scx_init_task(). + */ + ret = __scx_init_task(sch, p, false); + if (ret) + goto abort; + + rq = task_rq_lock(p, &rf); + p->scx.flags |= SCX_TASK_SUB_INIT; + task_rq_unlock(rq, p, &rf); + + put_task_struct(p); + } + scx_task_iter_stop(&sti); + + /* + * All tasks are prepped. Disable/exit tasks for $parent and enable for + * the new @sch. + */ + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + /* + * Use clearing of %SCX_TASK_SUB_INIT to detect and skip + * duplicate iterations. + */ + if (!(p->scx.flags & SCX_TASK_SUB_INIT)) + continue; + + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* + * $p must be either READY or ENABLED. If ENABLED, + * __scx_disabled_and_exit_task() first disables and + * makes it READY. However, after exiting $p, it will + * leave $p as READY. + */ + assert_task_ready_or_enabled(p); + __scx_disable_and_exit_task(parent, p); + + /* + * $p is now only initialized for @sch and READY, which + * is what we want. Assign it to @sch and enable. + */ + rcu_assign_pointer(p->scx.sched, sch); + scx_enable_task(sch, p); + + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); + + scx_enabling_sub_sched = NULL; + + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); + + scx_bypass(sch, false); + + pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name); + kobject_uevent(&sch->kobj, KOBJ_ADD); + ret = 0; + goto out_unlock; + +out_put_cgrp: + cgroup_put(cgrp); +out_unlock: + mutex_unlock(&scx_enable_mutex); + cmd->ret = ret; + return; + +abort: + put_task_struct(p); + scx_task_iter_stop(&sti); + scx_enabling_sub_sched = NULL; + + scx_task_iter_start(&sti, sch->cgrp); + while ((p = scx_task_iter_next_locked(&sti))) { + if (p->scx.flags & SCX_TASK_SUB_INIT) { + __scx_disable_and_exit_task(sch, p); + p->scx.flags &= ~SCX_TASK_SUB_INIT; + } + } + scx_task_iter_stop(&sti); +err_unlock_and_disable: + /* we'll soon enter disable path, keep bypass on */ + scx_cgroup_unlock(); + percpu_up_write(&scx_fork_rwsem); +err_disable: + mutex_unlock(&scx_enable_mutex); + kthread_flush_work(&sch->disable_work); + cmd->ret = 0; +} + +static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct cgroup *cgrp = data; + struct cgroup *parent = cgroup_parent(cgrp); + + if (!cgroup_on_dfl(cgrp)) + return NOTIFY_OK; + + switch (action) { + case CGROUP_LIFETIME_ONLINE: + /* inherit ->scx_sched from $parent */ + if (parent) + rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched); + break; + case CGROUP_LIFETIME_OFFLINE: + /* if there is a sched attached, shoot it down */ + if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp) + scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN, + SCX_ECODE_RSN_CGROUP_OFFLINE, + "cgroup %llu going offline", cgroup_id(cgrp)); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block scx_cgroup_lifetime_nb = { + .notifier_call = scx_cgroup_lifetime_notify, +}; + +static s32 __init scx_cgroup_lifetime_notifier_init(void) +{ + return blocking_notifier_chain_register(&cgroup_lifetime_notifier, + &scx_cgroup_lifetime_nb); +} +core_initcall(scx_cgroup_lifetime_notifier_init); +#endif /* CONFIG_EXT_SUB_SCHED */ + +static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) { static struct kthread_worker *helper; static DEFINE_MUTEX(helper_mutex); @@ -5270,7 +7158,12 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) mutex_unlock(&helper_mutex); } - kthread_init_work(&cmd.work, scx_enable_workfn); +#ifdef CONFIG_EXT_SUB_SCHED + if (ops->sub_cgroup_id > 1) + kthread_init_work(&cmd.work, scx_sub_enable_workfn); + else +#endif /* CONFIG_EXT_SUB_SCHED */ + kthread_init_work(&cmd.work, scx_root_enable_workfn); cmd.ops = ops; kthread_queue_work(READ_ONCE(helper), &cmd.work); @@ -5311,12 +7204,17 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, t = btf_type_by_id(reg->btf, reg->btf_id); if (t == task_struct_type) { - if (off >= offsetof(struct task_struct, scx.slice) && - off + size <= offsetofend(struct task_struct, scx.slice)) - return SCALAR_VALUE; - if (off >= offsetof(struct task_struct, scx.dsq_vtime) && - off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + /* + * COMPAT: Will be removed in v6.23. + */ + if ((off >= offsetof(struct task_struct, scx.slice) && + off + size <= offsetofend(struct task_struct, scx.slice)) || + (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) { + pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()"); return SCALAR_VALUE; + } + if (off >= offsetof(struct task_struct, scx.disallow) && off + size <= offsetofend(struct task_struct, scx.disallow)) return SCALAR_VALUE; @@ -5372,11 +7270,30 @@ static int bpf_scx_init_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, hotplug_seq): ops->hotplug_seq = *(u64 *)(udata + moff); return 1; +#ifdef CONFIG_EXT_SUB_SCHED + case offsetof(struct sched_ext_ops, sub_cgroup_id): + ops->sub_cgroup_id = *(u64 *)(udata + moff); + return 1; +#endif /* CONFIG_EXT_SUB_SCHED */ } return 0; } +#ifdef CONFIG_EXT_SUB_SCHED +static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = scx_prog_sched(prog->aux); + if (unlikely(!sch)) + return; + + scx_error(sch, "dispatch recursion detected"); +} +#endif /* CONFIG_EXT_SUB_SCHED */ + static int bpf_scx_check_member(const struct btf_type *t, const struct btf_member *member, const struct bpf_prog *prog) @@ -5394,12 +7311,30 @@ static int bpf_scx_check_member(const struct btf_type *t, case offsetof(struct sched_ext_ops, cpu_offline): case offsetof(struct sched_ext_ops, init): case offsetof(struct sched_ext_ops, exit): + case offsetof(struct sched_ext_ops, sub_attach): + case offsetof(struct sched_ext_ops, sub_detach): break; default: if (prog->sleepable) return -EINVAL; } +#ifdef CONFIG_EXT_SUB_SCHED + /* + * Enable private stack for operations that can nest along the + * hierarchy. + * + * XXX - Ideally, we should only do this for scheds that allow + * sub-scheds and sub-scheds themselves but I don't know how to access + * struct_ops from here. + */ + switch (moff) { + case offsetof(struct sched_ext_ops, dispatch): + prog->aux->priv_stack_requested = true; + prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch; + } +#endif /* CONFIG_EXT_SUB_SCHED */ + return 0; } @@ -5411,10 +7346,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link) static void bpf_scx_unreg(void *kdata, struct bpf_link *link) { struct sched_ext_ops *ops = kdata; - struct scx_sched *sch = ops->priv; + struct scx_sched *sch = rcu_dereference_protected(ops->priv, true); - scx_disable(SCX_EXIT_UNREG); + scx_disable(sch, SCX_EXIT_UNREG); kthread_flush_work(&sch->disable_work); + RCU_INIT_POINTER(ops->priv, NULL); kobject_put(&sch->kobj); } @@ -5471,7 +7407,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {} static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {} static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {} -#endif +#endif /* CONFIG_EXT_GROUP_SCHED */ +static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; } +static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {} static void sched_ext_ops__cpu_online(s32 cpu) {} static void sched_ext_ops__cpu_offline(s32 cpu) {} static s32 sched_ext_ops__init(void) { return -EINVAL; } @@ -5511,6 +7449,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = { .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth, .cgroup_set_idle = sched_ext_ops__cgroup_set_idle, #endif + .sub_attach = sched_ext_ops__sub_attach, + .sub_detach = sched_ext_ops__sub_detach, .cpu_online = sched_ext_ops__cpu_online, .cpu_offline = sched_ext_ops__cpu_offline, .init = sched_ext_ops__init, @@ -5541,7 +7481,15 @@ static struct bpf_struct_ops bpf_sched_ext_ops = { static void sysrq_handle_sched_ext_reset(u8 key) { - scx_disable(SCX_EXIT_SYSRQ); + struct scx_sched *sch; + + rcu_read_lock(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_disable(sch, SCX_EXIT_SYSRQ); + else + pr_info("sched_ext: BPF schedulers not loaded\n"); + rcu_read_unlock(); } static const struct sysrq_key_op sysrq_sched_ext_reset_op = { @@ -5554,9 +7502,10 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = { static void sysrq_handle_sched_ext_dump(u8 key) { struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; + struct scx_sched *sch; - if (scx_enabled()) - scx_dump_state(&ei, 0); + list_for_each_entry_rcu(sch, &scx_sched_all, all) + scx_dump_state(sch, &ei, 0, false); } static const struct sysrq_key_op sysrq_sched_ext_dump_op = { @@ -5610,11 +7559,11 @@ static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *ksyncs) if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { if (cur_class == &ext_sched_class) { + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); ksyncs[cpu] = rq->scx.kick_sync; should_wait = true; - } else { - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); } resched_curr(rq); @@ -5651,10 +7600,9 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) unsigned long *ksyncs; s32 cpu; - if (unlikely(!ksyncs_pcpu)) { - pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs"); + /* can race with free_kick_syncs() during scheduler disable */ + if (unlikely(!ksyncs_pcpu)) return; - } ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs; @@ -5669,27 +7617,15 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); } - if (!should_wait) - return; - - for_each_cpu(cpu, this_scx->cpus_to_wait) { - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; - - /* - * Busy-wait until the task running at the time of kicking is no - * longer running. This can be used to implement e.g. core - * scheduling. - * - * smp_cond_load_acquire() pairs with store_releases in - * pick_task_scx() and put_prev_task_scx(). The former breaks - * the wait if SCX's scheduling path is entered even if the same - * task is picked subsequently. The latter is necessary to break - * the wait when $cpu is taken by a higher sched class. - */ - if (cpu != cpu_of(this_rq)) - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); - - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); + /* + * Can't wait in hardirq — kick_sync can't advance, deadlocking if + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). + */ + if (should_wait) { + raw_spin_rq_lock(this_rq); + this_scx->kick_sync_pending = true; + resched_curr(this_rq); + raw_spin_rq_unlock(this_rq); } } @@ -5707,14 +7643,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work) */ void print_scx_info(const char *log_lvl, struct task_struct *p) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch; enum scx_enable_state state = scx_enable_state(); const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; char runnable_at_buf[22] = "?"; struct sched_class *class; unsigned long runnable_at; - if (state == SCX_DISABLED) + guard(rcu)(); + + sch = scx_task_sched_rcu(p); + + if (!sch) return; /* @@ -5741,6 +7681,14 @@ void print_scx_info(const char *log_lvl, struct task_struct *p) static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (!sch) + return NOTIFY_OK; + /* * SCX schedulers often have userspace components which are sometimes * involved in critial scheduling paths. PM operations involve freezing @@ -5751,12 +7699,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void * case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: case PM_RESTORE_PREPARE: - scx_bypass(true); + scx_bypass(sch, true); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: case PM_POST_RESTORE: - scx_bypass(false); + scx_bypass(sch, false); break; } @@ -5785,8 +7733,9 @@ void __init init_sched_ext_class(void) struct rq *rq = cpu_rq(cpu); int n = cpu_to_node(cpu); - init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); - init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS); + /* local_dsq's sch will be set during scx_root_enable() */ + BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL)); + INIT_LIST_HEAD(&rq->scx.runnable_list); INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals); @@ -5794,6 +7743,10 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); + raw_spin_lock_init(&rq->scx.deferred_reenq_lock); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); + INIT_LIST_HEAD(&rq->scx.deferred_reenq_users); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); @@ -5804,18 +7757,36 @@ void __init init_sched_ext_class(void) register_sysrq_key('S', &sysrq_sched_ext_reset_op); register_sysrq_key('D', &sysrq_sched_ext_dump_op); INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); + +#ifdef CONFIG_EXT_SUB_SCHED + BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params)); +#endif /* CONFIG_EXT_SUB_SCHED */ } /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, - u64 enq_flags) +static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags) { - if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) - return false; + bool is_local = dsq_id == SCX_DSQ_LOCAL || + (dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON; + + if (*enq_flags & SCX_ENQ_IMMED) { + if (unlikely(!is_local)) { + scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id); + return false; + } + } else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) { + *enq_flags |= SCX_ENQ_IMMED; + } + + return true; +} +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 *enq_flags) +{ lockdep_assert_irqs_disabled(); if (unlikely(!p)) { @@ -5823,18 +7794,27 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p return false; } - if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); + if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) { + scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags); return false; } + /* see SCX_EV_INSERT_NOT_OWNED definition */ + if (unlikely(!scx_task_on_sched(sch, p))) { + __scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1); + return false; + } + + if (!scx_vet_enq_flags(sch, dsq_id, enq_flags)) + return false; + return true; } static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; struct task_struct *ddsp_task; ddsp_task = __this_cpu_read(direct_dispatch_task); @@ -5843,7 +7823,7 @@ static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, return; } - if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { + if (unlikely(dspc->cursor >= sch->dsp_max_batch)) { scx_error(sch, "dispatch buffer overflow"); return; } @@ -5864,6 +7844,7 @@ __bpf_kfunc_start_defs(); * @dsq_id: DSQ to insert into * @slice: duration @p can run for in nsecs, 0 to keep the current value * @enq_flags: SCX_ENQ_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to * call this function spuriously. Can be called from ops.enqueue(), @@ -5898,16 +7879,17 @@ __bpf_kfunc_start_defs(); * to check the return value. */ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, - u64 slice, u64 enq_flags) + u64 slice, u64 enq_flags, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; - if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) return false; if (slice) @@ -5924,15 +7906,16 @@ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id, * COMPAT: Will be removed in v6.23 along with the ___v2 suffix. */ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, - u64 slice, u64 enq_flags) + u64 slice, u64 enq_flags, + const struct bpf_prog_aux *aux) { - scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags); + scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux); } static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dsq_insert_preamble(sch, p, enq_flags)) + if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags)) return false; if (slice) @@ -5963,6 +7946,7 @@ struct scx_bpf_dsq_insert_vtime_args { * @args->slice: duration @p can run for in nsecs, 0 to keep the current value * @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ * @args->enq_flags: SCX_ENQ_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided @@ -5987,13 +7971,14 @@ struct scx_bpf_dsq_insert_vtime_args { */ __bpf_kfunc bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, - struct scx_bpf_dsq_insert_vtime_args *args) + struct scx_bpf_dsq_insert_vtime_args *args, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; @@ -6015,44 +8000,61 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, if (unlikely(!sch)) return; +#ifdef CONFIG_EXT_SUB_SCHED + /* + * Disallow if any sub-scheds are attached. There is no way to tell + * which scheduler called us, just error out @p's scheduler. + */ + if (unlikely(!list_empty(&sch->children))) { + scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used"); + return; + } +#endif + scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) -BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU) -BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_enqueue_dispatch, + .filter = scx_kfunc_context_filter, }; static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, struct task_struct *p, u64 dsq_id, u64 enq_flags) { - struct scx_sched *sch = scx_root; struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq; + struct scx_sched *sch = src_dsq->sched; struct rq *this_rq, *src_rq, *locked_rq; bool dispatched = false; bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && - !scx_kf_allowed(sch, SCX_KF_DISPATCH)) + if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags)) return false; /* * If the BPF scheduler keeps calling this function repeatedly, it can * cause similar live-lock conditions as consume_dispatch_q(). */ - if (unlikely(READ_ONCE(scx_aborting))) + if (unlikely(READ_ONCE(sch->aborting))) return false; + if (unlikely(!scx_task_on_sched(sch, p))) { + scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler", + p->comm, p->pid); + return false; + } + /* * Can be called from either ops.dispatch() locking this_rq() or any * context where no rq lock is held. If latter, lock @p's task_rq which @@ -6076,20 +8078,14 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, locked_rq = src_rq; raw_spin_lock(&src_dsq->lock); - /* - * Did someone else get to it? @p could have already left $src_dsq, got - * re-enqueud, or be in the process of being consumed by someone else. - */ - if (unlikely(p->scx.dsq != src_dsq || - u32_before(kit->cursor.priv, p->scx.dsq_seq) || - p->scx.holding_cpu >= 0) || - WARN_ON_ONCE(src_rq != task_rq(p))) { + /* did someone else get to it while we dropped the locks? */ + if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) { raw_spin_unlock(&src_dsq->lock); goto out; } /* @p is still on $src_dsq and stable, determine the destination */ - dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p); + dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p)); /* * Apply vtime and slice updates before moving so that the new time is @@ -6123,44 +8119,42 @@ __bpf_kfunc_start_defs(); /** * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Can only be called from ops.dispatch(). */ -__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) +__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return 0; - if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) - return 0; - - return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); + return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor); } /** * scx_bpf_dispatch_cancel - Cancel the latest dispatch + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Cancel the latest dispatch. Can be called multiple times to cancel further * dispatches. Can only be called from ops.dispatch(). */ -__bpf_kfunc void scx_bpf_dispatch_cancel(void) +__bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_sched *sch; + struct scx_dsp_ctx *dspc; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return; - if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) - return; + dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; if (dspc->cursor > 0) dspc->cursor--; @@ -6170,10 +8164,21 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) /** * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ - * @dsq_id: DSQ to move task from + * @dsq_id: DSQ to move task from. Must be a user-created DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * @enq_flags: %SCX_ENQ_* * * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's - * local DSQ for execution. Can only be called from ops.dispatch(). + * local DSQ for execution with @enq_flags applied. Can only be called from + * ops.dispatch(). + * + * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as + * sources. Local DSQs support reenqueueing (a task can be picked up for + * execution, dequeued for property changes, or reenqueued), but the BPF + * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL + * is similar but also doesn't support reenqueueing, as it maps to multiple + * per-node DSQs making the scope difficult to define; this may change in the + * future. * * This function flushes the in-flight dispatches from scx_bpf_dsq_insert() * before trying to move from the specified DSQ. It may also grab rq locks and @@ -6182,21 +8187,24 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) * Returns %true if a task has been moved, %false if there isn't any task to * move. */ -__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) +__bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags, + const struct bpf_prog_aux *aux) { - struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; struct scx_sched *sch; + struct scx_dsp_ctx *dspc; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; - if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) + if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags)) return false; + dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx; + flush_dispatch_buf(sch, dspc->rq); dsq = find_user_dsq(sch, dsq_id); @@ -6205,7 +8213,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) return false; } - if (consume_dispatch_q(sch, dspc->rq, dsq)) { + if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) { /* * A successfully consumed task can be dequeued before it starts * running while the CPU is trying to migrate other dispatched @@ -6219,6 +8227,14 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) } } +/* + * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future. + */ +__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux) +{ + return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux); +} + /** * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs * @it__iter: DSQ iterator in progress @@ -6314,105 +8330,104 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler + * @cgroup_id: cgroup ID of the child scheduler to dispatch + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Allows a parent scheduler to trigger dispatching on one of its direct + * child schedulers. The child scheduler runs its dispatch operation to + * move tasks from dispatch queues to the local runqueue. + * + * Returns: true on success, false if cgroup_id is invalid, not a direct + * child, or caller lacks dispatch permission. + */ +__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux) +{ + struct rq *this_rq = this_rq(); + struct scx_sched *parent, *child; + + guard(rcu)(); + parent = scx_prog_sched(aux); + if (unlikely(!parent)) + return false; + + child = scx_find_sub_sched(cgroup_id); + + if (unlikely(!child)) + return false; + + if (unlikely(scx_parent(child) != parent)) { + scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu", + cgroup_id); + return false; + } + + return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev, + true); +} +#endif /* CONFIG_EXT_SUB_SCHED */ + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_dispatch) -BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) -BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) -BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) +BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS) +/* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +#ifdef CONFIG_EXT_SUB_SCHED +BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS) +#endif BTF_KFUNCS_END(scx_kfunc_ids_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_dispatch, + .filter = scx_kfunc_context_filter, }; -static u32 reenq_local(struct rq *rq) -{ - LIST_HEAD(tasks); - u32 nr_enqueued = 0; - struct task_struct *p, *n; - - lockdep_assert_rq_held(rq); - - /* - * The BPF scheduler may choose to dispatch tasks back to - * @rq->scx.local_dsq. Move all candidate tasks off to a private list - * first to avoid processing the same tasks repeatedly. - */ - list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list, - scx.dsq_list.node) { - /* - * If @p is being migrated, @p's current CPU may not agree with - * its allowed CPUs and the migration_cpu_stop is about to - * deactivate and re-activate @p anyway. Skip re-enqueueing. - * - * While racing sched property changes may also dequeue and - * re-enqueue a migrating task while its current CPU and allowed - * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to - * the current local DSQ for running tasks and thus are not - * visible to the BPF scheduler. - */ - if (p->migration_pending) - continue; - - dispatch_dequeue(rq, p); - list_add_tail(&p->scx.dsq_list.node, &tasks); - } - - list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) { - list_del_init(&p->scx.dsq_list.node); - do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); - nr_enqueued++; - } - - return nr_enqueued; -} - __bpf_kfunc_start_defs(); /** * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Iterate over all of the tasks currently enqueued on the local DSQ of the * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of * processed tasks. Can only be called from ops.cpu_release(). - * - * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void - * returning variant that can be called from anywhere. */ -__bpf_kfunc u32 scx_bpf_reenqueue_local(void) +__bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct rq *rq; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return 0; - if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) - return 0; - rq = cpu_rq(smp_processor_id()); lockdep_assert_rq_held(rq); - return reenq_local(rq); + return reenq_local(sch, rq, SCX_REENQ_ANY); } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) -BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS) BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_cpu_release, + .filter = scx_kfunc_context_filter, }; __bpf_kfunc_start_defs(); @@ -6421,11 +8436,12 @@ __bpf_kfunc_start_defs(); * scx_bpf_create_dsq - Create a custom DSQ * @dsq_id: DSQ to create * @node: NUMA node to allocate from + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable * scx callback, and any BPF_PROG_TYPE_SYSCALL prog. */ -__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) +__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux) { struct scx_dispatch_q *dsq; struct scx_sched *sch; @@ -6442,36 +8458,54 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) if (!dsq) return -ENOMEM; - init_dsq(dsq, dsq_id); + /* + * init_dsq() must be called in GFP_KERNEL context. Init it with NULL + * @sch and update afterwards. + */ + ret = init_dsq(dsq, dsq_id, NULL); + if (ret) { + kfree(dsq); + return ret; + } rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) + sch = scx_prog_sched(aux); + if (sch) { + dsq->sched = sch; ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node, dsq_hash_params); - else + } else { ret = -ENODEV; + } rcu_read_unlock(); - if (ret) + if (ret) { + exit_dsq(dsq); kfree(dsq); + } return ret; } __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_unlocked) -BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) +BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE) +/* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) +/* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */ +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_unlocked) static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { .owner = THIS_MODULE, .set = &scx_kfunc_ids_unlocked, + .filter = scx_kfunc_context_filter, }; __bpf_kfunc_start_defs(); @@ -6480,12 +8514,21 @@ __bpf_kfunc_start_defs(); * scx_bpf_task_set_slice - Set task's time slice * @p: task of interest * @slice: time slice to set in nsecs + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Set @p's time slice to @slice. Returns %true on success, %false if the * calling scheduler doesn't have authority over @p. */ -__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice, + const struct bpf_prog_aux *aux) { + struct scx_sched *sch; + + guard(rcu)(); + sch = scx_prog_sched(aux); + if (unlikely(!scx_task_on_sched(sch, p))) + return false; + p->scx.slice = slice; return true; } @@ -6494,12 +8537,21 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice) * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering * @p: task of interest * @vtime: virtual time to set + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Set @p's virtual time to @vtime. Returns %true on success, %false if the * calling scheduler doesn't have authority over @p. */ -__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime, + const struct bpf_prog_aux *aux) { + struct scx_sched *sch; + + guard(rcu)(); + sch = scx_prog_sched(aux); + if (unlikely(!scx_task_on_sched(sch, p))) + return false; + p->scx.dsq_vtime = vtime; return true; } @@ -6521,7 +8573,7 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) * lead to irq_work_queue() malfunction such as infinite busy wait for * IRQ status update. Suppress kicking. */ - if (scx_rq_bypassing(this_rq)) + if (scx_bypassing(sch, cpu_of(this_rq))) goto out; /* @@ -6561,18 +8613,19 @@ out: * scx_bpf_kick_cpu - Trigger reschedule on a CPU * @cpu: cpu to kick * @flags: %SCX_KICK_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or * trigger rescheduling on a busy CPU. This can be called from any online * scx_ops operation and the actual kicking is performed asynchronously through * an irq work. */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (likely(sch)) scx_kick_cpu(sch, cpu, flags); } @@ -6646,13 +8699,14 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) * @it: iterator to initialize * @dsq_id: DSQ to iterate * @flags: %SCX_DSQ_ITER_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Initialize BPF iterator @it which can be used with bpf_for_each() to walk * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes * tasks which are already queued when this function is invoked. */ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, - u64 flags) + u64 flags, const struct bpf_prog_aux *aux) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; struct scx_sched *sch; @@ -6670,7 +8724,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, */ kit->dsq = NULL; - sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held()); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -6681,8 +8735,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, if (!kit->dsq) return -ENOENT; - kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags, - READ_ONCE(kit->dsq->seq)); + kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags); return 0; } @@ -6696,41 +8749,13 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) { struct bpf_iter_scx_dsq_kern *kit = (void *)it; - bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV; - struct task_struct *p; - unsigned long flags; if (!kit->dsq) return NULL; - raw_spin_lock_irqsave(&kit->dsq->lock, flags); - - if (list_empty(&kit->cursor.node)) - p = NULL; - else - p = container_of(&kit->cursor, struct task_struct, scx.dsq_list); - - /* - * Only tasks which were queued before the iteration started are - * visible. This bounds BPF iterations and guarantees that vtime never - * jumps in the other direction while iterating. - */ - do { - p = nldsq_next_task(kit->dsq, p, rev); - } while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq))); - - if (p) { - if (rev) - list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node); - else - list_move(&kit->cursor.node, &p->scx.dsq_list.node); - } else { - list_del_init(&kit->cursor.node); - } - - raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); + guard(raw_spinlock_irqsave)(&kit->dsq->lock); - return p; + return nldsq_cursor_next_task(&kit->cursor, kit->dsq); } /** @@ -6759,6 +8784,7 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) /** * scx_bpf_dsq_peek - Lockless peek at the first element. * @dsq_id: DSQ to examine. + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Read the first element in the DSQ. This is semantically equivalent to using * the DSQ iterator, but is lockfree. Of course, like any lockless operation, @@ -6767,12 +8793,13 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) * * Returns the pointer, or NULL indicates an empty queue OR internal error. */ -__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) +__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dispatch_q *dsq; - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -6790,6 +8817,62 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) return rcu_dereference(dsq->first_task); } +/** + * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ + * @dsq_id: DSQ to re-enqueue + * @reenq_flags: %SCX_RENQ_* + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Iterate over all of the tasks currently enqueued on the DSQ identified by + * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are + * supported: + * + * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu) + * - User DSQs + * + * Re-enqueues are performed asynchronously. Can be called from anywhere. + */ +__bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags, + const struct bpf_prog_aux *aux) +{ + struct scx_sched *sch; + struct scx_dispatch_q *dsq; + + guard(preempt)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + return; + + if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) { + scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags); + return; + } + + /* not specifying any filter bits is the same as %SCX_REENQ_ANY */ + if (!(reenq_flags & __SCX_REENQ_FILTER_MASK)) + reenq_flags |= SCX_REENQ_ANY; + + dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id()); + schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq()); +} + +/** + * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * Iterate over all of the tasks currently enqueued on the local DSQ of the + * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from + * anywhere. + * + * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the + * future. + */ +__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux) +{ + scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux); +} + __bpf_kfunc_end_defs(); static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, @@ -6844,18 +8927,20 @@ __bpf_kfunc_start_defs(); * @fmt: error message format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops * disabling. */ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, - unsigned long long *data, u32 data__sz) + unsigned long long *data, u32 data__sz, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - sch = rcu_dereference_bh(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); @@ -6867,18 +8952,19 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, * @fmt: error message format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Indicate that the BPF scheduler encountered a fatal error and initiate ops * disabling. */ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, - u32 data__sz) + u32 data__sz, const struct bpf_prog_aux *aux) { struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - sch = rcu_dereference_bh(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); @@ -6890,6 +8976,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, * @fmt: format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and * dump_task() to generate extra debug dump specific to the BPF scheduler. @@ -6898,7 +8985,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, * multiple calls. The last line is automatically terminated. */ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, - u32 data__sz) + u32 data__sz, const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; @@ -6907,7 +8994,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return; @@ -6944,38 +9031,21 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, } /** - * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ - * - * Iterate over all of the tasks currently enqueued on the local DSQ of the - * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from - * anywhere. - */ -__bpf_kfunc void scx_bpf_reenqueue_local___v2(void) -{ - struct rq *rq; - - guard(preempt)(); - - rq = this_rq(); - local_set(&rq->scx.reenq_local_deferred, 1); - schedule_deferred(rq); -} - -/** * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU * @cpu: CPU of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the maximum relative capacity of @cpu in relation to the most * performant CPU in the system. The return value is in the range [1, * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). */ -__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) +__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else @@ -6985,6 +9055,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) /** * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU * @cpu: CPU of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Return the current relative performance of @cpu in relation to its maximum. * The return value is in the range [1, %SCX_CPUPERF_ONE]. @@ -6996,13 +9067,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) * * The result is in the range [1, %SCX_CPUPERF_ONE]. */ -__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) +__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else @@ -7013,6 +9084,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) * scx_bpf_cpuperf_set - Set the relative performance target of a CPU * @cpu: CPU of interest * @perf: target performance level [0, %SCX_CPUPERF_ONE] + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Set the target performance level of @cpu to @perf. @perf is in linear * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the @@ -7023,13 +9095,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) * use. Consult hardware and cpufreq documentation for more information. The * current performance level can be monitored using scx_bpf_cpuperf_cur(). */ -__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) +__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return; @@ -7139,14 +9211,15 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) /** * scx_bpf_cpu_rq - Fetch the rq of a CPU * @cpu: CPU of the rq + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs */ -__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) +__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -7165,18 +9238,19 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) /** * scx_bpf_locked_rq - Return the rq currently locked by SCX + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns the rq if a rq lock is currently held by SCX. * Otherwise emits an error and returns NULL. */ -__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +__bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux) { struct scx_sched *sch; struct rq *rq; guard(preempt)(); - sch = rcu_dereference_sched(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -7192,16 +9266,17 @@ __bpf_kfunc struct rq *scx_bpf_locked_rq(void) /** * scx_bpf_cpu_curr - Return remote CPU's curr task * @cpu: CPU of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Callers must hold RCU read lock (KF_RCU). */ -__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return NULL; @@ -7212,41 +9287,6 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) } /** - * scx_bpf_task_cgroup - Return the sched cgroup of a task - * @p: task of interest - * - * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with - * from the scheduler's POV. SCX operations should use this function to - * determine @p's current cgroup as, unlike following @p->cgroups, - * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all - * rq-locked operations. Can be called on the parameter tasks of rq-locked - * operations. The restriction guarantees that @p's rq is locked by the caller. - */ -#ifdef CONFIG_CGROUP_SCHED -__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) -{ - struct task_group *tg = p->sched_task_group; - struct cgroup *cgrp = &cgrp_dfl_root.cgrp; - struct scx_sched *sch; - - guard(rcu)(); - - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - goto out; - - if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) - goto out; - - cgrp = tg_cgrp(tg); - -out: - cgroup_get(cgrp); - return cgrp; -} -#endif - -/** * scx_bpf_now - Returns a high-performance monotonically non-decreasing * clock for the current CPU. The clock returned is in nanoseconds. * @@ -7322,10 +9362,14 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING); scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED); + scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT); scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH); scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE); + scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED); + scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH); } } @@ -7359,25 +9403,62 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events, memcpy(events, &e_sys, events__sz); } +#ifdef CONFIG_CGROUP_SCHED +/** + * scx_bpf_task_cgroup - Return the sched cgroup of a task + * @p: task of interest + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs + * + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with + * from the scheduler's POV. SCX operations should use this function to + * determine @p's current cgroup as, unlike following @p->cgroups, + * @p->sched_task_group is stable for the duration of the SCX op. See + * SCX_CALL_OP_TASK() for details. + */ +__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p, + const struct bpf_prog_aux *aux) +{ + struct task_group *tg = p->sched_task_group; + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); + + sch = scx_prog_sched(aux); + if (unlikely(!sch)) + goto out; + + if (!scx_kf_arg_task_ok(sch, p)) + goto out; + + cgrp = tg_cgrp(tg); + +out: + cgroup_get(cgrp); + return cgrp; +} +#endif /* CONFIG_CGROUP_SCHED */ + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) -BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU); -BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU); -BTF_ID_FLAGS(func, scx_bpf_kick_cpu) +BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU); +BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) -BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) -BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) +BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -BTF_ID_FLAGS(func, scx_bpf_exit_bstr) -BTF_ID_FLAGS(func, scx_bpf_error_bstr) -BTF_ID_FLAGS(func, scx_bpf_dump_bstr) -BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2) -BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) -BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) -BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) +BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS) BTF_ID_FLAGS(func, scx_bpf_nr_node_ids) BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) @@ -7385,14 +9466,14 @@ BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_cpu_rq) -BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) -BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) -#ifdef CONFIG_CGROUP_SCHED -BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) -#endif +BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED) BTF_ID_FLAGS(func, scx_bpf_now) BTF_ID_FLAGS(func, scx_bpf_events) +#ifdef CONFIG_CGROUP_SCHED +BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE) +#endif BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { @@ -7400,6 +9481,115 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = { .set = &scx_kfunc_ids_any, }; +/* + * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc + * group; an op may permit zero or more groups, with the union expressed in + * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter()) + * consults this table to decide whether a context-sensitive kfunc is callable + * from a given SCX op. + */ +enum scx_kf_allow_flags { + SCX_KF_ALLOW_UNLOCKED = 1 << 0, + SCX_KF_ALLOW_CPU_RELEASE = 1 << 1, + SCX_KF_ALLOW_DISPATCH = 1 << 2, + SCX_KF_ALLOW_ENQUEUE = 1 << 3, + SCX_KF_ALLOW_SELECT_CPU = 1 << 4, +}; + +/* + * Map each SCX op to the union of kfunc groups it permits, indexed by + * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not + * context-sensitive. + */ +static const u32 scx_kf_allow_flags[] = { + [SCX_OP_IDX(select_cpu)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, + [SCX_OP_IDX(enqueue)] = SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE, + [SCX_OP_IDX(dispatch)] = SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH, + [SCX_OP_IDX(cpu_release)] = SCX_KF_ALLOW_CPU_RELEASE, + [SCX_OP_IDX(init_task)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(dump)] = SCX_KF_ALLOW_UNLOCKED, +#ifdef CONFIG_EXT_GROUP_SCHED + [SCX_OP_IDX(cgroup_init)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_exit)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_prep_move)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_set_weight)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cgroup_set_idle)] = SCX_KF_ALLOW_UNLOCKED, +#endif /* CONFIG_EXT_GROUP_SCHED */ + [SCX_OP_IDX(sub_attach)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(sub_detach)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cpu_online)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(cpu_offline)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(init)] = SCX_KF_ALLOW_UNLOCKED, + [SCX_OP_IDX(exit)] = SCX_KF_ALLOW_UNLOCKED, +}; + +/* + * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the + * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this + * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or + * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the + * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g. + * scx_kfunc_ids_any) by falling through to "allow" when none of the + * context-sensitive sets contain the kfunc. + */ +int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id); + bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id); + bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id); + bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id); + bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id); + u32 moff, flags; + + /* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */ + if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release)) + return 0; + + /* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */ + if (prog->type == BPF_PROG_TYPE_SYSCALL) + return (in_unlocked || in_select_cpu) ? 0 : -EACCES; + + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + return -EACCES; + + /* + * add_subprog_and_kfunc() collects all kfunc calls, including dead code + * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets + * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set; + * do_check_main() re-runs the filter with st_ops set and enforces the + * actual restrictions. + */ + if (!prog->aux->st_ops) + return 0; + + /* + * Non-SCX struct_ops: only unlocked kfuncs are safe. The other + * context-sensitive kfuncs assume the rq lock is held by the SCX + * dispatch path, which doesn't apply to other struct_ops users. + */ + if (prog->aux->st_ops != &bpf_sched_ext_ops) + return in_unlocked ? 0 : -EACCES; + + /* SCX struct_ops: check the per-op allow list. */ + moff = prog->aux->attach_st_ops_member_off; + flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)]; + + if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked) + return 0; + if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release) + return 0; + if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch) + return 0; + if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue) + return 0; + if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu) + return 0; + + return -EACCES; +} + static int __init scx_init(void) { int ret; @@ -7409,11 +9599,12 @@ static int __init scx_init(void) * register_btf_kfunc_id_set() needs most of the system to be up. * * Some kfuncs are context-sensitive and can only be called from - * specific SCX ops. They are grouped into BTF sets accordingly. - * Unfortunately, BPF currently doesn't have a way of enforcing such - * restrictions. Eventually, the verifier should be able to enforce - * them. For now, register them the same and make each kfunc explicitly - * check using scx_kf_allowed(). + * specific SCX ops. They are grouped into per-context BTF sets, each + * registered with scx_kfunc_context_filter as its .filter callback. The + * BPF core dedups identical filter pointers per hook + * (btf_populate_kfunc_set()), so the filter is invoked exactly once per + * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op + * restrictions at verify time. */ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_enqueue_dispatch)) || diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 43429b33e52c..0b7fc46aee08 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -11,7 +11,7 @@ void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); -int scx_fork(struct task_struct *p); +int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs); void scx_post_fork(struct task_struct *p); void scx_cancel_fork(struct task_struct *p); bool scx_can_stop_tick(struct rq *rq); @@ -44,7 +44,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, static inline void scx_tick(struct rq *rq) {} static inline void scx_pre_fork(struct task_struct *p) {} -static inline int scx_fork(struct task_struct *p) { return 0; } +static inline int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void scx_post_fork(struct task_struct *p) {} static inline void scx_cancel_fork(struct task_struct *p) {} static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index ba298ac3ce6c..443d12a3df67 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -368,7 +368,7 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) /* * Enable NUMA optimization only when there are multiple NUMA domains - * among the online CPUs and the NUMA domains don't perfectly overlaps + * among the online CPUs and the NUMA domains don't perfectly overlap * with the LLC domains. * * If all CPUs belong to the same NUMA node and the same LLC domain, @@ -424,18 +424,24 @@ static inline bool task_affinity_all(const struct task_struct *p) * - prefer the last used CPU to take advantage of cached data (L1, L2) and * branch prediction optimizations. * - * 3. Pick a CPU within the same LLC (Last-Level Cache): + * 3. Prefer @prev_cpu's SMT sibling: + * - if @prev_cpu is busy and no fully idle core is available, try to + * place the task on an idle SMT sibling of @prev_cpu; keeping the + * task on the same core makes migration cheaper, preserves L1 cache + * locality and reduces wakeup latency. + * + * 4. Pick a CPU within the same LLC (Last-Level Cache): * - if the above conditions aren't met, pick a CPU that shares the same * LLC, if the LLC domain is a subset of @cpus_allowed, to maintain * cache locality. * - * 4. Pick a CPU within the same NUMA node, if enabled: + * 5. Pick a CPU within the same NUMA node, if enabled: * - choose a CPU from the same NUMA node, if the node cpumask is a * subset of @cpus_allowed, to reduce memory access latency. * - * 5. Pick any idle CPU within the @cpus_allowed domain. + * 6. Pick any idle CPU within the @cpus_allowed domain. * - * Step 3 and 4 are performed only if the system has, respectively, + * Step 4 and 5 are performed only if the system has, respectively, * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs. * @@ -543,7 +549,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, * piled up on it even if there is an idle core elsewhere on * the system. */ - waker_node = cpu_to_node(cpu); + waker_node = scx_cpu_node_if_enabled(cpu); if (!(current->flags & PF_EXITING) && cpu_rq(cpu)->scx.local_dsq.nr == 0 && (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) && @@ -616,6 +622,20 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, goto out_unlock; } +#ifdef CONFIG_SCHED_SMT + /* + * Use @prev_cpu's sibling if it's idle. + */ + if (sched_smt_active()) { + for_each_cpu_and(cpu, cpu_smt_mask(prev_cpu), allowed) { + if (cpu == prev_cpu) + continue; + if (scx_idle_test_and_clear_cpu(cpu)) + goto out_unlock; + } + } +#endif + /* * Search for any idle CPU in the same LLC domain. */ @@ -767,8 +787,9 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) * either enqueue() sees the idle bit or update_idle() sees the task * that enqueue() queued. */ - if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq)) - SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle); + if (SCX_HAS_OP(sch, update_idle) && do_notify && + !scx_bypassing(sch, cpu_of(rq))) + SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle); } static void reset_idle_masks(struct sched_ext_ops *ops) @@ -860,33 +881,40 @@ static bool check_builtin_idle_enabled(struct scx_sched *sch) * code. * * We can't simply check whether @p->migration_disabled is set in a - * sched_ext callback, because migration is always disabled for the current - * task while running BPF code. + * sched_ext callback, because the BPF prolog (__bpf_prog_enter) may disable + * migration for the current task while running BPF code. + * + * Since the BPF prolog calls migrate_disable() only when CONFIG_PREEMPT_RCU + * is enabled (via rcu_read_lock_dont_migrate()), migration_disabled == 1 for + * the current task is ambiguous only in that case: it could be from the BPF + * prolog rather than a real migrate_disable() call. * - * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) respectively - * disable and re-enable migration. For this reason, the current task - * inside a sched_ext callback is always a migration-disabled task. + * Without CONFIG_PREEMPT_RCU, the BPF prolog never calls migrate_disable(), + * so migration_disabled == 1 always means the task is truly + * migration-disabled. * - * Therefore, when @p->migration_disabled == 1, check whether @p is the - * current task or not: if it is, then migration was not disabled before - * entering the callback, otherwise migration was disabled. + * Therefore, when migration_disabled == 1 and CONFIG_PREEMPT_RCU is enabled, + * check whether @p is the current task or not: if it is, then migration was + * not disabled before entering the callback, otherwise migration was disabled. * * Returns true if @p is migration-disabled, false otherwise. */ static bool is_bpf_migration_disabled(const struct task_struct *p) { - if (p->migration_disabled == 1) - return p != current; - else - return p->migration_disabled; + if (p->migration_disabled == 1) { + if (IS_ENABLED(CONFIG_PREEMPT_RCU)) + return p != current; + return true; + } + return p->migration_disabled; } static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { - struct rq *rq; - struct rq_flags rf; + unsigned long irq_flags; + bool we_locked = false; s32 cpu; if (!ops_cpu_valid(sch, prev_cpu, NULL)) @@ -896,27 +924,20 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, return -EBUSY; /* - * If called from an unlocked context, acquire the task's rq lock, - * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed. + * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq + * lock or @p's pi_lock. Three cases: * - * Otherwise, allow to use this kfunc only from ops.select_cpu() - * and ops.select_enqueue(). - */ - if (scx_kf_allowed_if_unlocked()) { - rq = task_rq_lock(p, &rf); - } else { - if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) - return -EPERM; - rq = scx_locked_rq(); - } - - /* - * Validate locking correctness to access p->cpus_ptr and - * p->nr_cpus_allowed: if we're holding an rq lock, we're safe; - * otherwise, assert that p->pi_lock is held. + * - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock. + * - other rq-locked SCX op: scx_locked_rq() points at the held rq. + * - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops): + * nothing held, take pi_lock ourselves. */ - if (!rq) + if (this_rq()->scx.in_select_cpu) { lockdep_assert_held(&p->pi_lock); + } else if (!scx_locked_rq()) { + raw_spin_lock_irqsave(&p->pi_lock, irq_flags); + we_locked = true; + } /* * This may also be called from ops.enqueue(), so we need to handle @@ -935,8 +956,8 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, allowed ?: p->cpus_ptr, flags); } - if (scx_kf_allowed_if_unlocked()) - task_rq_unlock(rq, p, &rf); + if (we_locked) + raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags); return cpu; } @@ -945,14 +966,15 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or * trigger an error if @cpu is invalid * @cpu: target CPU + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs */ -__bpf_kfunc int scx_bpf_cpu_node(s32 cpu) +__bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) return NUMA_NO_NODE; return cpu_to_node(cpu); @@ -964,6 +986,7 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) * @prev_cpu: CPU @p was on previously * @wake_flags: %SCX_WAKE_* flags * @is_idle: out parameter indicating whether the returned CPU is idle + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked * context such as a BPF test_run() call, as long as built-in CPU selection @@ -974,14 +997,15 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) * currently idle and thus a good candidate for direct dispatching. */ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, - u64 wake_flags, bool *is_idle) + u64 wake_flags, bool *is_idle, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; s32 cpu; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1009,6 +1033,7 @@ struct scx_bpf_select_cpu_and_args { * @args->prev_cpu: CPU @p was on previously * @args->wake_flags: %SCX_WAKE_* flags * @args->flags: %SCX_PICK_IDLE* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided @@ -1027,13 +1052,14 @@ struct scx_bpf_select_cpu_and_args { */ __bpf_kfunc s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, - struct scx_bpf_select_cpu_and_args *args) + struct scx_bpf_select_cpu_and_args *args, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1055,6 +1081,17 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 if (unlikely(!sch)) return -ENODEV; +#ifdef CONFIG_EXT_SUB_SCHED + /* + * Disallow if any sub-scheds are attached. There is no way to tell + * which scheduler called us, just error out @p's scheduler. + */ + if (unlikely(!list_empty(&sch->children))) { + scx_error(scx_task_sched(p), "__scx_bpf_select_cpu_and() must be used"); + return -EINVAL; + } +#endif + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, cpus_allowed, flags); } @@ -1063,18 +1100,20 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the * idle-tracking per-CPU cpumask of a target NUMA node. * @node: target NUMA node + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will * be reported to the BPF scheduler via scx_error(). */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) +__bpf_kfunc const struct cpumask * +scx_bpf_get_idle_cpumask_node(s32 node, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1088,17 +1127,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) /** * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking * per-CPU cpumask. + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty mask if idle tracking is not enabled, or running on a * UP kernel. */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1118,18 +1158,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be * used to determine if an entire physical core is free. * @node: target NUMA node + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty cpumask if idle tracking is not enabled, if @node is * not valid, or running on a UP kernel. In this case the actual error will * be reported to the BPF scheduler via scx_error(). */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) +__bpf_kfunc const struct cpumask * +scx_bpf_get_idle_smtmask_node(s32 node, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1147,17 +1189,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, * per-physical-core cpumask. Can be used to determine if an entire physical * core is free. + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns an empty mask if idle tracking is not enabled, or running on a * UP kernel. */ -__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) +__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return cpu_none_mask; @@ -1193,6 +1236,7 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) /** * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state * @cpu: cpu to test and clear idle for + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Returns %true if @cpu was idle and its idle state was successfully cleared. * %false otherwise. @@ -1200,13 +1244,13 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) * Unavailable if ops.update_idle() is implemented and * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. */ -__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) +__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return false; @@ -1224,6 +1268,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) * @cpus_allowed: Allowed cpumask * @node: target NUMA node * @flags: %SCX_PICK_IDLE_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node. * @@ -1239,13 +1284,14 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set. */ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, - int node, u64 flags) + s32 node, u64 flags, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1260,6 +1306,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu * @cpus_allowed: Allowed cpumask * @flags: %SCX_PICK_IDLE_CPU_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu * number on success. -%EBUSY if no matching cpu was found. @@ -1279,13 +1326,13 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_idle_cpu_node() instead. */ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, - u64 flags) + u64 flags, const struct bpf_prog_aux *aux) { struct scx_sched *sch; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1306,6 +1353,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, * @cpus_allowed: Allowed cpumask * @node: target NUMA node * @flags: %SCX_PICK_IDLE_CPU_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu @@ -1322,14 +1370,15 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, * CPU. */ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, - int node, u64 flags) + s32 node, u64 flags, + const struct bpf_prog_aux *aux) { struct scx_sched *sch; s32 cpu; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1355,6 +1404,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU * @cpus_allowed: Allowed cpumask * @flags: %SCX_PICK_IDLE_CPU_* flags + * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs * * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu @@ -1369,14 +1419,14 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, * scx_bpf_pick_any_cpu_node() instead. */ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, - u64 flags) + u64 flags, const struct bpf_prog_aux *aux) { struct scx_sched *sch; s32 cpu; guard(rcu)(); - sch = rcu_dereference(scx_root); + sch = scx_prog_sched(aux); if (unlikely(!sch)) return -ENODEV; @@ -1401,20 +1451,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_idle) -BTF_ID_FLAGS(func, scx_bpf_cpu_node) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE) -BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE) +BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE) BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) -BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) -BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_idle) static const struct btf_kfunc_id_set scx_kfunc_set_idle = { @@ -1422,13 +1469,38 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = { .set = &scx_kfunc_ids_idle, }; +/* + * The select_cpu kfuncs internally call task_rq_lock() when invoked from an + * rq-unlocked context, and thus cannot be safely called from arbitrary tracing + * contexts where @p's pi_lock state is unknown. Keep them out of + * BPF_PROG_TYPE_TRACING by registering them in their own set which is exposed + * only to STRUCT_OPS and SYSCALL programs. + * + * These kfuncs are also members of scx_kfunc_ids_unlocked (see ext.c) because + * they're callable from unlocked contexts in addition to ops.select_cpu() and + * ops.enqueue(). + */ +BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) +BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU) +BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) + +static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { + .owner = THIS_MODULE, + .set = &scx_kfunc_ids_select_cpu, + .filter = scx_kfunc_context_filter, +}; + int scx_idle_init(void) { int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) || register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) || - register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle); + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) || + register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu); return ret; } diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h index fa583f141f35..dc35f850481e 100644 --- a/kernel/sched/ext_idle.h +++ b/kernel/sched/ext_idle.h @@ -12,6 +12,8 @@ struct sched_ext_ops; +extern struct btf_id_set8 scx_kfunc_ids_select_cpu; + void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops); void scx_idle_init_masks(void); diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h index 00b450597f3e..62ce4eaf6a3f 100644 --- a/kernel/sched/ext_internal.h +++ b/kernel/sched/ext_internal.h @@ -6,6 +6,7 @@ * Copyright (c) 2025 Tejun Heo <tj@kernel.org> */ #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) +#define SCX_MOFF_IDX(moff) ((moff) / sizeof(void (*)(void))) enum scx_consts { SCX_DSP_DFL_MAX_BATCH = 32, @@ -24,10 +25,16 @@ enum scx_consts { */ SCX_TASK_ITER_BATCH = 32, + SCX_BYPASS_HOST_NTH = 2, + SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC, SCX_BYPASS_LB_DONOR_PCT = 125, SCX_BYPASS_LB_MIN_DELTA_DIV = 4, SCX_BYPASS_LB_BATCH = 256, + + SCX_REENQ_LOCAL_MAX_REPEAT = 256, + + SCX_SUB_MAX_DEPTH = 4, }; enum scx_exit_kind { @@ -38,6 +45,7 @@ enum scx_exit_kind { SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + SCX_EXIT_PARENT, /* parent exiting */ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ @@ -62,6 +70,7 @@ enum scx_exit_kind { enum scx_exit_code { /* Reasons */ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32, /* Actions */ SCX_ECODE_ACT_RESTART = 1LLU << 48, @@ -175,9 +184,10 @@ enum scx_ops_flags { SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, /* - * CPU cgroup support flags + * If set, %SCX_ENQ_IMMED is assumed to be set on all local DSQ + * enqueues. */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + SCX_OPS_ALWAYS_ENQ_IMMED = 1LLU << 7, SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | @@ -186,7 +196,7 @@ enum scx_ops_flags { SCX_OPS_ALLOW_QUEUED_WAKEUP | SCX_OPS_SWITCH_PARTIAL | SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, + SCX_OPS_ALWAYS_ENQ_IMMED, /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, @@ -213,7 +223,7 @@ struct scx_exit_task_args { bool cancelled; }; -/* argument container for ops->cgroup_init() */ +/* argument container for ops.cgroup_init() */ struct scx_cgroup_init_args { /* the weight of the cgroup [1..10000] */ u32 weight; @@ -236,12 +246,12 @@ enum scx_cpu_preempt_reason { }; /* - * Argument container for ops->cpu_acquire(). Currently empty, but may be + * Argument container for ops.cpu_acquire(). Currently empty, but may be * expanded in the future. */ struct scx_cpu_acquire_args {}; -/* argument container for ops->cpu_release() */ +/* argument container for ops.cpu_release() */ struct scx_cpu_release_args { /* the reason the CPU was preempted */ enum scx_cpu_preempt_reason reason; @@ -250,9 +260,7 @@ struct scx_cpu_release_args { struct task_struct *task; }; -/* - * Informational context provided to dump operations. - */ +/* informational context provided to dump operations */ struct scx_dump_ctx { enum scx_exit_kind kind; s64 exit_code; @@ -261,6 +269,18 @@ struct scx_dump_ctx { u64 at_jiffies; }; +/* argument container for ops.sub_attach() */ +struct scx_sub_attach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + +/* argument container for ops.sub_detach() */ +struct scx_sub_detach_args { + struct sched_ext_ops *ops; + char *cgroup_path; +}; + /** * struct sched_ext_ops - Operation table for BPF scheduler implementation * @@ -721,6 +741,20 @@ struct sched_ext_ops { #endif /* CONFIG_EXT_GROUP_SCHED */ + /** + * @sub_attach: Attach a sub-scheduler + * @args: argument container, see the struct definition + * + * Return 0 to accept the sub-scheduler. -errno to reject. + */ + s32 (*sub_attach)(struct scx_sub_attach_args *args); + + /** + * @sub_detach: Detach a sub-scheduler + * @args: argument container, see the struct definition + */ + void (*sub_detach)(struct scx_sub_detach_args *args); + /* * All online ops must come before ops.cpu_online(). */ @@ -762,6 +796,10 @@ struct sched_ext_ops { */ void (*exit)(struct scx_exit_info *info); + /* + * Data fields must comes after all ops fields. + */ + /** * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch */ @@ -797,6 +835,12 @@ struct sched_ext_ops { u64 hotplug_seq; /** + * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the + * specified cgroup. + */ + u64 sub_cgroup_id; + + /** * @name: BPF scheduler's name * * Must be a non-zero valid BPF object name including only isalnum(), @@ -806,7 +850,7 @@ struct sched_ext_ops { char name[SCX_OPS_NAME_LEN]; /* internal use only, must be NULL */ - void *priv; + void __rcu *priv; }; enum scx_opi { @@ -854,6 +898,24 @@ struct scx_event_stats { s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; /* + * The number of times a task, enqueued on a local DSQ with + * SCX_ENQ_IMMED, was re-enqueued because the CPU was not available for + * immediate execution. + */ + s64 SCX_EV_REENQ_IMMED; + + /* + * The number of times a reenq of local DSQ caused another reenq of + * local DSQ. This can happen when %SCX_ENQ_IMMED races against a higher + * priority class task even if the BPF scheduler always satisfies the + * prerequisites for %SCX_ENQ_IMMED at the time of enqueue. However, + * that scenario is very unlikely and this count going up regularly + * indicates that the BPF scheduler is handling %SCX_ENQ_REENQ + * incorrectly causing recursive reenqueues. + */ + s64 SCX_EV_REENQ_LOCAL_REPEAT; + + /* * Total number of times a task's time slice was refilled with the * default value (SCX_SLICE_DFL). */ @@ -873,15 +935,77 @@ struct scx_event_stats { * The number of times the bypassing mode has been activated. */ s64 SCX_EV_BYPASS_ACTIVATE; + + /* + * The number of times the scheduler attempted to insert a task that it + * doesn't own into a DSQ. Such attempts are ignored. + * + * As BPF schedulers are allowed to ignore dequeues, it's difficult to + * tell whether such an attempt is from a scheduler malfunction or an + * ignored dequeue around sub-sched enabling. If this count keeps going + * up regardless of sub-sched enabling, it likely indicates a bug in the + * scheduler. + */ + s64 SCX_EV_INSERT_NOT_OWNED; + + /* + * The number of times tasks from bypassing descendants are scheduled + * from sub_bypass_dsq's. + */ + s64 SCX_EV_SUB_BYPASS_DISPATCH; +}; + +struct scx_sched; + +enum scx_sched_pcpu_flags { + SCX_SCHED_PCPU_BYPASSING = 1LLU << 0, +}; + +/* dispatch buf */ +struct scx_dsp_buf_ent { + struct task_struct *task; + unsigned long qseq; + u64 dsq_id; + u64 enq_flags; +}; + +struct scx_dsp_ctx { + struct rq *rq; + u32 cursor; + u32 nr_tasks; + struct scx_dsp_buf_ent buf[]; +}; + +struct scx_deferred_reenq_local { + struct list_head node; + u64 flags; + u64 seq; + u32 cnt; }; struct scx_sched_pcpu { + struct scx_sched *sch; + u64 flags; /* protected by rq lock */ + /* * The event counters are in a per-CPU variable to minimize the * accounting overhead. A system-wide view on the event counter is * constructed when requested by scx_bpf_events(). */ struct scx_event_stats event_stats; + + struct scx_deferred_reenq_local deferred_reenq_local; + struct scx_dispatch_q bypass_dsq; +#ifdef CONFIG_EXT_SUB_SCHED + u32 bypass_host_seq; +#endif + + /* must be the last entry - contains flex array */ + struct scx_dsp_ctx dsp_ctx; +}; + +struct scx_sched_pnode { + struct scx_dispatch_q global_dsq; }; struct scx_sched { @@ -897,15 +1021,50 @@ struct scx_sched { * per-node split isn't sufficient, it can be further split. */ struct rhashtable dsq_hash; - struct scx_dispatch_q **global_dsqs; + struct scx_sched_pnode **pnode; struct scx_sched_pcpu __percpu *pcpu; + u64 slice_dfl; + u64 bypass_timestamp; + s32 bypass_depth; + + /* bypass dispatch path enable state, see bypass_dsp_enabled() */ + unsigned long bypass_dsp_claim; + atomic_t bypass_dsp_enable_depth; + + bool aborting; + bool dump_disabled; /* protected by scx_dump_lock */ + u32 dsp_max_batch; + s32 level; + /* * Updates to the following warned bitfields can race causing RMW issues * but it doesn't really matter. */ bool warned_zero_slice:1; bool warned_deprecated_rq:1; + bool warned_unassoc_progs:1; + + struct list_head all; + +#ifdef CONFIG_EXT_SUB_SCHED + struct rhash_head hash_node; + + struct list_head children; + struct list_head sibling; + struct cgroup *cgrp; + char *cgrp_path; + struct kset *sub_kset; + + bool sub_attached; +#endif /* CONFIG_EXT_SUB_SCHED */ + + /* + * The maximum amount of time in jiffies that a task may be runnable + * without being scheduled on a CPU. If this timeout is exceeded, it + * will trigger scx_error(). + */ + unsigned long watchdog_timeout; atomic_t exit_kind; struct scx_exit_info *exit_info; @@ -913,9 +1072,13 @@ struct scx_sched { struct kobject kobj; struct kthread_worker *helper; - struct irq_work error_irq_work; + struct irq_work disable_irq_work; struct kthread_work disable_work; + struct timer_list bypass_lb_timer; struct rcu_work rcu_work; + + /* all ancestors including self */ + struct scx_sched *ancestors[]; }; enum scx_wake_flags { @@ -942,13 +1105,27 @@ enum scx_enq_flags { SCX_ENQ_PREEMPT = 1LLU << 32, /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. + * Only allowed on local DSQs. Guarantees that the task either gets + * on the CPU immediately and stays on it, or gets reenqueued back + * to the BPF scheduler. It will never linger on a local DSQ or be + * silently put back after preemption. + * + * The protection persists until the next fresh enqueue - it + * survives SAVE/RESTORE cycles, slice extensions and preemption. + * If the task can't stay on the CPU for any reason, it gets + * reenqueued back to the BPF scheduler. + * + * Exiting and migration-disabled tasks bypass ops.enqueue() and + * are placed directly on a local DSQ without IMMED protection + * unless %SCX_OPS_ENQ_EXITING and %SCX_OPS_ENQ_MIGRATION_DISABLED + * are set respectively. + */ + SCX_ENQ_IMMED = 1LLU << 33, + + /* + * The task being enqueued was previously enqueued on a DSQ, but was + * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find + * out why a given task is being reenqueued. */ SCX_ENQ_REENQ = 1LLU << 40, @@ -969,6 +1146,7 @@ enum scx_enq_flags { SCX_ENQ_CLEAR_OPSS = 1LLU << 56, SCX_ENQ_DSQ_PRIQ = 1LLU << 57, SCX_ENQ_NESTED = 1LLU << 58, + SCX_ENQ_GDSQ_FALLBACK = 1LLU << 59, /* fell back to global DSQ */ }; enum scx_deq_flags { @@ -982,6 +1160,28 @@ enum scx_deq_flags { * it hasn't been dispatched yet. Dequeue from the BPF side. */ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, + + /* + * The task is being dequeued due to a property change (e.g., + * sched_setaffinity(), sched_setscheduler(), set_user_nice(), + * etc.). + */ + SCX_DEQ_SCHED_CHANGE = 1LLU << 33, +}; + +enum scx_reenq_flags { + /* low 16bits determine which tasks should be reenqueued */ + SCX_REENQ_ANY = 1LLU << 0, /* all tasks */ + + __SCX_REENQ_FILTER_MASK = 0xffffLLU, + + __SCX_REENQ_USER_MASK = SCX_REENQ_ANY, + + /* bits 32-35 used by task_should_reenq() */ + SCX_REENQ_TSR_RQ_OPEN = 1LLU << 32, + SCX_REENQ_TSR_NOT_FIRST = 1LLU << 33, + + __SCX_REENQ_TSR_MASK = 0xfLLU << 32, }; enum scx_pick_idle_cpu_flags { @@ -1161,8 +1361,11 @@ enum scx_ops_state { #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) +extern struct scx_sched __rcu *scx_root; DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); +int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id); + /* * Return the rq currently locked from an scx callback, or NULL if no rq is * locked. @@ -1172,12 +1375,107 @@ static inline struct rq *scx_locked_rq(void) return __this_cpu_read(scx_locked_rq_state); } -static inline bool scx_kf_allowed_if_unlocked(void) +static inline bool scx_bypassing(struct scx_sched *sch, s32 cpu) +{ + return unlikely(per_cpu_ptr(sch->pcpu, cpu)->flags & + SCX_SCHED_PCPU_BYPASSING); +} + +#ifdef CONFIG_EXT_SUB_SCHED +/** + * scx_task_sched - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. Must be called with @p's pi_lock or rq lock + * held. + */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(p->scx.sched, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +/** + * scx_task_sched_rcu - Find scx_sched scheduling a task + * @p: task of interest + * + * Return @p's scheduler instance. The returned scx_sched is RCU protected. + */ +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(p->scx.sched); +} + +/** + * scx_task_on_sched - Is a task on the specified sched? + * @sch: sched to test against + * @p: task of interest + * + * Returns %true if @p is on @sch, %false otherwise. + */ +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) +{ + return rcu_access_pointer(p->scx.sched) == sch; +} + +/** + * scx_prog_sched - Find scx_sched associated with a BPF prog + * @aux: aux passed in from BPF to a kfunc + * + * To be called from kfuncs. Return the scheduler instance associated with the + * BPF program given the implicit kfunc argument aux. The returned scx_sched is + * RCU protected. + */ +static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) +{ + struct sched_ext_ops *ops; + struct scx_sched *root; + + ops = bpf_prog_get_assoc_struct_ops(aux); + if (likely(ops)) + return rcu_dereference_all(ops->priv); + + root = rcu_dereference_all(scx_root); + if (root) { + /* + * COMPAT-v6.19: Schedulers built before sub-sched support was + * introduced may have unassociated non-struct_ops programs. + */ + if (!root->ops.sub_attach) + return root; + + if (!root->warned_unassoc_progs) { + printk_deferred(KERN_WARNING "sched_ext: Unassociated program %s (id %d)\n", + aux->name, aux->id); + root->warned_unassoc_progs = true; + } + } + + return NULL; +} +#else /* CONFIG_EXT_SUB_SCHED */ +static inline struct scx_sched *scx_task_sched(const struct task_struct *p) +{ + return rcu_dereference_protected(scx_root, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(__rq_lockp(task_rq(p)))); +} + +static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p) +{ + return rcu_dereference_all(scx_root); +} + +static inline bool scx_task_on_sched(struct scx_sched *sch, + const struct task_struct *p) { - return !current->scx.kf_mask; + return true; } -static inline bool scx_rq_bypassing(struct rq *rq) +static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux) { - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); + return rcu_dereference_all(scx_root); } +#endif /* CONFIG_EXT_SUB_SCHED */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf948db905ed..69361c63353a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -225,6 +225,7 @@ void __init sched_init_granularity(void) update_sysctl(); } +#ifndef CONFIG_64BIT #define WMULT_CONST (~0U) #define WMULT_SHIFT 32 @@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#else +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) +{ + return (delta_exec * weight) / lw->weight; +} +#endif /* * delta /= w @@ -665,25 +672,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Since zero_vruntime closely tracks the per-task service, these * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag * induced in the system due to quantisation. - * - * Also, we use scale_load_down() to reduce the size. - * - * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ +static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w) +{ +#ifdef CONFIG_64BIT + if (cfs_rq->sum_shift) + w = max(2UL, w >> cfs_rq->sum_shift); +#endif + return w; +} + +static inline void +__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); + s64 w_vruntime, key = entity_key(cfs_rq, se); + + w_vruntime = key * weight; + WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62)); + + cfs_rq->sum_w_vruntime += w_vruntime; + cfs_rq->sum_weight += weight; +} + static void -sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); - s64 key = entity_key(cfs_rq, se); + unsigned long weight; + s64 key, tmp; - cfs_rq->sum_w_vruntime += key * weight; +again: + weight = avg_vruntime_weight(cfs_rq, se->load.weight); + key = entity_key(cfs_rq, se); + + if (check_mul_overflow(key, weight, &key)) + goto overflow; + + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp)) + goto overflow; + + cfs_rq->sum_w_vruntime = tmp; cfs_rq->sum_weight += weight; + return; + +overflow: + /* + * There's gotta be a limit -- if we're still failing at this point + * there's really nothing much to be done about things. + */ + BUG_ON(cfs_rq->sum_shift >= 10); + cfs_rq->sum_shift++; + + /* + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1 + */ + cfs_rq->sum_w_vruntime = 0; + cfs_rq->sum_weight = 0; + + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost; + node; node = rb_next(node)) + __sum_w_vruntime_add(cfs_rq, __node_2_se(node)); + + goto again; +} + +static void +sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (sched_feat(PARANOID_AVG)) + return sum_w_vruntime_add_paranoid(cfs_rq, se); + + __sum_w_vruntime_add(cfs_rq, se); } static void sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); s64 key = entity_key(cfs_rq, se); cfs_rq->sum_w_vruntime -= key * weight; @@ -707,7 +772,7 @@ void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta) * Called in: * - place_entity() -- before enqueue * - update_entity_lag() -- before dequeue - * - entity_tick() + * - update_deadline() -- slice expiration * * This means it is one entry 'behind' but that puts it close enough to where * the bound on entity_key() is at most two lag bounds. @@ -725,7 +790,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) s64 runtime = cfs_rq->sum_w_vruntime; if (curr) { - unsigned long w = scale_load_down(curr->load.weight); + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight); runtime += entity_key(cfs_rq, curr) * w; weight += w; @@ -735,7 +800,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) if (runtime < 0) runtime -= (weight - 1); - delta = div_s64(runtime, weight); + delta = div64_long(runtime, weight); } else if (curr) { /* * When there is but one element, it is the average. @@ -764,17 +829,44 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); * * -r_max < lag < max(r_max, q) */ -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime) { u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; s64 vlag, limit; + vlag = avruntime - se->vruntime; + limit = calc_delta_fair(max_slice, se); + + return clamp(vlag, -limit, limit); +} + +/* + * Delayed dequeue aims to reduce the negative lag of a dequeued task. While + * updating the lag of an entity, check that negative lag didn't increase + * during the delayed dequeue period which would be unfair. + * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO + * is set. + * + * Return true if the lag has been adjusted. + */ +static __always_inline +bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); + bool ret; + WARN_ON_ONCE(!se->on_rq); - vlag = avg_vruntime(cfs_rq) - se->vruntime; - limit = calc_delta_fair(max_slice, se); + if (se->sched_delayed) { + /* previous vlag < 0 otherwise se would not be delayed */ + vlag = max(vlag, se->vlag); + if (sched_feat(DELAY_ZERO)) + vlag = min(vlag, 0); + } + ret = (vlag == se->vlag); + se->vlag = vlag; - se->vlag = clamp(vlag, -limit, limit); + return ret; } /* @@ -801,7 +893,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight); avg += entity_key(cfs_rq, curr) * weight; load += weight; @@ -1024,7 +1116,7 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) /* * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(PICK_BUDDY) && + if (sched_feat(PICK_BUDDY) && protect && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ WARN_ON_ONCE(cfs_rq->next->sched_delayed); @@ -1131,6 +1223,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) * EEVDF: vd_i = ve_i + r_i / w_i */ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); + avg_vruntime(cfs_rq); /* * The task has consumed its request, reschedule. @@ -3840,23 +3933,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) se_weight(se) * -se->avg.load_sum); } -static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); +static void +rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot) +{ + unsigned long old_weight = se->load.weight; + + /* + * VRUNTIME + * -------- + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + se->vlag = div64_long(se->vlag * old_weight, weight); + + /* + * DEADLINE + * -------- + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + if (se->rel_deadline) + se->deadline = div64_long(se->deadline * old_weight, weight); + + if (rel_vprot) + se->vprot = div64_long(se->vprot * old_weight, weight); +} static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; bool rel_vprot = false; - u64 vprot; + u64 avruntime = 0; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); - update_entity_lag(cfs_rq, se); - se->deadline -= se->vruntime; + avruntime = avg_vruntime(cfs_rq); + se->vlag = entity_lag(cfs_rq, se, avruntime); + se->deadline -= avruntime; se->rel_deadline = 1; if (curr && protect_slice(se)) { - vprot = se->vprot - se->vruntime; + se->vprot -= avruntime; rel_vprot = true; } @@ -3867,30 +4062,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); - /* - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), - * we need to scale se->vlag when w_i changes. - */ - se->vlag = div_s64(se->vlag * se->load.weight, weight); - if (se->rel_deadline) - se->deadline = div_s64(se->deadline * se->load.weight, weight); - - if (rel_vprot) - vprot = div_s64(vprot * se->load.weight, weight); + rescale_entity(se, weight, rel_vprot); update_load_set(&se->load, weight); do { u32 divider = get_pelt_divider(&se->avg); - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - place_entity(cfs_rq, se, 0); if (rel_vprot) - se->vprot = se->vruntime + vprot; + se->vprot += avruntime; + se->deadline += avruntime; + se->rel_deadline = 0; + se->vruntime = avruntime - se->vlag; + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); @@ -5164,6 +5352,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u64 vslice, vruntime = avg_vruntime(cfs_rq); + bool update_zero = false; s64 lag = 0; if (!se->custom_slice) @@ -5180,7 +5369,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; - unsigned long load; + long load, weight; lag = se->vlag; @@ -5238,17 +5427,44 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ load = cfs_rq->sum_weight; if (curr && curr->on_rq) - load += scale_load_down(curr->load.weight); + load += avg_vruntime_weight(cfs_rq, curr->load.weight); - lag *= load + scale_load_down(se->load.weight); + weight = avg_vruntime_weight(cfs_rq, se->load.weight); + lag *= load + weight; if (WARN_ON_ONCE(!load)) load = 1; - lag = div_s64(lag, load); + lag = div64_long(lag, load); + + /* + * A heavy entity (relative to the tree) will pull the + * avg_vruntime close to its vruntime position on enqueue. But + * the zero_vruntime point is only updated at the next + * update_deadline()/place_entity()/update_entity_lag(). + * + * Specifically (see the comment near avg_vruntime_weight()): + * + * sum_w_vruntime = \Sum (v_i - v0) * w_i + * + * Note that if v0 is near a light entity, both terms will be + * small for the light entity, while in that case both terms + * are large for the heavy entity, leading to risk of + * overflow. + * + * OTOH if v0 is near the heavy entity, then the difference is + * larger for the light entity, but the factor is small, while + * for the heavy entity the difference is small but the factor + * is large. Avoiding the multiplication overflow. + */ + if (weight > load) + update_zero = true; } se->vruntime = vruntime - lag; - if (se->rel_deadline) { + if (update_zero) + update_zero_vruntime(cfs_rq, -lag); + + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { se->deadline += se->vruntime; se->rel_deadline = 0; return; @@ -5398,13 +5614,6 @@ static void clear_delayed(struct sched_entity *se) } } -static inline void finish_delayed_dequeue_entity(struct sched_entity *se) -{ - clear_delayed(se); - if (sched_feat(DELAY_ZERO) && se->vlag > 0) - se->vlag = 0; -} - static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -5430,6 +5639,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); + update_entity_lag(cfs_rq, se); set_delayed(se); return false; } @@ -5469,7 +5679,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); if (flags & DEQUEUE_DELAYED) - finish_delayed_dequeue_entity(se); + clear_delayed(se); if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); @@ -5593,18 +5803,13 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) update_load_avg(cfs_rq, curr, UPDATE_TG); update_cfs_group(curr); - /* - * Pulls along cfs_rq::zero_vruntime. - */ - avg_vruntime(cfs_rq); - #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ if (queued) { - resched_curr_lazy(rq_of(cfs_rq)); + resched_curr(rq_of(cfs_rq)); return; } #endif @@ -6809,27 +7014,41 @@ static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; + unsigned long scale = 1024; + unsigned long util = 0; + u64 vdelta; + u64 delta; WARN_ON_ONCE(task_rq(p) != rq); - if (rq->cfs.h_nr_queued > 1) { - u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; - u64 slice = se->slice; - s64 delta = slice - ran; + if (rq->cfs.h_nr_queued <= 1) + return; - if (delta < 0) { - if (task_current_donor(rq, p)) - resched_curr(rq); - return; - } - hrtick_start(rq, delta); + /* + * Compute time until virtual deadline + */ + vdelta = se->deadline - se->vruntime; + if ((s64)vdelta < 0) { + if (task_current_donor(rq, p)) + resched_curr(rq); + return; } + delta = (se->load.weight * vdelta) / NICE_0_LOAD; + + /* + * Correct for instantaneous load of other classes. + */ + util += cpu_util_irq(rq); + if (util && util < 1024) { + scale *= 1024; + scale /= (1024 - util); + } + + hrtick_start(rq, (scale * delta) / 1024); } /* - * called from enqueue/dequeue and updates the hrtick when the - * current task is from our class and nr_running is low enough - * to matter. + * Called on enqueue to start the hrtick when h_nr_queued becomes more than 1. */ static void hrtick_update(struct rq *rq) { @@ -6838,6 +7057,9 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) return; + if (hrtick_active(rq)) + return; + hrtick_start_fair(rq, donor); } #else /* !CONFIG_SCHED_HRTICK: */ @@ -6853,16 +7075,15 @@ static inline void hrtick_update(struct rq *rq) static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min, rq_util_max; + unsigned long rq_util_max; if (!sched_energy_enabled()) return false; - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); + return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu); } /* @@ -6900,9 +7121,15 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -static int sched_idle_cpu(int cpu) +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p) +{ + return sched_idle_rq(rq) && !task_has_idle_policy(p); +} + +static int choose_idle_cpu(int cpu, struct task_struct *p) { - return sched_idle_rq(cpu_rq(cpu)); + return available_idle_cpu(cpu) || + choose_sched_idle_rq(cpu_rq(cpu), p); } static void @@ -6918,18 +7145,14 @@ requeue_delayed_entity(struct sched_entity *se) WARN_ON_ONCE(!se->sched_delayed); WARN_ON_ONCE(!se->on_rq); - if (sched_feat(DELAY_ZERO)) { - update_entity_lag(cfs_rq, se); - if (se->vlag > 0) { - cfs_rq->nr_queued--; - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->vlag = 0; - place_entity(cfs_rq, se, 0); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); - cfs_rq->nr_queued++; - } + if (update_entity_lag(cfs_rq, se)) { + cfs_rq->nr_queued--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; } update_load_avg(cfs_rq, se, 0); @@ -7160,9 +7383,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) WARN_ON_ONCE(!task_sleep); WARN_ON_ONCE(p->on_rq != 1); - /* Fix-up what dequeue_task_fair() skipped */ - hrtick_update(rq); - /* * Fix-up what block_task() skipped. * @@ -7196,8 +7416,6 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* * Must not reference @p after dequeue_entities(DEQUEUE_DELAYED). */ - - hrtick_update(rq); return true; } @@ -7467,7 +7685,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct * if (!sched_core_cookie_match(rq, p)) continue; - if (sched_idle_cpu(i)) + if (choose_sched_idle_rq(rq, p)) return i; if (available_idle_cpu(i)) { @@ -7558,8 +7776,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && - sched_cpu_cookie_match(cpu_rq(cpu), p)) + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; return -1; @@ -7632,7 +7849,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { + if (choose_sched_idle_rq(cpu_rq(cpu), p) && + cpumask_test_cpu(cpu, cpus)) { *idle_cpu = cpu; break; } @@ -7667,7 +7885,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t */ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + if (choose_idle_cpu(cpu, p)) return cpu; } @@ -7706,21 +7924,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - struct sched_domain_shared *sd_share; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); - if (sd_share) { - /* because !--nr is the condition to stop scan */ - nr = READ_ONCE(sd_share->nr_idle_scan) + 1; - /* overloaded LLC is unlikely to have idle cpu/core */ - if (nr == 1) - return -1; - } + /* + * Increment because !--nr is the condition to stop scan. + * + * Since "sd" is "sd_llc" for target CPU dereferenced in the + * caller, it is safe to directly dereference "sd->shared". + * Topology bits always ensure it assigned for "sd_llc" abd it + * cannot disappear as long as we have a RCU protected + * reference to one the associated "sd" here. + */ + nr = READ_ONCE(sd->shared->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; } + if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr)) + return -1; + if (static_branch_unlikely(&sched_cluster_active)) { struct sched_group *sg = sd->groups; @@ -7789,7 +8012,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + if (!choose_idle_cpu(cpu, p)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); @@ -7860,7 +8083,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (choose_idle_cpu(target, p) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -7868,7 +8091,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + choose_idle_cpu(prev, p) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -7900,7 +8123,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + choose_idle_cpu(recent_used_cpu, p) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -8400,10 +8623,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct perf_domain *pd; struct energy_env eenv; - rcu_read_lock(); pd = rcu_dereference_all(rd->pd); if (!pd) - goto unlock; + return target; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -8413,13 +8635,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto unlock; + return target; target = prev_cpu; sync_entity_load_avg(&p->se); if (!task_util_est(p) && p_util_min == 0) - goto unlock; + return target; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -8514,7 +8736,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) prev_cpu); /* CPU utilization has changed */ if (prev_delta < base_energy) - goto unlock; + return target; prev_delta -= base_energy; prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); @@ -8538,7 +8760,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) - goto unlock; + return target; cur_delta -= base_energy; /* @@ -8555,7 +8777,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_actual_cap = cpu_actual_cap; } } - rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || @@ -8563,11 +8784,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = best_energy_cpu; return target; - -unlock: - rcu_read_unlock(); - - return target; } /* @@ -8612,7 +8828,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -8638,14 +8853,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) break; } - if (unlikely(sd)) { - /* Slow path */ - new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (wake_flags & WF_TTWU) { /* XXX always ? */ - /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } - rcu_read_unlock(); + /* Slow path */ + if (unlikely(sd)) + return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); + + /* Fast path */ + if (wake_flags & WF_TTWU) + return select_idle_sibling(p, prev_cpu, new_cpu); return new_cpu; } @@ -8936,8 +9150,10 @@ pick: return; preempt: - if (preempt_action == PREEMPT_WAKEUP_SHORT) + if (preempt_action == PREEMPT_WAKEUP_SHORT) { cancel_protect_slice(se); + clear_buddies(cfs_rq, se); + } resched_curr_lazy(rq); } @@ -9128,7 +9344,7 @@ static void yield_task_fair(struct rq *rq) */ if (entity_eligible(cfs_rq, se)) { se->vruntime = se->deadline; - se->deadline += calc_delta_fair(se->slice, se); + update_deadline(cfs_rq, se); } } @@ -9785,32 +10001,6 @@ next: } /* - * attach_task() -- attach the task detached by detach_task() to its new rq. - */ -static void attach_task(struct rq *rq, struct task_struct *p) -{ - lockdep_assert_rq_held(rq); - - WARN_ON_ONCE(task_rq(p) != rq); - activate_task(rq, p, ENQUEUE_NOCLOCK); - wakeup_preempt(rq, p, 0); -} - -/* - * attach_one_task() -- attaches the task returned from detach_one_task() to - * its new rq. - */ -static void attach_one_task(struct rq *rq, struct task_struct *p) -{ - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - attach_task(rq, p); - rq_unlock(rq, &rf); -} - -/* * attach_tasks() -- attaches all tasks detached by detach_tasks() to their * new rq. */ @@ -10047,6 +10237,7 @@ struct sg_lb_stats { unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned int group_smt_balance; /* Task on busy SMT be moved */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -10279,6 +10470,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { + /* + * With EAS and uclamp, 1 CPU in the group must be overutilized to + * consider the group overloaded. + */ + if (sched_energy_enabled() && !sgs->group_overutilized) + return false; + if (sgs->sum_nr_running <= sgs->group_weight) return false; @@ -10462,14 +10660,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_overloaded: sched_group is overloaded - * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - bool *sg_overloaded, - bool *sg_overutilized) + bool *sg_overloaded) { int i, nr_running, local_group, sd_flags = env->sd->flags; bool balancing_at_rd = !env->sd->parent; @@ -10491,7 +10687,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (cpu_overutilized(i)) - *sg_overutilized = 1; + sgs->group_overutilized = 1; /* * No need to call idle_cpu() if nr_running is not 0 @@ -11067,6 +11263,7 @@ static void update_idle_cpu_scan(struct lb_env *env, unsigned long sum_util) { struct sched_domain_shared *sd_share; + struct sched_domain *sd = env->sd; int llc_weight, pct; u64 x, y, tmp; /* @@ -11080,11 +11277,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) return; - llc_weight = per_cpu(sd_llc_size, env->dst_cpu); - if (env->sd->span_weight != llc_weight) - return; - - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = sd->shared; if (!sd_share) return; @@ -11118,10 +11311,11 @@ static void update_idle_cpu_scan(struct lb_env *env, */ /* equation [3] */ x = sum_util; + llc_weight = sd->span_weight; do_div(x, llc_weight); /* equation [4] */ - pct = env->sd->imbalance_pct; + pct = sd->imbalance_pct; tmp = x * x * pct * pct; do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); @@ -11162,13 +11356,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } + sg_overutilized |= sgs->group_overutilized; + /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -12289,7 +12485,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su sd->newidle_success += success; if (sd->newidle_call >= 1024) { - sd->newidle_ratio = sd->newidle_success; + u64 now = sched_clock(); + s64 delta = now - sd->newidle_stamp; + sd->newidle_stamp = now; + int ratio = 0; + + if (delta < 0) + delta = 0; + + if (sched_feat(NI_RATE)) { + /* + * ratio delta freq + * + * 1024 - 4 s - 128 Hz + * 512 - 2 s - 256 Hz + * 256 - 1 s - 512 Hz + * 128 - .5 s - 1024 Hz + * 64 - .25 s - 2048 Hz + */ + ratio = delta >> 22; + } + + ratio += sd->newidle_success; + + sd->newidle_ratio = min(1024, ratio); sd->newidle_call /= 2; sd->newidle_success /= 2; } @@ -12336,7 +12555,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -12374,7 +12593,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu); - busy = !idle && !sched_idle_cpu(cpu); + busy = !idle && !sched_idle_rq(rq); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); @@ -12419,14 +12638,14 @@ static inline int on_null_domain(struct rq *rq) */ static inline int find_new_ilb(void) { + int this_cpu = smp_processor_id(); const struct cpumask *hk_mask; int ilb_cpu; hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - - if (ilb_cpu == smp_processor_id()) + if (ilb_cpu == this_cpu) continue; if (idle_cpu(ilb_cpu)) @@ -12996,7 +13215,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { unsigned int weight = 1; - if (sched_feat(NI_RANDOM)) { + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { /* * Throw a 1k sided dice; and only run * newidle_balance according to the success @@ -13439,11 +13658,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) entity_tick(cfs_rq, se, queued); } - if (queued) { - if (!need_resched()) - hrtick_start_fair(rq, curr); + if (queued) return; - } if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); @@ -14025,7 +14241,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; } if (ng) { - gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; } print_numa_stats(m, node, tsf, tpf, gsf, gpf); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 136a6584be79..84c4fe3abd74 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -58,13 +58,20 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(DELAY_DEQUEUE, true) SCHED_FEAT(DELAY_ZERO, true) +SCHED_FEAT(PARANOID_AVG, false) + /* * Allow wakeup-time preemption of the current task: */ SCHED_FEAT(WAKEUP_PREEMPTION, true) +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +SCHED_FEAT(HRTICK, true) +SCHED_FEAT(HRTICK_DL, true) +#else SCHED_FEAT(HRTICK, false) SCHED_FEAT(HRTICK_DL, false) +#endif /* * Decrement CPU capacity based on time not spent running tasks @@ -126,3 +133,4 @@ SCHED_FEAT(LATENCY_WARN, false) * Do newidle balancing proportional to its success rate using randomization. */ SCHED_FEAT(NI_RANDOM, true) +SCHED_FEAT(NI_RATE, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index b95449165122..a83be0c834dd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -161,6 +161,14 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, return cpuidle_enter(drv, dev, next_state); } +static void idle_call_stop_or_retain_tick(bool stop_tick) +{ + if (stop_tick || tick_nohz_tick_stopped()) + tick_nohz_idle_stop_tick(); + else + tick_nohz_idle_retain_tick(); +} + /** * cpuidle_idle_call - the main idle function * @@ -170,7 +178,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * set, and it returns with polling set. If it ever stops polling, it * must clear the polling bit. */ -static void cpuidle_idle_call(void) +static void cpuidle_idle_call(bool stop_tick) { struct cpuidle_device *dev = cpuidle_get_device(); struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); @@ -186,7 +194,7 @@ static void cpuidle_idle_call(void) } if (cpuidle_not_available(drv, dev)) { - tick_nohz_idle_stop_tick(); + idle_call_stop_or_retain_tick(stop_tick); default_idle_call(); goto exit_idle; @@ -222,17 +230,19 @@ static void cpuidle_idle_call(void) next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); } else if (drv->state_count > 1) { - bool stop_tick = true; + /* + * stop_tick is expected to be true by default by cpuidle + * governors, which allows them to select idle states with + * target residency above the tick period length. + */ + stop_tick = true; /* * Ask the cpuidle framework to choose a convenient idle state. */ next_state = cpuidle_select(drv, dev, &stop_tick); - if (stop_tick || tick_nohz_tick_stopped()) - tick_nohz_idle_stop_tick(); - else - tick_nohz_idle_retain_tick(); + idle_call_stop_or_retain_tick(stop_tick); entered_state = call_cpuidle(drv, dev, next_state); /* @@ -240,7 +250,7 @@ static void cpuidle_idle_call(void) */ cpuidle_reflect(dev, entered_state); } else { - tick_nohz_idle_retain_tick(); + idle_call_stop_or_retain_tick(stop_tick); /* * If there is only a single idle state (or none), there is @@ -268,6 +278,7 @@ exit_idle: static void do_idle(void) { int cpu = smp_processor_id(); + bool got_tick = false; /* * Check if we need to update blocked load @@ -338,8 +349,9 @@ static void do_idle(void) tick_nohz_idle_restart_tick(); cpu_idle_poll(); } else { - cpuidle_idle_call(); + cpuidle_idle_call(got_tick); } + got_tick = tick_nohz_idle_got_tick(); arch_cpu_idle_exit(); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f69e1f16d923..4ee8faf01441 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int flags) { struct task_struct *p = NULL; + struct rq *rq = rq_of_rt_rq(rt_rq); if (!schedstat_enabled()) return; - if (rt_entity_is_task(rt_se)) + if (rt_entity_is_task(rt_se)) { p = rt_task_of(rt_se); + if (p != rq->curr) + update_stats_wait_end_rt(rt_rq, rt_se); + } + if ((flags & DEQUEUE_SLEEP) && p) { unsigned int state; @@ -1853,13 +1858,22 @@ static int find_lowest_rq(struct task_struct *task) static struct task_struct *pick_next_pushable_task(struct rq *rq) { - struct task_struct *p; + struct plist_head *head = &rq->rt.pushable_tasks; + struct task_struct *i, *p = NULL; if (!has_pushable_tasks(rq)) return NULL; - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); + plist_for_each_entry(i, head, pushable_tasks) { + /* make sure task isn't on_cpu (possible with proxy-exec) */ + if (!task_on_cpu(rq, i)) { + p = i; + break; + } + } + + if (!p) + return NULL; BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); @@ -2652,7 +2666,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) { struct rt_schedulable_data *d = data; struct task_group *child; - unsigned long total, sum = 0; + u64 total, sum = 0; u64 period, runtime; period = ktime_to_ns(tg->rt_bandwidth.rt_period); @@ -2676,9 +2690,6 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; - if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) - return -EBUSY; - total = to_ratio(period, runtime); /* @@ -2818,19 +2829,6 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } -#ifdef CONFIG_SYSCTL -static int sched_rt_global_constraints(void) -{ - int ret = 0; - - mutex_lock(&rt_constraints_mutex); - ret = __rt_schedulable(NULL, 0, 0); - mutex_unlock(&rt_constraints_mutex); - - return ret; -} -#endif /* CONFIG_SYSCTL */ - int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept real-time tasks when there is no way for them to run */ @@ -2840,14 +2838,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED: */ - -#ifdef CONFIG_SYSCTL -static int sched_rt_global_constraints(void) -{ - return 0; -} -#endif /* CONFIG_SYSCTL */ #endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL @@ -2859,11 +2849,14 @@ static int sched_rt_global_validate(void) NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; - return 0; -} +#ifdef CONFIG_RT_GROUP_SCHED + if (!rt_group_sched_enabled()) + return 0; -static void sched_rt_do_global(void) -{ + scoped_guard(mutex, &rt_constraints_mutex) + return __rt_schedulable(NULL, 0, 0); +#endif + return 0; } static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, @@ -2889,11 +2882,6 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff if (ret) goto undo; - ret = sched_rt_global_constraints(); - if (ret) - goto undo; - - sched_rt_do_global(); sched_dl_do_global(); } if (0) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 43bbf0693cca..9f63b15d309d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); -extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); @@ -684,8 +684,9 @@ struct cfs_rq { s64 sum_w_vruntime; u64 sum_weight; - u64 zero_vruntime; + unsigned int sum_shift; + #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 zero_vruntime_fi; @@ -782,7 +783,6 @@ enum scx_rq_flags { SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ - SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */ @@ -798,19 +798,29 @@ struct scx_rq { u64 extra_enq_flags; /* see move_task_to_local_dsq() */ u32 nr_running; u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ + bool in_select_cpu; bool cpu_released; u32 flags; + u32 nr_immed; /* ENQ_IMMED tasks on local_dsq */ u64 clock; /* current per-rq clock -- see scx_bpf_now() */ cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_kick_if_idle; cpumask_var_t cpus_to_preempt; cpumask_var_t cpus_to_wait; + cpumask_var_t cpus_to_sync; + bool kick_sync_pending; unsigned long kick_sync; - local_t reenq_local_deferred; + + struct task_struct *sub_dispatch_prev; + + raw_spinlock_t deferred_reenq_lock; + u64 deferred_reenq_locals_seq; + struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ + struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ struct balance_callback deferred_bal_cb; + struct balance_callback kick_sync_bal_cb; struct irq_work deferred_irq_work; struct irq_work kick_cpus_irq_work; - struct scx_dispatch_q bypass_dsq; }; #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -1285,6 +1295,8 @@ struct rq { call_single_data_t hrtick_csd; struct hrtimer hrtick_timer; ktime_t hrtick_time; + ktime_t hrtick_delay; + unsigned int hrtick_sched; #endif #ifdef CONFIG_SCHEDSTATS @@ -1606,15 +1618,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass) extern bool raw_spin_rq_trylock(struct rq *rq) __cond_acquires(true, __rq_lockp(rq)); -extern void raw_spin_rq_unlock(struct rq *rq) - __releases(__rq_lockp(rq)); - static inline void raw_spin_rq_lock(struct rq *rq) __acquires(__rq_lockp(rq)) { raw_spin_rq_lock_nested(rq, 0); } +static inline void raw_spin_rq_unlock(struct rq *rq) + __releases(__rq_lockp(rq)) +{ + raw_spin_unlock(rq_lockp(rq)); +} + static inline void raw_spin_rq_lock_irq(struct rq *rq) __acquires(__rq_lockp(rq)) { @@ -1853,6 +1868,13 @@ static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} static inline void scx_rq_clock_invalidate(struct rq *rq) {} #endif /* !CONFIG_SCHED_CLASS_EXT */ +static inline void assert_balance_callbacks_empty(struct rq *rq) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) && + rq->balance_callback && + rq->balance_callback != &balance_push_callback); +} + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1869,7 +1891,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; - WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); + assert_balance_callbacks_empty(rq); } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -2849,7 +2871,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert(rcu_read_lock_any_held()); return rq->idle_state; } @@ -2896,7 +2918,7 @@ extern void init_cfs_throttle_work(struct task_struct *p); #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -extern unsigned long to_ratio(u64 period, u64 runtime); +extern u64 to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); @@ -3001,6 +3023,29 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static inline void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + WARN_ON_ONCE(task_rq(p) != rq); + activate_task(rq, p, ENQUEUE_NOCLOCK); + wakeup_preempt(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static inline void attach_one_task(struct rq *rq, struct task_struct *p) +{ + guard(rq_lock)(rq); + update_rq_clock(rq); + attach_task(rq, p); +} + #ifdef CONFIG_PREEMPT_RT # define SCHED_NR_MIGRATE_BREAK 8 #else @@ -3030,46 +3075,31 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; * - enabled by features * - hrtimer is actually high res */ -static inline int hrtick_enabled(struct rq *rq) +static inline bool hrtick_enabled(struct rq *rq) { - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); + return cpu_active(cpu_of(rq)) && hrtimer_highres_enabled(); } -static inline int hrtick_enabled_fair(struct rq *rq) +static inline bool hrtick_enabled_fair(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK) && hrtick_enabled(rq); } -static inline int hrtick_enabled_dl(struct rq *rq) +static inline bool hrtick_enabled_dl(struct rq *rq) { - if (!sched_feat(HRTICK_DL)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK_DL) && hrtick_enabled(rq); } extern void hrtick_start(struct rq *rq, u64 delay); - -#else /* !CONFIG_SCHED_HRTICK: */ - -static inline int hrtick_enabled_fair(struct rq *rq) +static inline bool hrtick_active(struct rq *rq) { - return 0; -} - -static inline int hrtick_enabled_dl(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled(struct rq *rq) -{ - return 0; + return hrtimer_active(&rq->hrtick_timer); } +#else /* !CONFIG_SCHED_HRTICK: */ +static inline bool hrtick_enabled_fair(struct rq *rq) { return false; } +static inline bool hrtick_enabled_dl(struct rq *rq) { return false; } +static inline bool hrtick_enabled(struct rq *rq) { return false; } #endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index cadb0e9fe19b..b215b0ead9a6 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -911,10 +911,10 @@ err_size: return -E2BIG; } -static void get_params(struct task_struct *p, struct sched_attr *attr) +static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { if (task_has_dl_policy(p)) { - __getparam_dl(p, attr); + __getparam_dl(p, attr, flags); } else if (task_has_rt_policy(p)) { attr->sched_priority = p->rt_priority; } else { @@ -980,7 +980,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, return -ESRCH; if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); + get_params(p, &attr, 0); return sched_setattr(p, &attr); } @@ -1065,7 +1065,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags)) + usize < SCHED_ATTR_SIZE_VER0)) return -EINVAL; scoped_guard (rcu) { @@ -1073,6 +1073,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (!p) return -ESRCH; + if (flags) { + if (!task_has_dl_policy(p) || + flags != SCHED_GETATTR_FLAG_DL_DYNAMIC) + return -EINVAL; + } + retval = security_task_getscheduler(p); if (retval) return retval; @@ -1080,7 +1086,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); + get_params(p, &kattr, flags); kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32dcddaead82..5847b83d9d55 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,7 @@ */ #include <linux/sched/isolation.h> +#include <linux/sched/clock.h> #include <linux/bsearch.h> #include "sched.h" @@ -272,7 +273,7 @@ void rebuild_sched_domains_energy(void) static int sched_energy_aware_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret, state; + int ret; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -288,8 +289,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { - state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) + if (sysctl_sched_energy_aware != sched_energy_enabled()) rebuild_sched_domains_energy(); } @@ -387,11 +387,11 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) static void sched_energy_set(bool has_eas) { - if (!has_eas && static_branch_unlikely(&sched_energy_present)) { + if (!has_eas && sched_energy_enabled()) { if (sched_debug()) pr_info("%s: stopping EAS\n", __func__); static_branch_disable_cpuslocked(&sched_energy_present); - } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { + } else if (has_eas && !sched_energy_enabled()) { if (sched_debug()) pr_info("%s: starting EAS\n", __func__); static_branch_enable_cpuslocked(&sched_energy_present); @@ -684,6 +684,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); + + /* If sd_llc exists, sd_llc_shared should exist too. */ + WARN_ON_ONCE(!sd->shared); sds = sd->shared; } @@ -732,6 +735,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + /* Pick reference to parent->shared. */ + if (parent->shared) { + WARN_ON_ONCE(tmp->shared); + tmp->shared = parent->shared; + parent->shared = NULL; + } + if (parent->parent) { parent->parent->child = tmp; parent->parent->groups->flags = tmp->flags; @@ -781,6 +791,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { + struct sched_domain_shared * __percpu *sds; struct sched_domain * __percpu *sd; struct root_domain *rd; }; @@ -788,6 +799,7 @@ struct s_data { enum s_alloc { sa_rootdomain, sa_sd, + sa_sd_shared, sa_sd_storage, sa_none, }; @@ -1534,6 +1546,9 @@ static void set_domain_attribute(struct sched_domain *sd, static void __sdt_free(const struct cpumask *cpu_map); static int __sdt_alloc(const struct cpumask *cpu_map); +static void __sds_free(struct s_data *d, const struct cpumask *cpu_map); +static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { @@ -1545,6 +1560,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_sd: free_percpu(d->sd); fallthrough; + case sa_sd_shared: + __sds_free(d, cpu_map); + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); fallthrough; @@ -1560,9 +1578,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) if (__sdt_alloc(cpu_map)) return sa_sd_storage; + if (__sds_alloc(d, cpu_map)) + return sa_sd_shared; d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) - return sa_sd_storage; + return sa_sd_shared; d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd; @@ -1575,21 +1595,25 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) * sched_group structure so that the subsequent __free_domain_allocs() * will not free the data we're using. */ -static void claim_allocations(int cpu, struct sched_domain *sd) +static void claim_allocations(int cpu, struct s_data *d) { - struct sd_data *sdd = sd->private; + struct sched_domain *sd; + + if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref)) + *per_cpu_ptr(d->sds, cpu) = NULL; - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; + for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) { + struct sd_data *sdd = sd->private; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) - *per_cpu_ptr(sdd->sg, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) + *per_cpu_ptr(sdd->sg, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) - *per_cpu_ptr(sdd->sgc, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; + } } #ifdef CONFIG_NUMA @@ -1642,14 +1666,19 @@ sd_init(struct sched_domain_topology_level *tl, struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; + u64 now = sched_clock(); - sd_weight = cpumask_weight(tl->mask(tl, cpu)); + sd_span = sched_domain_span(sd); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); + sd_weight = cpumask_weight(sd_span); + sd_id = cpumask_first(sd_span); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, - "wrong sd_flags in topology description\n")) + "wrong sd_flags in topology description\n")) sd_flags &= TOPOLOGY_SD_FLAGS; + sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map); *sd = (struct sched_domain){ .min_interval = sd_weight, @@ -1679,6 +1708,7 @@ sd_init(struct sched_domain_topology_level *tl, .newidle_call = 512, .newidle_success = 256, .newidle_ratio = 512, + .newidle_stamp = now, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, @@ -1686,12 +1716,6 @@ sd_init(struct sched_domain_topology_level *tl, .name = tl->name, }; - sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); - sd_id = cpumask_first(sd_span); - - sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); - WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), "CPU capacity asymmetry not supported on SMT\n"); @@ -1727,16 +1751,6 @@ sd_init(struct sched_domain_topology_level *tl, sd->cache_nice_tries = 1; } - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - if (sd->flags & SD_SHARE_LLC) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); - atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - } - sd->private = sdd; return sd; @@ -2372,10 +2386,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -2386,7 +2396,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; - struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -2397,13 +2406,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -2445,8 +2447,6 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -2454,8 +2454,6 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); @@ -2463,6 +2461,42 @@ static void __sdt_free(const struct cpumask *cpu_map) } } +static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map) +{ + int j; + + d->sds = alloc_percpu(struct sched_domain_shared *); + if (!d->sds) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain_shared *sds; + + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(d->sds, j) = sds; + } + + return 0; +} + +static void __sds_free(struct s_data *d, const struct cpumask *cpu_map) +{ + int j; + + if (!d->sds) + return; + + for_each_cpu(j, cpu_map) + kfree(*per_cpu_ptr(d->sds, j)); + + free_percpu(d->sds); + d->sds = NULL; +} + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) @@ -2549,6 +2583,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map) } /* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ +static void adjust_numa_imbalance(struct sched_domain *sd_llc) +{ + struct sched_domain *parent; + unsigned int imb_span = 1; + unsigned int imb = 0; + unsigned int nr_llcs; + + WARN_ON(!(sd_llc->flags & SD_SHARE_LLC)); + WARN_ON(!sd_llc->parent); + + /* + * For a single LLC per node, allow an + * imbalance up to 12.5% of the node. This is + * arbitrary cutoff based two factors -- SMT and + * memory channels. For SMT-2, the intent is to + * avoid premature sharing of HT resources but + * SMT-4 or SMT-8 *may* benefit from a different + * cutoff. For memory channels, this is a very + * rough estimate of how many channels may be + * active and is based on recent CPUs with + * many cores. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. This assumes that there are + * enough logical CPUs per LLC to avoid SMT + * factors and that there is a correlation + * between LLCs and memory channels. + */ + nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight; + if (nr_llcs == 1) + imb = sd_llc->parent->span_weight >> 3; + else + imb = nr_llcs; + + imb = max(1U, imb); + sd_llc->parent->imb_numa_nr = imb; + + /* + * Set span based on the first NUMA domain. + * + * NUMA systems always add a NODE domain before + * iterating the NUMA domains. Since this is before + * degeneration, start from sd_llc's parent's + * parent which is the lowest an SD_NUMA domain can + * be relative to sd_llc. + */ + parent = sd_llc->parent->parent; + while (parent && !(parent->flags & SD_NUMA)) + parent = parent->parent; + + imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight; + + /* Update the upper remainder of the topology */ + parent = sd_llc->parent; + while (parent) { + int factor = max(1U, (parent->span_weight / imb_span)); + + parent->imb_numa_nr = imb * factor; + parent = parent->parent; + } +} + +/* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs */ @@ -2605,61 +2707,28 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } - /* - * Calculate an allowed NUMA imbalance such that LLCs do not get - * imbalanced. - */ for_each_cpu(i, cpu_map) { - unsigned int imb = 0; - unsigned int imb_span = 1; + sd = *per_cpu_ptr(d.sd, i); + if (!sd) + continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - struct sched_domain *child = sd->child; - - if (!(sd->flags & SD_SHARE_LLC) && child && - (child->flags & SD_SHARE_LLC)) { - struct sched_domain __rcu *top_p; - unsigned int nr_llcs; - - /* - * For a single LLC per node, allow an - * imbalance up to 12.5% of the node. This is - * arbitrary cutoff based two factors -- SMT and - * memory channels. For SMT-2, the intent is to - * avoid premature sharing of HT resources but - * SMT-4 or SMT-8 *may* benefit from a different - * cutoff. For memory channels, this is a very - * rough estimate of how many channels may be - * active and is based on recent CPUs with - * many cores. - * - * For multiple LLCs, allow an imbalance - * until multiple tasks would share an LLC - * on one node while LLCs on another node - * remain idle. This assumes that there are - * enough logical CPUs per LLC to avoid SMT - * factors and that there is a correlation - * between LLCs and memory channels. - */ - nr_llcs = sd->span_weight / child->span_weight; - if (nr_llcs == 1) - imb = sd->span_weight >> 3; - else - imb = nr_llcs; - imb = max(1U, imb); - sd->imb_numa_nr = imb; - - /* Set span based on the first NUMA domain. */ - top_p = sd->parent; - while (top_p && !(top_p->flags & SD_NUMA)) { - top_p = top_p->parent; - } - imb_span = top_p ? top_p->span_weight : sd->span_weight; - } else { - int factor = max(1U, (sd->span_weight / imb_span)); + /* First, find the topmost SD_SHARE_LLC domain */ + while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) + sd = sd->parent; - sd->imb_numa_nr = imb * factor; - } + if (sd->flags & SD_SHARE_LLC) { + int sd_id = cpumask_first(sched_domain_span(sd)); + + sd->shared = *per_cpu_ptr(d.sds, sd_id); + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); + atomic_inc(&sd->shared->ref); + + /* + * In presence of higher domains, adjust the + * NUMA imbalance stats for the hierarchy. + */ + if (IS_ENABLED(CONFIG_NUMA) && sd->parent) + adjust_numa_imbalance(sd); } } @@ -2668,10 +2737,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - claim_allocations(i, sd); + claim_allocations(i, &d); + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) init_sched_groups_capacity(i, sd); - } } /* Attach the domains */ |
