summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c80
-rw-r--r--kernel/sched/ext.c119
-rw-r--r--kernel/sched/ext_idle.c5
-rw-r--r--kernel/sched/ext_internal.h116
-rw-r--r--kernel/sched/fair.c150
-rw-r--r--kernel/sched/idle.c11
-rw-r--r--kernel/sched/isolation.c4
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/sched/syscalls.c30
9 files changed, 386 insertions, 140 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 759777694c78..496dff740dca 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4729,8 +4729,11 @@ void sched_cancel_fork(struct task_struct *p)
scx_cancel_fork(p);
}
+static void sched_mm_cid_fork(struct task_struct *t);
+
void sched_post_fork(struct task_struct *p)
{
+ sched_mm_cid_fork(p);
uclamp_post_fork(p);
scx_post_fork(p);
}
@@ -6830,6 +6833,7 @@ static void __sched notrace __schedule(int sched_mode)
/* SCX must consult the BPF scheduler to tell if rq is empty */
if (!rq->nr_running && !scx_enabled()) {
next = prev;
+ rq->next_class = &idle_sched_class;
goto picked;
}
} else if (!preempt && prev_state) {
@@ -10616,13 +10620,10 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc
}
}
-static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
+static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
guard(task_rq_lock)(t);
- /* If the task is not active it is not in the users count */
- if (!t->mm_cid.active)
- return false;
if (cid_on_task(t->mm_cid.cid)) {
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
if (task_rq(t)->curr == t)
@@ -10630,69 +10631,43 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
else
mm_unset_cid_on_task(t);
}
- return true;
}
-static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
+static void mm_cid_fixup_tasks_to_cpus(void)
{
- struct task_struct *p, *t;
- unsigned int users;
+ struct mm_struct *mm = current->mm;
+ struct task_struct *t;
- /*
- * This can obviously race with a concurrent affinity change, which
- * increases the number of allowed CPUs for this mm, but that does
- * not affect the mode and only changes the CID constraints. A
- * possible switch back to per task mode happens either in the
- * deferred handler function or in the next fork()/exit().
- *
- * The caller has already transferred. The newly incoming task is
- * already accounted for, but not yet visible.
- */
- users = mm->mm_cid.users - 2;
- if (!users)
- return;
+ lockdep_assert_held(&mm->mm_cid.mutex);
- guard(rcu)();
- for_other_threads(current, t) {
- if (mm_cid_fixup_task_to_cpu(t, mm))
- users--;
+ hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
+ /* Current has already transferred before invoking the fixup. */
+ if (t != current)
+ mm_cid_fixup_task_to_cpu(t, mm);
}
- if (!users)
- return;
-
- /* Happens only for VM_CLONE processes. */
- for_each_process_thread(p, t) {
- if (t == current || t->mm != mm)
- continue;
- if (mm_cid_fixup_task_to_cpu(t, mm)) {
- if (--users == 0)
- return;
- }
- }
-}
-
-static void mm_cid_fixup_tasks_to_cpus(void)
-{
- struct mm_struct *mm = current->mm;
-
- mm_cid_do_fixup_tasks_to_cpus(mm);
mm_cid_complete_transit(mm, MM_CID_ONCPU);
}
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
+ lockdep_assert_held(&mm->mm_cid.lock);
+
t->mm_cid.active = 1;
+ hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
mm->mm_cid.users++;
return mm_update_max_cids(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void sched_mm_cid_fork(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
bool percpu;
- WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
+ if (!mm)
+ return;
+
+ WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET);
guard(mutex)(&mm->mm_cid.mutex);
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
@@ -10731,12 +10706,13 @@ void sched_mm_cid_fork(struct task_struct *t)
static bool sched_mm_cid_remove_user(struct task_struct *t)
{
+ lockdep_assert_held(&t->mm->mm_cid.lock);
+
t->mm_cid.active = 0;
- scoped_guard(preempt) {
- /* Clear the transition bit */
- t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
- mm_unset_cid_on_task(t);
- }
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ hlist_del_init(&t->mm_cid.node);
t->mm->mm_cid.users--;
return mm_update_max_cids(t->mm);
}
@@ -10879,11 +10855,13 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
mutex_init(&mm->mm_cid.mutex);
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ INIT_HLIST_HEAD(&mm->mm_cid.user_list);
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
#endif /* !CONFIG_SCHED_MM_CID */
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 62b1f3ac5630..26a6ac2f8826 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -976,8 +976,12 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,
static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
{
- /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
- WRITE_ONCE(dsq->nr, dsq->nr + delta);
+ /*
+ * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE()
+ * on the read side and WRITE_ONCE() on the write side to properly
+ * annotate the concurrent lockless access and avoid KCSAN warnings.
+ */
+ WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta);
}
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
@@ -1099,7 +1103,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
- dsq->seq++;
+ WRITE_ONCE(dsq->seq, dsq->seq + 1);
p->scx.dsq_seq = dsq->seq;
dsq_mod_nr(dsq, 1);
@@ -1466,16 +1470,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
}
-static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags)
{
struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
+ u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
if (enq_flags & ENQUEUE_WAKEUP)
rq->scx.flags |= SCX_RQ_IN_WAKEUP;
- enq_flags |= rq->scx.extra_enq_flags;
-
if (sticky_cpu >= 0)
p->scx.sticky_cpu = -1;
@@ -2460,7 +2463,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq->next_class = &ext_sched_class;
+ rq_modified_begin(rq, &ext_sched_class);
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2475,7 +2478,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2735,7 +2738,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
- last_runnable + scx_watchdog_timeout))) {
+ last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2763,7 +2766,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- scx_watchdog_timeout / 2);
+ READ_ONCE(scx_watchdog_timeout) / 2);
}
void scx_tick(struct rq *rq)
@@ -3585,7 +3588,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
- css_put(css);
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
@@ -3708,7 +3710,9 @@ static void scx_kobj_release(struct kobject *kobj)
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_root->ops.name);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ return sysfs_emit(buf, "%s\n", sch->ops.name);
}
SCX_ATTR(ops);
@@ -3752,7 +3756,9 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
+ const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -3901,8 +3907,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
* consider offloading iff the total queued duration is over the
* threshold.
*/
- min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
- if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
return 0;
raw_spin_rq_lock_irq(rq);
@@ -4130,7 +4136,7 @@ static void scx_bypass(bool bypass)
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
- WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
+ WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -4423,10 +4429,19 @@ done:
scx_bypass(false);
}
+/*
+ * Claim the exit on @sch. The caller must ensure that the helper kthread work
+ * is kicked before the current task can be preempted. Once exit_kind is
+ * claimed, scx_error() can no longer trigger, so if the current task gets
+ * preempted and the BPF scheduler fails to schedule it back, the helper work
+ * will never be kicked and the whole system can wedge.
+ */
static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ lockdep_assert_preemption_disabled();
+
if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return false;
@@ -4449,6 +4464,7 @@ static void scx_disable(enum scx_exit_kind kind)
rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch) {
+ guard(preempt)();
scx_claim_exit(sch, kind);
kthread_queue_work(sch->helper, &sch->disable_work);
}
@@ -4771,6 +4787,8 @@ static bool scx_vexit(struct scx_sched *sch,
{
struct scx_exit_info *ei = sch->exit_info;
+ guard(preempt)();
+
if (!scx_claim_exit(sch, kind))
return false;
@@ -4955,20 +4973,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
return 0;
}
-static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+/*
+ * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
+ * starvation. During the READY -> ENABLED task switching loop, the calling
+ * thread's sched_class gets switched from fair to ext. As fair has higher
+ * priority than ext, the calling thread can be indefinitely starved under
+ * fair-class saturation, leading to a system hang.
+ */
+struct scx_enable_cmd {
+ struct kthread_work work;
+ struct sched_ext_ops *ops;
+ int ret;
+};
+
+static void scx_enable_workfn(struct kthread_work *work)
{
+ struct scx_enable_cmd *cmd =
+ container_of(work, struct scx_enable_cmd, work);
+ struct sched_ext_ops *ops = cmd->ops;
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
int i, cpu, ret;
- if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
- cpu_possible_mask)) {
- pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
- return -EINVAL;
- }
-
mutex_lock(&scx_enable_mutex);
if (scx_enable_state() != SCX_DISABLED) {
@@ -5060,7 +5088,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
WRITE_ONCE(scx_watchdog_timeout, timeout);
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
- scx_watchdog_timeout / 2);
+ READ_ONCE(scx_watchdog_timeout) / 2);
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
@@ -5185,13 +5213,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
atomic_long_inc(&scx_enable_seq);
- return 0;
+ cmd->ret = 0;
+ return;
err_free_ksyncs:
free_kick_syncs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
- return ret;
+ cmd->ret = ret;
+ return;
err_disable_unlock_all:
scx_cgroup_unlock();
@@ -5210,7 +5240,42 @@ err_disable:
*/
scx_error(sch, "scx_enable() failed (%d)", ret);
kthread_flush_work(&sch->disable_work);
- return 0;
+ cmd->ret = 0;
+}
+
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ static struct kthread_worker *helper;
+ static DEFINE_MUTEX(helper_mutex);
+ struct scx_enable_cmd cmd;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
+ return -EINVAL;
+ }
+
+ if (!READ_ONCE(helper)) {
+ mutex_lock(&helper_mutex);
+ if (!helper) {
+ struct kthread_worker *w =
+ kthread_run_worker(0, "scx_enable_helper");
+ if (IS_ERR_OR_NULL(w)) {
+ mutex_unlock(&helper_mutex);
+ return -ENOMEM;
+ }
+ sched_set_fifo(w->task);
+ WRITE_ONCE(helper, w);
+ }
+ mutex_unlock(&helper_mutex);
+ }
+
+ kthread_init_work(&cmd.work, scx_enable_workfn);
+ cmd.ops = ops;
+
+ kthread_queue_work(READ_ONCE(helper), &cmd.work);
+ kthread_flush_work(&cmd.work);
+ return cmd.ret;
}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index c5a3b0bac7c3..ba298ac3ce6c 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -663,9 +663,8 @@ void scx_idle_init_masks(void)
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
- /* Allocate per-node idle cpumasks */
- scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks,
- num_possible_nodes());
+ /* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */
+ scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids);
BUG_ON(!scx_idle_node_masks);
for_each_node(i) {
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 386c677e4c9a..00b450597f3e 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -74,7 +74,7 @@ enum scx_exit_flags {
* info communication. The following flag indicates whether ops.init()
* finished successfully.
*/
- SCX_EFLAG_INITIALIZED,
+ SCX_EFLAG_INITIALIZED = 1LLU << 0,
};
/*
@@ -1035,26 +1035,108 @@ static const char *scx_enable_state_str[] = {
};
/*
- * sched_ext_entity->ops_state
+ * Task Ownership State Machine (sched_ext_entity->ops_state)
*
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
+ * The sched_ext core uses this state machine to track task ownership
+ * between the SCX core and the BPF scheduler. This allows the BPF
+ * scheduler to dispatch tasks without strict ordering requirements, while
+ * the SCX core safely rejects invalid dispatches.
*
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
+ * State Transitions
*
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
+ * .------------> NONE (owned by SCX core)
+ * | | ^
+ * | enqueue | | direct dispatch
+ * | v |
+ * | QUEUEING -------'
+ * | |
+ * | enqueue |
+ * | completes |
+ * | v
+ * | QUEUED (owned by BPF scheduler)
+ * | |
+ * | dispatch |
+ * | |
+ * | v
+ * | DISPATCHING
+ * | |
+ * | dispatch |
+ * | completes |
+ * `---------------'
*
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
+ * State Descriptions
+ *
+ * - %SCX_OPSS_NONE:
+ * Task is owned by the SCX core. It's either on a run queue, running,
+ * or being manipulated by the core scheduler. The BPF scheduler has no
+ * claim on this task.
+ *
+ * - %SCX_OPSS_QUEUEING:
+ * Transitional state while transferring a task from the SCX core to
+ * the BPF scheduler. The task's rq lock is held during this state.
+ * Since QUEUEING is both entered and exited under the rq lock, dequeue
+ * can never observe this state (it would be a BUG). When finishing a
+ * dispatch, if the task is still in %SCX_OPSS_QUEUEING the completion
+ * path busy-waits for it to leave this state (via wait_ops_state())
+ * before retrying.
+ *
+ * - %SCX_OPSS_QUEUED:
+ * Task is owned by the BPF scheduler. It's on a DSQ (dispatch queue)
+ * and the BPF scheduler is responsible for dispatching it. A QSEQ
+ * (queue sequence number) is embedded in this state to detect
+ * dispatch/dequeue races: if a task is dequeued and re-enqueued, the
+ * QSEQ changes and any in-flight dispatch operations targeting the old
+ * QSEQ are safely ignored.
+ *
+ * - %SCX_OPSS_DISPATCHING:
+ * Transitional state while transferring a task from the BPF scheduler
+ * back to the SCX core. This state indicates the BPF scheduler has
+ * selected the task for execution. When dequeue needs to take the task
+ * off a DSQ and it is still in %SCX_OPSS_DISPATCHING, the dequeue path
+ * busy-waits for it to leave this state (via wait_ops_state()) before
+ * proceeding. Exits to %SCX_OPSS_NONE when dispatch completes.
+ *
+ * Memory Ordering
+ *
+ * Transitions out of %SCX_OPSS_QUEUEING and %SCX_OPSS_DISPATCHING into
+ * %SCX_OPSS_NONE or %SCX_OPSS_QUEUED must use atomic_long_set_release()
+ * and waiters must use atomic_long_read_acquire(). This ensures proper
+ * synchronization between concurrent operations.
+ *
+ * Cross-CPU Task Migration
+ *
+ * When moving a task in the %SCX_OPSS_DISPATCHING state, we can't simply
+ * grab the target CPU's rq lock because a concurrent dequeue might be
+ * waiting on %SCX_OPSS_DISPATCHING while holding the source rq lock
+ * (deadlock).
+ *
+ * The sched_ext core uses a "lock dancing" protocol coordinated by
+ * p->scx.holding_cpu. When moving a task to a different rq:
+ *
+ * 1. Verify task can be moved (CPU affinity, migration_disabled, etc.)
+ * 2. Set p->scx.holding_cpu to the current CPU
+ * 3. Set task state to %SCX_OPSS_NONE; dequeue waits while DISPATCHING
+ * is set, so clearing DISPATCHING first prevents the circular wait
+ * (safe to lock the rq we need)
+ * 4. Unlock the current CPU's rq
+ * 5. Lock src_rq (where the task currently lives)
+ * 6. Verify p->scx.holding_cpu == current CPU, if not, dequeue won the
+ * race (dequeue clears holding_cpu to -1 when it takes the task), in
+ * this case migration is aborted
+ * 7. If src_rq == dst_rq: clear holding_cpu and enqueue directly
+ * into dst_rq's local DSQ (no lock swap needed)
+ * 8. Otherwise: call move_remote_task_to_local_dsq(), which releases
+ * src_rq, locks dst_rq, and performs the deactivate/activate
+ * migration cycle (dst_rq is held on return)
+ * 9. Unlock dst_rq and re-lock the current CPU's rq to restore
+ * the lock state expected by the caller
+ *
+ * If any verification fails, abort the migration.
+ *
+ * This state tracking allows the BPF scheduler to try to dispatch any task
+ * at any time regardless of its state. The SCX core can safely
+ * reject/ignore invalid dispatches, simplifying the BPF scheduler
+ * implementation.
*/
enum scx_ops_state {
SCX_OPSS_NONE, /* owned by the SCX core */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eea99ec01a3f..bf948db905ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -589,6 +589,21 @@ static inline bool entity_before(const struct sched_entity *a,
return vruntime_cmp(a->deadline, "<", b->deadline);
}
+/*
+ * Per avg_vruntime() below, cfs_rq::zero_vruntime is only slightly stale
+ * and this value should be no more than two lag bounds. Which puts it in the
+ * general order of:
+ *
+ * (slice + TICK_NSEC) << NICE_0_LOAD_SHIFT
+ *
+ * which is around 44 bits in size (on 64bit); that is 20 for
+ * NICE_0_LOAD_SHIFT, another 20 for NSEC_PER_MSEC and then a handful for
+ * however many msec the actual slice+tick ends up begin.
+ *
+ * (disregarding the actual divide-by-weight part makes for the worst case
+ * weight of 2, which nicely cancels vs the fuzz in zero_vruntime not actually
+ * being the zero-lag point).
+ */
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
return vruntime_op(se->vruntime, "-", cfs_rq->zero_vruntime);
@@ -676,41 +691,65 @@ sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static inline
-void sum_w_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
+void update_zero_vruntime(struct cfs_rq *cfs_rq, s64 delta)
{
/*
- * v' = v + d ==> sum_w_vruntime' = sum_runtime - d*sum_weight
+ * v' = v + d ==> sum_w_vruntime' = sum_w_vruntime - d*sum_weight
*/
cfs_rq->sum_w_vruntime -= cfs_rq->sum_weight * delta;
+ cfs_rq->zero_vruntime += delta;
}
/*
- * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
+ * Specifically: avg_vruntime() + 0 must result in entity_eligible() := true
* For this to be so, the result of this function must have a left bias.
+ *
+ * Called in:
+ * - place_entity() -- before enqueue
+ * - update_entity_lag() -- before dequeue
+ * - entity_tick()
+ *
+ * This means it is one entry 'behind' but that puts it close enough to where
+ * the bound on entity_key() is at most two lag bounds.
*/
u64 avg_vruntime(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- s64 avg = cfs_rq->sum_w_vruntime;
- long load = cfs_rq->sum_weight;
+ long weight = cfs_rq->sum_weight;
+ s64 delta = 0;
- if (curr && curr->on_rq) {
- unsigned long weight = scale_load_down(curr->load.weight);
+ if (curr && !curr->on_rq)
+ curr = NULL;
- avg += entity_key(cfs_rq, curr) * weight;
- load += weight;
- }
+ if (weight) {
+ s64 runtime = cfs_rq->sum_w_vruntime;
+
+ if (curr) {
+ unsigned long w = scale_load_down(curr->load.weight);
+
+ runtime += entity_key(cfs_rq, curr) * w;
+ weight += w;
+ }
- if (load) {
/* sign flips effective floor / ceiling */
- if (avg < 0)
- avg -= (load - 1);
- avg = div_s64(avg, load);
+ if (runtime < 0)
+ runtime -= (weight - 1);
+
+ delta = div_s64(runtime, weight);
+ } else if (curr) {
+ /*
+ * When there is but one element, it is the average.
+ */
+ delta = curr->vruntime - cfs_rq->zero_vruntime;
}
- return cfs_rq->zero_vruntime + avg;
+ update_zero_vruntime(cfs_rq, delta);
+
+ return cfs_rq->zero_vruntime;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
+
/*
* lag_i = S - s_i = w_i * (V - v_i)
*
@@ -724,17 +763,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
* EEVDF gives the following limit for a steady state system:
*
* -r_max < lag < max(r_max, q)
- *
- * XXX could add max_slice to the augmented data to track this.
*/
static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
+ u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
s64 vlag, limit;
WARN_ON_ONCE(!se->on_rq);
vlag = avg_vruntime(cfs_rq) - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+ limit = calc_delta_fair(max_slice, se);
se->vlag = clamp(vlag, -limit, limit);
}
@@ -777,16 +815,6 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
return vruntime_eligible(cfs_rq, se->vruntime);
}
-static void update_zero_vruntime(struct cfs_rq *cfs_rq)
-{
- u64 vruntime = avg_vruntime(cfs_rq);
- s64 delta = vruntime_op(vruntime, "-", cfs_rq->zero_vruntime);
-
- sum_w_vruntime_update(cfs_rq, delta);
-
- cfs_rq->zero_vruntime = vruntime;
-}
-
static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
{
struct sched_entity *root = __pick_root_entity(cfs_rq);
@@ -802,6 +830,21 @@ static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq)
return min_slice;
}
+static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq)
+{
+ struct sched_entity *root = __pick_root_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+ u64 max_slice = 0ULL;
+
+ if (curr && curr->on_rq)
+ max_slice = curr->slice;
+
+ if (root)
+ max_slice = max(max_slice, root->max_slice);
+
+ return max_slice;
+}
+
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
{
return entity_before(__node_2_se(a), __node_2_se(b));
@@ -826,6 +869,15 @@ static inline void __min_slice_update(struct sched_entity *se, struct rb_node *n
}
}
+static inline void __max_slice_update(struct sched_entity *se, struct rb_node *node)
+{
+ if (node) {
+ struct sched_entity *rse = __node_2_se(node);
+ if (rse->max_slice > se->max_slice)
+ se->max_slice = rse->max_slice;
+ }
+}
+
/*
* se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime)
*/
@@ -833,6 +885,7 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
{
u64 old_min_vruntime = se->min_vruntime;
u64 old_min_slice = se->min_slice;
+ u64 old_max_slice = se->max_slice;
struct rb_node *node = &se->run_node;
se->min_vruntime = se->vruntime;
@@ -843,8 +896,13 @@ static inline bool min_vruntime_update(struct sched_entity *se, bool exit)
__min_slice_update(se, node->rb_right);
__min_slice_update(se, node->rb_left);
+ se->max_slice = se->slice;
+ __max_slice_update(se, node->rb_right);
+ __max_slice_update(se, node->rb_left);
+
return se->min_vruntime == old_min_vruntime &&
- se->min_slice == old_min_slice;
+ se->min_slice == old_min_slice &&
+ se->max_slice == old_max_slice;
}
RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
@@ -856,7 +914,6 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity,
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
sum_w_vruntime_add(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
se->min_vruntime = se->vruntime;
se->min_slice = se->slice;
rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
@@ -868,7 +925,6 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
&min_vruntime_cb);
sum_w_vruntime_sub(cfs_rq, se);
- update_zero_vruntime(cfs_rq);
}
struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq)
@@ -3790,6 +3846,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
+ bool rel_vprot = false;
+ u64 vprot;
if (se->on_rq) {
/* commit outstanding execution time */
@@ -3797,6 +3855,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ if (curr && protect_slice(se)) {
+ vprot = se->vprot - se->vruntime;
+ rel_vprot = true;
+ }
+
cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
@@ -3812,6 +3875,9 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
if (se->rel_deadline)
se->deadline = div_s64(se->deadline * se->load.weight, weight);
+ if (rel_vprot)
+ vprot = div_s64(vprot * se->load.weight, weight);
+
update_load_set(&se->load, weight);
do {
@@ -3823,6 +3889,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
place_entity(cfs_rq, se, 0);
+ if (rel_vprot)
+ se->vprot = se->vruntime + vprot;
update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -5420,7 +5488,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}
static void
-set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, bool first)
{
clear_buddies(cfs_rq, se);
@@ -5435,7 +5503,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- set_protect_slice(cfs_rq, se);
+ if (first)
+ set_protect_slice(cfs_rq, se);
}
update_stats_curr_start(cfs_rq, se);
@@ -5524,6 +5593,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
update_load_avg(cfs_rq, curr, UPDATE_TG);
update_cfs_group(curr);
+ /*
+ * Pulls along cfs_rq::zero_vruntime.
+ */
+ avg_vruntime(cfs_rq);
+
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
@@ -8948,13 +9022,13 @@ again:
pse = parent_entity(pse);
}
if (se_depth >= pse_depth) {
- set_next_entity(cfs_rq_of(se), se);
+ set_next_entity(cfs_rq_of(se), se, true);
se = parent_entity(se);
}
}
put_prev_entity(cfs_rq, pse);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, true);
__set_next_task_fair(rq, p, true);
}
@@ -12908,7 +12982,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
t0 = sched_clock_cpu(this_cpu);
__sched_balance_update_blocked_averages(this_rq);
- this_rq->next_class = &fair_sched_class;
+ rq_modified_begin(this_rq, &fair_sched_class);
raw_spin_rq_unlock(this_rq);
for_each_domain(this_cpu, sd) {
@@ -12975,7 +13049,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
pulled_task = 1;
/* If a higher prio class was modified, restart the pick */
- if (sched_class_above(this_rq->next_class, &fair_sched_class))
+ if (rq_modified_above(this_rq, &fair_sched_class))
pulled_task = -1;
out:
@@ -13568,7 +13642,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- set_next_entity(cfs_rq, se);
+ set_next_entity(cfs_rq, se, first);
/* ensure bandwidth has been allocated on our new cfs_rq */
account_cfs_rq_runtime(cfs_rq, 0);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3681b6ad9276..b95449165122 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -221,7 +221,7 @@ static void cpuidle_idle_call(void)
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
- } else {
+ } else if (drv->state_count > 1) {
bool stop_tick = true;
/*
@@ -239,6 +239,15 @@ static void cpuidle_idle_call(void)
* Give the governor an opportunity to reflect on the outcome
*/
cpuidle_reflect(dev, entered_state);
+ } else {
+ tick_nohz_idle_retain_tick();
+
+ /*
+ * If there is only a single idle state (or none), there is
+ * nothing meaningful for the governor to choose. Skip the
+ * governor and always use state 0.
+ */
+ call_cpuidle(drv, dev, 0);
}
exit_idle:
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3b725d39c06e..ef152d401fe2 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -123,8 +123,6 @@ int housekeeping_update(struct cpumask *isol_mask)
struct cpumask *trial, *old = NULL;
int err;
- lockdep_assert_cpus_held();
-
trial = kmalloc(cpumask_size(), GFP_KERNEL);
if (!trial)
return -ENOMEM;
@@ -136,7 +134,7 @@ int housekeeping_update(struct cpumask *isol_mask)
}
if (!housekeeping.flags)
- static_branch_enable_cpuslocked(&housekeeping_overridden);
+ static_branch_enable(&housekeeping_overridden);
if (housekeeping.flags & HK_FLAG_DOMAIN)
old = housekeeping_cpumask_dereference(HK_TYPE_DOMAIN);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b82fb70a9d54..43bbf0693cca 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2748,6 +2748,17 @@ static inline const struct sched_class *next_active_class(const struct sched_cla
#define sched_class_above(_a, _b) ((_a) < (_b))
+static inline void rq_modified_begin(struct rq *rq, const struct sched_class *class)
+{
+ if (sched_class_above(rq->next_class, class))
+ rq->next_class = class;
+}
+
+static inline bool rq_modified_above(struct rq *rq, const struct sched_class *class)
+{
+ return sched_class_above(rq->next_class, class);
+}
+
static inline bool sched_stop_runnable(struct rq *rq)
{
return rq->stop && task_on_rq_queued(rq->stop);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 6f10db3646e7..cadb0e9fe19b 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -284,6 +284,35 @@ static bool check_same_owner(struct task_struct *p)
uid_eq(cred->euid, pcred->uid));
}
+#ifdef CONFIG_RT_MUTEXES
+static inline void __setscheduler_dl_pi(int newprio, int policy,
+ struct task_struct *p,
+ struct sched_change_ctx *scope)
+{
+ /*
+ * In case a DEADLINE task (either proper or boosted) gets
+ * setscheduled to a lower priority class, check if it neeeds to
+ * inherit parameters from a potential pi_task. In that case make
+ * sure replenishment happens with the next enqueue.
+ */
+
+ if (dl_prio(newprio) && !dl_policy(policy)) {
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ if (pi_task) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ }
+ }
+}
+#else /* !CONFIG_RT_MUTEXES */
+static inline void __setscheduler_dl_pi(int newprio, int policy,
+ struct task_struct *p,
+ struct sched_change_ctx *scope)
+{
+}
+#endif /* !CONFIG_RT_MUTEXES */
+
#ifdef CONFIG_UCLAMP_TASK
static int uclamp_validate(struct task_struct *p,
@@ -655,6 +684,7 @@ change:
__setscheduler_params(p, attr);
p->sched_class = next_class;
p->prio = newprio;
+ __setscheduler_dl_pi(newprio, policy, p, scope);
}
__setscheduler_uclamp(p, attr);