summaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c119
1 files changed, 92 insertions, 27 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 62b1f3ac5630..26a6ac2f8826 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -976,8 +976,12 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,
static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
{
- /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
- WRITE_ONCE(dsq->nr, dsq->nr + delta);
+ /*
+ * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE()
+ * on the read side and WRITE_ONCE() on the write side to properly
+ * annotate the concurrent lockless access and avoid KCSAN warnings.
+ */
+ WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta);
}
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
@@ -1099,7 +1103,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
- dsq->seq++;
+ WRITE_ONCE(dsq->seq, dsq->seq + 1);
p->scx.dsq_seq = dsq->seq;
dsq_mod_nr(dsq, 1);
@@ -1466,16 +1470,15 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
}
-static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_flags)
{
struct scx_sched *sch = scx_root;
int sticky_cpu = p->scx.sticky_cpu;
+ u64 enq_flags = core_enq_flags | rq->scx.extra_enq_flags;
if (enq_flags & ENQUEUE_WAKEUP)
rq->scx.flags |= SCX_RQ_IN_WAKEUP;
- enq_flags |= rq->scx.extra_enq_flags;
-
if (sticky_cpu >= 0)
p->scx.sticky_cpu = -1;
@@ -2460,7 +2463,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
/* see kick_cpus_irq_workfn() */
smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
- rq->next_class = &ext_sched_class;
+ rq_modified_begin(rq, &ext_sched_class);
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
@@ -2475,7 +2478,7 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
* If @force_scx is true, always try to pick a SCHED_EXT task,
* regardless of any higher-priority sched classes activity.
*/
- if (!force_scx && sched_class_above(rq->next_class, &ext_sched_class))
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2735,7 +2738,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
- last_runnable + scx_watchdog_timeout))) {
+ last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -2763,7 +2766,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
- scx_watchdog_timeout / 2);
+ READ_ONCE(scx_watchdog_timeout) / 2);
}
void scx_tick(struct rq *rq)
@@ -3585,7 +3588,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
- css_put(css);
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
@@ -3708,7 +3710,9 @@ static void scx_kobj_release(struct kobject *kobj)
static ssize_t scx_attr_ops_show(struct kobject *kobj,
struct kobj_attribute *ka, char *buf)
{
- return sysfs_emit(buf, "%s\n", scx_root->ops.name);
+ struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ return sysfs_emit(buf, "%s\n", sch->ops.name);
}
SCX_ATTR(ops);
@@ -3752,7 +3756,9 @@ static const struct kobj_type scx_ktype = {
static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
- return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name);
+ const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+
+ return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
}
static const struct kset_uevent_ops scx_uevent_ops = {
@@ -3901,8 +3907,8 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
* consider offloading iff the total queued duration is over the
* threshold.
*/
- min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
- if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ min_delta_us = READ_ONCE(scx_bypass_lb_intv_us) / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
return 0;
raw_spin_rq_lock_irq(rq);
@@ -4130,7 +4136,7 @@ static void scx_bypass(bool bypass)
WARN_ON_ONCE(scx_bypass_depth <= 0);
if (scx_bypass_depth != 1)
goto unlock;
- WRITE_ONCE(scx_slice_dfl, scx_slice_bypass_us * NSEC_PER_USEC);
+ WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
bypass_timestamp = ktime_get_ns();
if (sch)
scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -4423,10 +4429,19 @@ done:
scx_bypass(false);
}
+/*
+ * Claim the exit on @sch. The caller must ensure that the helper kthread work
+ * is kicked before the current task can be preempted. Once exit_kind is
+ * claimed, scx_error() can no longer trigger, so if the current task gets
+ * preempted and the BPF scheduler fails to schedule it back, the helper work
+ * will never be kicked and the whole system can wedge.
+ */
static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
{
int none = SCX_EXIT_NONE;
+ lockdep_assert_preemption_disabled();
+
if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
return false;
@@ -4449,6 +4464,7 @@ static void scx_disable(enum scx_exit_kind kind)
rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch) {
+ guard(preempt)();
scx_claim_exit(sch, kind);
kthread_queue_work(sch->helper, &sch->disable_work);
}
@@ -4771,6 +4787,8 @@ static bool scx_vexit(struct scx_sched *sch,
{
struct scx_exit_info *ei = sch->exit_info;
+ guard(preempt)();
+
if (!scx_claim_exit(sch, kind))
return false;
@@ -4955,20 +4973,30 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
return 0;
}
-static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+/*
+ * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid
+ * starvation. During the READY -> ENABLED task switching loop, the calling
+ * thread's sched_class gets switched from fair to ext. As fair has higher
+ * priority than ext, the calling thread can be indefinitely starved under
+ * fair-class saturation, leading to a system hang.
+ */
+struct scx_enable_cmd {
+ struct kthread_work work;
+ struct sched_ext_ops *ops;
+ int ret;
+};
+
+static void scx_enable_workfn(struct kthread_work *work)
{
+ struct scx_enable_cmd *cmd =
+ container_of(work, struct scx_enable_cmd, work);
+ struct sched_ext_ops *ops = cmd->ops;
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
int i, cpu, ret;
- if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
- cpu_possible_mask)) {
- pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
- return -EINVAL;
- }
-
mutex_lock(&scx_enable_mutex);
if (scx_enable_state() != SCX_DISABLED) {
@@ -5060,7 +5088,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
WRITE_ONCE(scx_watchdog_timeout, timeout);
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
- scx_watchdog_timeout / 2);
+ READ_ONCE(scx_watchdog_timeout) / 2);
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
@@ -5185,13 +5213,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
atomic_long_inc(&scx_enable_seq);
- return 0;
+ cmd->ret = 0;
+ return;
err_free_ksyncs:
free_kick_syncs();
err_unlock:
mutex_unlock(&scx_enable_mutex);
- return ret;
+ cmd->ret = ret;
+ return;
err_disable_unlock_all:
scx_cgroup_unlock();
@@ -5210,7 +5240,42 @@ err_disable:
*/
scx_error(sch, "scx_enable() failed (%d)", ret);
kthread_flush_work(&sch->disable_work);
- return 0;
+ cmd->ret = 0;
+}
+
+static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+{
+ static struct kthread_worker *helper;
+ static DEFINE_MUTEX(helper_mutex);
+ struct scx_enable_cmd cmd;
+
+ if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
+ cpu_possible_mask)) {
+ pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
+ return -EINVAL;
+ }
+
+ if (!READ_ONCE(helper)) {
+ mutex_lock(&helper_mutex);
+ if (!helper) {
+ struct kthread_worker *w =
+ kthread_run_worker(0, "scx_enable_helper");
+ if (IS_ERR_OR_NULL(w)) {
+ mutex_unlock(&helper_mutex);
+ return -ENOMEM;
+ }
+ sched_set_fifo(w->task);
+ WRITE_ONCE(helper, w);
+ }
+ mutex_unlock(&helper_mutex);
+ }
+
+ kthread_init_work(&cmd.work, scx_enable_workfn);
+ cmd.ops = ops;
+
+ kthread_queue_work(READ_ONCE(helper), &cmd.work);
+ kthread_flush_work(&cmd.work);
+ return cmd.ret;
}