summaryrefslogtreecommitdiff
path: root/tools/sched_ext/include
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2025-10-27 08:19:40 -1000
committerTejun Heo <tj@kernel.org>2025-10-29 05:29:04 -1000
commita3f5d48222532484c1e85ef27cc6893803e4cd17 (patch)
tree57a7c39bbf6d170ff983fd50233df1a4b01f0409 /tools/sched_ext/include
parent8803e6a7fb687795ab4326f3e96e9f666605d883 (diff)
sched_ext: Allow scx_bpf_reenqueue_local() to be called from anywhere
The ops.cpu_acquire/release() callbacks miss events under multiple conditions. There are two distinct task dispatch gaps that can cause cpu_released flag desynchronization: 1. balance-to-pick_task gap: This is what was originally reported. balance_scx() can enqueue a task, but during consume_remote_task() when the rq lock is released, a higher priority task can be enqueued and ultimately picked while cpu_released remains false. This gap is closeable via RETRY_TASK handling. 2. ttwu-to-pick_task gap: ttwu() can directly dispatch a task to a CPU's local DSQ. By the time the sched path runs on the target CPU, higher class tasks may already be queued. In such cases, nothing on sched_ext side will be invoked, and the only solution would be a hook invoked regardless of sched class, which isn't desirable. Rather than adding invasive core hooks, BPF schedulers can use generic BPF mechanisms like tracepoints. From SCX scheduler's perspective, this is congruent with other mechanisms it already uses and doesn't add further friction. The main use case for cpu_release() was calling scx_bpf_reenqueue_local() when a CPU gets preempted by a higher priority scheduling class. However, the old scx_bpf_reenqueue_local() could only be called from cpu_release() context. Add a new version of scx_bpf_reenqueue_local() that can be called from any context by deferring the actual re-enqueue operation. This eliminates the need for cpu_acquire/release() ops entirely. Schedulers can now use standard BPF mechanisms like the sched_switch tracepoint to detect and handle CPU preemption. Update scx_qmap to demonstrate the new approach using sched_switch instead of cpu_release, with compat support for older kernels. Mark cpu_acquire/release() as deprecated. The old scx_bpf_reenqueue_local() variant will be removed in v6.23. Reported-by: Wen-Fang Liu <liuwenfang@honor.com> Link: https://lore.kernel.org/all/8d64c74118c6440f81bcf5a4ac6b9f00@honor.com/ Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'tools/sched_ext/include')
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h1
-rw-r--r--tools/sched_ext/include/scx/compat.bpf.h23
2 files changed, 23 insertions, 1 deletions
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index e65b1eb668ea..82a798c3fb22 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -70,7 +70,6 @@ void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __
void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-u32 scx_bpf_reenqueue_local(void) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 26bead92fa04..0bfb8abe2a46 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -279,6 +279,29 @@ static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
}
/*
+ * v6.19: The new void variant can be called from anywhere while the older v1
+ * variant can only be called from ops.cpu_release(). The double ___ prefixes on
+ * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix
+ * on kernel side. Drop the wrapper and move the decl to common.bpf.h after
+ * v6.22.
+ */
+u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak;
+void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak;
+
+static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void)
+{
+ return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat);
+}
+
+static inline void scx_bpf_reenqueue_local(void)
+{
+ if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
+ scx_bpf_reenqueue_local___v2___compat();
+ else
+ scx_bpf_reenqueue_local___v1();
+}
+
+/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
*/