summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched/ext.h6
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/ext.c1556
-rw-r--r--kernel/sched/ext.h25
-rw-r--r--kernel/sched/ext_idle.c146
-rw-r--r--kernel/sched/ext_internal.h1078
-rw-r--r--tools/sched_ext/include/scx/bpf_arena_common.bpf.h175
-rw-r--r--tools/sched_ext/include/scx/bpf_arena_common.h33
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h104
-rw-r--r--tools/sched_ext/include/scx/common.h5
-rw-r--r--tools/sched_ext/include/scx/compat.bpf.h22
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.bpf.h40
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.h49
-rw-r--r--tools/sched_ext/include/scx/user_exit_info_common.h30
-rw-r--r--tools/sched_ext/scx_central.bpf.c2
-rw-r--r--tools/sched_ext/scx_central.c1
-rw-r--r--tools/sched_ext/scx_flatcg.bpf.c2
-rw-r--r--tools/sched_ext/scx_flatcg.c2
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c98
-rw-r--r--tools/sched_ext/scx_qmap.c12
-rw-r--r--tools/sched_ext/scx_simple.c2
22 files changed, 1992 insertions, 1399 deletions
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 7047101dbf58..d82b7a9b0658 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -108,7 +108,11 @@ enum scx_kf_mask {
SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */
/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */
- /* ops.dequeue (in REST) may be nested inside DISPATCH */
+ /*
+ * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and
+ * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be
+ * nested inside DISPATCH.
+ */
SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */
SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */
SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index c4a488e67aa7..755883faf751 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -58,6 +58,7 @@
#include "deadline.c"
#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext_internal.h"
# include "ext.c"
# include "ext_idle.c"
#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec33b0353027..523a97f28bfc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9362,8 +9362,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task, false);
-
- scx_cgroup_finish_attach();
}
static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 088ceff38c8a..2b0e88206d07 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,1040 +9,6 @@
#include <linux/btf_ids.h>
#include "ext_idle.h"
-#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
-
-enum scx_consts {
- SCX_DSP_DFL_MAX_BATCH = 32,
- SCX_DSP_MAX_LOOPS = 32,
- SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
-
- SCX_EXIT_BT_LEN = 64,
- SCX_EXIT_MSG_LEN = 1024,
- SCX_EXIT_DUMP_DFL_LEN = 32768,
-
- SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
-
- /*
- * Iterating all tasks may take a while. Periodically drop
- * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
- */
- SCX_TASK_ITER_BATCH = 32,
-};
-
-enum scx_exit_kind {
- SCX_EXIT_NONE,
- SCX_EXIT_DONE,
-
- SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
- SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
- SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
- SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
-
- SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
- SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
- SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
-};
-
-/*
- * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
- * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
- * are 64bit of the format:
- *
- * Bits: [63 .. 48 47 .. 32 31 .. 0]
- * [ SYS ACT ] [ SYS RSN ] [ USR ]
- *
- * SYS ACT: System-defined exit actions
- * SYS RSN: System-defined exit reasons
- * USR : User-defined exit codes and reasons
- *
- * Using the above, users may communicate intention and context by ORing system
- * actions and/or system reasons with a user-defined exit code.
- */
-enum scx_exit_code {
- /* Reasons */
- SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
-
- /* Actions */
- SCX_ECODE_ACT_RESTART = 1LLU << 48,
-};
-
-/*
- * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
- * being disabled.
- */
-struct scx_exit_info {
- /* %SCX_EXIT_* - broad category of the exit reason */
- enum scx_exit_kind kind;
-
- /* exit code if gracefully exiting */
- s64 exit_code;
-
- /* textual representation of the above */
- const char *reason;
-
- /* backtrace if exiting due to an error */
- unsigned long *bt;
- u32 bt_len;
-
- /* informational message */
- char *msg;
-
- /* debug dump */
- char *dump;
-};
-
-/* sched_ext_ops.flags */
-enum scx_ops_flags {
- /*
- * Keep built-in idle tracking even if ops.update_idle() is implemented.
- */
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
-
- /*
- * By default, if there are no other task to run on the CPU, ext core
- * keeps running the current task even after its slice expires. If this
- * flag is specified, such tasks are passed to ops.enqueue() with
- * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
- */
- SCX_OPS_ENQ_LAST = 1LLU << 1,
-
- /*
- * An exiting task may schedule after PF_EXITING is set. In such cases,
- * bpf_task_from_pid() may not be able to find the task and if the BPF
- * scheduler depends on pid lookup for dispatching, the task will be
- * lost leading to various issues including RCU grace period stalls.
- *
- * To mask this problem, by default, unhashed tasks are automatically
- * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
- * depend on pid lookups and wants to handle these tasks directly, the
- * following flag can be used.
- */
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
-
- /*
- * If set, only tasks with policy set to SCHED_EXT are attached to
- * sched_ext. If clear, SCHED_NORMAL tasks are also included.
- */
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
-
- /*
- * A migration disabled task can only execute on its current CPU. By
- * default, such tasks are automatically put on the CPU's local DSQ with
- * the default slice on enqueue. If this ops flag is set, they also go
- * through ops.enqueue().
- *
- * A migration disabled task never invokes ops.select_cpu() as it can
- * only select the current CPU. Also, p->cpus_ptr will only contain its
- * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
- * and thus may disagree with cpumask_weight(p->cpus_ptr).
- */
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
-
- /*
- * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
- * ops.enqueue() on the ops.select_cpu() selected or the wakee's
- * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
- * transfers. When this optimization is enabled, ops.select_cpu() is
- * skipped in some cases (when racing against the wakee switching out).
- * As the BPF scheduler may depend on ops.select_cpu() being invoked
- * during wakeups, queued wakeup is disabled by default.
- *
- * If this ops flag is set, queued wakeup optimization is enabled and
- * the BPF scheduler must be able to handle ops.enqueue() invoked on the
- * wakee's CPU without preceding ops.select_cpu() even for tasks which
- * may be executed on multiple CPUs.
- */
- SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
-
- /*
- * If set, enable per-node idle cpumasks. If clear, use a single global
- * flat idle cpumask.
- */
- SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
-
- /*
- * CPU cgroup support flags
- */
- SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
-
- SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
- SCX_OPS_ENQ_LAST |
- SCX_OPS_ENQ_EXITING |
- SCX_OPS_ENQ_MIGRATION_DISABLED |
- SCX_OPS_ALLOW_QUEUED_WAKEUP |
- SCX_OPS_SWITCH_PARTIAL |
- SCX_OPS_BUILTIN_IDLE_PER_NODE |
- SCX_OPS_HAS_CGROUP_WEIGHT,
-
- /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
- __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
-};
-
-/* argument container for ops.init_task() */
-struct scx_init_task_args {
- /*
- * Set if ops.init_task() is being invoked on the fork path, as opposed
- * to the scheduler transition path.
- */
- bool fork;
-#ifdef CONFIG_EXT_GROUP_SCHED
- /* the cgroup the task is joining */
- struct cgroup *cgroup;
-#endif
-};
-
-/* argument container for ops.exit_task() */
-struct scx_exit_task_args {
- /* Whether the task exited before running on sched_ext. */
- bool cancelled;
-};
-
-/* argument container for ops->cgroup_init() */
-struct scx_cgroup_init_args {
- /* the weight of the cgroup [1..10000] */
- u32 weight;
-
- /* bandwidth control parameters from cpu.max and cpu.max.burst */
- u64 bw_period_us;
- u64 bw_quota_us;
- u64 bw_burst_us;
-};
-
-enum scx_cpu_preempt_reason {
- /* next task is being scheduled by &sched_class_rt */
- SCX_CPU_PREEMPT_RT,
- /* next task is being scheduled by &sched_class_dl */
- SCX_CPU_PREEMPT_DL,
- /* next task is being scheduled by &sched_class_stop */
- SCX_CPU_PREEMPT_STOP,
- /* unknown reason for SCX being preempted */
- SCX_CPU_PREEMPT_UNKNOWN,
-};
-
-/*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
- * expanded in the future.
- */
-struct scx_cpu_acquire_args {};
-
-/* argument container for ops->cpu_release() */
-struct scx_cpu_release_args {
- /* the reason the CPU was preempted */
- enum scx_cpu_preempt_reason reason;
-
- /* the task that's going to be scheduled on the CPU */
- struct task_struct *task;
-};
-
-/*
- * Informational context provided to dump operations.
- */
-struct scx_dump_ctx {
- enum scx_exit_kind kind;
- s64 exit_code;
- const char *reason;
- u64 at_ns;
- u64 at_jiffies;
-};
-
-/**
- * struct sched_ext_ops - Operation table for BPF scheduler implementation
- *
- * A BPF scheduler can implement an arbitrary scheduling policy by
- * implementing and loading operations in this table. Note that a userland
- * scheduling policy can also be implemented using the BPF scheduler
- * as a shim layer.
- */
-struct sched_ext_ops {
- /**
- * @select_cpu: Pick the target CPU for a task which is being woken up
- * @p: task being woken up
- * @prev_cpu: the cpu @p was on before sleeping
- * @wake_flags: SCX_WAKE_*
- *
- * Decision made here isn't final. @p may be moved to any CPU while it
- * is getting dispatched for execution later. However, as @p is not on
- * the rq at this point, getting the eventual execution CPU right here
- * saves a small bit of overhead down the line.
- *
- * If an idle CPU is returned, the CPU is kicked and will try to
- * dispatch. While an explicit custom mechanism can be added,
- * select_cpu() serves as the default way to wake up idle CPUs.
- *
- * @p may be inserted into a DSQ directly by calling
- * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
- * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
- * of the CPU returned by this operation.
- *
- * Note that select_cpu() is never called for tasks that can only run
- * on a single CPU or tasks with migration disabled, as they don't have
- * the option to select a different CPU. See select_task_rq() for
- * details.
- */
- s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
-
- /**
- * @enqueue: Enqueue a task on the BPF scheduler
- * @p: task being enqueued
- * @enq_flags: %SCX_ENQ_*
- *
- * @p is ready to run. Insert directly into a DSQ by calling
- * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
- * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
- * the task will stall.
- *
- * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
- * skipped.
- */
- void (*enqueue)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @dequeue: Remove a task from the BPF scheduler
- * @p: task being dequeued
- * @deq_flags: %SCX_DEQ_*
- *
- * Remove @p from the BPF scheduler. This is usually called to isolate
- * the task while updating its scheduling properties (e.g. priority).
- *
- * The ext core keeps track of whether the BPF side owns a given task or
- * not and can gracefully ignore spurious dispatches from BPF side,
- * which makes it safe to not implement this method. However, depending
- * on the scheduling logic, this can lead to confusing behaviors - e.g.
- * scheduling position not being updated across a priority change.
- */
- void (*dequeue)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
- * @cpu: CPU to dispatch tasks for
- * @prev: previous task being switched out
- *
- * Called when a CPU's local dsq is empty. The operation should dispatch
- * one or more tasks from the BPF scheduler into the DSQs using
- * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
- * using scx_bpf_dsq_move_to_local().
- *
- * The maximum number of times scx_bpf_dsq_insert() can be called
- * without an intervening scx_bpf_dsq_move_to_local() is specified by
- * ops.dispatch_max_batch. See the comments on top of the two functions
- * for more details.
- *
- * When not %NULL, @prev is an SCX task with its slice depleted. If
- * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
- * @prev->scx.flags, it is not enqueued yet and will be enqueued after
- * ops.dispatch() returns. To keep executing @prev, return without
- * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
- */
- void (*dispatch)(s32 cpu, struct task_struct *prev);
-
- /**
- * @tick: Periodic tick
- * @p: task running currently
- *
- * This operation is called every 1/HZ seconds on CPUs which are
- * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
- * immediate dispatch cycle on the CPU.
- */
- void (*tick)(struct task_struct *p);
-
- /**
- * @runnable: A task is becoming runnable on its associated CPU
- * @p: task becoming runnable
- * @enq_flags: %SCX_ENQ_*
- *
- * This and the following three functions can be used to track a task's
- * execution state transitions. A task becomes ->runnable() on a CPU,
- * and then goes through one or more ->running() and ->stopping() pairs
- * as it runs on the CPU, and eventually becomes ->quiescent() when it's
- * done running on the CPU.
- *
- * @p is becoming runnable on the CPU because it's
- *
- * - waking up (%SCX_ENQ_WAKEUP)
- * - being moved from another CPU
- * - being restored after temporarily taken off the queue for an
- * attribute change.
- *
- * This and ->enqueue() are related but not coupled. This operation
- * notifies @p's state transition and may not be followed by ->enqueue()
- * e.g. when @p is being dispatched to a remote CPU, or when @p is
- * being enqueued on a CPU experiencing a hotplug event. Likewise, a
- * task may be ->enqueue()'d without being preceded by this operation
- * e.g. after exhausting its slice.
- */
- void (*runnable)(struct task_struct *p, u64 enq_flags);
-
- /**
- * @running: A task is starting to run on its associated CPU
- * @p: task starting to run
- *
- * Note that this callback may be called from a CPU other than the
- * one the task is going to run on. This can happen when a task
- * property is changed (i.e., affinity), since scx_next_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to determine the
- * target CPU the task is going to use.
- *
- * See ->runnable() for explanation on the task state notifiers.
- */
- void (*running)(struct task_struct *p);
-
- /**
- * @stopping: A task is stopping execution
- * @p: task stopping to run
- * @runnable: is task @p still runnable?
- *
- * Note that this callback may be called from a CPU other than the
- * one the task was running on. This can happen when a task
- * property is changed (i.e., affinity), since dequeue_task_scx(),
- * which triggers this callback, may run on a CPU different from
- * the task's assigned CPU.
- *
- * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
- * the task was running on.
- *
- * See ->runnable() for explanation on the task state notifiers. If
- * !@runnable, ->quiescent() will be invoked after this operation
- * returns.
- */
- void (*stopping)(struct task_struct *p, bool runnable);
-
- /**
- * @quiescent: A task is becoming not runnable on its associated CPU
- * @p: task becoming not runnable
- * @deq_flags: %SCX_DEQ_*
- *
- * See ->runnable() for explanation on the task state notifiers.
- *
- * @p is becoming quiescent on the CPU because it's
- *
- * - sleeping (%SCX_DEQ_SLEEP)
- * - being moved to another CPU
- * - being temporarily taken off the queue for an attribute change
- * (%SCX_DEQ_SAVE)
- *
- * This and ->dequeue() are related but not coupled. This operation
- * notifies @p's state transition and may not be preceded by ->dequeue()
- * e.g. when @p is being dispatched to a remote CPU.
- */
- void (*quiescent)(struct task_struct *p, u64 deq_flags);
-
- /**
- * @yield: Yield CPU
- * @from: yielding task
- * @to: optional yield target task
- *
- * If @to is NULL, @from is yielding the CPU to other runnable tasks.
- * The BPF scheduler should ensure that other available tasks are
- * dispatched before the yielding task. Return value is ignored in this
- * case.
- *
- * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
- * scheduler can implement the request, return %true; otherwise, %false.
- */
- bool (*yield)(struct task_struct *from, struct task_struct *to);
-
- /**
- * @core_sched_before: Task ordering for core-sched
- * @a: task A
- * @b: task B
- *
- * Used by core-sched to determine the ordering between two tasks. See
- * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
- * core-sched.
- *
- * Both @a and @b are runnable and may or may not currently be queued on
- * the BPF scheduler. Should return %true if @a should run before @b.
- * %false if there's no required ordering or @b should run before @a.
- *
- * If not specified, the default is ordering them according to when they
- * became runnable.
- */
- bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
-
- /**
- * @set_weight: Set task weight
- * @p: task to set weight for
- * @weight: new weight [1..10000]
- *
- * Update @p's weight to @weight.
- */
- void (*set_weight)(struct task_struct *p, u32 weight);
-
- /**
- * @set_cpumask: Set CPU affinity
- * @p: task to set CPU affinity for
- * @cpumask: cpumask of cpus that @p can run on
- *
- * Update @p's CPU affinity to @cpumask.
- */
- void (*set_cpumask)(struct task_struct *p,
- const struct cpumask *cpumask);
-
- /**
- * @update_idle: Update the idle state of a CPU
- * @cpu: CPU to update the idle state for
- * @idle: whether entering or exiting the idle state
- *
- * This operation is called when @rq's CPU goes or leaves the idle
- * state. By default, implementing this operation disables the built-in
- * idle CPU tracking and the following helpers become unavailable:
- *
- * - scx_bpf_select_cpu_dfl()
- * - scx_bpf_select_cpu_and()
- * - scx_bpf_test_and_clear_cpu_idle()
- * - scx_bpf_pick_idle_cpu()
- *
- * The user also must implement ops.select_cpu() as the default
- * implementation relies on scx_bpf_select_cpu_dfl().
- *
- * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
- * tracking.
- */
- void (*update_idle)(s32 cpu, bool idle);
-
- /**
- * @cpu_acquire: A CPU is becoming available to the BPF scheduler
- * @cpu: The CPU being acquired by the BPF scheduler.
- * @args: Acquire arguments, see the struct definition.
- *
- * A CPU that was previously released from the BPF scheduler is now once
- * again under its control.
- */
- void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
-
- /**
- * @cpu_release: A CPU is taken away from the BPF scheduler
- * @cpu: The CPU being released by the BPF scheduler.
- * @args: Release arguments, see the struct definition.
- *
- * The specified CPU is no longer under the control of the BPF
- * scheduler. This could be because it was preempted by a higher
- * priority sched_class, though there may be other reasons as well. The
- * caller should consult @args->reason to determine the cause.
- */
- void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
-
- /**
- * @init_task: Initialize a task to run in a BPF scheduler
- * @p: task to initialize for BPF scheduling
- * @args: init arguments, see the struct definition
- *
- * Either we're loading a BPF scheduler or a new task is being forked.
- * Initialize @p for BPF scheduling. This operation may block and can
- * be used for allocations, and is called exactly once for a task.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During a fork, it
- * will abort that specific fork.
- */
- s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
-
- /**
- * @exit_task: Exit a previously-running task from the system
- * @p: task to exit
- * @args: exit arguments, see the struct definition
- *
- * @p is exiting or the BPF scheduler is being unloaded. Perform any
- * necessary cleanup for @p.
- */
- void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
-
- /**
- * @enable: Enable BPF scheduling for a task
- * @p: task to enable BPF scheduling for
- *
- * Enable @p for BPF scheduling. enable() is called on @p any time it
- * enters SCX, and is always paired with a matching disable().
- */
- void (*enable)(struct task_struct *p);
-
- /**
- * @disable: Disable BPF scheduling for a task
- * @p: task to disable BPF scheduling for
- *
- * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
- * Disable BPF scheduling for @p. A disable() call is always matched
- * with a prior enable() call.
- */
- void (*disable)(struct task_struct *p);
-
- /**
- * @dump: Dump BPF scheduler state on error
- * @ctx: debug dump context
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
- */
- void (*dump)(struct scx_dump_ctx *ctx);
-
- /**
- * @dump_cpu: Dump BPF scheduler state for a CPU on error
- * @ctx: debug dump context
- * @cpu: CPU to generate debug dump for
- * @idle: @cpu is currently idle without any runnable tasks
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @cpu. If @idle is %true and this operation doesn't produce any
- * output, @cpu is skipped for dump.
- */
- void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
-
- /**
- * @dump_task: Dump BPF scheduler state for a runnable task on error
- * @ctx: debug dump context
- * @p: runnable task to generate debug dump for
- *
- * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
- * @p.
- */
- void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
-
-#ifdef CONFIG_EXT_GROUP_SCHED
- /**
- * @cgroup_init: Initialize a cgroup
- * @cgrp: cgroup being initialized
- * @args: init arguments, see the struct definition
- *
- * Either the BPF scheduler is being loaded or @cgrp created, initialize
- * @cgrp for sched_ext. This operation may block.
- *
- * Return 0 for success, -errno for failure. An error return while
- * loading will abort loading of the BPF scheduler. During cgroup
- * creation, it will abort the specific cgroup creation.
- */
- s32 (*cgroup_init)(struct cgroup *cgrp,
- struct scx_cgroup_init_args *args);
-
- /**
- * @cgroup_exit: Exit a cgroup
- * @cgrp: cgroup being exited
- *
- * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
- * @cgrp for sched_ext. This operation my block.
- */
- void (*cgroup_exit)(struct cgroup *cgrp);
-
- /**
- * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Prepare @p for move from cgroup @from to @to. This operation may
- * block and can be used for allocations.
- *
- * Return 0 for success, -errno for failure. An error return aborts the
- * migration.
- */
- s32 (*cgroup_prep_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_move: Commit cgroup move
- * @p: task being moved
- * @from: cgroup @p is being moved from
- * @to: cgroup @p is being moved to
- *
- * Commit the move. @p is dequeued during this operation.
- */
- void (*cgroup_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_cancel_move: Cancel cgroup move
- * @p: task whose cgroup move is being canceled
- * @from: cgroup @p was being moved from
- * @to: cgroup @p was being moved to
- *
- * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
- * Undo the preparation.
- */
- void (*cgroup_cancel_move)(struct task_struct *p,
- struct cgroup *from, struct cgroup *to);
-
- /**
- * @cgroup_set_weight: A cgroup's weight is being changed
- * @cgrp: cgroup whose weight is being updated
- * @weight: new weight [1..10000]
- *
- * Update @cgrp's weight to @weight.
- */
- void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-
- /**
- * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
- * @cgrp: cgroup whose bandwidth is being updated
- * @period_us: bandwidth control period
- * @quota_us: bandwidth control quota
- * @burst_us: bandwidth control burst
- *
- * Update @cgrp's bandwidth control parameters. This is from the cpu.max
- * cgroup interface.
- *
- * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
- * to. For example, if @period_us is 1_000_000 and @quota_us is
- * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
- * interpreted in the same fashion and specifies how much @cgrp can
- * burst temporarily. The specific control mechanism and thus the
- * interpretation of @period_us and burstiness is upto to the BPF
- * scheduler.
- */
- void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
- u64 period_us, u64 quota_us, u64 burst_us);
-
-#endif /* CONFIG_EXT_GROUP_SCHED */
-
- /*
- * All online ops must come before ops.cpu_online().
- */
-
- /**
- * @cpu_online: A CPU became online
- * @cpu: CPU which just came up
- *
- * @cpu just came online. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
- */
- void (*cpu_online)(s32 cpu);
-
- /**
- * @cpu_offline: A CPU is going offline
- * @cpu: CPU which is going offline
- *
- * @cpu is going offline. @cpu will not call ops.enqueue() or
- * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
- */
- void (*cpu_offline)(s32 cpu);
-
- /*
- * All CPU hotplug ops must come before ops.init().
- */
-
- /**
- * @init: Initialize the BPF scheduler
- */
- s32 (*init)(void);
-
- /**
- * @exit: Clean up after the BPF scheduler
- * @info: Exit info
- *
- * ops.exit() is also called on ops.init() failure, which is a bit
- * unusual. This is to allow rich reporting through @info on how
- * ops.init() failed.
- */
- void (*exit)(struct scx_exit_info *info);
-
- /**
- * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
- */
- u32 dispatch_max_batch;
-
- /**
- * @flags: %SCX_OPS_* flags
- */
- u64 flags;
-
- /**
- * @timeout_ms: The maximum amount of time, in milliseconds, that a
- * runnable task should be able to wait before being scheduled. The
- * maximum timeout may not exceed the default timeout of 30 seconds.
- *
- * Defaults to the maximum allowed timeout value of 30 seconds.
- */
- u32 timeout_ms;
-
- /**
- * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
- * value of 32768 is used.
- */
- u32 exit_dump_len;
-
- /**
- * @hotplug_seq: A sequence number that may be set by the scheduler to
- * detect when a hotplug event has occurred during the loading process.
- * If 0, no detection occurs. Otherwise, the scheduler will fail to
- * load if the sequence number does not match @scx_hotplug_seq on the
- * enable path.
- */
- u64 hotplug_seq;
-
- /**
- * @name: BPF scheduler's name
- *
- * Must be a non-zero valid BPF object name including only isalnum(),
- * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
- * BPF scheduler is enabled.
- */
- char name[SCX_OPS_NAME_LEN];
-
- /* internal use only, must be NULL */
- void *priv;
-};
-
-enum scx_opi {
- SCX_OPI_BEGIN = 0,
- SCX_OPI_NORMAL_BEGIN = 0,
- SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
- SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
- SCX_OPI_END = SCX_OP_IDX(init),
-};
-
-/*
- * Collection of event counters. Event types are placed in descending order.
- */
-struct scx_event_stats {
- /*
- * If ops.select_cpu() returns a CPU which can't be used by the task,
- * the core scheduler code silently picks a fallback CPU.
- */
- s64 SCX_EV_SELECT_CPU_FALLBACK;
-
- /*
- * When dispatching to a local DSQ, the CPU may have gone offline in
- * the meantime. In this case, the task is bounced to the global DSQ.
- */
- s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
-
- /*
- * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
- * continued to run because there were no other tasks on the CPU.
- */
- s64 SCX_EV_DISPATCH_KEEP_LAST;
-
- /*
- * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
- * is dispatched to a local DSQ when exiting.
- */
- s64 SCX_EV_ENQ_SKIP_EXITING;
-
- /*
- * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
- * migration disabled task skips ops.enqueue() and is dispatched to its
- * local DSQ.
- */
- s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
-
- /*
- * Total number of times a task's time slice was refilled with the
- * default value (SCX_SLICE_DFL).
- */
- s64 SCX_EV_REFILL_SLICE_DFL;
-
- /*
- * The total duration of bypass modes in nanoseconds.
- */
- s64 SCX_EV_BYPASS_DURATION;
-
- /*
- * The number of tasks dispatched in the bypassing mode.
- */
- s64 SCX_EV_BYPASS_DISPATCH;
-
- /*
- * The number of times the bypassing mode has been activated.
- */
- s64 SCX_EV_BYPASS_ACTIVATE;
-};
-
-struct scx_sched {
- struct sched_ext_ops ops;
- DECLARE_BITMAP(has_op, SCX_OPI_END);
-
- /*
- * Dispatch queues.
- *
- * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
- * This is to avoid live-locking in bypass mode where all tasks are
- * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
- * per-node split isn't sufficient, it can be further split.
- */
- struct rhashtable dsq_hash;
- struct scx_dispatch_q **global_dsqs;
-
- /*
- * The event counters are in a per-CPU variable to minimize the
- * accounting overhead. A system-wide view on the event counter is
- * constructed when requested by scx_bpf_events().
- */
- struct scx_event_stats __percpu *event_stats_cpu;
-
- bool warned_zero_slice;
-
- atomic_t exit_kind;
- struct scx_exit_info *exit_info;
-
- struct kobject kobj;
-
- struct kthread_worker *helper;
- struct irq_work error_irq_work;
- struct kthread_work disable_work;
- struct rcu_work rcu_work;
-};
-
-enum scx_wake_flags {
- /* expose select WF_* flags as enums */
- SCX_WAKE_FORK = WF_FORK,
- SCX_WAKE_TTWU = WF_TTWU,
- SCX_WAKE_SYNC = WF_SYNC,
-};
-
-enum scx_enq_flags {
- /* expose select ENQUEUE_* flags as enums */
- SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
- SCX_ENQ_HEAD = ENQUEUE_HEAD,
- SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
-
- /* high 32bits are SCX specific */
-
- /*
- * Set the following to trigger preemption when calling
- * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
- * current task is cleared to zero and the CPU is kicked into the
- * scheduling path. Implies %SCX_ENQ_HEAD.
- */
- SCX_ENQ_PREEMPT = 1LLU << 32,
-
- /*
- * The task being enqueued was previously enqueued on the current CPU's
- * %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
- * invoked in a ->cpu_release() callback, and the task is again
- * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
- * task will not be scheduled on the CPU until at least the next invocation
- * of the ->cpu_acquire() callback.
- */
- SCX_ENQ_REENQ = 1LLU << 40,
-
- /*
- * The task being enqueued is the only task available for the cpu. By
- * default, ext core keeps executing such tasks but when
- * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
- * %SCX_ENQ_LAST flag set.
- *
- * The BPF scheduler is responsible for triggering a follow-up
- * scheduling event. Otherwise, Execution may stall.
- */
- SCX_ENQ_LAST = 1LLU << 41,
-
- /* high 8 bits are internal */
- __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
-
- SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
- SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
-};
-
-enum scx_deq_flags {
- /* expose select DEQUEUE_* flags as enums */
- SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
-
- /* high 32bits are SCX specific */
-
- /*
- * The generic core-sched layer decided to execute the task even though
- * it hasn't been dispatched yet. Dequeue from the BPF side.
- */
- SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
-};
-
-enum scx_pick_idle_cpu_flags {
- SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
- SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
-};
-
-enum scx_kick_flags {
- /*
- * Kick the target CPU if idle. Guarantees that the target CPU goes
- * through at least one full scheduling cycle before going idle. If the
- * target CPU can be determined to be currently not idle and going to go
- * through a scheduling cycle before going idle, noop.
- */
- SCX_KICK_IDLE = 1LLU << 0,
-
- /*
- * Preempt the current task and execute the dispatch path. If the
- * current task of the target CPU is an SCX task, its ->scx.slice is
- * cleared to zero before the scheduling path is invoked so that the
- * task expires and the dispatch path is invoked.
- */
- SCX_KICK_PREEMPT = 1LLU << 1,
-
- /*
- * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
- * return after the target CPU finishes picking the next task.
- */
- SCX_KICK_WAIT = 1LLU << 2,
-};
-
-enum scx_tg_flags {
- SCX_TG_ONLINE = 1U << 0,
- SCX_TG_INITED = 1U << 1,
-};
-
-enum scx_enable_state {
- SCX_ENABLING,
- SCX_ENABLED,
- SCX_DISABLING,
- SCX_DISABLED,
-};
-
-static const char *scx_enable_state_str[] = {
- [SCX_ENABLING] = "enabling",
- [SCX_ENABLED] = "enabled",
- [SCX_DISABLING] = "disabling",
- [SCX_DISABLED] = "disabled",
-};
-
-/*
- * sched_ext_entity->ops_state
- *
- * Used to track the task ownership between the SCX core and the BPF scheduler.
- * State transitions look as follows:
- *
- * NONE -> QUEUEING -> QUEUED -> DISPATCHING
- * ^ | |
- * | v v
- * \-------------------------------/
- *
- * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
- * sites for explanations on the conditions being waited upon and why they are
- * safe. Transitions out of them into NONE or QUEUED must store_release and the
- * waiters should load_acquire.
- *
- * Tracking scx_ops_state enables sched_ext core to reliably determine whether
- * any given task can be dispatched by the BPF scheduler at all times and thus
- * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
- * to try to dispatch any task anytime regardless of its state as the SCX core
- * can safely reject invalid dispatches.
- */
-enum scx_ops_state {
- SCX_OPSS_NONE, /* owned by the SCX core */
- SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
- SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
- SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
-
- /*
- * QSEQ brands each QUEUED instance so that, when dispatch races
- * dequeue/requeue, the dispatcher can tell whether it still has a claim
- * on the task being dispatched.
- *
- * As some 32bit archs can't do 64bit store_release/load_acquire,
- * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
- * 32bit machines. The dispatch race window QSEQ protects is very narrow
- * and runs with IRQ disabled. 30 bits should be sufficient.
- */
- SCX_OPSS_QSEQ_SHIFT = 2,
-};
-
-/* Use macros to ensure that the type is unsigned long for the masks */
-#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
-#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
-
/*
* NOTE: sched_ext is in the process of growing multiple scheduler support and
* scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -1170,7 +136,7 @@ static struct kset *scx_kset;
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
-static void scx_bpf_kick_cpu(s32 cpu, u64 flags);
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
@@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch,
va_end(args);
}
-static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code,
- const char *fmt, ...)
-{
- struct scx_sched *sch;
- va_list args;
-
- rcu_read_lock();
- sch = rcu_dereference(scx_root);
- if (sch) {
- va_start(args, fmt);
- scx_vexit(sch, kind, exit_code, fmt, args);
- va_end(args);
- }
- rcu_read_unlock();
-}
-
#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
-#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args)
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
@@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b)
return (s32)(a - b) < 0;
}
-static struct scx_dispatch_q *find_global_dsq(struct task_struct *p)
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
+ struct task_struct *p)
{
- struct scx_sched *sch = scx_root;
-
return sch->global_dsqs[cpu_to_node(task_cpu(p))];
}
@@ -1363,11 +311,11 @@ do { \
})
/* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(u32 mask)
+static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
{
if (unlikely(!(current->scx.kf_mask & mask))) {
- scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
- mask, current->scx.kf_mask);
+ scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
+ mask, current->scx.kf_mask);
return false;
}
@@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask)
*/
if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
(current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
- scx_kf_error("cpu_release kfunc called from a nested operation");
+ scx_error(sch, "cpu_release kfunc called from a nested operation");
return false;
}
if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
(current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
- scx_kf_error("dispatch kfunc called from a nested operation");
+ scx_error(sch, "dispatch kfunc called from a nested operation");
return false;
}
@@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask)
}
/* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
+static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
+ u32 mask,
struct task_struct *p)
{
- if (!scx_kf_allowed(mask))
+ if (!scx_kf_allowed(sch, mask))
return false;
if (unlikely((p != current->scx.kf_tasks[0] &&
p != current->scx.kf_tasks[1]))) {
- scx_kf_error("called on a task not being operated on");
+ scx_error(sch, "called on a task not being operated on");
return false;
}
@@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq {
*/
struct scx_task_iter {
struct sched_ext_entity cursor;
- struct task_struct *locked;
+ struct task_struct *locked_task;
struct rq *rq;
struct rq_flags rf;
u32 cnt;
+ bool list_locked;
};
/**
@@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter)
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
- iter->locked = NULL;
+ iter->locked_task = NULL;
iter->cnt = 0;
+ iter->list_locked = true;
}
static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
- if (iter->locked) {
- task_rq_unlock(iter->rq, iter->locked, &iter->rf);
- iter->locked = NULL;
+ if (iter->locked_task) {
+ task_rq_unlock(iter->rq, iter->locked_task, &iter->rf);
+ iter->locked_task = NULL;
}
}
@@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter)
*
* If @iter is in the middle of a locked iteration, it may be locking the rq of
* the task currently being visited in addition to scx_tasks_lock. Unlock both.
- * This function can be safely called anytime during an iteration.
+ * This function can be safely called anytime during an iteration. The next
+ * iterator operation will automatically restore the necessary locking.
*/
static void scx_task_iter_unlock(struct scx_task_iter *iter)
{
__scx_task_iter_rq_unlock(iter);
- spin_unlock_irq(&scx_tasks_lock);
+ if (iter->list_locked) {
+ iter->list_locked = false;
+ spin_unlock_irq(&scx_tasks_lock);
+ }
}
-/**
- * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock()
- * @iter: iterator to re-lock
- *
- * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it
- * doesn't re-lock the rq lock. Must be called before other iterator operations.
- */
-static void scx_task_iter_relock(struct scx_task_iter *iter)
+static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
{
- spin_lock_irq(&scx_tasks_lock);
+ if (!iter->list_locked) {
+ spin_lock_irq(&scx_tasks_lock);
+ iter->list_locked = true;
+ }
}
/**
@@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter)
*/
static void scx_task_iter_stop(struct scx_task_iter *iter)
{
+ __scx_task_iter_maybe_relock(iter);
list_del_init(&iter->cursor.tasks_node);
scx_task_iter_unlock(iter);
}
@@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
+ __scx_task_iter_maybe_relock(iter);
+
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
- scx_task_iter_relock(iter);
+ __scx_task_iter_maybe_relock(iter);
}
list_for_each_entry(pos, cursor, tasks_node) {
@@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return NULL;
iter->rq = task_rq_lock(p, &iter->rf);
- iter->locked = p;
+ iter->locked_task = p;
return p;
}
@@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This can be used when preemption is not disabled.
*/
#define scx_add_event(sch, name, cnt) do { \
- this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, (cnt)); \
} while(0)
@@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* This should be used only when preemption is disabled.
*/
#define __scx_add_event(sch, name, cnt) do { \
- __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \
+ __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \
trace_sched_ext_event(#name, cnt); \
} while(0)
@@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where)
}
/**
- * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args
- * @cpu: cpu number which came from a BPF ops
- * @where: extra information reported on error
- *
- * The same as ops_cpu_valid() but @sch is implicit.
- */
-static bool kf_cpu_valid(u32 cpu, const char *where)
-{
- if (__cpu_valid(cpu)) {
- return true;
- } else {
- scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: "");
- return false;
- }
-}
-
-/**
* ops_sanitize_err - Sanitize a -errno value
* @sch: scx_sched to error out on error
* @ops_name: operation to blame on failure
@@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
WRITE_ONCE(dsq->nr, dsq->nr + delta);
}
-static void refill_task_slice_dfl(struct task_struct *p)
+static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
p->scx.slice = SCX_SLICE_DFL;
- __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1);
+ __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
@@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
raw_spin_unlock(&dsq->lock);
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
raw_spin_lock(&dsq->lock);
}
}
@@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
return &cpu_rq(cpu)->scx.local_dsq;
}
if (dsq_id == SCX_DSQ_GLOBAL)
- dsq = find_global_dsq(p);
+ dsq = find_global_dsq(sch, p);
else
dsq = find_user_dsq(sch, dsq_id);
if (unlikely(!dsq)) {
scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
dsq_id, p->comm, p->pid);
- return find_global_dsq(p);
+ return find_global_dsq(sch, p);
}
return dsq;
}
-static void mark_direct_dispatch(struct task_struct *ddsp_task,
+static void mark_direct_dispatch(struct scx_sched *sch,
+ struct task_struct *ddsp_task,
struct task_struct *p, u64 dsq_id,
u64 enq_flags)
{
@@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task,
/* @p must match the task on the enqueue path */
if (unlikely(p != ddsp_task)) {
if (IS_ERR(ddsp_task))
- scx_kf_error("%s[%d] already direct-dispatched",
+ scx_error(sch, "%s[%d] already direct-dispatched",
p->comm, p->pid);
else
- scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
+ scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
ddsp_task->comm, ddsp_task->pid,
p->comm, p->pid);
return;
@@ -2333,15 +1271,15 @@ local:
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
local_norefill:
dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
return;
global:
touch_core_sched(rq, p); /* see the comment in local: */
- refill_task_slice_dfl(p);
- dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags);
+ refill_task_slice_dfl(sch, p);
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch,
if (!scx_rq_online(rq)) {
if (enforce)
- __scx_add_event(scx_root,
- SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
+ __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
}
@@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dst_dsq = find_global_dsq(p);
+ dst_dsq = find_global_dsq(sch, p);
dst_rq = src_rq;
}
} else {
@@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
- dispatch_enqueue(sch, find_global_dsq(p), p,
+ dispatch_enqueue(sch, find_global_dsq(sch, p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
}
@@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* balance(), we want to complete this scheduling cycle and then
* start a new one. IOW, we want to call resched_curr() on the
* next, most likely idle, task, not the current one. Use
- * scx_bpf_kick_cpu() for deferred kicking.
+ * scx_kick_cpu() for deferred kicking.
*/
if (unlikely(!--nr_loops)) {
- scx_bpf_kick_cpu(cpu_of(rq), 0);
+ scx_kick_cpu(sch, cpu_of(rq), 0);
break;
}
} while (dspc->nr_tasks);
@@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq)
if (keep_prev) {
p = prev;
if (!p->scx.slice)
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
} else {
p = first_local_task(rq);
if (!p) {
if (kick_idle)
- scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
+ scx_kick_cpu(rcu_dereference_sched(scx_root),
+ cpu_of(rq), SCX_KICK_IDLE);
return NULL;
}
if (unlikely(!p->scx.slice)) {
- struct scx_sched *sch = scx_root;
+ struct scx_sched *sch = rcu_dereference_sched(scx_root);
if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
p->comm, p->pid, __func__);
sch->warned_zero_slice = true;
}
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
}
}
@@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
- refill_task_slice_dfl(p);
+ refill_task_slice_dfl(sch, p);
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
} else {
cpu = prev_cpu;
@@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq)
#ifdef CONFIG_EXT_GROUP_SCHED
-DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem);
+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem);
static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
@@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg)
WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
@@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg)
tg->scx.flags |= SCX_TG_ONLINE;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ret;
}
@@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg)
WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
- percpu_down_read(&scx_cgroup_rwsem);
-
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
(tg->scx.flags & SCX_TG_INITED))
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
tg->css.cgroup);
tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
-
- percpu_up_read(&scx_cgroup_rwsem);
}
int scx_cgroup_can_attach(struct cgroup_taskset *tset)
@@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
struct task_struct *p;
int ret;
- /* released in scx_finish/cancel_attach() */
- percpu_down_read(&scx_cgroup_rwsem);
-
if (!scx_cgroup_enabled)
return 0;
@@ -4192,7 +3120,6 @@ err:
p->scx.cgrp_moving_from = NULL;
}
- percpu_up_read(&scx_cgroup_rwsem);
return ops_sanitize_err(sch, "cgroup_prep_move", ret);
}
@@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p)
p->scx.cgrp_moving_from = NULL;
}
-void scx_cgroup_finish_attach(void)
-{
- percpu_up_read(&scx_cgroup_rwsem);
-}
-
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
struct scx_sched *sch = scx_root;
@@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
struct task_struct *p;
if (!scx_cgroup_enabled)
- goto out_unlock;
+ return;
cgroup_taskset_for_each(p, css, tset) {
if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
@@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
p, p->scx.cgrp_moving_from, css->cgroup);
p->scx.cgrp_moving_from = NULL;
}
-out_unlock:
- percpu_up_read(&scx_cgroup_rwsem);
}
void scx_group_set_weight(struct task_group *tg, unsigned long weight)
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
tg->scx.weight != weight)
@@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
tg->scx.weight = weight;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
void scx_group_set_idle(struct task_group *tg, bool idle)
@@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg,
{
struct scx_sched *sch = scx_root;
- percpu_down_read(&scx_cgroup_rwsem);
+ percpu_down_read(&scx_cgroup_ops_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
(tg->scx.bw_period_us != period_us ||
@@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg,
tg->scx.bw_quota_us = quota_us;
tg->scx.bw_burst_us = burst_us;
- percpu_up_read(&scx_cgroup_rwsem);
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
static void scx_cgroup_lock(void)
{
- percpu_down_write(&scx_cgroup_rwsem);
+ percpu_down_write(&scx_cgroup_ops_rwsem);
+ cgroup_lock();
}
static void scx_cgroup_unlock(void)
{
- percpu_up_write(&scx_cgroup_rwsem);
+ cgroup_unlock();
+ percpu_up_write(&scx_cgroup_ops_rwsem);
}
#else /* CONFIG_EXT_GROUP_SCHED */
-static inline void scx_cgroup_lock(void) {}
-static inline void scx_cgroup_unlock(void) {}
+static void scx_cgroup_lock(void) {}
+static void scx_cgroup_unlock(void) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
@@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch)
{
struct cgroup_subsys_state *css;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
scx_cgroup_enabled = false;
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and exit all the inited ones, all online cgroups are exited.
*/
- rcu_read_lock();
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
@@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
if (!sch->ops.cgroup_exit)
continue;
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
css->cgroup);
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
}
static int scx_cgroup_init(struct scx_sched *sch)
@@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch)
struct cgroup_subsys_state *css;
int ret;
- percpu_rwsem_assert_held(&scx_cgroup_rwsem);
-
/*
- * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk
* cgroups and init, all online cgroups are initialized.
*/
- rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
struct scx_cgroup_init_args args = {
@@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch)
continue;
}
- if (WARN_ON_ONCE(!css_tryget(css)))
- continue;
- rcu_read_unlock();
-
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
css->cgroup, &args);
if (ret) {
@@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
return ret;
}
tg->scx.flags |= SCX_TG_INITED;
-
- rcu_read_lock();
- css_put(css);
}
- rcu_read_unlock();
WARN_ON_ONCE(scx_cgroup_enabled);
scx_cgroup_enabled = true;
@@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
int node;
kthread_stop(sch->helper->task);
- free_percpu(sch->event_stats_cpu);
+ free_percpu(sch->pcpu);
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -4671,9 +3571,22 @@ bool task_should_scx(int policy)
bool scx_allow_ttwu_queue(const struct task_struct *p)
{
- return !scx_enabled() ||
- (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) ||
- p->sched_class != &ext_sched_class;
+ struct scx_sched *sch;
+
+ if (!scx_enabled())
+ return true;
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return true;
+
+ if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ return true;
+
+ if (unlikely(p->sched_class != &ext_sched_class))
+ return true;
+
+ return false;
}
/**
@@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void)
*
* - pick_next_task() suppresses zero slice warning.
*
- * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
+ * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM
* operations.
*
* - scx_prio_less() reverts to the default core_sched_at order.
@@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
- dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
+ dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr),
+ p->migration_disabled);
if (SCX_HAS_OP(sch, dump_task)) {
ops_dump_init(s, " ");
@@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
sch->global_dsqs[node] = dsq;
}
- sch->event_stats_cpu = alloc_percpu(struct scx_event_stats);
- if (!sch->event_stats_cpu)
+ sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+ if (!sch->pcpu)
goto err_free_gdsqs;
sch->helper = kthread_run_worker(0, "sched_ext_helper");
if (!sch->helper)
- goto err_free_event_stats;
+ goto err_free_pcpu;
sched_set_fifo(sch->helper->task);
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
@@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
err_stop_helper:
kthread_stop(sch->helper->task);
-err_free_event_stats:
- free_percpu(sch->event_stats_cpu);
+err_free_pcpu:
+ free_percpu(sch->pcpu);
err_free_gdsqs:
for_each_node_state(node, N_POSSIBLE)
kfree(sch->global_dsqs[node]);
@@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_error(sch, "ops.init() failed (%d)", ret);
goto err_disable;
}
+ sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
}
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
@@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
ret = scx_init_task(p, task_group(p), false);
if (ret) {
put_task_struct(p);
- scx_task_iter_relock(&sti);
scx_task_iter_stop(&sti);
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
ret, p->comm, p->pid);
@@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_set_task_state(p, SCX_TASK_READY);
put_task_struct(p);
- scx_task_iter_relock(&sti);
}
scx_task_iter_stop(&sti);
scx_cgroup_unlock();
@@ -5795,7 +4708,7 @@ err_unlock:
err_disable_unlock_all:
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
- scx_bypass(false);
+ /* we'll soon enter disable path, keep bypass on */
err_disable:
mutex_unlock(&scx_enable_mutex);
/*
@@ -6328,40 +5241,41 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
+ u64 enq_flags)
{
- if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
+ if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
return false;
lockdep_assert_irqs_disabled();
if (unlikely(!p)) {
- scx_kf_error("called with NULL task");
+ scx_error(sch, "called with NULL task");
return false;
}
if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
- scx_kf_error("invalid enq_flags 0x%llx", enq_flags);
+ scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
return false;
}
return true;
}
-static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
+static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
+ u64 dsq_id, u64 enq_flags)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct task_struct *ddsp_task;
ddsp_task = __this_cpu_read(direct_dispatch_task);
if (ddsp_task) {
- mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
+ mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags);
return;
}
if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
- scx_kf_error("dispatch buffer overflow");
+ scx_error(sch, "dispatch buffer overflow");
return;
}
@@ -6413,7 +5327,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6421,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
else
p->scx.slice = p->scx.slice ?: 1;
- scx_dsq_insert_commit(p, dsq_id, enq_flags);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags);
}
/**
@@ -6448,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
u64 slice, u64 vtime, u64 enq_flags)
{
- if (!scx_dsq_insert_preamble(p, enq_flags))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_dsq_insert_preamble(sch, p, enq_flags))
return;
if (slice)
@@ -6458,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
p->scx.dsq_vtime = vtime;
- scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+ scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
__bpf_kfunc_end_defs();
@@ -6483,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
bool in_balance;
unsigned long flags;
- if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
+ if (!scx_kf_allowed_if_unlocked() &&
+ !scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
/*
@@ -6568,7 +5497,15 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
{
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return 0;
return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
@@ -6583,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
__bpf_kfunc void scx_bpf_dispatch_cancel(void)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+ struct scx_sched *sch;
+
+ guard(rcu)();
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return;
if (dspc->cursor > 0)
dspc->cursor--;
else
- scx_kf_error("dispatch buffer underflow");
+ scx_error(sch, "dispatch buffer underflow");
}
/**
@@ -6609,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
*/
__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
{
- struct scx_sched *sch = scx_root;
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
struct scx_dispatch_q *dsq;
+ struct scx_sched *sch;
- if (!scx_kf_allowed(SCX_KF_DISPATCH))
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return false;
+
+ if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
return false;
flush_dispatch_buf(sch, dspc->rq);
@@ -6760,12 +5710,18 @@ __bpf_kfunc_start_defs();
*/
__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
{
+ struct scx_sched *sch;
LIST_HEAD(tasks);
u32 nr_enqueued = 0;
struct rq *rq;
struct task_struct *p, *n;
- if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return 0;
+
+ if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
return 0;
rq = cpu_rq(smp_processor_id());
@@ -6877,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
__bpf_kfunc_start_defs();
-/**
- * scx_bpf_kick_cpu - Trigger reschedule on a CPU
- * @cpu: cpu to kick
- * @flags: %SCX_KICK_* flags
- *
- * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
- * trigger rescheduling on a busy CPU. This can be called from any online
- * scx_ops operation and the actual kicking is performed asynchronously through
- * an irq work.
- */
-__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
{
struct rq *this_rq;
unsigned long irq_flags;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!ops_cpu_valid(sch, cpu, NULL))
return;
local_irq_save(irq_flags);
@@ -6916,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
struct rq *target_rq = cpu_rq(cpu);
if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT)))
- scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
+ scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE");
if (raw_spin_rq_trylock(target_rq)) {
if (can_skip_idle_kick(target_rq)) {
@@ -6941,6 +5887,26 @@ out:
}
/**
+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU
+ * @cpu: cpu to kick
+ * @flags: %SCX_KICK_* flags
+ *
+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
+ * trigger rescheduling on a busy CPU. This can be called from any online
+ * scx_ops operation and the actual kicking is performed asynchronously through
+ * an irq work.
+ */
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (likely(sch))
+ scx_kick_cpu(sch, cpu, flags);
+}
+
+/**
* scx_bpf_dsq_nr_queued - Return the number of queued tasks
* @dsq_id: id of the DSQ
*
@@ -7120,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
__bpf_kfunc_end_defs();
-static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
- char *fmt, unsigned long long *data, u32 data__sz)
+static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
+ size_t line_size, char *fmt, unsigned long long *data,
+ u32 data__sz)
{
struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
s32 ret;
if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
(data__sz && !data)) {
- scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz);
+ scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz);
return -EINVAL;
}
ret = copy_from_kernel_nofault(data_buf, data, data__sz);
if (ret < 0) {
- scx_kf_error("failed to read data fields (%d)", ret);
+ scx_error(sch, "failed to read data fields (%d)", ret);
return ret;
}
ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
&bprintf_data);
if (ret < 0) {
- scx_kf_error("format preparation failed (%d)", ret);
+ scx_error(sch, "format preparation failed (%d)", ret);
return ret;
}
@@ -7149,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
bprintf_data.bin_args);
bpf_bprintf_cleanup(&bprintf_data);
if (ret < 0) {
- scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
+ scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz);
return ret;
}
return ret;
}
-static s32 bstr_format(struct scx_bstr_buf *buf,
+static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf,
char *fmt, unsigned long long *data, u32 data__sz)
{
- return __bstr_format(buf->data, buf->line, sizeof(buf->line),
+ return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line),
fmt, data, data__sz);
}
@@ -7178,11 +6145,14 @@ __bpf_kfunc_start_defs();
__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
unsigned long long *data, u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7198,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
unsigned long flags;
raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
- if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
- scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
+ sch = rcu_dereference_bh(scx_root);
+ if (likely(sch) &&
+ bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
+ scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
}
@@ -7221,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
u32 data__sz)
{
+ struct scx_sched *sch;
struct scx_dump_data *dd = &scx_dump_data;
struct scx_bstr_buf *buf = &dd->buf;
s32 ret;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return;
+
if (raw_smp_processor_id() != dd->cpu) {
- scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends");
+ scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends");
return;
}
/* append the formatted string to the line buf */
- ret = __bstr_format(buf->data, buf->line + dd->cursor,
+ ret = __bstr_format(sch, buf->data, buf->line + dd->cursor,
sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
if (ret < 0) {
dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
@@ -7267,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_cpu_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7289,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
*/
__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
{
- if (kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
return arch_scale_freq_capacity(cpu);
else
return SCX_CPUPERF_ONE;
@@ -7311,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
*/
__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(sch);
+ if (unlikely(!sch))
+ return;
+
if (unlikely(perf > SCX_CPUPERF_ONE)) {
- scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu);
+ scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu);
return;
}
- if (kf_cpu_valid(cpu, NULL)) {
+ if (ops_cpu_valid(sch, cpu, NULL)) {
struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq();
struct rq_flags rf;
@@ -7325,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
* to the corresponding CPU to prevent ABBA deadlocks.
*/
if (locked_rq && rq != locked_rq) {
- scx_kf_error("Invalid target CPU %d", cpu);
+ scx_error(sch, "Invalid target CPU %d", cpu);
return;
}
@@ -7420,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
*/
__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return NULL;
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ if (!sch->warned_deprecated_rq) {
+ printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; "
+ "use scx_bpf_locked_rq() when holding rq lock "
+ "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__);
+ sch->warned_deprecated_rq = true;
+ }
+
return cpu_rq(cpu);
}
/**
+ * scx_bpf_locked_rq - Return the rq currently locked by SCX
+ *
+ * Returns the rq if a rq lock is currently held by SCX.
+ * Otherwise emits an error and returns NULL.
+ */
+__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+{
+ struct scx_sched *sch;
+ struct rq *rq;
+
+ guard(preempt)();
+
+ sch = rcu_dereference_sched(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ rq = scx_locked_rq();
+ if (!rq) {
+ scx_error(sch, "accessing rq without holding rq lock");
+ return NULL;
+ }
+
+ return rq;
+}
+
+/**
+ * scx_bpf_cpu_curr - Return remote CPU's curr task
+ * @cpu: CPU of interest
+ *
+ * Callers must hold RCU read lock (KF_RCU).
+ */
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return NULL;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
+ return NULL;
+
+ return rcu_dereference(cpu_rq(cpu)->curr);
+}
+
+/**
* scx_bpf_task_cgroup - Return the sched cgroup of a task
* @p: task of interest
*
@@ -7442,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
{
struct task_group *tg = p->sched_task_group;
struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ goto out;
- if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+ if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
goto out;
cgrp = tg_cgrp(tg);
@@ -7524,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
/* Aggregate per-CPU event counters into @events. */
memset(events, 0, sizeof(*events));
for_each_possible_cpu(cpu) {
- e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu);
+ e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats;
scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
@@ -7590,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 292bb41a242e..43429b33e52c 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,29 +8,6 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
-static inline bool scx_kf_allowed_if_unlocked(void)
-{
- return !current->scx.kf_mask;
-}
-
-static inline bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
-DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
-
-DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
- return __this_cpu_read(scx_locked_rq_state);
-}
-
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
void scx_cgroup_move_task(struct task_struct *p);
-void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
@@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
static inline void scx_cgroup_move_task(struct task_struct *p) {}
-static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 537c6992bb63..d2434c954848 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -819,10 +819,10 @@ void scx_idle_disable(void)
* Helpers that can be called from the BPF scheduler.
*/
-static int validate_node(int node)
+static int validate_node(struct scx_sched *sch, int node)
{
if (!static_branch_likely(&scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is disabled");
+ scx_error(sch, "per-node idle tracking is disabled");
return -EOPNOTSUPP;
}
@@ -832,13 +832,13 @@ static int validate_node(int node)
/* Make sure node is in a valid range */
if (node < 0 || node >= nr_node_ids) {
- scx_kf_error("invalid node %d", node);
+ scx_error(sch, "invalid node %d", node);
return -EINVAL;
}
/* Make sure the node is part of the set of possible nodes */
if (!node_possible(node)) {
- scx_kf_error("unavailable node %d", node);
+ scx_error(sch, "unavailable node %d", node);
return -EINVAL;
}
@@ -847,12 +847,12 @@ static int validate_node(int node)
__bpf_kfunc_start_defs();
-static bool check_builtin_idle_enabled(void)
+static bool check_builtin_idle_enabled(struct scx_sched *sch)
{
if (static_branch_likely(&scx_builtin_idle_enabled))
return true;
- scx_kf_error("built-in idle tracking is disabled");
+ scx_error(sch, "built-in idle tracking is disabled");
return false;
}
@@ -882,17 +882,18 @@ static bool is_bpf_migration_disabled(const struct task_struct *p)
return p->migration_disabled;
}
-static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags,
const struct cpumask *allowed, u64 flags)
{
struct rq *rq;
struct rq_flags rf;
s32 cpu;
- if (!kf_cpu_valid(prev_cpu, NULL))
+ if (!ops_cpu_valid(sch, prev_cpu, NULL))
return -EINVAL;
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
/*
@@ -905,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
if (scx_kf_allowed_if_unlocked()) {
rq = task_rq_lock(p, &rf);
} else {
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
+ if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
return -EPERM;
rq = scx_locked_rq();
}
@@ -948,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
- if (!kf_cpu_valid(cpu, NULL))
- return NUMA_NO_NODE;
+ struct scx_sched *sch;
+
+ guard(rcu)();
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
+ return NUMA_NO_NODE;
return cpu_to_node(cpu);
}
@@ -972,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
+ struct scx_sched *sch;
s32 cpu;
- cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0);
if (cpu >= 0) {
*is_idle = true;
return cpu;
}
*is_idle = false;
-
return prev_cpu;
}
@@ -1007,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
__bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags)
{
- return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
+ cpus_allowed, flags);
}
/**
@@ -1021,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1037,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
return idle_cpumask(NUMA_NO_NODE)->cpu;
@@ -1060,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
+ node = validate_node(sch, node);
if (node < 0)
return cpu_none_mask;
@@ -1080,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return cpu_none_mask;
+
if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
- scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
return cpu_none_mask;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return cpu_none_mask;
if (sched_smt_active())
@@ -1121,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
*/
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
{
- if (!check_builtin_idle_enabled())
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
return false;
- if (!kf_cpu_valid(cpu, NULL))
+ if (!check_builtin_idle_enabled(sch))
+ return false;
+
+ if (!ops_cpu_valid(sch, cpu, NULL))
return false;
return scx_idle_test_and_clear_cpu(cpu);
@@ -1152,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
- node = validate_node(node);
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1184,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
+
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
- if (!check_builtin_idle_enabled())
+ if (!check_builtin_idle_enabled(sch))
return -EBUSY;
return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
@@ -1219,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
int node, u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
- node = validate_node(node);
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
+ node = validate_node(sch, node);
if (node < 0)
return node;
@@ -1259,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
+ struct scx_sched *sch;
s32 cpu;
+ guard(rcu)();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch))
+ return -ENODEV;
+
if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
- scx_kf_error("per-node idle tracking is enabled");
+ scx_error(sch, "per-node idle tracking is enabled");
return -EBUSY;
}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
new file mode 100644
index 000000000000..b3617abed510
--- /dev/null
+++ b/kernel/sched/ext_internal.h
@@ -0,0 +1,1078 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_DSP_MAX_LOOPS = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+ SCX_EXIT_DUMP_DFL_LEN = 32768,
+
+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE,
+
+ /*
+ * Iterating all tasks may take a while. Periodically drop
+ * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls.
+ */
+ SCX_TASK_ITER_BATCH = 32,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
+};
+
+/*
+ * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(),
+ * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes
+ * are 64bit of the format:
+ *
+ * Bits: [63 .. 48 47 .. 32 31 .. 0]
+ * [ SYS ACT ] [ SYS RSN ] [ USR ]
+ *
+ * SYS ACT: System-defined exit actions
+ * SYS RSN: System-defined exit reasons
+ * USR : User-defined exit codes and reasons
+ *
+ * Using the above, users may communicate intention and context by ORing system
+ * actions and/or system reasons with a user-defined exit code.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+enum scx_exit_flags {
+ /*
+ * ops.exit() may be called even if the loading failed before ops.init()
+ * finishes successfully. This is because ops.exit() allows rich exit
+ * info communication. The following flag indicates whether ops.init()
+ * finished successfully.
+ */
+ SCX_EFLAG_INITIALIZED,
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* %SCX_EFLAG_* */
+ u64 flags;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+
+ /* debug dump */
+ char *dump;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
+
+ /*
+ * CPU cgroup support flags
+ */
+ SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
+ SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
+ SCX_OPS_HAS_CGROUP_WEIGHT,
+
+ /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
+ __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /* the cgroup the task is joining */
+ struct cgroup *cgroup;
+#endif
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/* argument container for ops->cgroup_init() */
+struct scx_cgroup_init_args {
+ /* the weight of the cgroup [1..10000] */
+ u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
+};
+
+enum scx_cpu_preempt_reason {
+ /* next task is being scheduled by &sched_class_rt */
+ SCX_CPU_PREEMPT_RT,
+ /* next task is being scheduled by &sched_class_dl */
+ SCX_CPU_PREEMPT_DL,
+ /* next task is being scheduled by &sched_class_stop */
+ SCX_CPU_PREEMPT_STOP,
+ /* unknown reason for SCX being preempted */
+ SCX_CPU_PREEMPT_UNKNOWN,
+};
+
+/*
+ * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * expanded in the future.
+ */
+struct scx_cpu_acquire_args {};
+
+/* argument container for ops->cpu_release() */
+struct scx_cpu_release_args {
+ /* the reason the CPU was preempted */
+ enum scx_cpu_preempt_reason reason;
+
+ /* the task that's going to be scheduled on the CPU */
+ struct task_struct *task;
+};
+
+/*
+ * Informational context provided to dump operations.
+ */
+struct scx_dump_ctx {
+ enum scx_exit_kind kind;
+ s64 exit_code;
+ const char *reason;
+ u64 at_ns;
+ u64 at_jiffies;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
+ */
+struct sched_ext_ops {
+ /**
+ * @select_cpu: Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be inserted into a DSQ directly by calling
+ * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+ * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+ * of the CPU returned by this operation.
+ *
+ * Note that select_cpu() is never called for tasks that can only run
+ * on a single CPU or tasks with migration disabled, as they don't have
+ * the option to select a different CPU. See select_task_rq() for
+ * details.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * @enqueue: Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Insert directly into a DSQ by calling
+ * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+ * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+ * the task will stall.
+ *
+ * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @dequeue: Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+ * using scx_bpf_dsq_move_to_local().
+ *
+ * The maximum number of times scx_bpf_dsq_insert() can be called
+ * without an intervening scx_bpf_dsq_move_to_local() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * @tick: Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * @runnable: A task is becoming runnable on its associated CPU
+ * @p: task becoming runnable
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * This and the following three functions can be used to track a task's
+ * execution state transitions. A task becomes ->runnable() on a CPU,
+ * and then goes through one or more ->running() and ->stopping() pairs
+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's
+ * done running on the CPU.
+ *
+ * @p is becoming runnable on the CPU because it's
+ *
+ * - waking up (%SCX_ENQ_WAKEUP)
+ * - being moved from another CPU
+ * - being restored after temporarily taken off the queue for an
+ * attribute change.
+ *
+ * This and ->enqueue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be followed by ->enqueue()
+ * e.g. when @p is being dispatched to a remote CPU, or when @p is
+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a
+ * task may be ->enqueue()'d without being preceded by this operation
+ * e.g. after exhausting its slice.
+ */
+ void (*runnable)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * @running: A task is starting to run on its associated CPU
+ * @p: task starting to run
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task is going to run on. This can happen when a task
+ * property is changed (i.e., affinity), since scx_next_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to determine the
+ * target CPU the task is going to use.
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ */
+ void (*running)(struct task_struct *p);
+
+ /**
+ * @stopping: A task is stopping execution
+ * @p: task stopping to run
+ * @runnable: is task @p still runnable?
+ *
+ * Note that this callback may be called from a CPU other than the
+ * one the task was running on. This can happen when a task
+ * property is changed (i.e., affinity), since dequeue_task_scx(),
+ * which triggers this callback, may run on a CPU different from
+ * the task's assigned CPU.
+ *
+ * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU
+ * the task was running on.
+ *
+ * See ->runnable() for explanation on the task state notifiers. If
+ * !@runnable, ->quiescent() will be invoked after this operation
+ * returns.
+ */
+ void (*stopping)(struct task_struct *p, bool runnable);
+
+ /**
+ * @quiescent: A task is becoming not runnable on its associated CPU
+ * @p: task becoming not runnable
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * See ->runnable() for explanation on the task state notifiers.
+ *
+ * @p is becoming quiescent on the CPU because it's
+ *
+ * - sleeping (%SCX_DEQ_SLEEP)
+ * - being moved to another CPU
+ * - being temporarily taken off the queue for an attribute change
+ * (%SCX_DEQ_SAVE)
+ *
+ * This and ->dequeue() are related but not coupled. This operation
+ * notifies @p's state transition and may not be preceded by ->dequeue()
+ * e.g. when @p is being dispatched to a remote CPU.
+ */
+ void (*quiescent)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * @yield: Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * @core_sched_before: Task ordering for core-sched
+ * @a: task A
+ * @b: task B
+ *
+ * Used by core-sched to determine the ordering between two tasks. See
+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
+ * core-sched.
+ *
+ * Both @a and @b are runnable and may or may not currently be queued on
+ * the BPF scheduler. Should return %true if @a should run before @b.
+ * %false if there's no required ordering or @b should run before @a.
+ *
+ * If not specified, the default is ordering them according to when they
+ * became runnable.
+ */
+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
+
+ /**
+ * @set_weight: Set task weight
+ * @p: task to set weight for
+ * @weight: new weight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * @set_cpumask: Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_select_cpu_and()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
+ * @cpu: The CPU being acquired by the BPF scheduler.
+ * @args: Acquire arguments, see the struct definition.
+ *
+ * A CPU that was previously released from the BPF scheduler is now once
+ * again under its control.
+ */
+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
+
+ /**
+ * @cpu_release: A CPU is taken away from the BPF scheduler
+ * @cpu: The CPU being released by the BPF scheduler.
+ * @args: Release arguments, see the struct definition.
+ *
+ * The specified CPU is no longer under the control of the BPF
+ * scheduler. This could be because it was preempted by a higher
+ * priority sched_class, though there may be other reasons as well. The
+ * caller should consult @args->reason to determine the cause.
+ */
+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
+
+ /**
+ * @init_task: Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * @exit_task: Exit a previously-running task from the system
+ * @p: task to exit
+ * @args: exit arguments, see the struct definition
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * @enable: Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * @disable: Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /**
+ * @dump: Dump BPF scheduler state on error
+ * @ctx: debug dump context
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
+ */
+ void (*dump)(struct scx_dump_ctx *ctx);
+
+ /**
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
+ * @ctx: debug dump context
+ * @cpu: CPU to generate debug dump for
+ * @idle: @cpu is currently idle without any runnable tasks
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @cpu. If @idle is %true and this operation doesn't produce any
+ * output, @cpu is skipped for dump.
+ */
+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
+
+ /**
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
+ * @ctx: debug dump context
+ * @p: runnable task to generate debug dump for
+ *
+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
+ * @p.
+ */
+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
+
+#ifdef CONFIG_EXT_GROUP_SCHED
+ /**
+ * @cgroup_init: Initialize a cgroup
+ * @cgrp: cgroup being initialized
+ * @args: init arguments, see the struct definition
+ *
+ * Either the BPF scheduler is being loaded or @cgrp created, initialize
+ * @cgrp for sched_ext. This operation may block.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During cgroup
+ * creation, it will abort the specific cgroup creation.
+ */
+ s32 (*cgroup_init)(struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args);
+
+ /**
+ * @cgroup_exit: Exit a cgroup
+ * @cgrp: cgroup being exited
+ *
+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
+ * @cgrp for sched_ext. This operation my block.
+ */
+ void (*cgroup_exit)(struct cgroup *cgrp);
+
+ /**
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Prepare @p for move from cgroup @from to @to. This operation may
+ * block and can be used for allocations.
+ *
+ * Return 0 for success, -errno for failure. An error return aborts the
+ * migration.
+ */
+ s32 (*cgroup_prep_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_move: Commit cgroup move
+ * @p: task being moved
+ * @from: cgroup @p is being moved from
+ * @to: cgroup @p is being moved to
+ *
+ * Commit the move. @p is dequeued during this operation.
+ */
+ void (*cgroup_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_cancel_move: Cancel cgroup move
+ * @p: task whose cgroup move is being canceled
+ * @from: cgroup @p was being moved from
+ * @to: cgroup @p was being moved to
+ *
+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
+ * Undo the preparation.
+ */
+ void (*cgroup_cancel_move)(struct task_struct *p,
+ struct cgroup *from, struct cgroup *to);
+
+ /**
+ * @cgroup_set_weight: A cgroup's weight is being changed
+ * @cgrp: cgroup whose weight is being updated
+ * @weight: new weight [1..10000]
+ *
+ * Update @cgrp's weight to @weight.
+ */
+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
+#endif /* CONFIG_EXT_GROUP_SCHED */
+
+ /*
+ * All online ops must come before ops.cpu_online().
+ */
+
+ /**
+ * @cpu_online: A CPU became online
+ * @cpu: CPU which just came up
+ *
+ * @cpu just came online. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand.
+ */
+ void (*cpu_online)(s32 cpu);
+
+ /**
+ * @cpu_offline: A CPU is going offline
+ * @cpu: CPU which is going offline
+ *
+ * @cpu is going offline. @cpu will not call ops.enqueue() or
+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards.
+ */
+ void (*cpu_offline)(s32 cpu);
+
+ /*
+ * All CPU hotplug ops must come before ops.init().
+ */
+
+ /**
+ * @init: Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * @exit: Clean up after the BPF scheduler
+ * @info: Exit info
+ *
+ * ops.exit() is also called on ops.init() failure, which is a bit
+ * unusual. This is to allow rich reporting through @info on how
+ * ops.init() failed.
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * @flags: %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
+ /**
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
+ * value of 32768 is used.
+ */
+ u32 exit_dump_len;
+
+ /**
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
+ * detect when a hotplug event has occurred during the loading process.
+ * If 0, no detection occurs. Otherwise, the scheduler will fail to
+ * load if the sequence number does not match @scx_hotplug_seq on the
+ * enable path.
+ */
+ u64 hotplug_seq;
+
+ /**
+ * @name: BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+
+ /* internal use only, must be NULL */
+ void *priv;
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online),
+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * Total number of times a task's time slice was refilled with the
+ * default value (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_REFILL_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+struct scx_sched_pcpu {
+ /*
+ * The event counters are in a per-CPU variable to minimize the
+ * accounting overhead. A system-wide view on the event counter is
+ * constructed when requested by scx_bpf_events().
+ */
+ struct scx_event_stats event_stats;
+};
+
+struct scx_sched {
+ struct sched_ext_ops ops;
+ DECLARE_BITMAP(has_op, SCX_OPI_END);
+
+ /*
+ * Dispatch queues.
+ *
+ * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability.
+ * This is to avoid live-locking in bypass mode where all tasks are
+ * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If
+ * per-node split isn't sufficient, it can be further split.
+ */
+ struct rhashtable dsq_hash;
+ struct scx_dispatch_q **global_dsqs;
+ struct scx_sched_pcpu __percpu *pcpu;
+
+ bool warned_zero_slice:1;
+ bool warned_deprecated_rq:1;
+
+ atomic_t exit_kind;
+ struct scx_exit_info *exit_info;
+
+ struct kobject kobj;
+
+ struct kthread_worker *helper;
+ struct irq_work error_irq_work;
+ struct kthread_work disable_work;
+ struct rcu_work rcu_work;
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+ SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * Set the following to trigger preemption when calling
+ * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
+ * current task is cleared to zero and the CPU is kicked into the
+ * scheduling path. Implies %SCX_ENQ_HEAD.
+ */
+ SCX_ENQ_PREEMPT = 1LLU << 32,
+
+ /*
+ * The task being enqueued was previously enqueued on the current CPU's
+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
+ * invoked in a ->cpu_release() callback, and the task is again
+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
+ * task will not be scheduled on the CPU until at least the next invocation
+ * of the ->cpu_acquire() callback.
+ */
+ SCX_ENQ_REENQ = 1LLU << 40,
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * The BPF scheduler is responsible for triggering a follow-up
+ * scheduling event. Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The generic core-sched layer decided to execute the task even though
+ * it hasn't been dispatched yet. Dequeue from the BPF side.
+ */
+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
+};
+
+enum scx_kick_flags {
+ /*
+ * Kick the target CPU if idle. Guarantees that the target CPU goes
+ * through at least one full scheduling cycle before going idle. If the
+ * target CPU can be determined to be currently not idle and going to go
+ * through a scheduling cycle before going idle, noop.
+ */
+ SCX_KICK_IDLE = 1LLU << 0,
+
+ /*
+ * Preempt the current task and execute the dispatch path. If the
+ * current task of the target CPU is an SCX task, its ->scx.slice is
+ * cleared to zero before the scheduling path is invoked so that the
+ * task expires and the dispatch path is invoked.
+ */
+ SCX_KICK_PREEMPT = 1LLU << 1,
+
+ /*
+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will
+ * return after the target CPU finishes picking the next task.
+ */
+ SCX_KICK_WAIT = 1LLU << 2,
+};
+
+enum scx_tg_flags {
+ SCX_TG_ONLINE = 1U << 0,
+ SCX_TG_INITED = 1U << 1,
+};
+
+enum scx_enable_state {
+ SCX_ENABLING,
+ SCX_ENABLED,
+ SCX_DISABLING,
+ SCX_DISABLED,
+};
+
+static const char *scx_enable_state_str[] = {
+ [SCX_ENABLING] = "enabling",
+ [SCX_ENABLED] = "enabled",
+ [SCX_DISABLING] = "disabling",
+ [SCX_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
+static inline bool scx_kf_allowed_if_unlocked(void)
+{
+ return !current->scx.kf_mask;
+}
+
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h
new file mode 100644
index 000000000000..4366fb3c91ce
--- /dev/null
+++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE __PAGE_SIZE
+/*
+ * for older kernels try sizeof(struct genradix_node)
+ * or flexible:
+ * static inline long __bpf_page_size(void) {
+ * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node);
+ * }
+ * but generated code is not great.
+ */
+#endif
+
+#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
+#define __arena __attribute__((address_space(1)))
+#define __arena_global __attribute__((address_space(1)))
+#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
+#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
+#else
+
+/* emit instruction:
+ * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as
+ *
+ * This is a workaround for LLVM compiler versions without
+ * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena
+ * pointers and native kernel/userspace ones. In this case we explicitly do so
+ * with cast_kern() and cast_user(). E.g., in the Linux kernel tree,
+ * tools/testing/selftests/bpf includes tests that use these macros to implement
+ * linked lists and hashtables backed by arena memory. In sched_ext, we use
+ * cast_kern() and cast_user() for compatibility with older LLVM toolchains.
+ */
+#ifndef bpf_addr_space_cast
+#define bpf_addr_space_cast(var, dst_as, src_as)\
+ asm volatile(".byte 0xBF; \
+ .ifc %[reg], r0; \
+ .byte 0x00; \
+ .endif; \
+ .ifc %[reg], r1; \
+ .byte 0x11; \
+ .endif; \
+ .ifc %[reg], r2; \
+ .byte 0x22; \
+ .endif; \
+ .ifc %[reg], r3; \
+ .byte 0x33; \
+ .endif; \
+ .ifc %[reg], r4; \
+ .byte 0x44; \
+ .endif; \
+ .ifc %[reg], r5; \
+ .byte 0x55; \
+ .endif; \
+ .ifc %[reg], r6; \
+ .byte 0x66; \
+ .endif; \
+ .ifc %[reg], r7; \
+ .byte 0x77; \
+ .endif; \
+ .ifc %[reg], r8; \
+ .byte 0x88; \
+ .endif; \
+ .ifc %[reg], r9; \
+ .byte 0x99; \
+ .endif; \
+ .short %[off]; \
+ .long %[as]" \
+ : [reg]"+r"(var) \
+ : [off]"i"(BPF_ADDR_SPACE_CAST) \
+ , [as]"i"((dst_as << 16) | src_as));
+#endif
+
+#define __arena
+#define __arena_global SEC(".addr_space.1")
+#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1)
+#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0)
+#endif
+
+void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
+ int node_id, __u64 flags) __ksym __weak;
+void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
+
+/*
+ * Note that cond_break can only be portably used in the body of a breakable
+ * construct, whereas can_loop can be used anywhere.
+ */
+#ifdef TEST
+#define can_loop true
+#define __cond_break(expr) expr
+#else
+#ifdef __BPF_FEATURE_MAY_GOTO
+#define can_loop \
+ ({ __label__ l_break, l_continue; \
+ bool ret = true; \
+ asm volatile goto("may_goto %l[l_break]" \
+ :::: l_break); \
+ goto l_continue; \
+ l_break: ret = false; \
+ l_continue:; \
+ ret; \
+ })
+
+#define __cond_break(expr) \
+ ({ __label__ l_break, l_continue; \
+ asm volatile goto("may_goto %l[l_break]" \
+ :::: l_break); \
+ goto l_continue; \
+ l_break: expr; \
+ l_continue:; \
+ })
+#else
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define can_loop \
+ ({ __label__ l_break, l_continue; \
+ bool ret = true; \
+ asm volatile goto("1:.byte 0xe5; \
+ .byte 0; \
+ .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \
+ .short 0" \
+ :::: l_break); \
+ goto l_continue; \
+ l_break: ret = false; \
+ l_continue:; \
+ ret; \
+ })
+
+#define __cond_break(expr) \
+ ({ __label__ l_break, l_continue; \
+ asm volatile goto("1:.byte 0xe5; \
+ .byte 0; \
+ .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \
+ .short 0" \
+ :::: l_break); \
+ goto l_continue; \
+ l_break: expr; \
+ l_continue:; \
+ })
+#else
+#define can_loop \
+ ({ __label__ l_break, l_continue; \
+ bool ret = true; \
+ asm volatile goto("1:.byte 0xe5; \
+ .byte 0; \
+ .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \
+ .short 0" \
+ :::: l_break); \
+ goto l_continue; \
+ l_break: ret = false; \
+ l_continue:; \
+ ret; \
+ })
+
+#define __cond_break(expr) \
+ ({ __label__ l_break, l_continue; \
+ asm volatile goto("1:.byte 0xe5; \
+ .byte 0; \
+ .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \
+ .short 0" \
+ :::: l_break); \
+ goto l_continue; \
+ l_break: expr; \
+ l_continue:; \
+ })
+#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+#endif /* __BPF_FEATURE_MAY_GOTO */
+#endif /* TEST */
+
+#define cond_break __cond_break(break)
+#define cond_break_label(label) __cond_break(goto label)
+
+
+void bpf_preempt_disable(void) __weak __ksym;
+void bpf_preempt_enable(void) __weak __ksym;
diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h
new file mode 100644
index 000000000000..10141db0b59d
--- /dev/null
+++ b/tools/sched_ext/include/scx/bpf_arena_common.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#pragma once
+
+#ifndef arena_container_of
+#define arena_container_of(ptr, type, member) \
+ ({ \
+ void __arena *__mptr = (void __arena *)(ptr); \
+ ((type *)(__mptr - offsetof(type, member))); \
+ })
+#endif
+
+/* Provide the definition of PAGE_SIZE. */
+#include <sys/user.h>
+
+#define __arena
+#define __arg_arena
+#define cast_kern(ptr) /* nop for user space */
+#define cast_user(ptr) /* nop for user space */
+char __attribute__((weak)) arena[1];
+
+#ifndef offsetof
+#define offsetof(type, member) ((unsigned long)&((type *)0)->member)
+#endif
+
+static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt,
+ int node_id, __u64 flags)
+{
+ return NULL;
+}
+static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt)
+{
+}
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index d4e21558e982..06e2551033cb 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -24,14 +24,26 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <asm-generic/errno.h>
-#include "user_exit_info.h"
+#include "user_exit_info.bpf.h"
#include "enum_defs.autogen.h"
+#define PF_IDLE 0x00000002 /* I am an IDLE thread */
+#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
+#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */
+#define PF_KSWAPD 0x00020000 /* I am kswapd */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_EXITING 0x00000004
#define CLOCK_MONOTONIC 1
+#ifndef NR_CPUS
+#define NR_CPUS 1024
+#endif
+
+#ifndef NUMA_NO_NODE
+#define NUMA_NO_NODE (-1)
+#endif
+
extern int LINUX_KERNEL_VERSION __kconfig;
extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak;
extern const char CONFIG_LOCALVERSION[64] __kconfig __weak;
@@ -91,6 +103,8 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
+struct rq *scx_bpf_locked_rq(void) __ksym;
+struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
u64 scx_bpf_now(void) __ksym __weak;
void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
@@ -107,6 +121,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __
static inline __attribute__((format(printf, 1, 2)))
void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
+#define SCX_STRINGIFY(x) #x
+#define SCX_TOSTRING(x) SCX_STRINGIFY(x)
+
/*
* Helper macro for initializing the fmt and variadic argument inputs to both
* bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
@@ -141,13 +158,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
* scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
* instead of an array of u64. Invoking this macro will cause the scheduler to
* exit in an erroneous state, with diagnostic information being passed to the
- * user.
+ * user. It appends the file and line number to aid debugging.
*/
#define scx_bpf_error(fmt, args...) \
({ \
- scx_bpf_bstr_preamble(fmt, args) \
+ scx_bpf_bstr_preamble( \
+ __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \
scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \
- ___scx_bpf_bstr_format_checker(fmt, ##args); \
+ ___scx_bpf_bstr_format_checker( \
+ __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \
})
/*
@@ -229,6 +248,7 @@ BPF_PROG(name, ##args)
* be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
* `MEMBER_VPTR(ptr, ->member)`.
*/
+#ifndef MEMBER_VPTR
#define MEMBER_VPTR(base, member) (typeof((base) member) *) \
({ \
u64 __base = (u64)&(base); \
@@ -245,6 +265,7 @@ BPF_PROG(name, ##args)
[max]"i"(sizeof(base) - sizeof((base) member))); \
__addr; \
})
+#endif /* MEMBER_VPTR */
/**
* ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
@@ -260,6 +281,7 @@ BPF_PROG(name, ##args)
* size of the array to compute the max, which will result in rejection by
* the verifier.
*/
+#ifndef ARRAY_ELEM_PTR
#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \
({ \
u64 __base = (u64)arr; \
@@ -274,7 +296,7 @@ BPF_PROG(name, ##args)
[max]"r"(sizeof(arr[0]) * ((n) - 1))); \
__addr; \
})
-
+#endif /* ARRAY_ELEM_PTR */
/*
* BPF declarations and helpers
@@ -438,8 +460,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
*/
static inline bool is_migration_disabled(const struct task_struct *p)
{
- if (bpf_core_field_exists(p->migration_disabled))
- return p->migration_disabled;
+ /*
+ * Testing p->migration_disabled in a BPF code is tricky because the
+ * migration is _always_ disabled while running the BPF code.
+ * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF
+ * code execution disable and re-enable the migration of the current
+ * task, respectively. So, the _current_ task of the sched_ext ops is
+ * always migration-disabled. Moreover, p->migration_disabled could be
+ * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is
+ * executed in the middle of the other BPF code execution.
+ *
+ * Therefore, we should decide that the _current_ task is
+ * migration-disabled only when its migration_disabled count is greater
+ * than one. In other words, when p->migration_disabled == 1, there is
+ * an ambiguity, so we should check if @p is the current task or not.
+ */
+ if (bpf_core_field_exists(p->migration_disabled)) {
+ if (p->migration_disabled == 1)
+ return bpf_get_current_task_btf() != p;
+ else
+ return p->migration_disabled;
+ }
return false;
}
@@ -476,7 +517,7 @@ static inline s64 time_delta(u64 after, u64 before)
*/
static inline bool time_after(u64 a, u64 b)
{
- return (s64)(b - a) < 0;
+ return (s64)(b - a) < 0;
}
/**
@@ -500,7 +541,7 @@ static inline bool time_before(u64 a, u64 b)
*/
static inline bool time_after_eq(u64 a, u64 b)
{
- return (s64)(a - b) >= 0;
+ return (s64)(a - b) >= 0;
}
/**
@@ -547,9 +588,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c)
*/
/* useful compiler attributes */
+#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#ifndef __maybe_unused
#define __maybe_unused __attribute__((__unused__))
+#endif
/*
* READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
@@ -633,6 +680,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
})
/*
+ * __calc_avg - Calculate exponential weighted moving average (EWMA) with
+ * @old and @new values. @decay represents how large the @old value remains.
+ * With a larger @decay value, the moving average changes slowly, exhibiting
+ * fewer fluctuations.
+ */
+#define __calc_avg(old, new, decay) ({ \
+ typeof(decay) thr = 1 << (decay); \
+ typeof(old) ret; \
+ if (((old) < thr) || ((new) < thr)) { \
+ if (((old) == 1) && ((new) == 0)) \
+ ret = 0; \
+ else \
+ ret = ((old) - ((old) >> 1)) + ((new) >> 1); \
+ } else { \
+ ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \
+ } \
+ ret; \
+})
+
+/*
* log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
* @v: The value for which we're computing the base 2 logarithm.
*/
@@ -663,6 +730,25 @@ static inline u32 log2_u64(u64 v)
}
/*
+ * sqrt_u64 - Calculate the square root of value @x using Newton's method.
+ */
+static inline u64 __sqrt_u64(u64 x)
+{
+ if (x == 0 || x == 1)
+ return x;
+
+ u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32);
+
+ for (int i = 0; i < 8; ++i) {
+ u64 q = x / r;
+ if (r <= q)
+ break;
+ r = (r + q) >> 1;
+ }
+ return r;
+}
+
+/*
* Return a value proportionally scaled to the task's weight.
*/
static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value)
diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
index 1dc76bd84296..b3c6372bcf81 100644
--- a/tools/sched_ext/include/scx/common.h
+++ b/tools/sched_ext/include/scx/common.h
@@ -75,8 +75,9 @@ typedef int64_t s64;
#include "enums.h"
/* not available when building kernel tools/sched_ext */
-#if __has_include(<lib/sdt_task.h>)
-#include <lib/sdt_task.h>
+#if __has_include(<lib/sdt_task_defs.h>)
+#include "bpf_arena_common.h"
+#include <lib/sdt_task_defs.h>
#endif
#endif /* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 9252e1a00556..dd9144624dc9 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i
void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak;
#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \
(bpf_ksym_exists(scx_bpf_dsq_insert) ? \
@@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter,
scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
false))
+#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \
+ (bpf_ksym_exists(bpf_cpumask_populate) ? \
+ (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP)
+
#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \
_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
@@ -226,6 +231,23 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags)
scx_bpf_pick_any_cpu(cpus_allowed, flags))
/*
+ * v6.18: Add a helper to retrieve the current task running on a CPU.
+ *
+ * Keep this helper available until v6.20 for compatibility.
+ */
+static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu)
+{
+ struct rq *rq;
+
+ if (bpf_ksym_exists(scx_bpf_cpu_curr))
+ return scx_bpf_cpu_curr(cpu);
+
+ rq = scx_bpf_cpu_rq(cpu);
+
+ return rq ? rq->curr : NULL;
+}
+
+/*
* Define sched_ext_ops. This may be expanded to define multiple variants for
* backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
*/
diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h
new file mode 100644
index 000000000000..e7ac6611a990
--- /dev/null
+++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+
+#ifndef __USER_EXIT_INFO_BPF_H
+#define __USER_EXIT_INFO_BPF_H
+
+#ifndef LSP
+#include "vmlinux.h"
+#endif
+#include <bpf/bpf_core_read.h>
+
+#include "user_exit_info_common.h"
+
+#define UEI_DEFINE(__name) \
+ char RESIZABLE_ARRAY(data, __name##_dump); \
+ const volatile u32 __name##_dump_len; \
+ struct user_exit_info __name SEC(".data")
+
+#define UEI_RECORD(__uei_name, __ei) ({ \
+ bpf_probe_read_kernel_str(__uei_name.reason, \
+ sizeof(__uei_name.reason), (__ei)->reason); \
+ bpf_probe_read_kernel_str(__uei_name.msg, \
+ sizeof(__uei_name.msg), (__ei)->msg); \
+ bpf_probe_read_kernel_str(__uei_name##_dump, \
+ __uei_name##_dump_len, (__ei)->dump); \
+ if (bpf_core_field_exists((__ei)->exit_code)) \
+ __uei_name.exit_code = (__ei)->exit_code; \
+ /* use __sync to force memory barrier */ \
+ __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \
+ (__ei)->kind); \
+})
+
+#endif /* __USER_EXIT_INFO_BPF_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 66f856640ee7..399697fa372f 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -10,55 +10,11 @@
#ifndef __USER_EXIT_INFO_H
#define __USER_EXIT_INFO_H
-#ifdef LSP
-#define __bpf__
-#include "../vmlinux.h"
-#endif
-
-enum uei_sizes {
- UEI_REASON_LEN = 128,
- UEI_MSG_LEN = 1024,
- UEI_DUMP_DFL_LEN = 32768,
-};
-
-struct user_exit_info {
- int kind;
- s64 exit_code;
- char reason[UEI_REASON_LEN];
- char msg[UEI_MSG_LEN];
-};
-
-#ifdef __bpf__
-
-#ifndef LSP
-#include "vmlinux.h"
-#endif
-#include <bpf/bpf_core_read.h>
-
-#define UEI_DEFINE(__name) \
- char RESIZABLE_ARRAY(data, __name##_dump); \
- const volatile u32 __name##_dump_len; \
- struct user_exit_info __name SEC(".data")
-
-#define UEI_RECORD(__uei_name, __ei) ({ \
- bpf_probe_read_kernel_str(__uei_name.reason, \
- sizeof(__uei_name.reason), (__ei)->reason); \
- bpf_probe_read_kernel_str(__uei_name.msg, \
- sizeof(__uei_name.msg), (__ei)->msg); \
- bpf_probe_read_kernel_str(__uei_name##_dump, \
- __uei_name##_dump_len, (__ei)->dump); \
- if (bpf_core_field_exists((__ei)->exit_code)) \
- __uei_name.exit_code = (__ei)->exit_code; \
- /* use __sync to force memory barrier */ \
- __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \
- (__ei)->kind); \
-})
-
-#else /* !__bpf__ */
-
#include <stdio.h>
#include <stdbool.h>
+#include "user_exit_info_common.h"
+
/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \
u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
@@ -114,5 +70,4 @@ enum uei_ecode_mask {
#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
-#endif /* __bpf__ */
#endif /* __USER_EXIT_INFO_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h
new file mode 100644
index 000000000000..2d0981aedd89
--- /dev/null
+++ b/tools/sched_ext/include/scx/user_exit_info_common.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_COMMON_H
+#define __USER_EXIT_INFO_COMMON_H
+
+#ifdef LSP
+#include "../vmlinux.h"
+#endif
+
+enum uei_sizes {
+ UEI_REASON_LEN = 128,
+ UEI_MSG_LEN = 1024,
+ UEI_DUMP_DFL_LEN = 32768,
+};
+
+struct user_exit_info {
+ int kind;
+ s64 exit_code;
+ char reason[UEI_REASON_LEN];
+ char msg[UEI_MSG_LEN];
+};
+
+#endif /* __USER_EXIT_INFO_COMMON_H */
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 50bc1737c167..55df8b798865 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * A central FIFO sched_ext scheduler which demonstrates the followings:
+ * A central FIFO sched_ext scheduler which demonstrates the following:
*
* a. Making all scheduling decisions from one CPU:
*
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 6ba6e610eeaa..55931a4cd71c 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -61,6 +61,7 @@ restart:
skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
+ assert(skel->rodata->nr_cpu_ids > 0);
assert(skel->rodata->nr_cpu_ids <= INT32_MAX);
while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index fdc7170639e6..2c720e3ecad5 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops,
.cgroup_move = (void *)fcg_cgroup_move,
.init = (void *)fcg_init,
.exit = (void *)fcg_exit,
- .flags = SCX_OPS_ENQ_EXITING,
+ .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
.name = "flatcg");
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 6dd423eeb4ff..cd85eb401179 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -6,6 +6,7 @@
*/
#include <stdio.h>
#include <signal.h>
+#include <assert.h>
#include <unistd.h>
#include <libgen.h>
#include <limits.h>
@@ -137,6 +138,7 @@ restart:
skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+ assert(skel->rodata->nr_cpus > 0);
skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 69d877501cb7..3072b593f898 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -39,7 +39,8 @@ const volatile u32 stall_kernel_nth;
const volatile u32 dsp_inf_loop_after;
const volatile u32 dsp_batch;
const volatile bool highpri_boosting;
-const volatile bool print_shared_dsq;
+const volatile bool print_dsqs_and_events;
+const volatile bool print_msgs;
const volatile s32 disallow_tgid;
const volatile bool suppress_dump;
@@ -56,7 +57,8 @@ struct qmap {
queue1 SEC(".maps"),
queue2 SEC(".maps"),
queue3 SEC(".maps"),
- queue4 SEC(".maps");
+ queue4 SEC(".maps"),
+ dump_store SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
@@ -578,11 +580,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
return;
scx_bpf_dump("QMAP FIFO[%d]:", i);
+
+ /*
+ * Dump can be invoked anytime and there is no way to iterate in
+ * a non-destructive way. Pop and store in dump_store and then
+ * restore afterwards. If racing against new enqueues, ordering
+ * can get mixed up.
+ */
bpf_repeat(4096) {
if (bpf_map_pop_elem(fifo, &pid))
break;
+ bpf_map_push_elem(&dump_store, &pid, 0);
scx_bpf_dump(" %d", pid);
}
+
+ bpf_repeat(4096) {
+ if (bpf_map_pop_elem(&dump_store, &pid))
+ break;
+ bpf_map_push_elem(fifo, &pid, 0);
+ }
+
scx_bpf_dump("\n");
}
}
@@ -617,22 +634,25 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
{
- bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
- cgrp->kn->id, args->weight, args->bw_period_us,
- args->bw_quota_us, args->bw_burst_us);
+ if (print_msgs)
+ bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
+ cgrp->kn->id, args->weight, args->bw_period_us,
+ args->bw_quota_us, args->bw_burst_us);
return 0;
}
void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
{
- bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
+ if (print_msgs)
+ bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
}
void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
u64 period_us, u64 quota_us, u64 burst_us)
{
- bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id,
- period_us, quota_us, burst_us);
+ if (print_msgs)
+ bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
+ cgrp->kn->id, period_us, quota_us, burst_us);
}
/*
@@ -676,16 +696,20 @@ static void print_cpus(void)
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
{
- bpf_printk("CPU %d coming online", cpu);
- /* @cpu is already online at this point */
- print_cpus();
+ if (print_msgs) {
+ bpf_printk("CPU %d coming online", cpu);
+ /* @cpu is already online at this point */
+ print_cpus();
+ }
}
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
{
- bpf_printk("CPU %d going offline", cpu);
- /* @cpu is still online at this point */
- print_cpus();
+ if (print_msgs) {
+ bpf_printk("CPU %d going offline", cpu);
+ /* @cpu is still online at this point */
+ print_cpus();
+ }
}
struct monitor_timer {
@@ -783,35 +807,36 @@ static void dump_shared_dsq(void)
static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
{
- struct scx_event_stats events;
-
bpf_rcu_read_lock();
dispatch_highpri(true);
bpf_rcu_read_unlock();
monitor_cpuperf();
- if (print_shared_dsq)
+ if (print_dsqs_and_events) {
+ struct scx_event_stats events;
+
dump_shared_dsq();
- __COMPAT_scx_bpf_events(&events, sizeof(events));
-
- bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
- scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
- bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
- scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
- bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
- scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
- bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
- scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
- bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
- scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
- bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
- scx_read_event(&events, SCX_EV_BYPASS_DURATION));
- bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
- scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
- bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
- scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
+ __COMPAT_scx_bpf_events(&events, sizeof(events));
+
+ bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
+ scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
+ bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
+ scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
+ bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
+ scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
+ bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
+ scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
+ bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
+ scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
+ bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
+ scx_read_event(&events, SCX_EV_BYPASS_DURATION));
+ bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
+ scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
+ bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
+ scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
+ }
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
return 0;
@@ -823,7 +848,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
struct bpf_timer *timer;
s32 ret;
- print_cpus();
+ if (print_msgs)
+ print_cpus();
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
if (ret)
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index c4912ab2e76f..ef701d45ba43 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-" [-P] [-d PID] [-D LEN] [-p] [-v]\n"
+" [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,7 +28,8 @@ const char help_fmt[] =
" -T COUNT Stall every COUNT'th kernel thread\n"
" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
-" -P Print out DSQ content to trace_pipe every second, use with -b\n"
+" -P Print out DSQ content and event counters to trace_pipe every second\n"
+" -M Print out debug messages to trace_pipe\n"
" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
" -D LEN Set scx_exit_info.dump buffer length\n"
@@ -66,7 +67,7 @@ int main(int argc, char **argv)
skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
- while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -87,7 +88,10 @@ int main(int argc, char **argv)
skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
break;
case 'P':
- skel->rodata->print_shared_dsq = true;
+ skel->rodata->print_dsqs_and_events = true;
+ break;
+ case 'M':
+ skel->rodata->print_msgs = true;
break;
case 'H':
skel->rodata->highpri_boosting = true;
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 76d83199545c..06d4b13bf76b 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -7,6 +7,7 @@
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
+#include <assert.h>
#include <libgen.h>
#include <bpf/bpf.h>
#include <scx/common.h>
@@ -41,6 +42,7 @@ static void sigint_handler(int simple)
static void read_stats(struct scx_simple *skel, __u64 *stats)
{
int nr_cpus = libbpf_num_possible_cpus();
+ assert(nr_cpus > 0);
__u64 cnts[2][nr_cpus];
__u32 idx;