diff options
22 files changed, 1992 insertions, 1399 deletions
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 7047101dbf58..d82b7a9b0658 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -108,7 +108,11 @@ enum scx_kf_mask { SCX_KF_UNLOCKED = 0, /* sleepable and not rq locked */ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ SCX_KF_CPU_RELEASE = 1 << 0, /* ops.cpu_release() */ - /* ops.dequeue (in REST) may be nested inside DISPATCH */ + /* + * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and + * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be + * nested inside DISPATCH. + */ SCX_KF_DISPATCH = 1 << 1, /* ops.dispatch() */ SCX_KF_ENQUEUE = 1 << 2, /* ops.enqueue() and ops.select_cpu() */ SCX_KF_SELECT_CPU = 1 << 3, /* ops.select_cpu() */ diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index c4a488e67aa7..755883faf751 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -58,6 +58,7 @@ #include "deadline.c" #ifdef CONFIG_SCHED_CLASS_EXT +# include "ext_internal.h" # include "ext.c" # include "ext_idle.c" #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ec33b0353027..523a97f28bfc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9362,8 +9362,6 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task, false); - - scx_cgroup_finish_attach(); } static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 088ceff38c8a..2b0e88206d07 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -9,1040 +9,6 @@ #include <linux/btf_ids.h> #include "ext_idle.h" -#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) - -enum scx_consts { - SCX_DSP_DFL_MAX_BATCH = 32, - SCX_DSP_MAX_LOOPS = 32, - SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, - - SCX_EXIT_BT_LEN = 64, - SCX_EXIT_MSG_LEN = 1024, - SCX_EXIT_DUMP_DFL_LEN = 32768, - - SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, - - /* - * Iterating all tasks may take a while. Periodically drop - * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. - */ - SCX_TASK_ITER_BATCH = 32, -}; - -enum scx_exit_kind { - SCX_EXIT_NONE, - SCX_EXIT_DONE, - - SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ - SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ - SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ - SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ - - SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ - SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ - SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -}; - -/* - * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), - * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes - * are 64bit of the format: - * - * Bits: [63 .. 48 47 .. 32 31 .. 0] - * [ SYS ACT ] [ SYS RSN ] [ USR ] - * - * SYS ACT: System-defined exit actions - * SYS RSN: System-defined exit reasons - * USR : User-defined exit codes and reasons - * - * Using the above, users may communicate intention and context by ORing system - * actions and/or system reasons with a user-defined exit code. - */ -enum scx_exit_code { - /* Reasons */ - SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, - - /* Actions */ - SCX_ECODE_ACT_RESTART = 1LLU << 48, -}; - -/* - * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is - * being disabled. - */ -struct scx_exit_info { - /* %SCX_EXIT_* - broad category of the exit reason */ - enum scx_exit_kind kind; - - /* exit code if gracefully exiting */ - s64 exit_code; - - /* textual representation of the above */ - const char *reason; - - /* backtrace if exiting due to an error */ - unsigned long *bt; - u32 bt_len; - - /* informational message */ - char *msg; - - /* debug dump */ - char *dump; -}; - -/* sched_ext_ops.flags */ -enum scx_ops_flags { - /* - * Keep built-in idle tracking even if ops.update_idle() is implemented. - */ - SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, - - /* - * By default, if there are no other task to run on the CPU, ext core - * keeps running the current task even after its slice expires. If this - * flag is specified, such tasks are passed to ops.enqueue() with - * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. - */ - SCX_OPS_ENQ_LAST = 1LLU << 1, - - /* - * An exiting task may schedule after PF_EXITING is set. In such cases, - * bpf_task_from_pid() may not be able to find the task and if the BPF - * scheduler depends on pid lookup for dispatching, the task will be - * lost leading to various issues including RCU grace period stalls. - * - * To mask this problem, by default, unhashed tasks are automatically - * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't - * depend on pid lookups and wants to handle these tasks directly, the - * following flag can be used. - */ - SCX_OPS_ENQ_EXITING = 1LLU << 2, - - /* - * If set, only tasks with policy set to SCHED_EXT are attached to - * sched_ext. If clear, SCHED_NORMAL tasks are also included. - */ - SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, - - /* - * A migration disabled task can only execute on its current CPU. By - * default, such tasks are automatically put on the CPU's local DSQ with - * the default slice on enqueue. If this ops flag is set, they also go - * through ops.enqueue(). - * - * A migration disabled task never invokes ops.select_cpu() as it can - * only select the current CPU. Also, p->cpus_ptr will only contain its - * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr - * and thus may disagree with cpumask_weight(p->cpus_ptr). - */ - SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, - - /* - * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes - * ops.enqueue() on the ops.select_cpu() selected or the wakee's - * previous CPU via IPI (inter-processor interrupt) to reduce cacheline - * transfers. When this optimization is enabled, ops.select_cpu() is - * skipped in some cases (when racing against the wakee switching out). - * As the BPF scheduler may depend on ops.select_cpu() being invoked - * during wakeups, queued wakeup is disabled by default. - * - * If this ops flag is set, queued wakeup optimization is enabled and - * the BPF scheduler must be able to handle ops.enqueue() invoked on the - * wakee's CPU without preceding ops.select_cpu() even for tasks which - * may be executed on multiple CPUs. - */ - SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, - - /* - * If set, enable per-node idle cpumasks. If clear, use a single global - * flat idle cpumask. - */ - SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, - - /* - * CPU cgroup support flags - */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ - - SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | - SCX_OPS_ENQ_LAST | - SCX_OPS_ENQ_EXITING | - SCX_OPS_ENQ_MIGRATION_DISABLED | - SCX_OPS_ALLOW_QUEUED_WAKEUP | - SCX_OPS_SWITCH_PARTIAL | - SCX_OPS_BUILTIN_IDLE_PER_NODE | - SCX_OPS_HAS_CGROUP_WEIGHT, - - /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ - __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, - - SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, -}; - -/* argument container for ops.init_task() */ -struct scx_init_task_args { - /* - * Set if ops.init_task() is being invoked on the fork path, as opposed - * to the scheduler transition path. - */ - bool fork; -#ifdef CONFIG_EXT_GROUP_SCHED - /* the cgroup the task is joining */ - struct cgroup *cgroup; -#endif -}; - -/* argument container for ops.exit_task() */ -struct scx_exit_task_args { - /* Whether the task exited before running on sched_ext. */ - bool cancelled; -}; - -/* argument container for ops->cgroup_init() */ -struct scx_cgroup_init_args { - /* the weight of the cgroup [1..10000] */ - u32 weight; - - /* bandwidth control parameters from cpu.max and cpu.max.burst */ - u64 bw_period_us; - u64 bw_quota_us; - u64 bw_burst_us; -}; - -enum scx_cpu_preempt_reason { - /* next task is being scheduled by &sched_class_rt */ - SCX_CPU_PREEMPT_RT, - /* next task is being scheduled by &sched_class_dl */ - SCX_CPU_PREEMPT_DL, - /* next task is being scheduled by &sched_class_stop */ - SCX_CPU_PREEMPT_STOP, - /* unknown reason for SCX being preempted */ - SCX_CPU_PREEMPT_UNKNOWN, -}; - -/* - * Argument container for ops->cpu_acquire(). Currently empty, but may be - * expanded in the future. - */ -struct scx_cpu_acquire_args {}; - -/* argument container for ops->cpu_release() */ -struct scx_cpu_release_args { - /* the reason the CPU was preempted */ - enum scx_cpu_preempt_reason reason; - - /* the task that's going to be scheduled on the CPU */ - struct task_struct *task; -}; - -/* - * Informational context provided to dump operations. - */ -struct scx_dump_ctx { - enum scx_exit_kind kind; - s64 exit_code; - const char *reason; - u64 at_ns; - u64 at_jiffies; -}; - -/** - * struct sched_ext_ops - Operation table for BPF scheduler implementation - * - * A BPF scheduler can implement an arbitrary scheduling policy by - * implementing and loading operations in this table. Note that a userland - * scheduling policy can also be implemented using the BPF scheduler - * as a shim layer. - */ -struct sched_ext_ops { - /** - * @select_cpu: Pick the target CPU for a task which is being woken up - * @p: task being woken up - * @prev_cpu: the cpu @p was on before sleeping - * @wake_flags: SCX_WAKE_* - * - * Decision made here isn't final. @p may be moved to any CPU while it - * is getting dispatched for execution later. However, as @p is not on - * the rq at this point, getting the eventual execution CPU right here - * saves a small bit of overhead down the line. - * - * If an idle CPU is returned, the CPU is kicked and will try to - * dispatch. While an explicit custom mechanism can be added, - * select_cpu() serves as the default way to wake up idle CPUs. - * - * @p may be inserted into a DSQ directly by calling - * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. - * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ - * of the CPU returned by this operation. - * - * Note that select_cpu() is never called for tasks that can only run - * on a single CPU or tasks with migration disabled, as they don't have - * the option to select a different CPU. See select_task_rq() for - * details. - */ - s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); - - /** - * @enqueue: Enqueue a task on the BPF scheduler - * @p: task being enqueued - * @enq_flags: %SCX_ENQ_* - * - * @p is ready to run. Insert directly into a DSQ by calling - * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly - * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, - * the task will stall. - * - * If @p was inserted into a DSQ from ops.select_cpu(), this callback is - * skipped. - */ - void (*enqueue)(struct task_struct *p, u64 enq_flags); - - /** - * @dequeue: Remove a task from the BPF scheduler - * @p: task being dequeued - * @deq_flags: %SCX_DEQ_* - * - * Remove @p from the BPF scheduler. This is usually called to isolate - * the task while updating its scheduling properties (e.g. priority). - * - * The ext core keeps track of whether the BPF side owns a given task or - * not and can gracefully ignore spurious dispatches from BPF side, - * which makes it safe to not implement this method. However, depending - * on the scheduling logic, this can lead to confusing behaviors - e.g. - * scheduling position not being updated across a priority change. - */ - void (*dequeue)(struct task_struct *p, u64 deq_flags); - - /** - * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs - * @cpu: CPU to dispatch tasks for - * @prev: previous task being switched out - * - * Called when a CPU's local dsq is empty. The operation should dispatch - * one or more tasks from the BPF scheduler into the DSQs using - * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ - * using scx_bpf_dsq_move_to_local(). - * - * The maximum number of times scx_bpf_dsq_insert() can be called - * without an intervening scx_bpf_dsq_move_to_local() is specified by - * ops.dispatch_max_batch. See the comments on top of the two functions - * for more details. - * - * When not %NULL, @prev is an SCX task with its slice depleted. If - * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in - * @prev->scx.flags, it is not enqueued yet and will be enqueued after - * ops.dispatch() returns. To keep executing @prev, return without - * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. - */ - void (*dispatch)(s32 cpu, struct task_struct *prev); - - /** - * @tick: Periodic tick - * @p: task running currently - * - * This operation is called every 1/HZ seconds on CPUs which are - * executing an SCX task. Setting @p->scx.slice to 0 will trigger an - * immediate dispatch cycle on the CPU. - */ - void (*tick)(struct task_struct *p); - - /** - * @runnable: A task is becoming runnable on its associated CPU - * @p: task becoming runnable - * @enq_flags: %SCX_ENQ_* - * - * This and the following three functions can be used to track a task's - * execution state transitions. A task becomes ->runnable() on a CPU, - * and then goes through one or more ->running() and ->stopping() pairs - * as it runs on the CPU, and eventually becomes ->quiescent() when it's - * done running on the CPU. - * - * @p is becoming runnable on the CPU because it's - * - * - waking up (%SCX_ENQ_WAKEUP) - * - being moved from another CPU - * - being restored after temporarily taken off the queue for an - * attribute change. - * - * This and ->enqueue() are related but not coupled. This operation - * notifies @p's state transition and may not be followed by ->enqueue() - * e.g. when @p is being dispatched to a remote CPU, or when @p is - * being enqueued on a CPU experiencing a hotplug event. Likewise, a - * task may be ->enqueue()'d without being preceded by this operation - * e.g. after exhausting its slice. - */ - void (*runnable)(struct task_struct *p, u64 enq_flags); - - /** - * @running: A task is starting to run on its associated CPU - * @p: task starting to run - * - * Note that this callback may be called from a CPU other than the - * one the task is going to run on. This can happen when a task - * property is changed (i.e., affinity), since scx_next_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to determine the - * target CPU the task is going to use. - * - * See ->runnable() for explanation on the task state notifiers. - */ - void (*running)(struct task_struct *p); - - /** - * @stopping: A task is stopping execution - * @p: task stopping to run - * @runnable: is task @p still runnable? - * - * Note that this callback may be called from a CPU other than the - * one the task was running on. This can happen when a task - * property is changed (i.e., affinity), since dequeue_task_scx(), - * which triggers this callback, may run on a CPU different from - * the task's assigned CPU. - * - * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU - * the task was running on. - * - * See ->runnable() for explanation on the task state notifiers. If - * !@runnable, ->quiescent() will be invoked after this operation - * returns. - */ - void (*stopping)(struct task_struct *p, bool runnable); - - /** - * @quiescent: A task is becoming not runnable on its associated CPU - * @p: task becoming not runnable - * @deq_flags: %SCX_DEQ_* - * - * See ->runnable() for explanation on the task state notifiers. - * - * @p is becoming quiescent on the CPU because it's - * - * - sleeping (%SCX_DEQ_SLEEP) - * - being moved to another CPU - * - being temporarily taken off the queue for an attribute change - * (%SCX_DEQ_SAVE) - * - * This and ->dequeue() are related but not coupled. This operation - * notifies @p's state transition and may not be preceded by ->dequeue() - * e.g. when @p is being dispatched to a remote CPU. - */ - void (*quiescent)(struct task_struct *p, u64 deq_flags); - - /** - * @yield: Yield CPU - * @from: yielding task - * @to: optional yield target task - * - * If @to is NULL, @from is yielding the CPU to other runnable tasks. - * The BPF scheduler should ensure that other available tasks are - * dispatched before the yielding task. Return value is ignored in this - * case. - * - * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf - * scheduler can implement the request, return %true; otherwise, %false. - */ - bool (*yield)(struct task_struct *from, struct task_struct *to); - - /** - * @core_sched_before: Task ordering for core-sched - * @a: task A - * @b: task B - * - * Used by core-sched to determine the ordering between two tasks. See - * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on - * core-sched. - * - * Both @a and @b are runnable and may or may not currently be queued on - * the BPF scheduler. Should return %true if @a should run before @b. - * %false if there's no required ordering or @b should run before @a. - * - * If not specified, the default is ordering them according to when they - * became runnable. - */ - bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); - - /** - * @set_weight: Set task weight - * @p: task to set weight for - * @weight: new weight [1..10000] - * - * Update @p's weight to @weight. - */ - void (*set_weight)(struct task_struct *p, u32 weight); - - /** - * @set_cpumask: Set CPU affinity - * @p: task to set CPU affinity for - * @cpumask: cpumask of cpus that @p can run on - * - * Update @p's CPU affinity to @cpumask. - */ - void (*set_cpumask)(struct task_struct *p, - const struct cpumask *cpumask); - - /** - * @update_idle: Update the idle state of a CPU - * @cpu: CPU to update the idle state for - * @idle: whether entering or exiting the idle state - * - * This operation is called when @rq's CPU goes or leaves the idle - * state. By default, implementing this operation disables the built-in - * idle CPU tracking and the following helpers become unavailable: - * - * - scx_bpf_select_cpu_dfl() - * - scx_bpf_select_cpu_and() - * - scx_bpf_test_and_clear_cpu_idle() - * - scx_bpf_pick_idle_cpu() - * - * The user also must implement ops.select_cpu() as the default - * implementation relies on scx_bpf_select_cpu_dfl(). - * - * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle - * tracking. - */ - void (*update_idle)(s32 cpu, bool idle); - - /** - * @cpu_acquire: A CPU is becoming available to the BPF scheduler - * @cpu: The CPU being acquired by the BPF scheduler. - * @args: Acquire arguments, see the struct definition. - * - * A CPU that was previously released from the BPF scheduler is now once - * again under its control. - */ - void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); - - /** - * @cpu_release: A CPU is taken away from the BPF scheduler - * @cpu: The CPU being released by the BPF scheduler. - * @args: Release arguments, see the struct definition. - * - * The specified CPU is no longer under the control of the BPF - * scheduler. This could be because it was preempted by a higher - * priority sched_class, though there may be other reasons as well. The - * caller should consult @args->reason to determine the cause. - */ - void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); - - /** - * @init_task: Initialize a task to run in a BPF scheduler - * @p: task to initialize for BPF scheduling - * @args: init arguments, see the struct definition - * - * Either we're loading a BPF scheduler or a new task is being forked. - * Initialize @p for BPF scheduling. This operation may block and can - * be used for allocations, and is called exactly once for a task. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During a fork, it - * will abort that specific fork. - */ - s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); - - /** - * @exit_task: Exit a previously-running task from the system - * @p: task to exit - * @args: exit arguments, see the struct definition - * - * @p is exiting or the BPF scheduler is being unloaded. Perform any - * necessary cleanup for @p. - */ - void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); - - /** - * @enable: Enable BPF scheduling for a task - * @p: task to enable BPF scheduling for - * - * Enable @p for BPF scheduling. enable() is called on @p any time it - * enters SCX, and is always paired with a matching disable(). - */ - void (*enable)(struct task_struct *p); - - /** - * @disable: Disable BPF scheduling for a task - * @p: task to disable BPF scheduling for - * - * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. - * Disable BPF scheduling for @p. A disable() call is always matched - * with a prior enable() call. - */ - void (*disable)(struct task_struct *p); - - /** - * @dump: Dump BPF scheduler state on error - * @ctx: debug dump context - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. - */ - void (*dump)(struct scx_dump_ctx *ctx); - - /** - * @dump_cpu: Dump BPF scheduler state for a CPU on error - * @ctx: debug dump context - * @cpu: CPU to generate debug dump for - * @idle: @cpu is currently idle without any runnable tasks - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @cpu. If @idle is %true and this operation doesn't produce any - * output, @cpu is skipped for dump. - */ - void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); - - /** - * @dump_task: Dump BPF scheduler state for a runnable task on error - * @ctx: debug dump context - * @p: runnable task to generate debug dump for - * - * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for - * @p. - */ - void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); - -#ifdef CONFIG_EXT_GROUP_SCHED - /** - * @cgroup_init: Initialize a cgroup - * @cgrp: cgroup being initialized - * @args: init arguments, see the struct definition - * - * Either the BPF scheduler is being loaded or @cgrp created, initialize - * @cgrp for sched_ext. This operation may block. - * - * Return 0 for success, -errno for failure. An error return while - * loading will abort loading of the BPF scheduler. During cgroup - * creation, it will abort the specific cgroup creation. - */ - s32 (*cgroup_init)(struct cgroup *cgrp, - struct scx_cgroup_init_args *args); - - /** - * @cgroup_exit: Exit a cgroup - * @cgrp: cgroup being exited - * - * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit - * @cgrp for sched_ext. This operation my block. - */ - void (*cgroup_exit)(struct cgroup *cgrp); - - /** - * @cgroup_prep_move: Prepare a task to be moved to a different cgroup - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Prepare @p for move from cgroup @from to @to. This operation may - * block and can be used for allocations. - * - * Return 0 for success, -errno for failure. An error return aborts the - * migration. - */ - s32 (*cgroup_prep_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_move: Commit cgroup move - * @p: task being moved - * @from: cgroup @p is being moved from - * @to: cgroup @p is being moved to - * - * Commit the move. @p is dequeued during this operation. - */ - void (*cgroup_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_cancel_move: Cancel cgroup move - * @p: task whose cgroup move is being canceled - * @from: cgroup @p was being moved from - * @to: cgroup @p was being moved to - * - * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). - * Undo the preparation. - */ - void (*cgroup_cancel_move)(struct task_struct *p, - struct cgroup *from, struct cgroup *to); - - /** - * @cgroup_set_weight: A cgroup's weight is being changed - * @cgrp: cgroup whose weight is being updated - * @weight: new weight [1..10000] - * - * Update @cgrp's weight to @weight. - */ - void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); - - /** - * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed - * @cgrp: cgroup whose bandwidth is being updated - * @period_us: bandwidth control period - * @quota_us: bandwidth control quota - * @burst_us: bandwidth control burst - * - * Update @cgrp's bandwidth control parameters. This is from the cpu.max - * cgroup interface. - * - * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled - * to. For example, if @period_us is 1_000_000 and @quota_us is - * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be - * interpreted in the same fashion and specifies how much @cgrp can - * burst temporarily. The specific control mechanism and thus the - * interpretation of @period_us and burstiness is upto to the BPF - * scheduler. - */ - void (*cgroup_set_bandwidth)(struct cgroup *cgrp, - u64 period_us, u64 quota_us, u64 burst_us); - -#endif /* CONFIG_EXT_GROUP_SCHED */ - - /* - * All online ops must come before ops.cpu_online(). - */ - - /** - * @cpu_online: A CPU became online - * @cpu: CPU which just came up - * - * @cpu just came online. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs beforehand. - */ - void (*cpu_online)(s32 cpu); - - /** - * @cpu_offline: A CPU is going offline - * @cpu: CPU which is going offline - * - * @cpu is going offline. @cpu will not call ops.enqueue() or - * ops.dispatch(), nor run tasks associated with other CPUs afterwards. - */ - void (*cpu_offline)(s32 cpu); - - /* - * All CPU hotplug ops must come before ops.init(). - */ - - /** - * @init: Initialize the BPF scheduler - */ - s32 (*init)(void); - - /** - * @exit: Clean up after the BPF scheduler - * @info: Exit info - * - * ops.exit() is also called on ops.init() failure, which is a bit - * unusual. This is to allow rich reporting through @info on how - * ops.init() failed. - */ - void (*exit)(struct scx_exit_info *info); - - /** - * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch - */ - u32 dispatch_max_batch; - - /** - * @flags: %SCX_OPS_* flags - */ - u64 flags; - - /** - * @timeout_ms: The maximum amount of time, in milliseconds, that a - * runnable task should be able to wait before being scheduled. The - * maximum timeout may not exceed the default timeout of 30 seconds. - * - * Defaults to the maximum allowed timeout value of 30 seconds. - */ - u32 timeout_ms; - - /** - * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default - * value of 32768 is used. - */ - u32 exit_dump_len; - - /** - * @hotplug_seq: A sequence number that may be set by the scheduler to - * detect when a hotplug event has occurred during the loading process. - * If 0, no detection occurs. Otherwise, the scheduler will fail to - * load if the sequence number does not match @scx_hotplug_seq on the - * enable path. - */ - u64 hotplug_seq; - - /** - * @name: BPF scheduler's name - * - * Must be a non-zero valid BPF object name including only isalnum(), - * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the - * BPF scheduler is enabled. - */ - char name[SCX_OPS_NAME_LEN]; - - /* internal use only, must be NULL */ - void *priv; -}; - -enum scx_opi { - SCX_OPI_BEGIN = 0, - SCX_OPI_NORMAL_BEGIN = 0, - SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), - SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), - SCX_OPI_END = SCX_OP_IDX(init), -}; - -/* - * Collection of event counters. Event types are placed in descending order. - */ -struct scx_event_stats { - /* - * If ops.select_cpu() returns a CPU which can't be used by the task, - * the core scheduler code silently picks a fallback CPU. - */ - s64 SCX_EV_SELECT_CPU_FALLBACK; - - /* - * When dispatching to a local DSQ, the CPU may have gone offline in - * the meantime. In this case, the task is bounced to the global DSQ. - */ - s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; - - /* - * If SCX_OPS_ENQ_LAST is not set, the number of times that a task - * continued to run because there were no other tasks on the CPU. - */ - s64 SCX_EV_DISPATCH_KEEP_LAST; - - /* - * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task - * is dispatched to a local DSQ when exiting. - */ - s64 SCX_EV_ENQ_SKIP_EXITING; - - /* - * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a - * migration disabled task skips ops.enqueue() and is dispatched to its - * local DSQ. - */ - s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; - - /* - * Total number of times a task's time slice was refilled with the - * default value (SCX_SLICE_DFL). - */ - s64 SCX_EV_REFILL_SLICE_DFL; - - /* - * The total duration of bypass modes in nanoseconds. - */ - s64 SCX_EV_BYPASS_DURATION; - - /* - * The number of tasks dispatched in the bypassing mode. - */ - s64 SCX_EV_BYPASS_DISPATCH; - - /* - * The number of times the bypassing mode has been activated. - */ - s64 SCX_EV_BYPASS_ACTIVATE; -}; - -struct scx_sched { - struct sched_ext_ops ops; - DECLARE_BITMAP(has_op, SCX_OPI_END); - - /* - * Dispatch queues. - * - * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. - * This is to avoid live-locking in bypass mode where all tasks are - * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If - * per-node split isn't sufficient, it can be further split. - */ - struct rhashtable dsq_hash; - struct scx_dispatch_q **global_dsqs; - - /* - * The event counters are in a per-CPU variable to minimize the - * accounting overhead. A system-wide view on the event counter is - * constructed when requested by scx_bpf_events(). - */ - struct scx_event_stats __percpu *event_stats_cpu; - - bool warned_zero_slice; - - atomic_t exit_kind; - struct scx_exit_info *exit_info; - - struct kobject kobj; - - struct kthread_worker *helper; - struct irq_work error_irq_work; - struct kthread_work disable_work; - struct rcu_work rcu_work; -}; - -enum scx_wake_flags { - /* expose select WF_* flags as enums */ - SCX_WAKE_FORK = WF_FORK, - SCX_WAKE_TTWU = WF_TTWU, - SCX_WAKE_SYNC = WF_SYNC, -}; - -enum scx_enq_flags { - /* expose select ENQUEUE_* flags as enums */ - SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, - SCX_ENQ_HEAD = ENQUEUE_HEAD, - SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, - - /* high 32bits are SCX specific */ - - /* - * Set the following to trigger preemption when calling - * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the - * current task is cleared to zero and the CPU is kicked into the - * scheduling path. Implies %SCX_ENQ_HEAD. - */ - SCX_ENQ_PREEMPT = 1LLU << 32, - - /* - * The task being enqueued was previously enqueued on the current CPU's - * %SCX_DSQ_LOCAL, but was removed from it in a call to the - * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was - * invoked in a ->cpu_release() callback, and the task is again - * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the - * task will not be scheduled on the CPU until at least the next invocation - * of the ->cpu_acquire() callback. - */ - SCX_ENQ_REENQ = 1LLU << 40, - - /* - * The task being enqueued is the only task available for the cpu. By - * default, ext core keeps executing such tasks but when - * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the - * %SCX_ENQ_LAST flag set. - * - * The BPF scheduler is responsible for triggering a follow-up - * scheduling event. Otherwise, Execution may stall. - */ - SCX_ENQ_LAST = 1LLU << 41, - - /* high 8 bits are internal */ - __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, - - SCX_ENQ_CLEAR_OPSS = 1LLU << 56, - SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -}; - -enum scx_deq_flags { - /* expose select DEQUEUE_* flags as enums */ - SCX_DEQ_SLEEP = DEQUEUE_SLEEP, - - /* high 32bits are SCX specific */ - - /* - * The generic core-sched layer decided to execute the task even though - * it hasn't been dispatched yet. Dequeue from the BPF side. - */ - SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -}; - -enum scx_pick_idle_cpu_flags { - SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ - SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ -}; - -enum scx_kick_flags { - /* - * Kick the target CPU if idle. Guarantees that the target CPU goes - * through at least one full scheduling cycle before going idle. If the - * target CPU can be determined to be currently not idle and going to go - * through a scheduling cycle before going idle, noop. - */ - SCX_KICK_IDLE = 1LLU << 0, - - /* - * Preempt the current task and execute the dispatch path. If the - * current task of the target CPU is an SCX task, its ->scx.slice is - * cleared to zero before the scheduling path is invoked so that the - * task expires and the dispatch path is invoked. - */ - SCX_KICK_PREEMPT = 1LLU << 1, - - /* - * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will - * return after the target CPU finishes picking the next task. - */ - SCX_KICK_WAIT = 1LLU << 2, -}; - -enum scx_tg_flags { - SCX_TG_ONLINE = 1U << 0, - SCX_TG_INITED = 1U << 1, -}; - -enum scx_enable_state { - SCX_ENABLING, - SCX_ENABLED, - SCX_DISABLING, - SCX_DISABLED, -}; - -static const char *scx_enable_state_str[] = { - [SCX_ENABLING] = "enabling", - [SCX_ENABLED] = "enabled", - [SCX_DISABLING] = "disabling", - [SCX_DISABLED] = "disabled", -}; - -/* - * sched_ext_entity->ops_state - * - * Used to track the task ownership between the SCX core and the BPF scheduler. - * State transitions look as follows: - * - * NONE -> QUEUEING -> QUEUED -> DISPATCHING - * ^ | | - * | v v - * \-------------------------------/ - * - * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call - * sites for explanations on the conditions being waited upon and why they are - * safe. Transitions out of them into NONE or QUEUED must store_release and the - * waiters should load_acquire. - * - * Tracking scx_ops_state enables sched_ext core to reliably determine whether - * any given task can be dispatched by the BPF scheduler at all times and thus - * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler - * to try to dispatch any task anytime regardless of its state as the SCX core - * can safely reject invalid dispatches. - */ -enum scx_ops_state { - SCX_OPSS_NONE, /* owned by the SCX core */ - SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ - SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ - SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ - - /* - * QSEQ brands each QUEUED instance so that, when dispatch races - * dequeue/requeue, the dispatcher can tell whether it still has a claim - * on the task being dispatched. - * - * As some 32bit archs can't do 64bit store_release/load_acquire, - * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on - * 32bit machines. The dispatch race window QSEQ protects is very narrow - * and runs with IRQ disabled. 30 bits should be sufficient. - */ - SCX_OPSS_QSEQ_SHIFT = 2, -}; - -/* Use macros to ensure that the type is unsigned long for the masks */ -#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) - /* * NOTE: sched_ext is in the process of growing multiple scheduler support and * scx_root usage is in a transitional state. Naked dereferences are safe if the @@ -1170,7 +136,7 @@ static struct kset *scx_kset; #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); -static void scx_bpf_kick_cpu(s32 cpu, u64 flags); +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); @@ -1185,24 +151,7 @@ static __printf(4, 5) void scx_exit(struct scx_sched *sch, va_end(args); } -static __printf(3, 4) void scx_kf_exit(enum scx_exit_kind kind, s64 exit_code, - const char *fmt, ...) -{ - struct scx_sched *sch; - va_list args; - - rcu_read_lock(); - sch = rcu_dereference(scx_root); - if (sch) { - va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); - va_end(args); - } - rcu_read_unlock(); -} - #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) -#define scx_kf_error(fmt, args...) scx_kf_exit(SCX_EXIT_ERROR, 0, fmt, ##args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -1232,10 +181,9 @@ static bool u32_before(u32 a, u32 b) return (s32)(a - b) < 0; } -static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) +static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, + struct task_struct *p) { - struct scx_sched *sch = scx_root; - return sch->global_dsqs[cpu_to_node(task_cpu(p))]; } @@ -1363,11 +311,11 @@ do { \ }) /* @mask is constant, always inline to cull unnecessary branches */ -static __always_inline bool scx_kf_allowed(u32 mask) +static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask) { if (unlikely(!(current->scx.kf_mask & mask))) { - scx_kf_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", - mask, current->scx.kf_mask); + scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x", + mask, current->scx.kf_mask); return false; } @@ -1380,13 +328,13 @@ static __always_inline bool scx_kf_allowed(u32 mask) */ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { - scx_kf_error("cpu_release kfunc called from a nested operation"); + scx_error(sch, "cpu_release kfunc called from a nested operation"); return false; } if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { - scx_kf_error("dispatch kfunc called from a nested operation"); + scx_error(sch, "dispatch kfunc called from a nested operation"); return false; } @@ -1394,15 +342,16 @@ static __always_inline bool scx_kf_allowed(u32 mask) } /* see SCX_CALL_OP_TASK() */ -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch, + u32 mask, struct task_struct *p) { - if (!scx_kf_allowed(mask)) + if (!scx_kf_allowed(sch, mask)) return false; if (unlikely((p != current->scx.kf_tasks[0] && p != current->scx.kf_tasks[1]))) { - scx_kf_error("called on a task not being operated on"); + scx_error(sch, "called on a task not being operated on"); return false; } @@ -1488,10 +437,11 @@ struct bpf_iter_scx_dsq { */ struct scx_task_iter { struct sched_ext_entity cursor; - struct task_struct *locked; + struct task_struct *locked_task; struct rq *rq; struct rq_flags rf; u32 cnt; + bool list_locked; }; /** @@ -1519,15 +469,16 @@ static void scx_task_iter_start(struct scx_task_iter *iter) iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked = NULL; + iter->locked_task = NULL; iter->cnt = 0; + iter->list_locked = true; } static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) { - if (iter->locked) { - task_rq_unlock(iter->rq, iter->locked, &iter->rf); - iter->locked = NULL; + if (iter->locked_task) { + task_rq_unlock(iter->rq, iter->locked_task, &iter->rf); + iter->locked_task = NULL; } } @@ -1537,24 +488,24 @@ static void __scx_task_iter_rq_unlock(struct scx_task_iter *iter) * * If @iter is in the middle of a locked iteration, it may be locking the rq of * the task currently being visited in addition to scx_tasks_lock. Unlock both. - * This function can be safely called anytime during an iteration. + * This function can be safely called anytime during an iteration. The next + * iterator operation will automatically restore the necessary locking. */ static void scx_task_iter_unlock(struct scx_task_iter *iter) { __scx_task_iter_rq_unlock(iter); - spin_unlock_irq(&scx_tasks_lock); + if (iter->list_locked) { + iter->list_locked = false; + spin_unlock_irq(&scx_tasks_lock); + } } -/** - * scx_task_iter_relock - Lock scx_tasks_lock released by scx_task_iter_unlock() - * @iter: iterator to re-lock - * - * Re-lock scx_tasks_lock unlocked by scx_task_iter_unlock(). Note that it - * doesn't re-lock the rq lock. Must be called before other iterator operations. - */ -static void scx_task_iter_relock(struct scx_task_iter *iter) +static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { - spin_lock_irq(&scx_tasks_lock); + if (!iter->list_locked) { + spin_lock_irq(&scx_tasks_lock); + iter->list_locked = true; + } } /** @@ -1567,6 +518,7 @@ static void scx_task_iter_relock(struct scx_task_iter *iter) */ static void scx_task_iter_stop(struct scx_task_iter *iter) { + __scx_task_iter_maybe_relock(iter); list_del_init(&iter->cursor.tasks_node); scx_task_iter_unlock(iter); } @@ -1584,10 +536,12 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; + __scx_task_iter_maybe_relock(iter); + if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - scx_task_iter_relock(iter); + __scx_task_iter_maybe_relock(iter); } list_for_each_entry(pos, cursor, tasks_node) { @@ -1650,7 +604,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) return NULL; iter->rq = task_rq_lock(p, &iter->rf); - iter->locked = p; + iter->locked_task = p; return p; } @@ -1664,7 +618,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This can be used when preemption is not disabled. */ #define scx_add_event(sch, name, cnt) do { \ - this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, (cnt)); \ } while(0) @@ -1677,7 +631,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter) * This should be used only when preemption is disabled. */ #define __scx_add_event(sch, name, cnt) do { \ - __this_cpu_add((sch)->event_stats_cpu->name, (cnt)); \ + __this_cpu_add((sch)->pcpu->event_stats.name, (cnt)); \ trace_sched_ext_event(#name, cnt); \ } while(0) @@ -1766,23 +720,6 @@ static bool ops_cpu_valid(struct scx_sched *sch, s32 cpu, const char *where) } /** - * kf_cpu_valid - Verify a CPU number, to be used on kfunc input args - * @cpu: cpu number which came from a BPF ops - * @where: extra information reported on error - * - * The same as ops_cpu_valid() but @sch is implicit. - */ -static bool kf_cpu_valid(u32 cpu, const char *where) -{ - if (__cpu_valid(cpu)) { - return true; - } else { - scx_kf_error("invalid CPU %d%s%s", cpu, where ? " " : "", where ?: ""); - return false; - } -} - -/** * ops_sanitize_err - Sanitize a -errno value * @sch: scx_sched to error out on error * @ops_name: operation to blame on failure @@ -1942,10 +879,10 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) WRITE_ONCE(dsq->nr, dsq->nr + delta); } -static void refill_task_slice_dfl(struct task_struct *p) +static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { p->scx.slice = SCX_SLICE_DFL; - __scx_add_event(scx_root, SCX_EV_REFILL_SLICE_DFL, 1); + __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, @@ -1963,7 +900,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ raw_spin_unlock(&dsq->lock); - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); raw_spin_lock(&dsq->lock); } } @@ -2142,26 +1079,27 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) - return find_global_dsq(p); + return find_global_dsq(sch, p); return &cpu_rq(cpu)->scx.local_dsq; } if (dsq_id == SCX_DSQ_GLOBAL) - dsq = find_global_dsq(p); + dsq = find_global_dsq(sch, p); else dsq = find_user_dsq(sch, dsq_id); if (unlikely(!dsq)) { scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]", dsq_id, p->comm, p->pid); - return find_global_dsq(p); + return find_global_dsq(sch, p); } return dsq; } -static void mark_direct_dispatch(struct task_struct *ddsp_task, +static void mark_direct_dispatch(struct scx_sched *sch, + struct task_struct *ddsp_task, struct task_struct *p, u64 dsq_id, u64 enq_flags) { @@ -2175,10 +1113,10 @@ static void mark_direct_dispatch(struct task_struct *ddsp_task, /* @p must match the task on the enqueue path */ if (unlikely(p != ddsp_task)) { if (IS_ERR(ddsp_task)) - scx_kf_error("%s[%d] already direct-dispatched", + scx_error(sch, "%s[%d] already direct-dispatched", p->comm, p->pid); else - scx_kf_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", + scx_error(sch, "scheduling for %s[%d] but trying to direct-dispatch %s[%d]", ddsp_task->comm, ddsp_task->pid, p->comm, p->pid); return; @@ -2333,15 +1271,15 @@ local: * higher priority it becomes from scx_prio_less()'s POV. */ touch_core_sched(rq, p); - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); local_norefill: dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); return; global: touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(p); - dispatch_enqueue(sch, find_global_dsq(p), p, enq_flags); + refill_task_slice_dfl(sch, p); + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -2651,8 +1589,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch, if (!scx_rq_online(rq)) { if (enforce) - __scx_add_event(scx_root, - SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); + __scx_add_event(sch, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1); return false; } @@ -2754,7 +1691,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dst_dsq = find_global_dsq(p); + dst_dsq = find_global_dsq(sch, p); dst_rq = src_rq; } } else { @@ -2910,7 +1847,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq, if (src_rq != dst_rq && unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) { - dispatch_enqueue(sch, find_global_dsq(p), p, + dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags | SCX_ENQ_CLEAR_OPSS); return; } @@ -3155,10 +2092,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * balance(), we want to complete this scheduling cycle and then * start a new one. IOW, we want to call resched_curr() on the * next, most likely idle, task, not the current one. Use - * scx_bpf_kick_cpu() for deferred kicking. + * scx_kick_cpu() for deferred kicking. */ if (unlikely(!--nr_loops)) { - scx_bpf_kick_cpu(cpu_of(rq), 0); + scx_kick_cpu(sch, cpu_of(rq), 0); break; } } while (dspc->nr_tasks); @@ -3442,24 +2379,25 @@ static struct task_struct *pick_task_scx(struct rq *rq) if (keep_prev) { p = prev; if (!p->scx.slice) - refill_task_slice_dfl(p); + refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); if (!p) { if (kick_idle) - scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE); + scx_kick_cpu(rcu_dereference_sched(scx_root), + cpu_of(rq), SCX_KICK_IDLE); return NULL; } if (unlikely(!p->scx.slice)) { - struct scx_sched *sch = scx_root; + struct scx_sched *sch = rcu_dereference_sched(scx_root); if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) { printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n", p->comm, p->pid, __func__); sch->warned_zero_slice = true; } - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); } } @@ -3548,7 +2486,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { - refill_task_slice_dfl(p); + refill_task_slice_dfl(sch, p); p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; } else { cpu = prev_cpu; @@ -4084,7 +3022,7 @@ bool scx_can_stop_tick(struct rq *rq) #ifdef CONFIG_EXT_GROUP_SCHED -DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); +DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_ops_rwsem); static bool scx_cgroup_enabled; void scx_tg_init(struct task_group *tg) @@ -4101,8 +3039,6 @@ int scx_tg_online(struct task_group *tg) WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(sch, cgroup_init)) { struct scx_cgroup_init_args args = @@ -4122,7 +3058,6 @@ int scx_tg_online(struct task_group *tg) tg->scx.flags |= SCX_TG_ONLINE; } - percpu_up_read(&scx_cgroup_rwsem); return ret; } @@ -4132,15 +3067,11 @@ void scx_tg_offline(struct task_group *tg) WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE)); - percpu_down_read(&scx_cgroup_rwsem); - if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) && (tg->scx.flags & SCX_TG_INITED)) SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, tg->css.cgroup); tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); - - percpu_up_read(&scx_cgroup_rwsem); } int scx_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4150,9 +3081,6 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset) struct task_struct *p; int ret; - /* released in scx_finish/cancel_attach() */ - percpu_down_read(&scx_cgroup_rwsem); - if (!scx_cgroup_enabled) return 0; @@ -4192,7 +3120,6 @@ err: p->scx.cgrp_moving_from = NULL; } - percpu_up_read(&scx_cgroup_rwsem); return ops_sanitize_err(sch, "cgroup_prep_move", ret); } @@ -4215,11 +3142,6 @@ void scx_cgroup_move_task(struct task_struct *p) p->scx.cgrp_moving_from = NULL; } -void scx_cgroup_finish_attach(void) -{ - percpu_up_read(&scx_cgroup_rwsem); -} - void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) { struct scx_sched *sch = scx_root; @@ -4227,7 +3149,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) struct task_struct *p; if (!scx_cgroup_enabled) - goto out_unlock; + return; cgroup_taskset_for_each(p, css, tset) { if (SCX_HAS_OP(sch, cgroup_cancel_move) && @@ -4236,15 +3158,13 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) p, p->scx.cgrp_moving_from, css->cgroup); p->scx.cgrp_moving_from = NULL; } -out_unlock: - percpu_up_read(&scx_cgroup_rwsem); } void scx_group_set_weight(struct task_group *tg, unsigned long weight) { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) && tg->scx.weight != weight) @@ -4253,7 +3173,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) tg->scx.weight = weight; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_idle(struct task_group *tg, bool idle) @@ -4266,7 +3186,7 @@ void scx_group_set_bandwidth(struct task_group *tg, { struct scx_sched *sch = scx_root; - percpu_down_read(&scx_cgroup_rwsem); + percpu_down_read(&scx_cgroup_ops_rwsem); if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) && (tg->scx.bw_period_us != period_us || @@ -4279,23 +3199,25 @@ void scx_group_set_bandwidth(struct task_group *tg, tg->scx.bw_quota_us = quota_us; tg->scx.bw_burst_us = burst_us; - percpu_up_read(&scx_cgroup_rwsem); + percpu_up_read(&scx_cgroup_ops_rwsem); } static void scx_cgroup_lock(void) { - percpu_down_write(&scx_cgroup_rwsem); + percpu_down_write(&scx_cgroup_ops_rwsem); + cgroup_lock(); } static void scx_cgroup_unlock(void) { - percpu_up_write(&scx_cgroup_rwsem); + cgroup_unlock(); + percpu_up_write(&scx_cgroup_ops_rwsem); } #else /* CONFIG_EXT_GROUP_SCHED */ -static inline void scx_cgroup_lock(void) {} -static inline void scx_cgroup_unlock(void) {} +static void scx_cgroup_lock(void) {} +static void scx_cgroup_unlock(void) {} #endif /* CONFIG_EXT_GROUP_SCHED */ @@ -4411,15 +3333,12 @@ static void scx_cgroup_exit(struct scx_sched *sch) { struct cgroup_subsys_state *css; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - scx_cgroup_enabled = false; /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and exit all the inited ones, all online cgroups are exited. */ - rcu_read_lock(); css_for_each_descendant_post(css, &root_task_group.css) { struct task_group *tg = css_tg(css); @@ -4430,17 +3349,9 @@ static void scx_cgroup_exit(struct scx_sched *sch) if (!sch->ops.cgroup_exit) continue; - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL, css->cgroup); - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); } static int scx_cgroup_init(struct scx_sched *sch) @@ -4448,13 +3359,10 @@ static int scx_cgroup_init(struct scx_sched *sch) struct cgroup_subsys_state *css; int ret; - percpu_rwsem_assert_held(&scx_cgroup_rwsem); - /* - * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk + * scx_tg_on/offline() are excluded through cgroup_lock(). If we walk * cgroups and init, all online cgroups are initialized. */ - rcu_read_lock(); css_for_each_descendant_pre(css, &root_task_group.css) { struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { @@ -4473,10 +3381,6 @@ static int scx_cgroup_init(struct scx_sched *sch) continue; } - if (WARN_ON_ONCE(!css_tryget(css))) - continue; - rcu_read_unlock(); - ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, css->cgroup, &args); if (ret) { @@ -4485,11 +3389,7 @@ static int scx_cgroup_init(struct scx_sched *sch) return ret; } tg->scx.flags |= SCX_TG_INITED; - - rcu_read_lock(); - css_put(css); } - rcu_read_unlock(); WARN_ON_ONCE(scx_cgroup_enabled); scx_cgroup_enabled = true; @@ -4572,7 +3472,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; kthread_stop(sch->helper->task); - free_percpu(sch->event_stats_cpu); + free_percpu(sch->pcpu); for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -4671,9 +3571,22 @@ bool task_should_scx(int policy) bool scx_allow_ttwu_queue(const struct task_struct *p) { - return !scx_enabled() || - (scx_root->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) || - p->sched_class != &ext_sched_class; + struct scx_sched *sch; + + if (!scx_enabled()) + return true; + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return true; + + if (sch->ops.flags & SCX_OPS_ALLOW_QUEUED_WAKEUP) + return true; + + if (unlikely(p->sched_class != &ext_sched_class)) + return true; + + return false; } /** @@ -4789,7 +3702,7 @@ static void scx_clear_softlockup(void) * * - pick_next_task() suppresses zero slice warning. * - * - scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM + * - scx_kick_cpu() is disabled to avoid irq_work malfunction during PM * operations. * * - scx_prio_less() reverts to the default core_sched_at order. @@ -5234,7 +4147,8 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u", p->scx.dsq_vtime, p->scx.slice, p->scx.weight); - dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); + dump_line(s, " cpus=%*pb no_mig=%u", cpumask_pr_args(p->cpus_ptr), + p->migration_disabled); if (SCX_HAS_OP(sch, dump_task)) { ops_dump_init(s, " "); @@ -5473,13 +4387,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) sch->global_dsqs[node] = dsq; } - sch->event_stats_cpu = alloc_percpu(struct scx_event_stats); - if (!sch->event_stats_cpu) + sch->pcpu = alloc_percpu(struct scx_sched_pcpu); + if (!sch->pcpu) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); if (!sch->helper) - goto err_free_event_stats; + goto err_free_pcpu; sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -5497,8 +4411,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) err_stop_helper: kthread_stop(sch->helper->task); -err_free_event_stats: - free_percpu(sch->event_stats_cpu); +err_free_pcpu: + free_percpu(sch->pcpu); err_free_gdsqs: for_each_node_state(node, N_POSSIBLE) kfree(sch->global_dsqs[node]); @@ -5621,6 +4535,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_error(sch, "ops.init() failed (%d)", ret); goto err_disable; } + sch->exit_info->flags |= SCX_EFLAG_INITIALIZED; } for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) @@ -5713,7 +4628,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) ret = scx_init_task(p, task_group(p), false); if (ret) { put_task_struct(p); - scx_task_iter_relock(&sti); scx_task_iter_stop(&sti); scx_error(sch, "ops.init_task() failed (%d) for %s[%d]", ret, p->comm, p->pid); @@ -5723,7 +4637,6 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_set_task_state(p, SCX_TASK_READY); put_task_struct(p); - scx_task_iter_relock(&sti); } scx_task_iter_stop(&sti); scx_cgroup_unlock(); @@ -5795,7 +4708,7 @@ err_unlock: err_disable_unlock_all: scx_cgroup_unlock(); percpu_up_write(&scx_fork_rwsem); - scx_bypass(false); + /* we'll soon enter disable path, keep bypass on */ err_disable: mutex_unlock(&scx_enable_mutex); /* @@ -6328,40 +5241,41 @@ void __init init_sched_ext_class(void) /******************************************************************************** * Helpers that can be called from the BPF scheduler. */ -static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags) +static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p, + u64 enq_flags) { - if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) + if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) return false; lockdep_assert_irqs_disabled(); if (unlikely(!p)) { - scx_kf_error("called with NULL task"); + scx_error(sch, "called with NULL task"); return false; } if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { - scx_kf_error("invalid enq_flags 0x%llx", enq_flags); + scx_error(sch, "invalid enq_flags 0x%llx", enq_flags); return false; } return true; } -static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id, - u64 enq_flags) +static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p, + u64 dsq_id, u64 enq_flags) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct task_struct *ddsp_task; ddsp_task = __this_cpu_read(direct_dispatch_task); if (ddsp_task) { - mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); + mark_direct_dispatch(sch, ddsp_task, p, dsq_id, enq_flags); return; } if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { - scx_kf_error("dispatch buffer overflow"); + scx_error(sch, "dispatch buffer overflow"); return; } @@ -6413,7 +5327,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6421,7 +5342,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice else p->scx.slice = p->scx.slice ?: 1; - scx_dsq_insert_commit(p, dsq_id, enq_flags); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags); } /** @@ -6448,7 +5369,14 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) { - if (!scx_dsq_insert_preamble(p, enq_flags)) + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_dsq_insert_preamble(sch, p, enq_flags)) return; if (slice) @@ -6458,7 +5386,7 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, p->scx.dsq_vtime = vtime; - scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); + scx_dsq_insert_commit(sch, p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); } __bpf_kfunc_end_defs(); @@ -6483,7 +5411,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit, bool in_balance; unsigned long flags; - if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH)) + if (!scx_kf_allowed_if_unlocked() && + !scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; /* @@ -6568,7 +5497,15 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) { - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return 0; return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); @@ -6583,14 +5520,21 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) __bpf_kfunc void scx_bpf_dispatch_cancel(void) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); + struct scx_sched *sch; + + guard(rcu)(); - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return; if (dspc->cursor > 0) dspc->cursor--; else - scx_kf_error("dispatch buffer underflow"); + scx_error(sch, "dispatch buffer underflow"); } /** @@ -6609,11 +5553,17 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void) */ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id) { - struct scx_sched *sch = scx_root; struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); struct scx_dispatch_q *dsq; + struct scx_sched *sch; - if (!scx_kf_allowed(SCX_KF_DISPATCH)) + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return false; + + if (!scx_kf_allowed(sch, SCX_KF_DISPATCH)) return false; flush_dispatch_buf(sch, dspc->rq); @@ -6760,12 +5710,18 @@ __bpf_kfunc_start_defs(); */ __bpf_kfunc u32 scx_bpf_reenqueue_local(void) { + struct scx_sched *sch; LIST_HEAD(tasks); u32 nr_enqueued = 0; struct rq *rq; struct task_struct *p, *n; - if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return 0; + + if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE)) return 0; rq = cpu_rq(smp_processor_id()); @@ -6877,22 +5833,12 @@ static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = { __bpf_kfunc_start_defs(); -/** - * scx_bpf_kick_cpu - Trigger reschedule on a CPU - * @cpu: cpu to kick - * @flags: %SCX_KICK_* flags - * - * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or - * trigger rescheduling on a busy CPU. This can be called from any online - * scx_ops operation and the actual kicking is performed asynchronously through - * an irq work. - */ -__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags) { struct rq *this_rq; unsigned long irq_flags; - if (!kf_cpu_valid(cpu, NULL)) + if (!ops_cpu_valid(sch, cpu, NULL)) return; local_irq_save(irq_flags); @@ -6916,7 +5862,7 @@ __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) struct rq *target_rq = cpu_rq(cpu); if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) - scx_kf_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); + scx_error(sch, "PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); if (raw_spin_rq_trylock(target_rq)) { if (can_skip_idle_kick(target_rq)) { @@ -6941,6 +5887,26 @@ out: } /** + * scx_bpf_kick_cpu - Trigger reschedule on a CPU + * @cpu: cpu to kick + * @flags: %SCX_KICK_* flags + * + * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or + * trigger rescheduling on a busy CPU. This can be called from any online + * scx_ops operation and the actual kicking is performed asynchronously through + * an irq work. + */ +__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) +{ + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (likely(sch)) + scx_kick_cpu(sch, cpu, flags); +} + +/** * scx_bpf_dsq_nr_queued - Return the number of queued tasks * @dsq_id: id of the DSQ * @@ -7120,28 +6086,29 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __bpf_kfunc_end_defs(); -static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, - char *fmt, unsigned long long *data, u32 data__sz) +static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, + size_t line_size, char *fmt, unsigned long long *data, + u32 data__sz) { struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; s32 ret; if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || (data__sz && !data)) { - scx_kf_error("invalid data=%p and data__sz=%u", (void *)data, data__sz); + scx_error(sch, "invalid data=%p and data__sz=%u", (void *)data, data__sz); return -EINVAL; } ret = copy_from_kernel_nofault(data_buf, data, data__sz); if (ret < 0) { - scx_kf_error("failed to read data fields (%d)", ret); + scx_error(sch, "failed to read data fields (%d)", ret); return ret; } ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, &bprintf_data); if (ret < 0) { - scx_kf_error("format preparation failed (%d)", ret); + scx_error(sch, "format preparation failed (%d)", ret); return ret; } @@ -7149,17 +6116,17 @@ static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, bprintf_data.bin_args); bpf_bprintf_cleanup(&bprintf_data); if (ret < 0) { - scx_kf_error("(\"%s\", %p, %u) failed to format", fmt, data, data__sz); + scx_error(sch, "(\"%s\", %p, %u) failed to format", fmt, data, data__sz); return ret; } return ret; } -static s32 bstr_format(struct scx_bstr_buf *buf, +static s32 bstr_format(struct scx_sched *sch, struct scx_bstr_buf *buf, char *fmt, unsigned long long *data, u32 data__sz) { - return __bstr_format(buf->data, buf->line, sizeof(buf->line), + return __bstr_format(sch, buf->data, buf->line, sizeof(buf->line), fmt, data, data__sz); } @@ -7178,11 +6145,14 @@ __bpf_kfunc_start_defs(); __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7198,11 +6168,14 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; unsigned long flags; raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); - if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) - scx_kf_exit(SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); + sch = rcu_dereference_bh(scx_root); + if (likely(sch) && + bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0) + scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line); raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); } @@ -7221,17 +6194,24 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data__sz) { + struct scx_sched *sch; struct scx_dump_data *dd = &scx_dump_data; struct scx_bstr_buf *buf = &dd->buf; s32 ret; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return; + if (raw_smp_processor_id() != dd->cpu) { - scx_kf_error("scx_bpf_dump() must only be called from ops.dump() and friends"); + scx_error(sch, "scx_bpf_dump() must only be called from ops.dump() and friends"); return; } /* append the formatted string to the line buf */ - ret = __bstr_format(buf->data, buf->line + dd->cursor, + ret = __bstr_format(sch, buf->data, buf->line + dd->cursor, sizeof(buf->line) - dd->cursor, fmt, data, data__sz); if (ret < 0) { dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", @@ -7267,7 +6247,12 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, */ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_cpu_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7289,7 +6274,12 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) */ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) { - if (kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (likely(sch) && ops_cpu_valid(sch, cpu, NULL)) return arch_scale_freq_capacity(cpu); else return SCX_CPUPERF_ONE; @@ -7311,12 +6301,20 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) */ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(sch); + if (unlikely(!sch)) + return; + if (unlikely(perf > SCX_CPUPERF_ONE)) { - scx_kf_error("Invalid cpuperf target %u for CPU %d", perf, cpu); + scx_error(sch, "Invalid cpuperf target %u for CPU %d", perf, cpu); return; } - if (kf_cpu_valid(cpu, NULL)) { + if (ops_cpu_valid(sch, cpu, NULL)) { struct rq *rq = cpu_rq(cpu), *locked_rq = scx_locked_rq(); struct rq_flags rf; @@ -7325,7 +6323,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) * to the corresponding CPU to prevent ABBA deadlocks. */ if (locked_rq && rq != locked_rq) { - scx_kf_error("Invalid target CPU %d", cpu); + scx_error(sch, "Invalid target CPU %d", cpu); return; } @@ -7420,13 +6418,76 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) */ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return NULL; + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + if (!sch->warned_deprecated_rq) { + printk_deferred(KERN_WARNING "sched_ext: %s() is deprecated; " + "use scx_bpf_locked_rq() when holding rq lock " + "or scx_bpf_cpu_curr() to read remote curr safely.\n", __func__); + sch->warned_deprecated_rq = true; + } + return cpu_rq(cpu); } /** + * scx_bpf_locked_rq - Return the rq currently locked by SCX + * + * Returns the rq if a rq lock is currently held by SCX. + * Otherwise emits an error and returns NULL. + */ +__bpf_kfunc struct rq *scx_bpf_locked_rq(void) +{ + struct scx_sched *sch; + struct rq *rq; + + guard(preempt)(); + + sch = rcu_dereference_sched(scx_root); + if (unlikely(!sch)) + return NULL; + + rq = scx_locked_rq(); + if (!rq) { + scx_error(sch, "accessing rq without holding rq lock"); + return NULL; + } + + return rq; +} + +/** + * scx_bpf_cpu_curr - Return remote CPU's curr task + * @cpu: CPU of interest + * + * Callers must hold RCU read lock (KF_RCU). + */ +__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu) +{ + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return NULL; + + if (!ops_cpu_valid(sch, cpu, NULL)) + return NULL; + + return rcu_dereference(cpu_rq(cpu)->curr); +} + +/** * scx_bpf_task_cgroup - Return the sched cgroup of a task * @p: task of interest * @@ -7442,8 +6503,15 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) { struct task_group *tg = p->sched_task_group; struct cgroup *cgrp = &cgrp_dfl_root.cgrp; + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + goto out; - if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) + if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p)) goto out; cgrp = tg_cgrp(tg); @@ -7524,7 +6592,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event /* Aggregate per-CPU event counters into @events. */ memset(events, 0, sizeof(*events)); for_each_possible_cpu(cpu) { - e_cpu = per_cpu_ptr(sch->event_stats_cpu, cpu); + e_cpu = &per_cpu_ptr(sch->pcpu, cpu)->event_stats; scx_agg_event(events, e_cpu, SCX_EV_SELECT_CPU_FALLBACK); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE); scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST); @@ -7590,6 +6658,8 @@ BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) BTF_ID_FLAGS(func, scx_bpf_cpu_rq) +BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL) +BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 292bb41a242e..43429b33e52c 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -8,29 +8,6 @@ */ #ifdef CONFIG_SCHED_CLASS_EXT -static inline bool scx_kf_allowed_if_unlocked(void) -{ - return !current->scx.kf_mask; -} - -static inline bool scx_rq_bypassing(struct rq *rq) -{ - return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); -} - -DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup); - -DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); - -/* - * Return the rq currently locked from an scx callback, or NULL if no rq is - * locked. - */ -static inline struct rq *scx_locked_rq(void) -{ - return __this_cpu_read(scx_locked_rq_state); -} - void scx_tick(struct rq *rq); void init_scx_entity(struct sched_ext_entity *scx); void scx_pre_fork(struct task_struct *p); @@ -100,7 +77,6 @@ int scx_tg_online(struct task_group *tg); void scx_tg_offline(struct task_group *tg); int scx_cgroup_can_attach(struct cgroup_taskset *tset); void scx_cgroup_move_task(struct task_struct *p); -void scx_cgroup_finish_attach(void); void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); void scx_group_set_idle(struct task_group *tg, bool idle); @@ -111,7 +87,6 @@ static inline int scx_tg_online(struct task_group *tg) { return 0; } static inline void scx_tg_offline(struct task_group *tg) {} static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } static inline void scx_cgroup_move_task(struct task_struct *p) {} -static inline void scx_cgroup_finish_attach(void) {} static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c index 537c6992bb63..d2434c954848 100644 --- a/kernel/sched/ext_idle.c +++ b/kernel/sched/ext_idle.c @@ -819,10 +819,10 @@ void scx_idle_disable(void) * Helpers that can be called from the BPF scheduler. */ -static int validate_node(int node) +static int validate_node(struct scx_sched *sch, int node) { if (!static_branch_likely(&scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is disabled"); + scx_error(sch, "per-node idle tracking is disabled"); return -EOPNOTSUPP; } @@ -832,13 +832,13 @@ static int validate_node(int node) /* Make sure node is in a valid range */ if (node < 0 || node >= nr_node_ids) { - scx_kf_error("invalid node %d", node); + scx_error(sch, "invalid node %d", node); return -EINVAL; } /* Make sure the node is part of the set of possible nodes */ if (!node_possible(node)) { - scx_kf_error("unavailable node %d", node); + scx_error(sch, "unavailable node %d", node); return -EINVAL; } @@ -847,12 +847,12 @@ static int validate_node(int node) __bpf_kfunc_start_defs(); -static bool check_builtin_idle_enabled(void) +static bool check_builtin_idle_enabled(struct scx_sched *sch) { if (static_branch_likely(&scx_builtin_idle_enabled)) return true; - scx_kf_error("built-in idle tracking is disabled"); + scx_error(sch, "built-in idle tracking is disabled"); return false; } @@ -882,17 +882,18 @@ static bool is_bpf_migration_disabled(const struct task_struct *p) return p->migration_disabled; } -static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags, +static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p, + s32 prev_cpu, u64 wake_flags, const struct cpumask *allowed, u64 flags) { struct rq *rq; struct rq_flags rf; s32 cpu; - if (!kf_cpu_valid(prev_cpu, NULL)) + if (!ops_cpu_valid(sch, prev_cpu, NULL)) return -EINVAL; - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; /* @@ -905,7 +906,7 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f if (scx_kf_allowed_if_unlocked()) { rq = task_rq_lock(p, &rf); } else { - if (!scx_kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) + if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE)) return -EPERM; rq = scx_locked_rq(); } @@ -948,9 +949,13 @@ static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_f */ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) { - if (!kf_cpu_valid(cpu, NULL)) - return NUMA_NO_NODE; + struct scx_sched *sch; + + guard(rcu)(); + sch = rcu_dereference(scx_root); + if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL)) + return NUMA_NO_NODE; return cpu_to_node(cpu); } @@ -972,15 +977,21 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu) __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { + struct scx_sched *sch; s32 cpu; - cpu = select_cpu_from_kfunc(p, prev_cpu, wake_flags, NULL, 0); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0); if (cpu >= 0) { *is_idle = true; return cpu; } *is_idle = false; - return prev_cpu; } @@ -1007,7 +1018,16 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, const struct cpumask *cpus_allowed, u64 flags) { - return select_cpu_from_kfunc(p, prev_cpu, wake_flags, cpus_allowed, flags); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, + cpus_allowed, flags); } /** @@ -1021,7 +1041,15 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1037,12 +1065,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; return idle_cpumask(NUMA_NO_NODE)->cpu; @@ -1060,7 +1096,15 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + + node = validate_node(sch, node); if (node < 0) return cpu_none_mask; @@ -1080,12 +1124,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return cpu_none_mask; + if (static_branch_unlikely(&scx_builtin_idle_per_node)) { - scx_kf_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); + scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled"); return cpu_none_mask; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return cpu_none_mask; if (sched_smt_active()) @@ -1121,10 +1173,18 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) */ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) { - if (!check_builtin_idle_enabled()) + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) return false; - if (!kf_cpu_valid(cpu, NULL)) + if (!check_builtin_idle_enabled(sch)) + return false; + + if (!ops_cpu_valid(sch, cpu, NULL)) return false; return scx_idle_test_and_clear_cpu(cpu); @@ -1152,7 +1212,15 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { - node = validate_node(node); + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1184,12 +1252,20 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; + + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } - if (!check_builtin_idle_enabled()) + if (!check_builtin_idle_enabled(sch)) return -EBUSY; return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags); @@ -1219,9 +1295,16 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, int node, u64 flags) { + struct scx_sched *sch; s32 cpu; - node = validate_node(node); + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + + node = validate_node(sch, node); if (node < 0) return node; @@ -1259,10 +1342,17 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed, __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) { + struct scx_sched *sch; s32 cpu; + guard(rcu)(); + + sch = rcu_dereference(scx_root); + if (unlikely(!sch)) + return -ENODEV; + if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) { - scx_kf_error("per-node idle tracking is enabled"); + scx_error(sch, "per-node idle tracking is enabled"); return -EBUSY; } diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h new file mode 100644 index 000000000000..b3617abed510 --- /dev/null +++ b/kernel/sched/ext_internal.h @@ -0,0 +1,1078 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Tejun Heo <tj@kernel.org> + */ +#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) + +enum scx_consts { + SCX_DSP_DFL_MAX_BATCH = 32, + SCX_DSP_MAX_LOOPS = 32, + SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, + + SCX_EXIT_BT_LEN = 64, + SCX_EXIT_MSG_LEN = 1024, + SCX_EXIT_DUMP_DFL_LEN = 32768, + + SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, + + /* + * Iterating all tasks may take a while. Periodically drop + * scx_tasks_lock to avoid causing e.g. CSD and RCU stalls. + */ + SCX_TASK_ITER_BATCH = 32, +}; + +enum scx_exit_kind { + SCX_EXIT_NONE, + SCX_EXIT_DONE, + + SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ + SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ + SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ + SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ + + SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ + SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ + SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ +}; + +/* + * An exit code can be specified when exiting with scx_bpf_exit() or scx_exit(), + * corresponding to exit_kind UNREG_BPF and UNREG_KERN respectively. The codes + * are 64bit of the format: + * + * Bits: [63 .. 48 47 .. 32 31 .. 0] + * [ SYS ACT ] [ SYS RSN ] [ USR ] + * + * SYS ACT: System-defined exit actions + * SYS RSN: System-defined exit reasons + * USR : User-defined exit codes and reasons + * + * Using the above, users may communicate intention and context by ORing system + * actions and/or system reasons with a user-defined exit code. + */ +enum scx_exit_code { + /* Reasons */ + SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, + + /* Actions */ + SCX_ECODE_ACT_RESTART = 1LLU << 48, +}; + +enum scx_exit_flags { + /* + * ops.exit() may be called even if the loading failed before ops.init() + * finishes successfully. This is because ops.exit() allows rich exit + * info communication. The following flag indicates whether ops.init() + * finished successfully. + */ + SCX_EFLAG_INITIALIZED, +}; + +/* + * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is + * being disabled. + */ +struct scx_exit_info { + /* %SCX_EXIT_* - broad category of the exit reason */ + enum scx_exit_kind kind; + + /* exit code if gracefully exiting */ + s64 exit_code; + + /* %SCX_EFLAG_* */ + u64 flags; + + /* textual representation of the above */ + const char *reason; + + /* backtrace if exiting due to an error */ + unsigned long *bt; + u32 bt_len; + + /* informational message */ + char *msg; + + /* debug dump */ + char *dump; +}; + +/* sched_ext_ops.flags */ +enum scx_ops_flags { + /* + * Keep built-in idle tracking even if ops.update_idle() is implemented. + */ + SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, + + /* + * By default, if there are no other task to run on the CPU, ext core + * keeps running the current task even after its slice expires. If this + * flag is specified, such tasks are passed to ops.enqueue() with + * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. + */ + SCX_OPS_ENQ_LAST = 1LLU << 1, + + /* + * An exiting task may schedule after PF_EXITING is set. In such cases, + * bpf_task_from_pid() may not be able to find the task and if the BPF + * scheduler depends on pid lookup for dispatching, the task will be + * lost leading to various issues including RCU grace period stalls. + * + * To mask this problem, by default, unhashed tasks are automatically + * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't + * depend on pid lookups and wants to handle these tasks directly, the + * following flag can be used. + */ + SCX_OPS_ENQ_EXITING = 1LLU << 2, + + /* + * If set, only tasks with policy set to SCHED_EXT are attached to + * sched_ext. If clear, SCHED_NORMAL tasks are also included. + */ + SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, + + /* + * A migration disabled task can only execute on its current CPU. By + * default, such tasks are automatically put on the CPU's local DSQ with + * the default slice on enqueue. If this ops flag is set, they also go + * through ops.enqueue(). + * + * A migration disabled task never invokes ops.select_cpu() as it can + * only select the current CPU. Also, p->cpus_ptr will only contain its + * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr + * and thus may disagree with cpumask_weight(p->cpus_ptr). + */ + SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4, + + /* + * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes + * ops.enqueue() on the ops.select_cpu() selected or the wakee's + * previous CPU via IPI (inter-processor interrupt) to reduce cacheline + * transfers. When this optimization is enabled, ops.select_cpu() is + * skipped in some cases (when racing against the wakee switching out). + * As the BPF scheduler may depend on ops.select_cpu() being invoked + * during wakeups, queued wakeup is disabled by default. + * + * If this ops flag is set, queued wakeup optimization is enabled and + * the BPF scheduler must be able to handle ops.enqueue() invoked on the + * wakee's CPU without preceding ops.select_cpu() even for tasks which + * may be executed on multiple CPUs. + */ + SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5, + + /* + * If set, enable per-node idle cpumasks. If clear, use a single global + * flat idle cpumask. + */ + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6, + + /* + * CPU cgroup support flags + */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ + + SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | + SCX_OPS_ENQ_LAST | + SCX_OPS_ENQ_EXITING | + SCX_OPS_ENQ_MIGRATION_DISABLED | + SCX_OPS_ALLOW_QUEUED_WAKEUP | + SCX_OPS_SWITCH_PARTIAL | + SCX_OPS_BUILTIN_IDLE_PER_NODE | + SCX_OPS_HAS_CGROUP_WEIGHT, + + /* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */ + __SCX_OPS_INTERNAL_MASK = 0xffLLU << 56, + + SCX_OPS_HAS_CPU_PREEMPT = 1LLU << 56, +}; + +/* argument container for ops.init_task() */ +struct scx_init_task_args { + /* + * Set if ops.init_task() is being invoked on the fork path, as opposed + * to the scheduler transition path. + */ + bool fork; +#ifdef CONFIG_EXT_GROUP_SCHED + /* the cgroup the task is joining */ + struct cgroup *cgroup; +#endif +}; + +/* argument container for ops.exit_task() */ +struct scx_exit_task_args { + /* Whether the task exited before running on sched_ext. */ + bool cancelled; +}; + +/* argument container for ops->cgroup_init() */ +struct scx_cgroup_init_args { + /* the weight of the cgroup [1..10000] */ + u32 weight; + + /* bandwidth control parameters from cpu.max and cpu.max.burst */ + u64 bw_period_us; + u64 bw_quota_us; + u64 bw_burst_us; +}; + +enum scx_cpu_preempt_reason { + /* next task is being scheduled by &sched_class_rt */ + SCX_CPU_PREEMPT_RT, + /* next task is being scheduled by &sched_class_dl */ + SCX_CPU_PREEMPT_DL, + /* next task is being scheduled by &sched_class_stop */ + SCX_CPU_PREEMPT_STOP, + /* unknown reason for SCX being preempted */ + SCX_CPU_PREEMPT_UNKNOWN, +}; + +/* + * Argument container for ops->cpu_acquire(). Currently empty, but may be + * expanded in the future. + */ +struct scx_cpu_acquire_args {}; + +/* argument container for ops->cpu_release() */ +struct scx_cpu_release_args { + /* the reason the CPU was preempted */ + enum scx_cpu_preempt_reason reason; + + /* the task that's going to be scheduled on the CPU */ + struct task_struct *task; +}; + +/* + * Informational context provided to dump operations. + */ +struct scx_dump_ctx { + enum scx_exit_kind kind; + s64 exit_code; + const char *reason; + u64 at_ns; + u64 at_jiffies; +}; + +/** + * struct sched_ext_ops - Operation table for BPF scheduler implementation + * + * A BPF scheduler can implement an arbitrary scheduling policy by + * implementing and loading operations in this table. Note that a userland + * scheduling policy can also be implemented using the BPF scheduler + * as a shim layer. + */ +struct sched_ext_ops { + /** + * @select_cpu: Pick the target CPU for a task which is being woken up + * @p: task being woken up + * @prev_cpu: the cpu @p was on before sleeping + * @wake_flags: SCX_WAKE_* + * + * Decision made here isn't final. @p may be moved to any CPU while it + * is getting dispatched for execution later. However, as @p is not on + * the rq at this point, getting the eventual execution CPU right here + * saves a small bit of overhead down the line. + * + * If an idle CPU is returned, the CPU is kicked and will try to + * dispatch. While an explicit custom mechanism can be added, + * select_cpu() serves as the default way to wake up idle CPUs. + * + * @p may be inserted into a DSQ directly by calling + * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped. + * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ + * of the CPU returned by this operation. + * + * Note that select_cpu() is never called for tasks that can only run + * on a single CPU or tasks with migration disabled, as they don't have + * the option to select a different CPU. See select_task_rq() for + * details. + */ + s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); + + /** + * @enqueue: Enqueue a task on the BPF scheduler + * @p: task being enqueued + * @enq_flags: %SCX_ENQ_* + * + * @p is ready to run. Insert directly into a DSQ by calling + * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly + * inserted, the bpf scheduler owns @p and if it fails to dispatch @p, + * the task will stall. + * + * If @p was inserted into a DSQ from ops.select_cpu(), this callback is + * skipped. + */ + void (*enqueue)(struct task_struct *p, u64 enq_flags); + + /** + * @dequeue: Remove a task from the BPF scheduler + * @p: task being dequeued + * @deq_flags: %SCX_DEQ_* + * + * Remove @p from the BPF scheduler. This is usually called to isolate + * the task while updating its scheduling properties (e.g. priority). + * + * The ext core keeps track of whether the BPF side owns a given task or + * not and can gracefully ignore spurious dispatches from BPF side, + * which makes it safe to not implement this method. However, depending + * on the scheduling logic, this can lead to confusing behaviors - e.g. + * scheduling position not being updated across a priority change. + */ + void (*dequeue)(struct task_struct *p, u64 deq_flags); + + /** + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs + * @cpu: CPU to dispatch tasks for + * @prev: previous task being switched out + * + * Called when a CPU's local dsq is empty. The operation should dispatch + * one or more tasks from the BPF scheduler into the DSQs using + * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ + * using scx_bpf_dsq_move_to_local(). + * + * The maximum number of times scx_bpf_dsq_insert() can be called + * without an intervening scx_bpf_dsq_move_to_local() is specified by + * ops.dispatch_max_batch. See the comments on top of the two functions + * for more details. + * + * When not %NULL, @prev is an SCX task with its slice depleted. If + * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in + * @prev->scx.flags, it is not enqueued yet and will be enqueued after + * ops.dispatch() returns. To keep executing @prev, return without + * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST. + */ + void (*dispatch)(s32 cpu, struct task_struct *prev); + + /** + * @tick: Periodic tick + * @p: task running currently + * + * This operation is called every 1/HZ seconds on CPUs which are + * executing an SCX task. Setting @p->scx.slice to 0 will trigger an + * immediate dispatch cycle on the CPU. + */ + void (*tick)(struct task_struct *p); + + /** + * @runnable: A task is becoming runnable on its associated CPU + * @p: task becoming runnable + * @enq_flags: %SCX_ENQ_* + * + * This and the following three functions can be used to track a task's + * execution state transitions. A task becomes ->runnable() on a CPU, + * and then goes through one or more ->running() and ->stopping() pairs + * as it runs on the CPU, and eventually becomes ->quiescent() when it's + * done running on the CPU. + * + * @p is becoming runnable on the CPU because it's + * + * - waking up (%SCX_ENQ_WAKEUP) + * - being moved from another CPU + * - being restored after temporarily taken off the queue for an + * attribute change. + * + * This and ->enqueue() are related but not coupled. This operation + * notifies @p's state transition and may not be followed by ->enqueue() + * e.g. when @p is being dispatched to a remote CPU, or when @p is + * being enqueued on a CPU experiencing a hotplug event. Likewise, a + * task may be ->enqueue()'d without being preceded by this operation + * e.g. after exhausting its slice. + */ + void (*runnable)(struct task_struct *p, u64 enq_flags); + + /** + * @running: A task is starting to run on its associated CPU + * @p: task starting to run + * + * Note that this callback may be called from a CPU other than the + * one the task is going to run on. This can happen when a task + * property is changed (i.e., affinity), since scx_next_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to determine the + * target CPU the task is going to use. + * + * See ->runnable() for explanation on the task state notifiers. + */ + void (*running)(struct task_struct *p); + + /** + * @stopping: A task is stopping execution + * @p: task stopping to run + * @runnable: is task @p still runnable? + * + * Note that this callback may be called from a CPU other than the + * one the task was running on. This can happen when a task + * property is changed (i.e., affinity), since dequeue_task_scx(), + * which triggers this callback, may run on a CPU different from + * the task's assigned CPU. + * + * Therefore, always use scx_bpf_task_cpu(@p) to retrieve the CPU + * the task was running on. + * + * See ->runnable() for explanation on the task state notifiers. If + * !@runnable, ->quiescent() will be invoked after this operation + * returns. + */ + void (*stopping)(struct task_struct *p, bool runnable); + + /** + * @quiescent: A task is becoming not runnable on its associated CPU + * @p: task becoming not runnable + * @deq_flags: %SCX_DEQ_* + * + * See ->runnable() for explanation on the task state notifiers. + * + * @p is becoming quiescent on the CPU because it's + * + * - sleeping (%SCX_DEQ_SLEEP) + * - being moved to another CPU + * - being temporarily taken off the queue for an attribute change + * (%SCX_DEQ_SAVE) + * + * This and ->dequeue() are related but not coupled. This operation + * notifies @p's state transition and may not be preceded by ->dequeue() + * e.g. when @p is being dispatched to a remote CPU. + */ + void (*quiescent)(struct task_struct *p, u64 deq_flags); + + /** + * @yield: Yield CPU + * @from: yielding task + * @to: optional yield target task + * + * If @to is NULL, @from is yielding the CPU to other runnable tasks. + * The BPF scheduler should ensure that other available tasks are + * dispatched before the yielding task. Return value is ignored in this + * case. + * + * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf + * scheduler can implement the request, return %true; otherwise, %false. + */ + bool (*yield)(struct task_struct *from, struct task_struct *to); + + /** + * @core_sched_before: Task ordering for core-sched + * @a: task A + * @b: task B + * + * Used by core-sched to determine the ordering between two tasks. See + * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on + * core-sched. + * + * Both @a and @b are runnable and may or may not currently be queued on + * the BPF scheduler. Should return %true if @a should run before @b. + * %false if there's no required ordering or @b should run before @a. + * + * If not specified, the default is ordering them according to when they + * became runnable. + */ + bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); + + /** + * @set_weight: Set task weight + * @p: task to set weight for + * @weight: new weight [1..10000] + * + * Update @p's weight to @weight. + */ + void (*set_weight)(struct task_struct *p, u32 weight); + + /** + * @set_cpumask: Set CPU affinity + * @p: task to set CPU affinity for + * @cpumask: cpumask of cpus that @p can run on + * + * Update @p's CPU affinity to @cpumask. + */ + void (*set_cpumask)(struct task_struct *p, + const struct cpumask *cpumask); + + /** + * @update_idle: Update the idle state of a CPU + * @cpu: CPU to update the idle state for + * @idle: whether entering or exiting the idle state + * + * This operation is called when @rq's CPU goes or leaves the idle + * state. By default, implementing this operation disables the built-in + * idle CPU tracking and the following helpers become unavailable: + * + * - scx_bpf_select_cpu_dfl() + * - scx_bpf_select_cpu_and() + * - scx_bpf_test_and_clear_cpu_idle() + * - scx_bpf_pick_idle_cpu() + * + * The user also must implement ops.select_cpu() as the default + * implementation relies on scx_bpf_select_cpu_dfl(). + * + * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle + * tracking. + */ + void (*update_idle)(s32 cpu, bool idle); + + /** + * @cpu_acquire: A CPU is becoming available to the BPF scheduler + * @cpu: The CPU being acquired by the BPF scheduler. + * @args: Acquire arguments, see the struct definition. + * + * A CPU that was previously released from the BPF scheduler is now once + * again under its control. + */ + void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); + + /** + * @cpu_release: A CPU is taken away from the BPF scheduler + * @cpu: The CPU being released by the BPF scheduler. + * @args: Release arguments, see the struct definition. + * + * The specified CPU is no longer under the control of the BPF + * scheduler. This could be because it was preempted by a higher + * priority sched_class, though there may be other reasons as well. The + * caller should consult @args->reason to determine the cause. + */ + void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); + + /** + * @init_task: Initialize a task to run in a BPF scheduler + * @p: task to initialize for BPF scheduling + * @args: init arguments, see the struct definition + * + * Either we're loading a BPF scheduler or a new task is being forked. + * Initialize @p for BPF scheduling. This operation may block and can + * be used for allocations, and is called exactly once for a task. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During a fork, it + * will abort that specific fork. + */ + s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); + + /** + * @exit_task: Exit a previously-running task from the system + * @p: task to exit + * @args: exit arguments, see the struct definition + * + * @p is exiting or the BPF scheduler is being unloaded. Perform any + * necessary cleanup for @p. + */ + void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); + + /** + * @enable: Enable BPF scheduling for a task + * @p: task to enable BPF scheduling for + * + * Enable @p for BPF scheduling. enable() is called on @p any time it + * enters SCX, and is always paired with a matching disable(). + */ + void (*enable)(struct task_struct *p); + + /** + * @disable: Disable BPF scheduling for a task + * @p: task to disable BPF scheduling for + * + * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. + * Disable BPF scheduling for @p. A disable() call is always matched + * with a prior enable() call. + */ + void (*disable)(struct task_struct *p); + + /** + * @dump: Dump BPF scheduler state on error + * @ctx: debug dump context + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. + */ + void (*dump)(struct scx_dump_ctx *ctx); + + /** + * @dump_cpu: Dump BPF scheduler state for a CPU on error + * @ctx: debug dump context + * @cpu: CPU to generate debug dump for + * @idle: @cpu is currently idle without any runnable tasks + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @cpu. If @idle is %true and this operation doesn't produce any + * output, @cpu is skipped for dump. + */ + void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); + + /** + * @dump_task: Dump BPF scheduler state for a runnable task on error + * @ctx: debug dump context + * @p: runnable task to generate debug dump for + * + * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for + * @p. + */ + void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); + +#ifdef CONFIG_EXT_GROUP_SCHED + /** + * @cgroup_init: Initialize a cgroup + * @cgrp: cgroup being initialized + * @args: init arguments, see the struct definition + * + * Either the BPF scheduler is being loaded or @cgrp created, initialize + * @cgrp for sched_ext. This operation may block. + * + * Return 0 for success, -errno for failure. An error return while + * loading will abort loading of the BPF scheduler. During cgroup + * creation, it will abort the specific cgroup creation. + */ + s32 (*cgroup_init)(struct cgroup *cgrp, + struct scx_cgroup_init_args *args); + + /** + * @cgroup_exit: Exit a cgroup + * @cgrp: cgroup being exited + * + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit + * @cgrp for sched_ext. This operation my block. + */ + void (*cgroup_exit)(struct cgroup *cgrp); + + /** + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Prepare @p for move from cgroup @from to @to. This operation may + * block and can be used for allocations. + * + * Return 0 for success, -errno for failure. An error return aborts the + * migration. + */ + s32 (*cgroup_prep_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_move: Commit cgroup move + * @p: task being moved + * @from: cgroup @p is being moved from + * @to: cgroup @p is being moved to + * + * Commit the move. @p is dequeued during this operation. + */ + void (*cgroup_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_cancel_move: Cancel cgroup move + * @p: task whose cgroup move is being canceled + * @from: cgroup @p was being moved from + * @to: cgroup @p was being moved to + * + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). + * Undo the preparation. + */ + void (*cgroup_cancel_move)(struct task_struct *p, + struct cgroup *from, struct cgroup *to); + + /** + * @cgroup_set_weight: A cgroup's weight is being changed + * @cgrp: cgroup whose weight is being updated + * @weight: new weight [1..10000] + * + * Update @cgrp's weight to @weight. + */ + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); + + /** + * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed + * @cgrp: cgroup whose bandwidth is being updated + * @period_us: bandwidth control period + * @quota_us: bandwidth control quota + * @burst_us: bandwidth control burst + * + * Update @cgrp's bandwidth control parameters. This is from the cpu.max + * cgroup interface. + * + * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled + * to. For example, if @period_us is 1_000_000 and @quota_us is + * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be + * interpreted in the same fashion and specifies how much @cgrp can + * burst temporarily. The specific control mechanism and thus the + * interpretation of @period_us and burstiness is upto to the BPF + * scheduler. + */ + void (*cgroup_set_bandwidth)(struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us); + +#endif /* CONFIG_EXT_GROUP_SCHED */ + + /* + * All online ops must come before ops.cpu_online(). + */ + + /** + * @cpu_online: A CPU became online + * @cpu: CPU which just came up + * + * @cpu just came online. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs beforehand. + */ + void (*cpu_online)(s32 cpu); + + /** + * @cpu_offline: A CPU is going offline + * @cpu: CPU which is going offline + * + * @cpu is going offline. @cpu will not call ops.enqueue() or + * ops.dispatch(), nor run tasks associated with other CPUs afterwards. + */ + void (*cpu_offline)(s32 cpu); + + /* + * All CPU hotplug ops must come before ops.init(). + */ + + /** + * @init: Initialize the BPF scheduler + */ + s32 (*init)(void); + + /** + * @exit: Clean up after the BPF scheduler + * @info: Exit info + * + * ops.exit() is also called on ops.init() failure, which is a bit + * unusual. This is to allow rich reporting through @info on how + * ops.init() failed. + */ + void (*exit)(struct scx_exit_info *info); + + /** + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch + */ + u32 dispatch_max_batch; + + /** + * @flags: %SCX_OPS_* flags + */ + u64 flags; + + /** + * @timeout_ms: The maximum amount of time, in milliseconds, that a + * runnable task should be able to wait before being scheduled. The + * maximum timeout may not exceed the default timeout of 30 seconds. + * + * Defaults to the maximum allowed timeout value of 30 seconds. + */ + u32 timeout_ms; + + /** + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default + * value of 32768 is used. + */ + u32 exit_dump_len; + + /** + * @hotplug_seq: A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + + /** + * @name: BPF scheduler's name + * + * Must be a non-zero valid BPF object name including only isalnum(), + * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the + * BPF scheduler is enabled. + */ + char name[SCX_OPS_NAME_LEN]; + + /* internal use only, must be NULL */ + void *priv; +}; + +enum scx_opi { + SCX_OPI_BEGIN = 0, + SCX_OPI_NORMAL_BEGIN = 0, + SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), + SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), + SCX_OPI_END = SCX_OP_IDX(init), +}; + +/* + * Collection of event counters. Event types are placed in descending order. + */ +struct scx_event_stats { + /* + * If ops.select_cpu() returns a CPU which can't be used by the task, + * the core scheduler code silently picks a fallback CPU. + */ + s64 SCX_EV_SELECT_CPU_FALLBACK; + + /* + * When dispatching to a local DSQ, the CPU may have gone offline in + * the meantime. In this case, the task is bounced to the global DSQ. + */ + s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE; + + /* + * If SCX_OPS_ENQ_LAST is not set, the number of times that a task + * continued to run because there were no other tasks on the CPU. + */ + s64 SCX_EV_DISPATCH_KEEP_LAST; + + /* + * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task + * is dispatched to a local DSQ when exiting. + */ + s64 SCX_EV_ENQ_SKIP_EXITING; + + /* + * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a + * migration disabled task skips ops.enqueue() and is dispatched to its + * local DSQ. + */ + s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED; + + /* + * Total number of times a task's time slice was refilled with the + * default value (SCX_SLICE_DFL). + */ + s64 SCX_EV_REFILL_SLICE_DFL; + + /* + * The total duration of bypass modes in nanoseconds. + */ + s64 SCX_EV_BYPASS_DURATION; + + /* + * The number of tasks dispatched in the bypassing mode. + */ + s64 SCX_EV_BYPASS_DISPATCH; + + /* + * The number of times the bypassing mode has been activated. + */ + s64 SCX_EV_BYPASS_ACTIVATE; +}; + +struct scx_sched_pcpu { + /* + * The event counters are in a per-CPU variable to minimize the + * accounting overhead. A system-wide view on the event counter is + * constructed when requested by scx_bpf_events(). + */ + struct scx_event_stats event_stats; +}; + +struct scx_sched { + struct sched_ext_ops ops; + DECLARE_BITMAP(has_op, SCX_OPI_END); + + /* + * Dispatch queues. + * + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. + * This is to avoid live-locking in bypass mode where all tasks are + * dispatched to %SCX_DSQ_GLOBAL and all CPUs consume from it. If + * per-node split isn't sufficient, it can be further split. + */ + struct rhashtable dsq_hash; + struct scx_dispatch_q **global_dsqs; + struct scx_sched_pcpu __percpu *pcpu; + + bool warned_zero_slice:1; + bool warned_deprecated_rq:1; + + atomic_t exit_kind; + struct scx_exit_info *exit_info; + + struct kobject kobj; + + struct kthread_worker *helper; + struct irq_work error_irq_work; + struct kthread_work disable_work; + struct rcu_work rcu_work; +}; + +enum scx_wake_flags { + /* expose select WF_* flags as enums */ + SCX_WAKE_FORK = WF_FORK, + SCX_WAKE_TTWU = WF_TTWU, + SCX_WAKE_SYNC = WF_SYNC, +}; + +enum scx_enq_flags { + /* expose select ENQUEUE_* flags as enums */ + SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, + SCX_ENQ_HEAD = ENQUEUE_HEAD, + SCX_ENQ_CPU_SELECTED = ENQUEUE_RQ_SELECTED, + + /* high 32bits are SCX specific */ + + /* + * Set the following to trigger preemption when calling + * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the + * current task is cleared to zero and the CPU is kicked into the + * scheduling path. Implies %SCX_ENQ_HEAD. + */ + SCX_ENQ_PREEMPT = 1LLU << 32, + + /* + * The task being enqueued was previously enqueued on the current CPU's + * %SCX_DSQ_LOCAL, but was removed from it in a call to the + * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was + * invoked in a ->cpu_release() callback, and the task is again + * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the + * task will not be scheduled on the CPU until at least the next invocation + * of the ->cpu_acquire() callback. + */ + SCX_ENQ_REENQ = 1LLU << 40, + + /* + * The task being enqueued is the only task available for the cpu. By + * default, ext core keeps executing such tasks but when + * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the + * %SCX_ENQ_LAST flag set. + * + * The BPF scheduler is responsible for triggering a follow-up + * scheduling event. Otherwise, Execution may stall. + */ + SCX_ENQ_LAST = 1LLU << 41, + + /* high 8 bits are internal */ + __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, + + SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, +}; + +enum scx_deq_flags { + /* expose select DEQUEUE_* flags as enums */ + SCX_DEQ_SLEEP = DEQUEUE_SLEEP, + + /* high 32bits are SCX specific */ + + /* + * The generic core-sched layer decided to execute the task even though + * it hasn't been dispatched yet. Dequeue from the BPF side. + */ + SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, +}; + +enum scx_pick_idle_cpu_flags { + SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ + SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */ +}; + +enum scx_kick_flags { + /* + * Kick the target CPU if idle. Guarantees that the target CPU goes + * through at least one full scheduling cycle before going idle. If the + * target CPU can be determined to be currently not idle and going to go + * through a scheduling cycle before going idle, noop. + */ + SCX_KICK_IDLE = 1LLU << 0, + + /* + * Preempt the current task and execute the dispatch path. If the + * current task of the target CPU is an SCX task, its ->scx.slice is + * cleared to zero before the scheduling path is invoked so that the + * task expires and the dispatch path is invoked. + */ + SCX_KICK_PREEMPT = 1LLU << 1, + + /* + * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will + * return after the target CPU finishes picking the next task. + */ + SCX_KICK_WAIT = 1LLU << 2, +}; + +enum scx_tg_flags { + SCX_TG_ONLINE = 1U << 0, + SCX_TG_INITED = 1U << 1, +}; + +enum scx_enable_state { + SCX_ENABLING, + SCX_ENABLED, + SCX_DISABLING, + SCX_DISABLED, +}; + +static const char *scx_enable_state_str[] = { + [SCX_ENABLING] = "enabling", + [SCX_ENABLED] = "enabled", + [SCX_DISABLING] = "disabling", + [SCX_DISABLED] = "disabled", +}; + +/* + * sched_ext_entity->ops_state + * + * Used to track the task ownership between the SCX core and the BPF scheduler. + * State transitions look as follows: + * + * NONE -> QUEUEING -> QUEUED -> DISPATCHING + * ^ | | + * | v v + * \-------------------------------/ + * + * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call + * sites for explanations on the conditions being waited upon and why they are + * safe. Transitions out of them into NONE or QUEUED must store_release and the + * waiters should load_acquire. + * + * Tracking scx_ops_state enables sched_ext core to reliably determine whether + * any given task can be dispatched by the BPF scheduler at all times and thus + * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler + * to try to dispatch any task anytime regardless of its state as the SCX core + * can safely reject invalid dispatches. + */ +enum scx_ops_state { + SCX_OPSS_NONE, /* owned by the SCX core */ + SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ + SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ + SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ + + /* + * QSEQ brands each QUEUED instance so that, when dispatch races + * dequeue/requeue, the dispatcher can tell whether it still has a claim + * on the task being dispatched. + * + * As some 32bit archs can't do 64bit store_release/load_acquire, + * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on + * 32bit machines. The dispatch race window QSEQ protects is very narrow + * and runs with IRQ disabled. 30 bits should be sufficient. + */ + SCX_OPSS_QSEQ_SHIFT = 2, +}; + +/* Use macros to ensure that the type is unsigned long for the masks */ +#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) +#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) + +DECLARE_PER_CPU(struct rq *, scx_locked_rq_state); + +/* + * Return the rq currently locked from an scx callback, or NULL if no rq is + * locked. + */ +static inline struct rq *scx_locked_rq(void) +{ + return __this_cpu_read(scx_locked_rq_state); +} + +static inline bool scx_kf_allowed_if_unlocked(void) +{ + return !current->scx.kf_mask; +} + +static inline bool scx_rq_bypassing(struct rq *rq) +{ + return unlikely(rq->scx.flags & SCX_RQ_BYPASSING); +} diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h new file mode 100644 index 000000000000..4366fb3c91ce --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else + +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + * + * This is a workaround for LLVM compiler versions without + * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena + * pointers and native kernel/userspace ones. In this case we explicitly do so + * with cast_kern() and cast_user(). E.g., in the Linux kernel tree, + * tools/testing/selftests/bpf includes tests that use these macros to implement + * linked lists and hashtables backed by arena memory. In sched_ext, we use + * cast_kern() and cast_user() for compatibility with older LLVM toolchains. + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef TEST +#define can_loop true +#define __cond_break(expr) expr +#else +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ +#endif /* __BPF_FEATURE_MAY_GOTO */ +#endif /* TEST */ + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + + +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h new file mode 100644 index 000000000000..10141db0b59d --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +/* Provide the definition of PAGE_SIZE. */ +#include <sys/user.h> + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +char __attribute__((weak)) arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index d4e21558e982..06e2551033cb 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -24,14 +24,26 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <asm-generic/errno.h> -#include "user_exit_info.h" +#include "user_exit_info.bpf.h" #include "enum_defs.autogen.h" +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_EXITING 0x00000004 #define CLOCK_MONOTONIC 1 +#ifndef NR_CPUS +#define NR_CPUS 1024 +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + extern int LINUX_KERNEL_VERSION __kconfig; extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak; extern const char CONFIG_LOCALVERSION[64] __kconfig __weak; @@ -91,6 +103,8 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; +struct rq *scx_bpf_locked_rq(void) __ksym; +struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; @@ -107,6 +121,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __ static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} +#define SCX_STRINGIFY(x) #x +#define SCX_TOSTRING(x) SCX_STRINGIFY(x) + /* * Helper macro for initializing the fmt and variadic argument inputs to both * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to @@ -141,13 +158,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments * instead of an array of u64. Invoking this macro will cause the scheduler to * exit in an erroneous state, with diagnostic information being passed to the - * user. + * user. It appends the file and line number to aid debugging. */ #define scx_bpf_error(fmt, args...) \ ({ \ - scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_bstr_preamble( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ - ___scx_bpf_bstr_format_checker(fmt, ##args); \ + ___scx_bpf_bstr_format_checker( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \ }) /* @@ -229,6 +248,7 @@ BPF_PROG(name, ##args) * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of * `MEMBER_VPTR(ptr, ->member)`. */ +#ifndef MEMBER_VPTR #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ ({ \ u64 __base = (u64)&(base); \ @@ -245,6 +265,7 @@ BPF_PROG(name, ##args) [max]"i"(sizeof(base) - sizeof((base) member))); \ __addr; \ }) +#endif /* MEMBER_VPTR */ /** * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element @@ -260,6 +281,7 @@ BPF_PROG(name, ##args) * size of the array to compute the max, which will result in rejection by * the verifier. */ +#ifndef ARRAY_ELEM_PTR #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ ({ \ u64 __base = (u64)arr; \ @@ -274,7 +296,7 @@ BPF_PROG(name, ##args) [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ __addr; \ }) - +#endif /* ARRAY_ELEM_PTR */ /* * BPF declarations and helpers @@ -438,8 +460,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) */ static inline bool is_migration_disabled(const struct task_struct *p) { - if (bpf_core_field_exists(p->migration_disabled)) - return p->migration_disabled; + /* + * Testing p->migration_disabled in a BPF code is tricky because the + * migration is _always_ disabled while running the BPF code. + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF + * code execution disable and re-enable the migration of the current + * task, respectively. So, the _current_ task of the sched_ext ops is + * always migration-disabled. Moreover, p->migration_disabled could be + * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is + * executed in the middle of the other BPF code execution. + * + * Therefore, we should decide that the _current_ task is + * migration-disabled only when its migration_disabled count is greater + * than one. In other words, when p->migration_disabled == 1, there is + * an ambiguity, so we should check if @p is the current task or not. + */ + if (bpf_core_field_exists(p->migration_disabled)) { + if (p->migration_disabled == 1) + return bpf_get_current_task_btf() != p; + else + return p->migration_disabled; + } return false; } @@ -476,7 +517,7 @@ static inline s64 time_delta(u64 after, u64 before) */ static inline bool time_after(u64 a, u64 b) { - return (s64)(b - a) < 0; + return (s64)(b - a) < 0; } /** @@ -500,7 +541,7 @@ static inline bool time_before(u64 a, u64 b) */ static inline bool time_after_eq(u64 a, u64 b) { - return (s64)(a - b) >= 0; + return (s64)(a - b) >= 0; } /** @@ -547,9 +588,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c) */ /* useful compiler attributes */ +#ifndef likely #define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef __maybe_unused #define __maybe_unused __attribute__((__unused__)) +#endif /* * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They @@ -633,6 +680,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s }) /* + * __calc_avg - Calculate exponential weighted moving average (EWMA) with + * @old and @new values. @decay represents how large the @old value remains. + * With a larger @decay value, the moving average changes slowly, exhibiting + * fewer fluctuations. + */ +#define __calc_avg(old, new, decay) ({ \ + typeof(decay) thr = 1 << (decay); \ + typeof(old) ret; \ + if (((old) < thr) || ((new) < thr)) { \ + if (((old) == 1) && ((new) == 0)) \ + ret = 0; \ + else \ + ret = ((old) - ((old) >> 1)) + ((new) >> 1); \ + } else { \ + ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \ + } \ + ret; \ +}) + +/* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. */ @@ -663,6 +730,25 @@ static inline u32 log2_u64(u64 v) } /* + * sqrt_u64 - Calculate the square root of value @x using Newton's method. + */ +static inline u64 __sqrt_u64(u64 x) +{ + if (x == 0 || x == 1) + return x; + + u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32); + + for (int i = 0; i < 8; ++i) { + u64 q = x / r; + if (r <= q) + break; + r = (r + q) >> 1; + } + return r; +} + +/* * Return a value proportionally scaled to the task's weight. */ static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value) diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index 1dc76bd84296..b3c6372bcf81 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -75,8 +75,9 @@ typedef int64_t s64; #include "enums.h" /* not available when building kernel tools/sched_ext */ -#if __has_include(<lib/sdt_task.h>) -#include <lib/sdt_task.h> +#if __has_include(<lib/sdt_task_defs.h>) +#include "bpf_arena_common.h" +#include <lib/sdt_task_defs.h> #endif #endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 9252e1a00556..dd9144624dc9 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -38,6 +38,7 @@ void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__i void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ @@ -82,6 +83,10 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) +#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ + (bpf_ksym_exists(bpf_cpumask_populate) ? \ + (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) + #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") @@ -226,6 +231,23 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) scx_bpf_pick_any_cpu(cpus_allowed, flags)) /* + * v6.18: Add a helper to retrieve the current task running on a CPU. + * + * Keep this helper available until v6.20 for compatibility. + */ +static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) +{ + struct rq *rq; + + if (bpf_ksym_exists(scx_bpf_cpu_curr)) + return scx_bpf_cpu_curr(cpu); + + rq = scx_bpf_cpu_rq(cpu); + + return rq ? rq->curr : NULL; +} + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h new file mode 100644 index 000000000000..e7ac6611a990 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ + +#ifndef __USER_EXIT_INFO_BPF_H +#define __USER_EXIT_INFO_BPF_H + +#ifndef LSP +#include "vmlinux.h" +#endif +#include <bpf/bpf_core_read.h> + +#include "user_exit_info_common.h" + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#endif /* __USER_EXIT_INFO_BPF_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 66f856640ee7..399697fa372f 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -10,55 +10,11 @@ #ifndef __USER_EXIT_INFO_H #define __USER_EXIT_INFO_H -#ifdef LSP -#define __bpf__ -#include "../vmlinux.h" -#endif - -enum uei_sizes { - UEI_REASON_LEN = 128, - UEI_MSG_LEN = 1024, - UEI_DUMP_DFL_LEN = 32768, -}; - -struct user_exit_info { - int kind; - s64 exit_code; - char reason[UEI_REASON_LEN]; - char msg[UEI_MSG_LEN]; -}; - -#ifdef __bpf__ - -#ifndef LSP -#include "vmlinux.h" -#endif -#include <bpf/bpf_core_read.h> - -#define UEI_DEFINE(__name) \ - char RESIZABLE_ARRAY(data, __name##_dump); \ - const volatile u32 __name##_dump_len; \ - struct user_exit_info __name SEC(".data") - -#define UEI_RECORD(__uei_name, __ei) ({ \ - bpf_probe_read_kernel_str(__uei_name.reason, \ - sizeof(__uei_name.reason), (__ei)->reason); \ - bpf_probe_read_kernel_str(__uei_name.msg, \ - sizeof(__uei_name.msg), (__ei)->msg); \ - bpf_probe_read_kernel_str(__uei_name##_dump, \ - __uei_name##_dump_len, (__ei)->dump); \ - if (bpf_core_field_exists((__ei)->exit_code)) \ - __uei_name.exit_code = (__ei)->exit_code; \ - /* use __sync to force memory barrier */ \ - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ - (__ei)->kind); \ -}) - -#else /* !__bpf__ */ - #include <stdio.h> #include <stdbool.h> +#include "user_exit_info_common.h" + /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ @@ -114,5 +70,4 @@ enum uei_ecode_mask { #define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -#endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h new file mode 100644 index 000000000000..2d0981aedd89 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __USER_EXIT_INFO_COMMON_H +#define __USER_EXIT_INFO_COMMON_H + +#ifdef LSP +#include "../vmlinux.h" +#endif + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#endif /* __USER_EXIT_INFO_COMMON_H */ diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c index 50bc1737c167..55df8b798865 100644 --- a/tools/sched_ext/scx_central.bpf.c +++ b/tools/sched_ext/scx_central.bpf.c @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * A central FIFO sched_ext scheduler which demonstrates the followings: + * A central FIFO sched_ext scheduler which demonstrates the following: * * a. Making all scheduling decisions from one CPU: * diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 6ba6e610eeaa..55931a4cd71c 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -61,6 +61,7 @@ restart: skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids > 0); assert(skel->rodata->nr_cpu_ids <= INT32_MAX); while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index fdc7170639e6..2c720e3ecad5 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops, .cgroup_move = (void *)fcg_cgroup_move, .init = (void *)fcg_init, .exit = (void *)fcg_exit, - .flags = SCX_OPS_ENQ_EXITING, + .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, .name = "flatcg"); diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c index 6dd423eeb4ff..cd85eb401179 100644 --- a/tools/sched_ext/scx_flatcg.c +++ b/tools/sched_ext/scx_flatcg.c @@ -6,6 +6,7 @@ */ #include <stdio.h> #include <signal.h> +#include <assert.h> #include <unistd.h> #include <libgen.h> #include <limits.h> @@ -137,6 +138,7 @@ restart: skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); skel->rodata->nr_cpus = libbpf_num_possible_cpus(); + assert(skel->rodata->nr_cpus > 0); skel->rodata->cgrp_slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 69d877501cb7..3072b593f898 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -39,7 +39,8 @@ const volatile u32 stall_kernel_nth; const volatile u32 dsp_inf_loop_after; const volatile u32 dsp_batch; const volatile bool highpri_boosting; -const volatile bool print_shared_dsq; +const volatile bool print_dsqs_and_events; +const volatile bool print_msgs; const volatile s32 disallow_tgid; const volatile bool suppress_dump; @@ -56,7 +57,8 @@ struct qmap { queue1 SEC(".maps"), queue2 SEC(".maps"), queue3 SEC(".maps"), - queue4 SEC(".maps"); + queue4 SEC(".maps"), + dump_store SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); @@ -578,11 +580,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) return; scx_bpf_dump("QMAP FIFO[%d]:", i); + + /* + * Dump can be invoked anytime and there is no way to iterate in + * a non-destructive way. Pop and store in dump_store and then + * restore afterwards. If racing against new enqueues, ordering + * can get mixed up. + */ bpf_repeat(4096) { if (bpf_map_pop_elem(fifo, &pid)) break; + bpf_map_push_elem(&dump_store, &pid, 0); scx_bpf_dump(" %d", pid); } + + bpf_repeat(4096) { + if (bpf_map_pop_elem(&dump_store, &pid)) + break; + bpf_map_push_elem(fifo, &pid, 0); + } + scx_bpf_dump("\n"); } } @@ -617,22 +634,25 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args) { - bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", - cgrp->kn->id, args->weight, args->bw_period_us, - args->bw_quota_us, args->bw_burst_us); + if (print_msgs) + bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu", + cgrp->kn->id, args->weight, args->bw_period_us, + args->bw_quota_us, args->bw_burst_us); return 0; } void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight) { - bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); + if (print_msgs) + bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight); } void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) { - bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", cgrp->kn->id, - period_us, quota_us, burst_us); + if (print_msgs) + bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu", + cgrp->kn->id, period_us, quota_us, burst_us); } /* @@ -676,16 +696,20 @@ static void print_cpus(void) void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) { - bpf_printk("CPU %d coming online", cpu); - /* @cpu is already online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d coming online", cpu); + /* @cpu is already online at this point */ + print_cpus(); + } } void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) { - bpf_printk("CPU %d going offline", cpu); - /* @cpu is still online at this point */ - print_cpus(); + if (print_msgs) { + bpf_printk("CPU %d going offline", cpu); + /* @cpu is still online at this point */ + print_cpus(); + } } struct monitor_timer { @@ -783,35 +807,36 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { - struct scx_event_stats events; - bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); monitor_cpuperf(); - if (print_shared_dsq) + if (print_dsqs_and_events) { + struct scx_event_stats events; + dump_shared_dsq(); - __COMPAT_scx_bpf_events(&events, sizeof(events)); - - bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", - scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", - scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); - bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", - scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); - bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", - scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); - bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", - scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", - scx_read_event(&events, SCX_EV_BYPASS_DURATION)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", - scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); - bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", - scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL", + scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + } bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; @@ -823,7 +848,8 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) struct bpf_timer *timer; s32 ret; - print_cpus(); + if (print_msgs) + print_cpus(); ret = scx_bpf_create_dsq(SHARED_DSQ, -1); if (ret) diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index c4912ab2e76f..ef701d45ba43 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -" [-P] [-d PID] [-D LEN] [-p] [-v]\n" +" [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,7 +28,8 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" " -b COUNT Dispatch upto COUNT tasks together\n" -" -P Print out DSQ content to trace_pipe every second, use with -b\n" +" -P Print out DSQ content and event counters to trace_pipe every second\n" +" -M Print out debug messages to trace_pipe\n" " -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" " -D LEN Set scx_exit_info.dump buffer length\n" @@ -66,7 +67,7 @@ int main(int argc, char **argv) skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); - while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -87,7 +88,10 @@ int main(int argc, char **argv) skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); break; case 'P': - skel->rodata->print_shared_dsq = true; + skel->rodata->print_dsqs_and_events = true; + break; + case 'M': + skel->rodata->print_msgs = true; break; case 'H': skel->rodata->highpri_boosting = true; diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 76d83199545c..06d4b13bf76b 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -7,6 +7,7 @@ #include <stdio.h> #include <unistd.h> #include <signal.h> +#include <assert.h> #include <libgen.h> #include <bpf/bpf.h> #include <scx/common.h> @@ -41,6 +42,7 @@ static void sigint_handler(int simple) static void read_stats(struct scx_simple *skel, __u64 *stats) { int nr_cpus = libbpf_num_possible_cpus(); + assert(nr_cpus > 0); __u64 cnts[2][nr_cpus]; __u32 idx; |
