diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
| commit | 02baaa67d9afc2e56c6e1ac6a1fb1f1dd2be366f (patch) | |
| tree | 13ae2fec8be92b2f774cfb3fd725c027740be3ac /include/linux/sched/ext.h | |
| parent | 8449d3252c2603a51ffc7c36cb5bd94874378b7d (diff) | |
| parent | 1dd6c84f1c544e552848a8968599220bd464e338 (diff) | |
Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo:
- Improve recovery from misbehaving BPF schedulers.
When a scheduler puts many tasks with varying affinity restrictions
on a shared DSQ, CPUs scanning through tasks they cannot run can
overwhelm the system, causing lockups.
Bypass mode now uses per-CPU DSQs with a load balancer to avoid this,
and hooks into the hardlockup detector to attempt recovery.
Add scx_cpu0 example scheduler to demonstrate this scenario.
- Add lockless peek operation for DSQs to reduce lock contention for
schedulers that need to query queue state during load balancing.
- Allow scx_bpf_reenqueue_local() to be called from anywhere in
preparation for deprecating cpu_acquire/release() callbacks in favor
of generic BPF hooks.
- Prepare for hierarchical scheduler support: add
scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs,
make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in
structs for future aux__prog parameter.
- Implement cgroup_set_idle() callback to notify BPF schedulers when a
cgroup's idle state changes.
- Fix migration tasks being incorrectly downgraded from
stop_sched_class to rt_sched_class across sched_ext enable/disable.
Applied late as the fix is low risk and the bug subtle but needs
stable backporting.
- Various fixes and cleanups including cgroup exit ordering,
SCX_KICK_WAIT reliability, and backward compatibility improvements.
* tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits)
sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks
sched_ext: tools: Removing duplicate targets during non-cross compilation
sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object
sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs
sched_ext: Update comments replacing breather with aborting mechanism
sched_ext: Implement load balancer for bypass mode
sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked()
sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
sched_ext: Add scx_cpu0 example scheduler
sched_ext: Hook up hardlockup detector
sched_ext: Make handle_lockup() propagate scx_verror() result
sched_ext: Refactor lockup handlers into handle_lockup()
sched_ext: Make scx_exit() and scx_vexit() return bool
sched_ext: Exit dispatch and move operations immediately when aborting
sched_ext: Simplify breather mechanism with scx_aborting flag
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
sched_ext: Refactor do_enqueue_task() local and global DSQ paths
sched_ext: Use shorter slice in bypass mode
sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races
sched_ext: Minor cleanups to scx_task_iter
...
Diffstat (limited to 'include/linux/sched/ext.h')
| -rw-r--r-- | include/linux/sched/ext.h | 27 |
1 files changed, 25 insertions, 2 deletions
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index d82b7a9b0658..bcb962d5ee7d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -17,7 +17,18 @@ enum scx_public_consts { SCX_OPS_NAME_LEN = 128, + /* + * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses + * to set the slice for a task that is selected for execution. + * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice + * refill has been triggered. + * + * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass + * mode. As making forward progress for all tasks is the main goal of + * the bypass mode, a shorter slice is used. + */ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; @@ -46,6 +57,7 @@ enum scx_dsq_id_flags { SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; @@ -58,6 +70,7 @@ enum scx_dsq_id_flags { */ struct scx_dispatch_q { raw_spinlock_t lock; + struct task_struct __rcu *first_task; /* lockless peek at head */ struct list_head list; /* tasks in dispatch order */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; @@ -136,6 +149,13 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; +#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ + (struct scx_dsq_list_node) { \ + .node = LIST_HEAD_INIT((__node).node), \ + .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ + .priv = (__priv), \ + } + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. @@ -207,16 +227,18 @@ struct sched_ext_entity { struct list_head tasks_node; }; -void sched_ext_free(struct task_struct *p); +void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); +bool scx_hardlockup(int cpu); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ -static inline void sched_ext_free(struct task_struct *p) {} +static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} +static inline bool scx_hardlockup(int cpu) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -228,6 +250,7 @@ struct scx_task_group { u64 bw_period_us; u64 bw_quota_us; u64 bw_burst_us; + bool idle; #endif }; |
