diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
| commit | 02baaa67d9afc2e56c6e1ac6a1fb1f1dd2be366f (patch) | |
| tree | 13ae2fec8be92b2f774cfb3fd725c027740be3ac /include | |
| parent | 8449d3252c2603a51ffc7c36cb5bd94874378b7d (diff) | |
| parent | 1dd6c84f1c544e552848a8968599220bd464e338 (diff) | |
Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo:
- Improve recovery from misbehaving BPF schedulers.
When a scheduler puts many tasks with varying affinity restrictions
on a shared DSQ, CPUs scanning through tasks they cannot run can
overwhelm the system, causing lockups.
Bypass mode now uses per-CPU DSQs with a load balancer to avoid this,
and hooks into the hardlockup detector to attempt recovery.
Add scx_cpu0 example scheduler to demonstrate this scenario.
- Add lockless peek operation for DSQs to reduce lock contention for
schedulers that need to query queue state during load balancing.
- Allow scx_bpf_reenqueue_local() to be called from anywhere in
preparation for deprecating cpu_acquire/release() callbacks in favor
of generic BPF hooks.
- Prepare for hierarchical scheduler support: add
scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs,
make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in
structs for future aux__prog parameter.
- Implement cgroup_set_idle() callback to notify BPF schedulers when a
cgroup's idle state changes.
- Fix migration tasks being incorrectly downgraded from
stop_sched_class to rt_sched_class across sched_ext enable/disable.
Applied late as the fix is low risk and the bug subtle but needs
stable backporting.
- Various fixes and cleanups including cgroup exit ordering,
SCX_KICK_WAIT reliability, and backward compatibility improvements.
* tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits)
sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks
sched_ext: tools: Removing duplicate targets during non-cross compilation
sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object
sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs
sched_ext: Update comments replacing breather with aborting mechanism
sched_ext: Implement load balancer for bypass mode
sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked()
sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
sched_ext: Add scx_cpu0 example scheduler
sched_ext: Hook up hardlockup detector
sched_ext: Make handle_lockup() propagate scx_verror() result
sched_ext: Refactor lockup handlers into handle_lockup()
sched_ext: Make scx_exit() and scx_vexit() return bool
sched_ext: Exit dispatch and move operations immediately when aborting
sched_ext: Simplify breather mechanism with scx_aborting flag
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
sched_ext: Refactor do_enqueue_task() local and global DSQ paths
sched_ext: Use shorter slice in bypass mode
sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races
sched_ext: Minor cleanups to scx_task_iter
...
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/sched/ext.h | 27 | ||||
| -rw-r--r-- | include/trace/events/sched_ext.h | 39 |
2 files changed, 64 insertions, 2 deletions
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index d82b7a9b0658..bcb962d5ee7d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -17,7 +17,18 @@ enum scx_public_consts { SCX_OPS_NAME_LEN = 128, + /* + * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses + * to set the slice for a task that is selected for execution. + * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice + * refill has been triggered. + * + * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass + * mode. As making forward progress for all tasks is the main goal of + * the bypass mode, a shorter slice is used. + */ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; @@ -46,6 +57,7 @@ enum scx_dsq_id_flags { SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; @@ -58,6 +70,7 @@ enum scx_dsq_id_flags { */ struct scx_dispatch_q { raw_spinlock_t lock; + struct task_struct __rcu *first_task; /* lockless peek at head */ struct list_head list; /* tasks in dispatch order */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; @@ -136,6 +149,13 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; +#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ + (struct scx_dsq_list_node) { \ + .node = LIST_HEAD_INIT((__node).node), \ + .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ + .priv = (__priv), \ + } + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. @@ -207,16 +227,18 @@ struct sched_ext_entity { struct list_head tasks_node; }; -void sched_ext_free(struct task_struct *p); +void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); +bool scx_hardlockup(int cpu); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ -static inline void sched_ext_free(struct task_struct *p) {} +static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} +static inline bool scx_hardlockup(int cpu) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ @@ -228,6 +250,7 @@ struct scx_task_group { u64 bw_period_us; u64 bw_quota_us; u64 bw_burst_us; + bool idle; #endif }; diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h index 50e4b712735a..d1bf5acd59c5 100644 --- a/include/trace/events/sched_ext.h +++ b/include/trace/events/sched_ext.h @@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event, ) ); +TRACE_EVENT(sched_ext_bypass_lb, + + TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced, + __u32 before_min, __u32 before_max, + __u32 after_min, __u32 after_max), + + TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced, + before_min, before_max, after_min, after_max), + + TP_STRUCT__entry( + __field( __u32, node ) + __field( __u32, nr_cpus ) + __field( __u32, nr_tasks ) + __field( __u32, nr_balanced ) + __field( __u32, before_min ) + __field( __u32, before_max ) + __field( __u32, after_min ) + __field( __u32, after_max ) + ), + + TP_fast_assign( + __entry->node = node; + __entry->nr_cpus = nr_cpus; + __entry->nr_tasks = nr_tasks; + __entry->nr_balanced = nr_balanced; + __entry->before_min = before_min; + __entry->before_max = before_max; + __entry->after_min = after_min; + __entry->after_max = after_max; + ), + + TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u", + __entry->node, __entry->nr_cpus, + __entry->nr_tasks, __entry->nr_balanced, + __entry->before_min, __entry->after_min, + __entry->before_max, __entry->after_max + ) +); + #endif /* _TRACE_SCHED_EXT_H */ /* This part must be outside protection */ |
