From 6e3c0a4e1ad1e0455b7880fad02b3ee179f56c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Apr 2025 12:16:28 +0200 Subject: sched/fair: Fix lag clamp Vincent reported that he was seeing undue lag clamping in a mixed slice workload. Implement the max_slice tracking as per the todo comment. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Reported-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20250422101628.GA33555@noisy.programming.kicks-ass.net --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 074ad4ef3d81..a7b4a980eb2f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -579,6 +579,7 @@ struct sched_entity { u64 deadline; u64 min_vruntime; u64 min_slice; + u64 max_slice; struct list_head group_node; unsigned char on_rq; -- cgit v1.2.3 From 4c652a47722f69c6f2685f05b17490ea97f643a8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 6 Feb 2026 08:41:13 +0100 Subject: rseq: Mark rseq_arm_slice_extension_timer() __always_inline objtool warns about this function being called inside of a uaccess section: kernel/entry/common.o: warning: objtool: irqentry_exit+0x1dc: call to rseq_arm_slice_extension_timer() with UACCESS enabled Interestingly, this happens with CONFIG_RSEQ_SLICE_EXTENSION disabled, so this is an empty function, as the normal implementation is already marked __always_inline. I could reproduce this multiple times with gcc-11 but not with gcc-15, so the compiler probably got better at identifying the trivial function. Mark all the empty helpers for !RSEQ_SLICE_EXTENSION as __always_inline for consistency, avoiding this warning. Fixes: 0ac3b5c3dc45 ("rseq: Implement time slice extension enforcement timer") Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260206074122.709580-1-arnd@kernel.org --- include/linux/rseq_entry.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..c6831c93cd6e 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -216,10 +216,10 @@ efault: } #else /* CONFIG_RSEQ_SLICE_EXTENSION */ -static inline bool rseq_slice_extension_enabled(void) { return false; } -static inline bool rseq_arm_slice_extension_timer(void) { return false; } -static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static __always_inline bool rseq_slice_extension_enabled(void) { return false; } +static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; } +static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { } +static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); -- cgit v1.2.3 From 3b68df978133ac3d46d570af065a73debbb68248 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 20 Feb 2026 15:06:41 -0500 Subject: rseq: slice ext: Ensure rseq feature size differs from original rseq size Before rseq became extensible, its original size was 32 bytes even though the active rseq area was only 20 bytes. This had the following impact in terms of userspace ecosystem evolution: * The GNU libc between 2.35 and 2.39 expose a __rseq_size symbol set to 32, even though the size of the active rseq area is really 20. * The GNU libc 2.40 changes this __rseq_size to 20, thus making it express the active rseq area. * Starting from glibc 2.41, __rseq_size corresponds to the AT_RSEQ_FEATURE_SIZE from getauxval(3). This means that users of __rseq_size can always expect it to correspond to the active rseq area, except for the value 32, for which the active rseq area is 20 bytes. Exposing a 32 bytes feature size would make life needlessly painful for userspace. Therefore, add a reserved field at the end of the rseq area to bump the feature size to 33 bytes. This reserved field is expected to be replaced with whatever field will come next, expecting that this field will be larger than 1 byte. The effect of this change is to increase the size from 32 to 64 bytes before we actually have fields using that memory. Clarify the allocation size and alignment requirements in the struct rseq uapi comment. Change the value returned by getauxval(AT_RSEQ_ALIGN) to return the value of the active rseq area size rounded up to next power of 2, which guarantees that the rseq structure will always be aligned on the nearest power of two large enough to contain it, even as it grows. Change the alignment check in the rseq registration accordingly. This will minimize the amount of ABI corner-cases we need to document and require userspace to play games with. The rule stays simple when __rseq_size != 32: #define rseq_field_available(field) (__rseq_size >= offsetofend(struct rseq_abi, field)) Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260220200642.1317826-3-mathieu.desnoyers@efficios.com --- include/linux/rseq.h | 12 ++++++++++++ include/uapi/linux/rseq.h | 26 ++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7a01a0760405..b9d62fc2140d 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -146,6 +146,18 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) t->rseq = current->rseq; } +/* + * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq + * registration. This is the active rseq area size rounded up to next + * power of 2, which guarantees that the rseq structure will always be + * aligned on the nearest power of two large enough to contain it, even + * as it grows. + */ +static inline unsigned int rseq_alloc_align(void) +{ + return 1U << get_count_order(offsetof(struct rseq, end)); +} + #else /* CONFIG_RSEQ */ static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 863c4a00a66b..f69344fe6c08 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -87,10 +87,17 @@ struct rseq_slice_ctrl { }; /* - * struct rseq is aligned on 4 * 8 bytes to ensure it is always - * contained within a single cache-line. + * The original size and alignment of the allocation for struct rseq is + * 32 bytes. * - * A single struct rseq per thread is allowed. + * The allocation size needs to be greater or equal to + * max(getauxval(AT_RSEQ_FEATURE_SIZE), 32), and the allocation needs to + * be aligned on max(getauxval(AT_RSEQ_ALIGN), 32). + * + * As an alternative, userspace is allowed to use both the original size + * and alignment of 32 bytes for backward compatibility. + * + * A single active struct rseq registration per thread is allowed. */ struct rseq { /* @@ -180,10 +187,21 @@ struct rseq { */ struct rseq_slice_ctrl slice_ctrl; + /* + * Before rseq became extensible, its original size was 32 bytes even + * though the active rseq area was only 20 bytes. + * Exposing a 32 bytes feature size would make life needlessly painful + * for userspace. Therefore, add a reserved byte after byte 32 + * to bump the rseq feature size from 32 to 33. + * The next field to be added to the rseq area will be larger + * than one byte, and will replace this reserved byte. + */ + __u8 __reserved; + /* * Flexible array member at end of structure, after last feature field. */ char end[]; -} __attribute__((aligned(4 * sizeof(__u64)))); +} __attribute__((aligned(32))); #endif /* _UAPI_LINUX_RSEQ_H */ -- cgit v1.2.3