From d7a5da7a0f7fa7ff081140c4f6f971db98882703 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:04 +0100 Subject: rseq: Add fields and constants for time slice extension Aside of a Kconfig knob add the following items: - Two flag bits for the rseq user space ABI, which allow user space to query the availability and enablement without a syscall. - A new member to the user space ABI struct rseq, which is going to be used to communicate request and grant between kernel and user space. - A rseq state struct to hold the kernel state of this - Documentation of the new mechanism Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.669472597@linutronix.de --- kernel/rseq.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index 395d8b002350..07c324d5a201 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -389,6 +389,8 @@ static bool rseq_reset_ids(void) */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { + u32 rseqfl = 0; + if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -440,6 +442,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + scoped_user_write_access(rseq, efault) { /* * If the rseq_cs pointer is non-NULL on registration, clear it to @@ -449,11 +454,13 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 * clearing the fields. Don't bother reading it, just reset it. */ unsafe_put_user(0UL, &rseq->rseq_cs, efault); + unsafe_put_user(rseqfl, &rseq->flags, efault); /* Initialize IDs in user space */ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); unsafe_put_user(0U, &rseq->node_id, efault); unsafe_put_user(0U, &rseq->mm_cid, efault); + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); } /* -- cgit v1.2.3 From f8380f976804533df4c6c3d3a0b2cd03c2d262bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:06 +0100 Subject: rseq: Provide static branch for time slice extensions Guard the time slice extension functionality with a static key, which can be disabled on the kernel command line. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.733429292@linutronix.de --- kernel/rseq.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index 07c324d5a201..bf75268580ef 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -483,3 +483,20 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 efault: return -EFAULT; } + +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +static int __init rseq_slice_cmdline(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return 0; + + if (!on) + static_branch_disable(&rseq_slice_extension_key); + return 1; +} +__setup("rseq_slice_ext=", rseq_slice_cmdline); +#endif /* CONFIG_RSEQ_SLICE_EXTENSION */ -- cgit v1.2.3 From b5b8282441bc4f8f1ff505e19d566dbd7b805761 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:09 +0100 Subject: rseq: Add statistics for time slice extensions Extend the quick statistics with time slice specific fields. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.795202254@linutronix.de --- kernel/rseq.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index bf75268580ef..415d75b6df2c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -138,6 +138,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); + stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); + stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); + stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); + stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); + } } seq_printf(m, "exit: %16lu\n", stats.exit); @@ -148,6 +155,13 @@ static int rseq_stats_show(struct seq_file *m, void *p) seq_printf(m, "cs: %16lu\n", stats.cs); seq_printf(m, "clear: %16lu\n", stats.clear); seq_printf(m, "fixup: %16lu\n", stats.fixup); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + seq_printf(m, "sgrant: %16lu\n", stats.s_granted); + seq_printf(m, "sexpir: %16lu\n", stats.s_expired); + seq_printf(m, "srevok: %16lu\n", stats.s_revoked); + seq_printf(m, "syield: %16lu\n", stats.s_yielded); + seq_printf(m, "sabort: %16lu\n", stats.s_aborted); + } return 0; } -- cgit v1.2.3 From 28621ec2d46c6adf7d33a6facbd83e2fa566bd34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:12 +0100 Subject: rseq: Add prctl() to enable time slice extensions Implement a prctl() so that tasks can enable the time slice extension mechanism. This fails, when time slice extensions are disabled at compile time or on the kernel command line and when no rseq pointer is registered in the kernel. That allows to implement a single trivial check in the exit to user mode hotpath, to decide whether the whole mechanism needs to be invoked. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.858717691@linutronix.de --- kernel/rseq.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index 415d75b6df2c..09848bb14ec2 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -71,6 +71,7 @@ #define RSEQ_BUILD_SLOW_PATH #include +#include #include #include #include @@ -501,6 +502,57 @@ efault: #ifdef CONFIG_RSEQ_SLICE_EXTENSION DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + switch (arg2) { + case PR_RSEQ_SLICE_EXTENSION_GET: + if (arg3) + return -EINVAL; + return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; + + case PR_RSEQ_SLICE_EXTENSION_SET: { + u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); + + if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) + return -EINVAL; + if (!rseq_slice_extension_enabled()) + return -ENOTSUPP; + if (!current->rseq.usrptr) + return -ENXIO; + + /* No change? */ + if (enable == !!current->rseq.slice.state.enabled) + return 0; + + if (get_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + if (current->rseq.slice.state.enabled) + valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if ((rflags & valid) != valid) + goto die; + + rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (enable) + rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if (put_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + current->rseq.slice.state.enabled = enable; + return 0; + } + default: + return -EINVAL; + } +die: + force_sig(SIGSEGV); + return -EFAULT; +} + static int __init rseq_slice_cmdline(char *str) { bool on; -- cgit v1.2.3 From 99d2592023e5d0a31f5f5a83c694df48239a1e6c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:15 +0100 Subject: rseq: Implement sys_rseq_slice_yield() Provide a new syscall which has the only purpose to yield the CPU after the kernel granted a time slice extension. sched_yield() is not suitable for that because it unconditionally schedules, but the end of the time slice extension is not required to schedule when the task was already preempted. This also allows to have a strict check for termination to catch user space invoking random syscalls including sched_yield() from a time slice extension region. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Acked-by: Arnd Bergmann Link: https://patch.msgid.link/20251215155708.929634896@linutronix.de --- kernel/rseq.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index 09848bb14ec2..d8e1992edffa 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -553,6 +553,27 @@ die: return -EFAULT; } +/** + * sys_rseq_slice_yield - yield the current processor side effect free if a + * task granted with a time slice extension is done with + * the critical work before being forced out. + * + * Return: 1 if the task successfully yielded the CPU within the granted slice. + * 0 if the slice extension was either never granted or was revoked by + * going over the granted extension, using a syscall other than this one + * or being scheduled out earlier due to a subsequent interrupt. + * + * The syscall does not schedule because the syscall entry work immediately + * relinquishes the CPU and schedules if required. + */ +SYSCALL_DEFINE0(rseq_slice_yield) +{ + int yielded = !!current->rseq.slice.yielded; + + current->rseq.slice.yielded = 0; + return yielded; +} + static int __init rseq_slice_cmdline(char *str) { bool on; -- cgit v1.2.3 From dd0a04606937af5810e9117d343ee3792635bd3d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:19 +0100 Subject: rseq: Implement syscall entry work for time slice extensions The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice extension. This allows to handle the rseq_slice_yield() syscall, which is used by user space to relinquish the CPU after finishing the critical section for which it requested an extension. In case the kernel state is still GRANTED, the kernel resets both kernel and user space state with a set of sanity checks. If the kernel state is already cleared, then this raced against the timer or some other interrupt and just clears the work bit. Doing it in syscall entry work allows to catch misbehaving user space, which issues an arbitrary syscall, i.e. not rseq_slice_yield(), from the critical section. Contrary to the initial strict requirement to use rseq_slice_yield() arbitrary syscalls are not considered a violation of the ABI contract anymore to allow onion architecture applications, which cannot control the code inside a critical section, to utilize this as well. If the code detects inconsistent user space that result in a SIGSEGV for the application. If the grant was still active and the task was not preempted yet, the work code reschedules immediately before continuing through the syscall. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155709.005777059@linutronix.de --- kernel/rseq.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index d8e1992edffa..8aa4821e3979 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -502,6 +502,97 @@ efault: #ifdef CONFIG_RSEQ_SLICE_EXTENSION DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); +static inline void rseq_slice_set_need_resched(struct task_struct *curr) +{ + /* + * The interrupt guard is required to prevent inconsistent state in + * this case: + * + * set_tsk_need_resched() + * --> Interrupt + * wakeup() + * set_tsk_need_resched() + * set_preempt_need_resched() + * schedule_on_return() + * clear_tsk_need_resched() + * clear_preempt_need_resched() + * set_preempt_need_resched() <- Inconsistent state + * + * This is safe vs. a remote set of TIF_NEED_RESCHED because that + * only sets the already set bit and does not create inconsistent + * state. + */ + scoped_guard(irq) + set_need_resched_current(); +} + +static void rseq_slice_validate_ctrl(u32 expected) +{ + u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; + u32 uval; + + if (get_user(uval, sctrl) || uval != expected) + force_sig(SIGSEGV); +} + +/* + * Invoked from syscall entry if a time slice extension was granted and the + * kernel did not clear it before user space left the critical section. + * + * While the recommended way to relinquish the CPU side effect free is + * rseq_slice_yield(2), any syscall within a granted slice terminates the + * grant and immediately reschedules if required. This supports onion layer + * applications, where the code requesting the grant cannot control the + * code within the critical section. + */ +void rseq_syscall_enter_work(long syscall) +{ + struct task_struct *curr = current; + struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; + + clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + + if (static_branch_unlikely(&rseq_debug_enabled)) + rseq_slice_validate_ctrl(ctrl.all); + + /* + * The kernel might have raced, revoked the grant and updated + * userspace, but kept the SLICE work set. + */ + if (!ctrl.granted) + return; + + /* + * Required to make set_tsk_need_resched() correct on PREEMPT[RT] + * kernels. Leaving the scope will reschedule on preemption models + * FULL, LAZY and RT if necessary. + */ + scoped_guard(preempt) { + /* + * Now that preemption is disabled, quickly check whether + * the task was already rescheduled before arriving here. + */ + if (!curr->rseq.event.sched_switch) { + rseq_slice_set_need_resched(curr); + + if (syscall == __NR_rseq_slice_yield) { + rseq_stat_inc(rseq_stats.s_yielded); + /* Update the yielded state for syscall return */ + curr->rseq.slice.yielded = 1; + } else { + rseq_stat_inc(rseq_stats.s_aborted); + } + } + } + /* Reschedule on NONE/VOLUNTARY preemption models */ + cond_resched(); + + /* Clear the grant in kernel state and user space */ + curr->rseq.slice.state.granted = false; + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) + force_sig(SIGSEGV); +} + int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) { switch (arg2) { -- cgit v1.2.3 From 0ac3b5c3dc45085b28a10ee730fb2860841f08ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:22 +0100 Subject: rseq: Implement time slice extension enforcement timer If a time slice extension is granted and the reschedule delayed, the kernel has to ensure that user space cannot abuse the extension and exceed the maximum granted time. It was suggested to implement this via the existing hrtick() timer in the scheduler, but that turned out to be problematic for several reasons: 1) It creates a dependency on CONFIG_SCHED_HRTICK, which can be disabled independently of CONFIG_HIGHRES_TIMERS 2) HRTICK usage in the scheduler can be runtime disabled or is only used for certain aspects of scheduling. 3) The function is calling into the scheduler code and that might have unexpected consequences when this is invoked due to a time slice enforcement expiry. Especially when the task managed to clear the grant via sched_yield(0). It would be possible to address #2 and #3 by storing state in the scheduler, but that is extra complexity and fragility for no value. Implement a dedicated per CPU hrtimer instead, which is solely used for the purpose of time slice enforcement. The timer is armed when an extension was granted right before actually returning to user mode in rseq_exit_to_user_mode_restart(). It is disarmed, when the task relinquishes the CPU. This is expensive as the timer is probably the first expiring timer on the CPU, which means it has to reprogram the hardware. But that's less expensive than going through a full hrtimer interrupt cycle for nothing. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251215155709.068329497@linutronix.de --- kernel/rseq.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 129 insertions(+), 3 deletions(-) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index 8aa4821e3979..275d70114107 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -71,6 +71,8 @@ #define RSEQ_BUILD_SLOW_PATH #include +#include +#include #include #include #include @@ -500,8 +502,91 @@ efault: } #ifdef CONFIG_RSEQ_SLICE_EXTENSION +struct slice_timer { + struct hrtimer timer; + void *cookie; +}; + +unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC; +static DEFINE_PER_CPU(struct slice_timer, slice_timer); DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); +/* + * When the timer expires and the task is still in user space, the return + * from interrupt will revoke the grant and schedule. If the task already + * entered the kernel via a syscall and the timer fires before the syscall + * work was able to cancel it, then depending on the preemption model this + * will either reschedule on return from interrupt or in the syscall work + * below. + */ +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) +{ + struct slice_timer *st = container_of(tmr, struct slice_timer, timer); + + /* + * Validate that the task which armed the timer is still on the + * CPU. It could have been scheduled out without canceling the + * timer. + */ + if (st->cookie == current && current->rseq.slice.state.granted) { + rseq_stat_inc(rseq_stats.s_expired); + set_need_resched_current(); + } + return HRTIMER_NORESTART; +} + +bool __rseq_arm_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + struct task_struct *curr = current; + + lockdep_assert_irqs_disabled(); + + /* + * This check prevents a task, which got a time slice extension + * granted, from exceeding the maximum scheduling latency when the + * grant expired before going out to user space. Don't bother to + * clear the grant here, it will be cleaned up automatically before + * going out to user space after being scheduled back in. + */ + if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { + set_need_resched_current(); + return true; + } + + /* + * Store the task pointer as a cookie for comparison in the timer + * function. This is safe as the timer is CPU local and cannot be + * in the expiry function at this point. + */ + st->cookie = curr; + hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* Arm the syscall entry work */ + set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + return false; +} + +static void rseq_cancel_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + + /* + * st->cookie can be safely read as preemption is disabled and the + * timer is CPU local. + * + * As this is most probably the first expiring timer, the cancel is + * expensive as it has to reprogram the hardware, but that's less + * expensive than going through a full hrtimer_interrupt() cycle + * for nothing. + * + * hrtimer_try_to_cancel() is sufficient here as the timer is CPU + * local and once the hrtimer code disabled interrupts the timer + * callback cannot be running. + */ + if (st->cookie == current) + hrtimer_try_to_cancel(&st->timer); +} + static inline void rseq_slice_set_need_resched(struct task_struct *curr) { /* @@ -563,11 +648,14 @@ void rseq_syscall_enter_work(long syscall) return; /* - * Required to make set_tsk_need_resched() correct on PREEMPT[RT] - * kernels. Leaving the scope will reschedule on preemption models - * FULL, LAZY and RT if necessary. + * Required to stabilize the per CPU timer pointer and to make + * set_tsk_need_resched() correct on PREEMPT[RT] kernels. + * + * Leaving the scope will reschedule on preemption models FULL, + * LAZY and RT if necessary. */ scoped_guard(preempt) { + rseq_cancel_slice_extension_timer(); /* * Now that preemption is disabled, quickly check whether * the task was already rescheduled before arriving here. @@ -665,6 +753,31 @@ SYSCALL_DEFINE0(rseq_slice_yield) return yielded; } +#ifdef CONFIG_SYSCTL +static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; + +static const struct ctl_table rseq_slice_ext_sysctl[] = { + { + .procname = "rseq_slice_extension_nsec", + .data = &rseq_slice_ext_nsecs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (unsigned int *)&rseq_slice_ext_nsecs_min, + .extra2 = (unsigned int *)&rseq_slice_ext_nsecs_max, + }, +}; + +static void rseq_slice_sysctl_init(void) +{ + if (rseq_slice_extension_enabled()) + register_sysctl_init("kernel", rseq_slice_ext_sysctl); +} +#else /* CONFIG_SYSCTL */ +static inline void rseq_slice_sysctl_init(void) { } +#endif /* !CONFIG_SYSCTL */ + static int __init rseq_slice_cmdline(char *str) { bool on; @@ -677,4 +790,17 @@ static int __init rseq_slice_cmdline(char *str) return 1; } __setup("rseq_slice_ext=", rseq_slice_cmdline); + +static int __init rseq_slice_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); + } + rseq_slice_sysctl_init(); + return 0; +} +device_initcall(rseq_slice_init); #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ -- cgit v1.2.3 From d6200245c75e832af2087bc60ba2e6641a90eee9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Jan 2026 11:23:57 +0100 Subject: rseq: Allow registering RSEQ with slice extension Since glibc cares about the number of syscalls required to initialize a new thread, allow initializing rseq with slice extension on. This avoids having to do another prctl(). Requested-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143207.814193010@infradead.org --- kernel/rseq.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index 275d70114107..1c5490a172a8 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -424,7 +424,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return 0; } - if (unlikely(flags)) + if (unlikely(flags & ~(RSEQ_FLAG_SLICE_EXT_DEFAULT_ON))) return -EINVAL; if (current->rseq.usrptr) { @@ -459,8 +459,12 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (rseq_slice_extension_enabled() && + (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON)) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } scoped_user_write_access(rseq, efault) { /* @@ -488,6 +492,10 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 current->rseq.len = rseq_len; current->rseq.sig = sig; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); +#endif + /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields -- cgit v1.2.3 From e1d7f54900f1e1d3003a85b78cd7105a64203ff7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Jan 2026 14:21:51 +0100 Subject: rseq: Move slice_ext_nsec to debugfs Move changing the slice ext duration to debugfs, a sliglty less permanent interface. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143207.923520192@infradead.org --- kernel/rseq.c | 69 +++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 23 deletions(-) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index 1c5490a172a8..e423a9bc0a2c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -123,7 +123,6 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, } #endif /* CONFIG_TRACEPOINTS */ -#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_RSEQ_STATS DEFINE_PER_CPU(struct rseq_stats, rseq_stats); @@ -222,16 +221,19 @@ static const struct file_operations debug_ops = { .release = single_release, }; +static void rseq_slice_ext_init(struct dentry *root_dir); + static int __init rseq_debugfs_init(void) { struct dentry *root_dir = debugfs_create_dir("rseq", NULL); debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); rseq_stats_init(root_dir); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseq_slice_ext_init(root_dir); return 0; } __initcall(rseq_debugfs_init); -#endif /* CONFIG_DEBUG_FS */ static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { @@ -515,7 +517,9 @@ struct slice_timer { void *cookie; }; -unsigned int rseq_slice_ext_nsecs __read_mostly = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; +unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; static DEFINE_PER_CPU(struct slice_timer, slice_timer); DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); @@ -761,30 +765,48 @@ SYSCALL_DEFINE0(rseq_slice_yield) return yielded; } -#ifdef CONFIG_SYSCTL -static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; -static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; +static int rseq_slice_ext_show(struct seq_file *m, void *p) +{ + seq_printf(m, "%d\n", rseq_slice_ext_nsecs); + return 0; +} + +static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned int nsecs; + + if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) + return -EINVAL; + + if (nsecs < rseq_slice_ext_nsecs_min) + return -ERANGE; -static const struct ctl_table rseq_slice_ext_sysctl[] = { - { - .procname = "rseq_slice_extension_nsec", - .data = &rseq_slice_ext_nsecs, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_douintvec_minmax, - .extra1 = (unsigned int *)&rseq_slice_ext_nsecs_min, - .extra2 = (unsigned int *)&rseq_slice_ext_nsecs_max, - }, + if (nsecs > rseq_slice_ext_nsecs_max) + return -ERANGE; + + rseq_slice_ext_nsecs = nsecs; + + return count; +} + +static int rseq_slice_ext_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_slice_ext_show, inode->i_private); +} + +static const struct file_operations slice_ext_ops = { + .open = rseq_slice_ext_open, + .read = seq_read, + .write = rseq_slice_ext_write, + .llseek = seq_lseek, + .release = single_release, }; -static void rseq_slice_sysctl_init(void) +static void rseq_slice_ext_init(struct dentry *root_dir) { - if (rseq_slice_extension_enabled()) - register_sysctl_init("kernel", rseq_slice_ext_sysctl); + debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); } -#else /* CONFIG_SYSCTL */ -static inline void rseq_slice_sysctl_init(void) { } -#endif /* !CONFIG_SYSCTL */ static int __init rseq_slice_cmdline(char *str) { @@ -807,8 +829,9 @@ static int __init rseq_slice_init(void) hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); } - rseq_slice_sysctl_init(); return 0; } device_initcall(rseq_slice_init); +#else +static void rseq_slice_ext_init(struct dentry *root_dir) { } #endif /* CONFIG_RSEQ_SLICE_EXTENSION */ -- cgit v1.2.3 From 21c0e92d0681fbd10ac024311bd09bca439e0bb1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Jan 2026 14:25:04 +0100 Subject: rseq: Lower default slice extension Change the minimum slice extension to 5 usec. Since slice_test selftest reaches a staggering ~350 nsec extension: Task: slice_test Mean: 350.266 ns Latency (us) | Count ------------------------------ EXPIRED | 238 0 us | 143189 1 us | 167 2 us | 26 3 us | 11 4 us | 28 5 us | 31 6 us | 22 7 us | 23 8 us | 32 9 us | 16 10 us | 35 Lower the minimal (and default) value to 5 usecs -- which is still massive. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143208.073200729@infradead.org --- kernel/rseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rseq.c') diff --git a/kernel/rseq.c b/kernel/rseq.c index e423a9bc0a2c..b0973d19f366 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -517,7 +517,7 @@ struct slice_timer { void *cookie; }; -static const unsigned int rseq_slice_ext_nsecs_min = 10 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; static DEFINE_PER_CPU(struct slice_timer, slice_timer); -- cgit v1.2.3