From 38a68b982dd0b10e3da943f100e034598326eafe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Nov 2025 16:39:44 +0100 Subject: : Add the __signed_scalar_typeof() helper Define __signed_scalar_typeof() to declare a signed scalar type, leaving non-scalar types unchanged. To be used to clean up the scheduler load-balancing code a bit. [ mingo: Split off this patch from the scheduler patch. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Shrikanth Hegde Cc: Valentin Schneider Cc: Vincent Guittot Link: https://patch.msgid.link/20251127154725.413564507@infradead.org --- include/linux/compiler_types.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 1280693766b9..280b4ac0990f 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -586,6 +586,25 @@ struct ftrace_likely_data { __scalar_type_to_expr_cases(long long), \ default: (x))) +/* + * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving + * non-scalar types unchanged. + */ + +#define __scalar_type_to_signed_cases(type) \ + unsigned type: (signed type)0, \ + signed type: (signed type)0 + +#define __signed_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (signed char)0, \ + __scalar_type_to_signed_cases(char), \ + __scalar_type_to_signed_cases(short), \ + __scalar_type_to_signed_cases(int), \ + __scalar_type_to_signed_cases(long), \ + __scalar_type_to_signed_cases(long long), \ + default: (x))) + /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ -- cgit v1.2.3 From 80390ead2080071cbd6f427ff8deb94d10a4a50f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 26 Nov 2025 05:31:28 +0100 Subject: sched/fair: Separate se->vlag from se->vprot There's no real space concerns here and keeping these fields in a union makes reading (and tracing) the scheduler code harder. Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251201064647.1851919-4-mingo@kernel.org --- include/linux/sched.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d395f2810fac..bf96a7d595e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -586,15 +586,10 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; - union { - /* - * When !@on_rq this field is vlag. - * When cfs_rq->curr == se (which implies @on_rq) - * this field is vprot. See protect_slice(). - */ - s64 vlag; - u64 vprot; - }; + /* Approximated virtual lag: */ + s64 vlag; + /* 'Protected' deadline, to give out minimum quantums: */ + u64 vprot; u64 slice; u64 nr_migrations; -- cgit v1.2.3 From d7a5da7a0f7fa7ff081140c4f6f971db98882703 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:04 +0100 Subject: rseq: Add fields and constants for time slice extension Aside of a Kconfig knob add the following items: - Two flag bits for the rseq user space ABI, which allow user space to query the availability and enablement without a syscall. - A new member to the user space ABI struct rseq, which is going to be used to communicate request and grant between kernel and user space. - A rseq state struct to hold the kernel state of this - Documentation of the new mechanism Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.669472597@linutronix.de --- include/linux/rseq_types.h | 28 +++++++++++++++++++++++++++- include/uapi/linux/rseq.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 332dc14b81c9..67e40c059b1b 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -72,13 +72,36 @@ struct rseq_ids { }; }; +/** + * union rseq_slice_state - Status information for rseq time slice extension + * @state: Compound to access the overall state + * @enabled: Time slice extension is enabled for the task + * @granted: Time slice extension was granted to the task + */ +union rseq_slice_state { + u16 state; + struct { + u8 enabled; + u8 granted; + }; +}; + +/** + * struct rseq_slice - Status information for rseq time slice extension + * @state: Time slice extension state + */ +struct rseq_slice { + union rseq_slice_state state; +}; + /** * struct rseq_data - Storage for all rseq related data * @usrptr: Pointer to the registered user space RSEQ memory * @len: Length of the RSEQ region - * @sig: Signature of critial section abort IPs + * @sig: Signature of critical section abort IPs * @event: Storage for event management * @ids: Storage for cached CPU ID and MM CID + * @slice: Storage for time slice extension data */ struct rseq_data { struct rseq __user *usrptr; @@ -86,6 +109,9 @@ struct rseq_data { u32 sig; struct rseq_event event; struct rseq_ids ids; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + struct rseq_slice slice; +#endif }; #else /* CONFIG_RSEQ */ diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 1b76d508400c..6afc219d1545 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -23,9 +23,15 @@ enum rseq_flags { }; enum rseq_cs_flags_bit { + /* Historical and unsupported bits */ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, + /* (3) Intentional gap to put new bits into a separate byte */ + + /* User read only feature flags */ + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4, + RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT = 5, }; enum rseq_cs_flags { @@ -35,6 +41,11 @@ enum rseq_cs_flags { (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), + + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE = + (1U << RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT), + RSEQ_CS_FLAG_SLICE_EXT_ENABLED = + (1U << RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT), }; /* @@ -53,6 +64,27 @@ struct rseq_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -141,6 +173,12 @@ struct rseq { */ __u32 mm_cid; + /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_slice_ctrl slice_ctrl; + /* * Flexible array member at end of structure, after last feature field. */ -- cgit v1.2.3 From f8380f976804533df4c6c3d3a0b2cd03c2d262bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:06 +0100 Subject: rseq: Provide static branch for time slice extensions Guard the time slice extension functionality with a static key, which can be disabled on the kernel command line. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.733429292@linutronix.de --- include/linux/rseq_entry.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index a36b472627de..d0ec4714ffd6 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -75,6 +75,17 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); #define rseq_inline __always_inline #endif +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +static __always_inline bool rseq_slice_extension_enabled(void) +{ + return static_branch_likely(&rseq_slice_extension_key); +} +#else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline bool rseq_slice_extension_enabled(void) { return false; } +#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); bool rseq_debug_validate_ids(struct task_struct *t); -- cgit v1.2.3 From b5b8282441bc4f8f1ff505e19d566dbd7b805761 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:09 +0100 Subject: rseq: Add statistics for time slice extensions Extend the quick statistics with time slice specific fields. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.795202254@linutronix.de --- include/linux/rseq_entry.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index d0ec4714ffd6..54d8e338b26e 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -15,6 +15,11 @@ struct rseq_stats { unsigned long cs; unsigned long clear; unsigned long fixup; + unsigned long s_granted; + unsigned long s_expired; + unsigned long s_revoked; + unsigned long s_yielded; + unsigned long s_aborted; }; DECLARE_PER_CPU(struct rseq_stats, rseq_stats); -- cgit v1.2.3 From 28621ec2d46c6adf7d33a6facbd83e2fa566bd34 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:12 +0100 Subject: rseq: Add prctl() to enable time slice extensions Implement a prctl() so that tasks can enable the time slice extension mechanism. This fails, when time slice extensions are disabled at compile time or on the kernel command line and when no rseq pointer is registered in the kernel. That allows to implement a single trivial check in the exit to user mode hotpath, to decide whether the whole mechanism needs to be invoked. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155708.858717691@linutronix.de --- include/linux/rseq.h | 9 +++++++++ include/uapi/linux/prctl.h | 10 ++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 2266f4dc77b6..3c194a02ad0a 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -163,4 +163,13 @@ void rseq_syscall(struct pt_regs *regs); static inline void rseq_syscall(struct pt_regs *regs) { } #endif /* !CONFIG_DEBUG_RSEQ */ +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); +#else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + return -ENOTSUPP; +} +#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ + #endif /* _LINUX_RSEQ_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 51c4e8c82b1e..79944b7ae50a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -386,4 +386,14 @@ struct prctl_mm_map { # define PR_FUTEX_HASH_SET_SLOTS 1 # define PR_FUTEX_HASH_GET_SLOTS 2 +/* RSEQ time slice extensions */ +#define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +/* + * Bits for RSEQ_SLICE_EXTENSION_GET/SET + * PR_RSEQ_SLICE_EXT_ENABLE: Enable + */ +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From 99d2592023e5d0a31f5f5a83c694df48239a1e6c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:15 +0100 Subject: rseq: Implement sys_rseq_slice_yield() Provide a new syscall which has the only purpose to yield the CPU after the kernel granted a time slice extension. sched_yield() is not suitable for that because it unconditionally schedules, but the end of the time slice extension is not required to schedule when the task was already preempted. This also allows to have a strict check for termination to catch user space invoking random syscalls including sched_yield() from a time slice extension region. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Acked-by: Arnd Bergmann Link: https://patch.msgid.link/20251215155708.929634896@linutronix.de --- include/linux/rseq_types.h | 2 ++ include/linux/syscalls.h | 1 + include/uapi/asm-generic/unistd.h | 5 ++++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 67e40c059b1b..8c540e775161 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -89,9 +89,11 @@ union rseq_slice_state { /** * struct rseq_slice - Status information for rseq time slice extension * @state: Time slice extension state + * @yielded: Indicator for rseq_slice_yield() */ struct rseq_slice { union rseq_slice_state state; + u8 yielded; }; /** diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cf84d98964b2..6c8a570cf44a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -961,6 +961,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_rseq_slice_yield(void); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); asmlinkage long sys_open_tree_attr(int dfd, const char __user *path, unsigned flags, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 942370b3f5d2..a627acc8fb5f 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -860,8 +860,11 @@ __SYSCALL(__NR_file_setattr, sys_file_setattr) #define __NR_listns 470 __SYSCALL(__NR_listns, sys_listns) +#define __NR_rseq_slice_yield 471 +__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield) + #undef __NR_syscalls -#define __NR_syscalls 471 +#define __NR_syscalls 472 /* * 32 bit systems traditionally used different -- cgit v1.2.3 From dd0a04606937af5810e9117d343ee3792635bd3d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:19 +0100 Subject: rseq: Implement syscall entry work for time slice extensions The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice extension. This allows to handle the rseq_slice_yield() syscall, which is used by user space to relinquish the CPU after finishing the critical section for which it requested an extension. In case the kernel state is still GRANTED, the kernel resets both kernel and user space state with a set of sanity checks. If the kernel state is already cleared, then this raced against the timer or some other interrupt and just clears the work bit. Doing it in syscall entry work allows to catch misbehaving user space, which issues an arbitrary syscall, i.e. not rseq_slice_yield(), from the critical section. Contrary to the initial strict requirement to use rseq_slice_yield() arbitrary syscalls are not considered a violation of the ABI contract anymore to allow onion architecture applications, which cannot control the code inside a critical section, to utilize this as well. If the code detects inconsistent user space that result in a SIGSEGV for the application. If the grant was still active and the task was not preempted yet, the work code reschedules immediately before continuing through the syscall. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155709.005777059@linutronix.de --- include/linux/entry-common.h | 2 +- include/linux/rseq.h | 2 ++ include/linux/thread_info.h | 16 +++++++++------- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 87efb38b7081..026201a44aa2 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -36,8 +36,8 @@ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ + SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ ARCH_SYSCALL_WORK_ENTER) - #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 3c194a02ad0a..7a01a0760405 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -164,8 +164,10 @@ static inline void rseq_syscall(struct pt_regs *regs) { } #endif /* !CONFIG_DEBUG_RSEQ */ #ifdef CONFIG_RSEQ_SLICE_EXTENSION +void rseq_syscall_enter_work(long syscall); int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); #else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline void rseq_syscall_enter_work(long syscall) { } static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) { return -ENOTSUPP; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index b40de9bab4b7..051e42902690 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -46,15 +46,17 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, + SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE, }; -#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) -#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) -#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) -#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) -#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) -#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) -#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) +#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) +#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) +#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) +#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) +#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) +#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE) #endif #include -- cgit v1.2.3 From 0ac3b5c3dc45085b28a10ee730fb2860841f08ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:22 +0100 Subject: rseq: Implement time slice extension enforcement timer If a time slice extension is granted and the reschedule delayed, the kernel has to ensure that user space cannot abuse the extension and exceed the maximum granted time. It was suggested to implement this via the existing hrtick() timer in the scheduler, but that turned out to be problematic for several reasons: 1) It creates a dependency on CONFIG_SCHED_HRTICK, which can be disabled independently of CONFIG_HIGHRES_TIMERS 2) HRTICK usage in the scheduler can be runtime disabled or is only used for certain aspects of scheduling. 3) The function is calling into the scheduler code and that might have unexpected consequences when this is invoked due to a time slice enforcement expiry. Especially when the task managed to clear the grant via sched_yield(0). It would be possible to address #2 and #3 by storing state in the scheduler, but that is extra complexity and fragility for no value. Implement a dedicated per CPU hrtimer instead, which is solely used for the purpose of time slice enforcement. The timer is armed when an extension was granted right before actually returning to user mode in rseq_exit_to_user_mode_restart(). It is disarmed, when the task relinquishes the CPU. This is expensive as the timer is probably the first expiring timer on the CPU, which means it has to reprogram the hardware. But that's less expensive than going through a full hrtimer interrupt cycle for nothing. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251215155709.068329497@linutronix.de --- include/linux/rseq_entry.h | 38 ++++++++++++++++++++++++++++---------- include/linux/rseq_types.h | 2 ++ 2 files changed, 30 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 54d8e338b26e..8d04611056aa 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -87,8 +87,24 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return static_branch_likely(&rseq_slice_extension_key); } + +extern unsigned int rseq_slice_ext_nsecs; +bool __rseq_arm_slice_extension_timer(void); + +static __always_inline bool rseq_arm_slice_extension_timer(void) +{ + if (!rseq_slice_extension_enabled()) + return false; + + if (likely(!current->rseq.slice.state.granted)) + return false; + + return __rseq_arm_slice_extension_timer(); +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } +static inline bool rseq_arm_slice_extension_timer(void) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -543,17 +559,19 @@ static __always_inline void clear_tif_rseq(void) { } static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) { - if (likely(!test_tif_rseq(ti_work))) - return false; - - if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { - current->rseq.event.slowpath = true; - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); - return true; + if (unlikely(test_tif_rseq(ti_work))) { + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + clear_tif_rseq(); } - - clear_tif_rseq(); - return false; + /* + * Arm the slice extension timer if nothing to do anymore and the + * task really goes out to user space. + */ + return rseq_arm_slice_extension_timer(); } #else /* CONFIG_GENERIC_ENTRY */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 8c540e775161..8a2e76c5d2a8 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -89,10 +89,12 @@ union rseq_slice_state { /** * struct rseq_slice - Status information for rseq time slice extension * @state: Time slice extension state + * @expires: The time when a grant expires * @yielded: Indicator for rseq_slice_yield() */ struct rseq_slice { union rseq_slice_state state; + u64 expires; u8 yielded; }; -- cgit v1.2.3 From 7ee58f98b59b0ec32ea8a92f0bc85cb46fcd3de3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:26 +0100 Subject: rseq: Reset slice extension when scheduled When a time slice extension was granted in the need_resched() check on exit to user space, the task can still be scheduled out in one of the other pending work items. When it gets scheduled back in, and need_resched() is not set, then the stale grant would be preserved, which is just wrong. RSEQ already keeps track of that and sets TIF_RSEQ, which invokes the critical section and ID update mechanisms. Utilize them and clear the user space slice control member of struct rseq unconditionally within the existing user access sections. That's just an unconditional store more in that path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251215155709.131081527@linutronix.de --- include/linux/rseq_entry.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 8d04611056aa..fc77b9dd0932 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -102,9 +102,17 @@ static __always_inline bool rseq_arm_slice_extension_timer(void) return __rseq_arm_slice_extension_timer(); } +static __always_inline void rseq_slice_clear_grant(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted) + rseq_stat_inc(rseq_stats.s_revoked); + t->rseq.slice.state.granted = false; +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } static inline bool rseq_arm_slice_extension_timer(void) { return false; } +static inline void rseq_slice_clear_grant(struct task_struct *t) { } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -391,8 +399,15 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + + /* Open coded, so it's in the same user access region */ + if (rseq_slice_extension_enabled()) { + /* Unconditionally clear it, no point in conditionals */ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } + rseq_slice_clear_grant(t); /* Cache the new values */ t->rseq.ids.cpu_cid = ids->cpu_cid; rseq_stat_inc(rseq_stats.ids); @@ -488,8 +503,17 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t */ u64 csaddr; - if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) - return false; + scoped_user_rw_access(rseq, efault) { + unsafe_get_user(csaddr, &rseq->rseq_cs, efault); + + /* Open coded, so it's in the same user access region */ + if (rseq_slice_extension_enabled()) { + /* Unconditionally clear it, no point in conditionals */ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } + } + + rseq_slice_clear_grant(t); if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) @@ -505,6 +529,8 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t u32 node_id = cpu_to_node(ids.cpu_id); return rseq_update_usr(t, regs, &ids, node_id); +efault: + return false; } static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) -- cgit v1.2.3 From dfb630f548a7c715efb0651c6abf334dca75cd52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 15 Dec 2025 17:52:28 +0100 Subject: rseq: Implement rseq_grant_slice_extension() Provide the actual decision function, which decides whether a time slice extension is granted in the exit to user mode path when NEED_RESCHED is evaluated. The decision is made in two stages. First an inline quick check to avoid going into the actual decision function. This checks whether: #1 the functionality is enabled #2 the exit is a return from interrupt to user mode #3 any TIF bit, which causes extra work is set. That includes TIF_RSEQ, which means the task was already scheduled out. The slow path, which implements the actual user space ABI, is invoked when: A) #1 is true, #2 is true and #3 is false It checks whether user space requested a slice extension by setting the request bit in the rseq slice_ctrl field. If so, it grants the extension and stores the slice expiry time, so that the actual exit code can double check whether the slice is already exhausted before going back. B) #1 - #3 are true _and_ a slice extension was granted in a previous loop iteration In this case the grant is revoked. In case that the user space access faults or invalid state is detected, the task is terminated with SIGSEGV. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251215155709.195303303@linutronix.de --- include/linux/rseq_entry.h | 108 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index fc77b9dd0932..cbc4a791618b 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -42,6 +42,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #ifdef CONFIG_RSEQ #include #include +#include #include #include @@ -109,10 +110,116 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } +static __always_inline bool rseq_grant_slice_extension(bool work_pending) +{ + struct task_struct *curr = current; + struct rseq_slice_ctrl usr_ctrl; + union rseq_slice_state state; + struct rseq __user *rseq; + + if (!rseq_slice_extension_enabled()) + return false; + + /* If not enabled or not a return from interrupt, nothing to do. */ + state = curr->rseq.slice.state; + state.enabled &= curr->rseq.event.user_irq; + if (likely(!state.state)) + return false; + + rseq = curr->rseq.usrptr; + scoped_user_rw_access(rseq, efault) { + + /* + * Quick check conditions where a grant is not possible or + * needs to be revoked. + * + * 1) Any TIF bit which needs to do extra work aside of + * rescheduling prevents a grant. + * + * 2) A previous rescheduling request resulted in a slice + * extension grant. + */ + if (unlikely(work_pending || state.granted)) { + /* Clear user control unconditionally. No point for checking */ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + rseq_slice_clear_grant(curr); + return false; + } + + unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); + if (likely(!(usr_ctrl.request))) + return false; + + /* Grant the slice extention */ + usr_ctrl.request = 0; + usr_ctrl.granted = 1; + unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); + } + + rseq_stat_inc(rseq_stats.s_granted); + + curr->rseq.slice.state.granted = true; + /* Store expiry time for arming the timer on the way out */ + curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns(); + /* + * This is racy against a remote CPU setting TIF_NEED_RESCHED in + * several ways: + * + * 1) + * CPU0 CPU1 + * clear_tsk() + * set_tsk() + * clear_preempt() + * Raise scheduler IPI on CPU0 + * --> IPI + * fold_need_resched() -> Folds correctly + * 2) + * CPU0 CPU1 + * set_tsk() + * clear_tsk() + * clear_preempt() + * Raise scheduler IPI on CPU0 + * --> IPI + * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false + * + * #1 is not any different from a regular remote reschedule as it + * sets the previously not set bit and then raises the IPI which + * folds it into the preempt counter + * + * #2 is obviously incorrect from a scheduler POV, but it's not + * differently incorrect than the code below clearing the + * reschedule request with the safety net of the timer. + * + * The important part is that the clearing is protected against the + * scheduler IPI and also against any other interrupt which might + * end up waking up a task and setting the bits in the middle of + * the operation: + * + * clear_tsk() + * ---> Interrupt + * wakeup_on_this_cpu() + * set_tsk() + * set_preempt() + * clear_preempt() + * + * which would be inconsistent state. + */ + scoped_guard(irq) { + clear_tsk_need_resched(curr); + clear_preempt_need_resched(); + } + return true; + +efault: + force_sig(SIGSEGV); + return false; +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } static inline bool rseq_arm_slice_extension_timer(void) { return false; } static inline void rseq_slice_clear_grant(struct task_struct *t) { } +static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -671,6 +778,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } +static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ -- cgit v1.2.3 From d6200245c75e832af2087bc60ba2e6641a90eee9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Jan 2026 11:23:57 +0100 Subject: rseq: Allow registering RSEQ with slice extension Since glibc cares about the number of syscalls required to initialize a new thread, allow initializing rseq with slice extension on. This avoids having to do another prctl(). Requested-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260121143207.814193010@infradead.org --- include/uapi/linux/rseq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 6afc219d1545..863c4a00a66b 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -19,7 +19,8 @@ enum rseq_cpu_id_state { }; enum rseq_flags { - RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_SLICE_EXT_DEFAULT_ON = (1 << 1), }; enum rseq_cs_flags_bit { -- cgit v1.2.3 From 377521af0341083bc9d196cf021ec7265dc47c20 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jan 2026 09:52:23 +0100 Subject: sched: remove task_struct->faults_disabled_mapping This reverts commit 2b69987be575 ("sched: Add task_struct->faults_disabled_mapping"), which added this field without review or maintainer signoff. With bcachefs removed from the tree it is also unused now. Signed-off-by: Christoph Hellwig Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260122085223.487092-1-hch@lst.de --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index bf96a7d595e2..71654e33bdbe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -955,7 +955,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; - struct address_space *faults_disabled_mapping; int exit_state; int exit_code; -- cgit v1.2.3 From 03150a9f84b328f5c724b8ed9ff8600c2d7e2d7b Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 28 Jan 2026 11:19:21 +0800 Subject: entry: Remove unused syscall argument from syscall_trace_enter() The 'syscall' argument of syscall_trace_enter() is immediately overwritten before any real use and serves only as a local variable, so drop the parameter. No functional change intended. Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260128031934.3906955-2-ruanjinjie@huawei.com --- include/linux/entry-common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 87efb38b7081..e4a8287af822 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -45,7 +45,7 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); +long syscall_trace_enter(struct pt_regs *regs, unsigned long work); /** * syscall_enter_from_user_mode_work - Check and handle work before invoking @@ -75,7 +75,7 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re unsigned long work = READ_ONCE(current_thread_info()->syscall_work); if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); + syscall = syscall_trace_enter(regs, work); return syscall; } -- cgit v1.2.3 From e1647100c22eb718e9833211722cbb78e339047c Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 28 Jan 2026 11:19:29 +0800 Subject: entry: Rework syscall_exit_to_user_mode_work() for architecture reuse syscall_exit_to_user_mode_work() invokes local_irq_disable_exit_to_user() and syscall_exit_to_user_mode_prepare() after handling pending syscall exit work. The conversion of ARM64 to the generic entry code requires this to be split up, so move the invocations of local_irq_disable_exit_to_user() and syscall_exit_to_user_mode_prepare() into the only caller. No functional change intended. [ tglx: Massaged changelog and comments ] Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Brodsky Reviewed-by: Thomas Gleixner Link: https://patch.msgid.link/20260128031934.3906955-10-ruanjinjie@huawei.com --- include/linux/entry-common.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index e4a8287af822..5316004940c0 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -122,17 +122,12 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l void syscall_exit_work(struct pt_regs *regs, unsigned long work); /** - * syscall_exit_to_user_mode_work - Handle work before returning to user mode + * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode * @regs: Pointer to currents pt_regs * - * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling - * exit_to_user_mode() to perform the final transition to user mode. + * Step 1 of syscall_exit_to_user_mode() with the same calling convention. * - * Calling convention is the same as for syscall_exit_to_user_mode() and it - * returns with all work handled and interrupts disabled. The caller must - * invoke exit_to_user_mode() before actually switching to user mode to - * make the final state transitions. Interrupts must stay disabled between - * return from this function and the invocation of exit_to_user_mode(). + * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards. */ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) { @@ -155,15 +150,13 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) */ if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); - local_irq_disable_exit_to_user(); - syscall_exit_to_user_mode_prepare(regs); } /** * syscall_exit_to_user_mode - Handle work before returning to user mode * @regs: Pointer to currents pt_regs * - * Invoked with interrupts enabled and fully valid regs. Returns with all + * Invoked with interrupts enabled and fully valid @regs. Returns with all * work handled, interrupts disabled such that the caller can immediately * switch to user mode. Called from architecture specific syscall and ret * from fork code. @@ -176,6 +169,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) * - ptrace (single stepping) * * 2) Preparatory work + * - Disable interrupts * - Exit to user mode loop (common TIF handling). Invokes * arch_exit_to_user_mode_work() for architecture specific TIF work * - Architecture specific one time work arch_exit_to_user_mode_prepare() @@ -184,14 +178,17 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the * functionality in exit_to_user_mode(). * - * This is a combination of syscall_exit_to_user_mode_work() (1,2) and - * exit_to_user_mode(). This function is preferred unless there is a - * compelling architectural reason to use the separate functions. + * This is a combination of syscall_exit_to_user_mode_work() (1), disabling + * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and + * exit_to_user_mode() (3). This function is preferred unless there is a + * compelling architectural reason to invoke the functions separately. */ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); syscall_exit_to_user_mode_work(regs); + local_irq_disable_exit_to_user(); + syscall_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); } -- cgit v1.2.3 From 578b21fd3ab2d9901ce40ed802e428a41a40610d Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 28 Jan 2026 11:19:30 +0800 Subject: entry: Add arch_ptrace_report_syscall_entry/exit() ARM64 requires a architecture specific ptrace wrapper as it needs to save and restore scratch registers. Provide arch_ptrace_report_syscall_entry/exit() wrappers which fall back to ptrace_report_syscall_entry/exit() if the architecture does not provide them. No functional change intended. [ tglx: Massaged changelog and comments ] Suggested-by: Mark Rutland Suggested-by: Thomas Gleixner Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Reviewed-by: Kevin Brodsky Link: https://patch.msgid.link/20260128031934.3906955-11-ruanjinjie@huawei.com --- include/linux/entry-common.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 5316004940c0..bea207e32c58 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -45,6 +45,24 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) +/** + * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper + * + * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). + * + * This allows architecture specific ptrace_report_syscall_entry() + * implementations. If not defined by the architecture this falls back to + * to ptrace_report_syscall_entry(). + */ +static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs); + +#ifndef arch_ptrace_report_syscall_entry +static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs) +{ + return ptrace_report_syscall_entry(regs); +} +#endif + long syscall_trace_enter(struct pt_regs *regs, unsigned long work); /** @@ -112,6 +130,24 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l return ret; } +/** + * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() + * + * This allows architecture specific ptrace_report_syscall_exit() + * implementations. If not defined by the architecture this falls back to + * to ptrace_report_syscall_exit(). + */ +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, + int step); + +#ifndef arch_ptrace_report_syscall_exit +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, + int step) +{ + ptrace_report_syscall_exit(regs, step); +} +#endif + /** * syscall_exit_work - Handle work before returning to user mode * @regs: Pointer to current pt_regs -- cgit v1.2.3 From 31c9387d0d84bc1d643a0c30155b6d92d05c92fc Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Wed, 28 Jan 2026 11:19:33 +0800 Subject: entry: Inline syscall_exit_work() and syscall_trace_enter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After switching ARM64 to the generic entry code, a syscall_exit_work() appeared as a profiling hotspot because it is not inlined. Inlining both syscall_trace_enter() and syscall_exit_work() provides a performance gain when any of the work items is enabled. With audit enabled this results in a ~4% performance gain for perf bench basic syscall on a kunpeng920 system: | Metric | Baseline | Inlined | Change | | ---------- | ----------- | ----------- | ------ | | Total time | 2.353 [sec] | 2.264 [sec] | ↓3.8% | | usecs/op | 0.235374 | 0.226472 | ↓3.8% | | ops/sec | 4,248,588 | 4,415,554 | ↑3.9% | Small gains can be observed on x86 as well, though the generated code optimizes for the work case, which is counterproductive for high performance scenarios where such entry/exit work is usually avoided. Avoid this by marking the work check in syscall_enter_from_user_mode_work() unlikely, which is what the corresponding check in the exit path does already. [ tglx: Massage changelog and add the unlikely() ] Signed-off-by: Jinjie Ruan Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260128031934.3906955-14-ruanjinjie@huawei.com --- include/linux/entry-common.h | 94 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index bea207e32c58..e67e3afe39ad 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -2,6 +2,7 @@ #ifndef __LINUX_ENTRYCOMMON_H #define __LINUX_ENTRYCOMMON_H +#include #include #include #include @@ -63,7 +64,58 @@ static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs } #endif -long syscall_trace_enter(struct pt_regs *regs, unsigned long work); +bool syscall_user_dispatch(struct pt_regs *regs); +long trace_syscall_enter(struct pt_regs *regs, long syscall); +void trace_syscall_exit(struct pt_regs *regs, long ret); + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work) +{ + long syscall, ret = 0; + + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + + /* Handle ptrace */ + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { + ret = arch_ptrace_report_syscall_entry(regs); + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (work & SYSCALL_WORK_SECCOMP) { + ret = __secure_computing(); + if (ret == -1L) + return ret; + } + + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) + syscall = trace_syscall_enter(regs, syscall); + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} /** * syscall_enter_from_user_mode_work - Check and handle work before invoking @@ -130,6 +182,19 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l return ret; } +/* + * If SYSCALL_EMU is set, then the only reason to report is when + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_user_mode(). + */ +static __always_inline bool report_single_step(unsigned long work) +{ + if (work & SYSCALL_WORK_SYSCALL_EMU) + return false; + + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; +} + /** * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() * @@ -155,7 +220,32 @@ static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs * * Do one-time syscall specific work. */ -void syscall_exit_work(struct pt_regs *regs, unsigned long work); +static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work) +{ + bool step; + + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + + audit_syscall_exit(regs); + + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) + trace_syscall_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) + arch_ptrace_report_syscall_exit(regs, step); +} /** * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode -- cgit v1.2.3