diff options
Diffstat (limited to 'include')
| -rw-r--r-- | include/linux/compiler_types.h | 19 | ||||
| -rw-r--r-- | include/linux/entry-common.h | 167 | ||||
| -rw-r--r-- | include/linux/rseq.h | 11 | ||||
| -rw-r--r-- | include/linux/rseq_entry.h | 192 | ||||
| -rw-r--r-- | include/linux/rseq_types.h | 32 | ||||
| -rw-r--r-- | include/linux/sched.h | 14 | ||||
| -rw-r--r-- | include/linux/syscalls.h | 1 | ||||
| -rw-r--r-- | include/linux/thread_info.h | 16 | ||||
| -rw-r--r-- | include/uapi/asm-generic/unistd.h | 5 | ||||
| -rw-r--r-- | include/uapi/linux/prctl.h | 10 | ||||
| -rw-r--r-- | include/uapi/linux/rseq.h | 41 |
11 files changed, 458 insertions, 50 deletions
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index ea96a4466d82..26d322d43224 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -620,6 +620,25 @@ struct ftrace_likely_data { __scalar_type_to_expr_cases(long long), \ default: (x))) +/* + * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving + * non-scalar types unchanged. + */ + +#define __scalar_type_to_signed_cases(type) \ + unsigned type: (signed type)0, \ + signed type: (signed type)0 + +#define __signed_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (signed char)0, \ + __scalar_type_to_signed_cases(char), \ + __scalar_type_to_signed_cases(short), \ + __scalar_type_to_signed_cases(int), \ + __scalar_type_to_signed_cases(long), \ + __scalar_type_to_signed_cases(long long), \ + default: (x))) + /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 87efb38b7081..f83ca0abf2cd 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -2,6 +2,7 @@ #ifndef __LINUX_ENTRYCOMMON_H #define __LINUX_ENTRYCOMMON_H +#include <linux/audit.h> #include <linux/irq-entry-common.h> #include <linux/livepatch.h> #include <linux/ptrace.h> @@ -36,8 +37,8 @@ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ + SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ ARCH_SYSCALL_WORK_ENTER) - #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ @@ -45,7 +46,84 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); +/** + * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper + * + * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry(). + * + * This allows architecture specific ptrace_report_syscall_entry() + * implementations. If not defined by the architecture this falls back to + * to ptrace_report_syscall_entry(). + */ +static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs); + +#ifndef arch_ptrace_report_syscall_entry +static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs) +{ + return ptrace_report_syscall_entry(regs); +} +#endif + +bool syscall_user_dispatch(struct pt_regs *regs); +long trace_syscall_enter(struct pt_regs *regs, long syscall); +void trace_syscall_exit(struct pt_regs *regs, long ret); + +static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) +{ + if (unlikely(audit_context())) { + unsigned long args[6]; + + syscall_get_arguments(current, regs, args); + audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]); + } +} + +static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work) +{ + long syscall, ret = 0; + + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + + /* + * User space got a time slice extension granted and relinquishes + * the CPU. The work stops the slice timer to avoid an extra round + * through hrtimer_interrupt(). + */ + if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE) + rseq_syscall_enter_work(syscall_get_nr(current, regs)); + + /* Handle ptrace */ + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { + ret = arch_ptrace_report_syscall_entry(regs); + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) + return -1L; + } + + /* Do seccomp after ptrace, to catch any tracer changes. */ + if (work & SYSCALL_WORK_SECCOMP) { + ret = __secure_computing(); + if (ret == -1L) + return ret; + } + + /* Either of the above might have changed the syscall number */ + syscall = syscall_get_nr(current, regs); + + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) + syscall = trace_syscall_enter(regs, syscall); + + syscall_enter_audit(regs, syscall); + + return ret ? : syscall; +} /** * syscall_enter_from_user_mode_work - Check and handle work before invoking @@ -75,7 +153,7 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re unsigned long work = READ_ONCE(current_thread_info()->syscall_work); if (work & SYSCALL_WORK_ENTER) - syscall = syscall_trace_enter(regs, syscall, work); + syscall = syscall_trace_enter(regs, work); return syscall; } @@ -112,6 +190,37 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l return ret; } +/* + * If SYSCALL_EMU is set, then the only reason to report is when + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * instruction has been already reported in syscall_enter_from_user_mode(). + */ +static __always_inline bool report_single_step(unsigned long work) +{ + if (work & SYSCALL_WORK_SYSCALL_EMU) + return false; + + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; +} + +/** + * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit() + * + * This allows architecture specific ptrace_report_syscall_exit() + * implementations. If not defined by the architecture this falls back to + * to ptrace_report_syscall_exit(). + */ +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, + int step); + +#ifndef arch_ptrace_report_syscall_exit +static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs, + int step) +{ + ptrace_report_syscall_exit(regs, step); +} +#endif + /** * syscall_exit_work - Handle work before returning to user mode * @regs: Pointer to current pt_regs @@ -119,20 +228,40 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l * * Do one-time syscall specific work. */ -void syscall_exit_work(struct pt_regs *regs, unsigned long work); +static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work) +{ + bool step; + + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + + audit_syscall_exit(regs); + + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) + trace_syscall_exit(regs, syscall_get_return_value(current, regs)); + + step = report_single_step(work); + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) + arch_ptrace_report_syscall_exit(regs, step); +} /** - * syscall_exit_to_user_mode_work - Handle work before returning to user mode + * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode * @regs: Pointer to currents pt_regs * - * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling - * exit_to_user_mode() to perform the final transition to user mode. + * Step 1 of syscall_exit_to_user_mode() with the same calling convention. * - * Calling convention is the same as for syscall_exit_to_user_mode() and it - * returns with all work handled and interrupts disabled. The caller must - * invoke exit_to_user_mode() before actually switching to user mode to - * make the final state transitions. Interrupts must stay disabled between - * return from this function and the invocation of exit_to_user_mode(). + * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards. */ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) { @@ -155,15 +284,13 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) */ if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); - local_irq_disable_exit_to_user(); - syscall_exit_to_user_mode_prepare(regs); } /** * syscall_exit_to_user_mode - Handle work before returning to user mode * @regs: Pointer to currents pt_regs * - * Invoked with interrupts enabled and fully valid regs. Returns with all + * Invoked with interrupts enabled and fully valid @regs. Returns with all * work handled, interrupts disabled such that the caller can immediately * switch to user mode. Called from architecture specific syscall and ret * from fork code. @@ -176,6 +303,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) * - ptrace (single stepping) * * 2) Preparatory work + * - Disable interrupts * - Exit to user mode loop (common TIF handling). Invokes * arch_exit_to_user_mode_work() for architecture specific TIF work * - Architecture specific one time work arch_exit_to_user_mode_prepare() @@ -184,14 +312,17 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the * functionality in exit_to_user_mode(). * - * This is a combination of syscall_exit_to_user_mode_work() (1,2) and - * exit_to_user_mode(). This function is preferred unless there is a - * compelling architectural reason to use the separate functions. + * This is a combination of syscall_exit_to_user_mode_work() (1), disabling + * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and + * exit_to_user_mode() (3). This function is preferred unless there is a + * compelling architectural reason to invoke the functions separately. */ static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); syscall_exit_to_user_mode_work(regs); + local_irq_disable_exit_to_user(); + syscall_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); } diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 2266f4dc77b6..7a01a0760405 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -163,4 +163,15 @@ void rseq_syscall(struct pt_regs *regs); static inline void rseq_syscall(struct pt_regs *regs) { } #endif /* !CONFIG_DEBUG_RSEQ */ +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +void rseq_syscall_enter_work(long syscall); +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); +#else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline void rseq_syscall_enter_work(long syscall) { } +static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + return -ENOTSUPP; +} +#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ + #endif /* _LINUX_RSEQ_H */ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index a36b472627de..cbc4a791618b 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -15,6 +15,11 @@ struct rseq_stats { unsigned long cs; unsigned long clear; unsigned long fixup; + unsigned long s_granted; + unsigned long s_expired; + unsigned long s_revoked; + unsigned long s_yielded; + unsigned long s_aborted; }; DECLARE_PER_CPU(struct rseq_stats, rseq_stats); @@ -37,6 +42,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #ifdef CONFIG_RSEQ #include <linux/jump_label.h> #include <linux/rseq.h> +#include <linux/sched/signal.h> #include <linux/uaccess.h> #include <linux/tracepoint-defs.h> @@ -75,6 +81,147 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); #define rseq_inline __always_inline #endif +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +static __always_inline bool rseq_slice_extension_enabled(void) +{ + return static_branch_likely(&rseq_slice_extension_key); +} + +extern unsigned int rseq_slice_ext_nsecs; +bool __rseq_arm_slice_extension_timer(void); + +static __always_inline bool rseq_arm_slice_extension_timer(void) +{ + if (!rseq_slice_extension_enabled()) + return false; + + if (likely(!current->rseq.slice.state.granted)) + return false; + + return __rseq_arm_slice_extension_timer(); +} + +static __always_inline void rseq_slice_clear_grant(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted) + rseq_stat_inc(rseq_stats.s_revoked); + t->rseq.slice.state.granted = false; +} + +static __always_inline bool rseq_grant_slice_extension(bool work_pending) +{ + struct task_struct *curr = current; + struct rseq_slice_ctrl usr_ctrl; + union rseq_slice_state state; + struct rseq __user *rseq; + + if (!rseq_slice_extension_enabled()) + return false; + + /* If not enabled or not a return from interrupt, nothing to do. */ + state = curr->rseq.slice.state; + state.enabled &= curr->rseq.event.user_irq; + if (likely(!state.state)) + return false; + + rseq = curr->rseq.usrptr; + scoped_user_rw_access(rseq, efault) { + + /* + * Quick check conditions where a grant is not possible or + * needs to be revoked. + * + * 1) Any TIF bit which needs to do extra work aside of + * rescheduling prevents a grant. + * + * 2) A previous rescheduling request resulted in a slice + * extension grant. + */ + if (unlikely(work_pending || state.granted)) { + /* Clear user control unconditionally. No point for checking */ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + rseq_slice_clear_grant(curr); + return false; + } + + unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); + if (likely(!(usr_ctrl.request))) + return false; + + /* Grant the slice extention */ + usr_ctrl.request = 0; + usr_ctrl.granted = 1; + unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault); + } + + rseq_stat_inc(rseq_stats.s_granted); + + curr->rseq.slice.state.granted = true; + /* Store expiry time for arming the timer on the way out */ + curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns(); + /* + * This is racy against a remote CPU setting TIF_NEED_RESCHED in + * several ways: + * + * 1) + * CPU0 CPU1 + * clear_tsk() + * set_tsk() + * clear_preempt() + * Raise scheduler IPI on CPU0 + * --> IPI + * fold_need_resched() -> Folds correctly + * 2) + * CPU0 CPU1 + * set_tsk() + * clear_tsk() + * clear_preempt() + * Raise scheduler IPI on CPU0 + * --> IPI + * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false + * + * #1 is not any different from a regular remote reschedule as it + * sets the previously not set bit and then raises the IPI which + * folds it into the preempt counter + * + * #2 is obviously incorrect from a scheduler POV, but it's not + * differently incorrect than the code below clearing the + * reschedule request with the safety net of the timer. + * + * The important part is that the clearing is protected against the + * scheduler IPI and also against any other interrupt which might + * end up waking up a task and setting the bits in the middle of + * the operation: + * + * clear_tsk() + * ---> Interrupt + * wakeup_on_this_cpu() + * set_tsk() + * set_preempt() + * clear_preempt() + * + * which would be inconsistent state. + */ + scoped_guard(irq) { + clear_tsk_need_resched(curr); + clear_preempt_need_resched(); + } + return true; + +efault: + force_sig(SIGSEGV); + return false; +} + +#else /* CONFIG_RSEQ_SLICE_EXTENSION */ +static inline bool rseq_slice_extension_enabled(void) { return false; } +static inline bool rseq_arm_slice_extension_timer(void) { return false; } +static inline void rseq_slice_clear_grant(struct task_struct *t) { } +static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); bool rseq_debug_validate_ids(struct task_struct *t); @@ -359,8 +506,15 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); if (csaddr) unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + + /* Open coded, so it's in the same user access region */ + if (rseq_slice_extension_enabled()) { + /* Unconditionally clear it, no point in conditionals */ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } + rseq_slice_clear_grant(t); /* Cache the new values */ t->rseq.ids.cpu_cid = ids->cpu_cid; rseq_stat_inc(rseq_stats.ids); @@ -456,8 +610,17 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t */ u64 csaddr; - if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) - return false; + scoped_user_rw_access(rseq, efault) { + unsafe_get_user(csaddr, &rseq->rseq_cs, efault); + + /* Open coded, so it's in the same user access region */ + if (rseq_slice_extension_enabled()) { + /* Unconditionally clear it, no point in conditionals */ + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } + } + + rseq_slice_clear_grant(t); if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) @@ -473,6 +636,8 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t u32 node_id = cpu_to_node(ids.cpu_id); return rseq_update_usr(t, regs, &ids, node_id); +efault: + return false; } static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) @@ -527,17 +692,19 @@ static __always_inline void clear_tif_rseq(void) { } static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) { - if (likely(!test_tif_rseq(ti_work))) - return false; - - if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { - current->rseq.event.slowpath = true; - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); - return true; + if (unlikely(test_tif_rseq(ti_work))) { + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + clear_tif_rseq(); } - - clear_tif_rseq(); - return false; + /* + * Arm the slice extension timer if nothing to do anymore and the + * task really goes out to user space. + */ + return rseq_arm_slice_extension_timer(); } #else /* CONFIG_GENERIC_ENTRY */ @@ -611,6 +778,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } +static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index ef0811379c54..da5fa6f40294 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -73,12 +73,39 @@ struct rseq_ids { }; /** + * union rseq_slice_state - Status information for rseq time slice extension + * @state: Compound to access the overall state + * @enabled: Time slice extension is enabled for the task + * @granted: Time slice extension was granted to the task + */ +union rseq_slice_state { + u16 state; + struct { + u8 enabled; + u8 granted; + }; +}; + +/** + * struct rseq_slice - Status information for rseq time slice extension + * @state: Time slice extension state + * @expires: The time when a grant expires + * @yielded: Indicator for rseq_slice_yield() + */ +struct rseq_slice { + union rseq_slice_state state; + u64 expires; + u8 yielded; +}; + +/** * struct rseq_data - Storage for all rseq related data * @usrptr: Pointer to the registered user space RSEQ memory * @len: Length of the RSEQ region - * @sig: Signature of critial section abort IPs + * @sig: Signature of critical section abort IPs * @event: Storage for event management * @ids: Storage for cached CPU ID and MM CID + * @slice: Storage for time slice extension data */ struct rseq_data { struct rseq __user *usrptr; @@ -86,6 +113,9 @@ struct rseq_data { u32 sig; struct rseq_event event; struct rseq_ids ids; +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + struct rseq_slice slice; +#endif }; #else /* CONFIG_RSEQ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8faf653803a1..36ae08ca0c62 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -586,15 +586,10 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; - union { - /* - * When !@on_rq this field is vlag. - * When cfs_rq->curr == se (which implies @on_rq) - * this field is vprot. See protect_slice(). - */ - s64 vlag; - u64 vprot; - }; + /* Approximated virtual lag: */ + s64 vlag; + /* 'Protected' deadline, to give out minimum quantums: */ + u64 vprot; u64 slice; u64 nr_migrations; @@ -956,7 +951,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; - struct address_space *faults_disabled_mapping; int exit_state; int exit_code; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cf84d98964b2..6c8a570cf44a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -961,6 +961,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, int flags, uint32_t sig); +asmlinkage long sys_rseq_slice_yield(void); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); asmlinkage long sys_open_tree_attr(int dfd, const char __user *path, unsigned flags, diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index b40de9bab4b7..051e42902690 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -46,15 +46,17 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, + SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE, }; -#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) -#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) -#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) -#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) -#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) -#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) -#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) +#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) +#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) +#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) +#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) +#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) +#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE) #endif #include <asm/thread_info.h> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 942370b3f5d2..a627acc8fb5f 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -860,8 +860,11 @@ __SYSCALL(__NR_file_setattr, sys_file_setattr) #define __NR_listns 470 __SYSCALL(__NR_listns, sys_listns) +#define __NR_rseq_slice_yield 471 +__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield) + #undef __NR_syscalls -#define __NR_syscalls 471 +#define __NR_syscalls 472 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 51c4e8c82b1e..79944b7ae50a 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -386,4 +386,14 @@ struct prctl_mm_map { # define PR_FUTEX_HASH_SET_SLOTS 1 # define PR_FUTEX_HASH_GET_SLOTS 2 +/* RSEQ time slice extensions */ +#define PR_RSEQ_SLICE_EXTENSION 79 +# define PR_RSEQ_SLICE_EXTENSION_GET 1 +# define PR_RSEQ_SLICE_EXTENSION_SET 2 +/* + * Bits for RSEQ_SLICE_EXTENSION_GET/SET + * PR_RSEQ_SLICE_EXT_ENABLE: Enable + */ +# define PR_RSEQ_SLICE_EXT_ENABLE 0x01 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 1b76d508400c..863c4a00a66b 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -19,13 +19,20 @@ enum rseq_cpu_id_state { }; enum rseq_flags { - RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_SLICE_EXT_DEFAULT_ON = (1 << 1), }; enum rseq_cs_flags_bit { + /* Historical and unsupported bits */ RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0, RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1, RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2, + /* (3) Intentional gap to put new bits into a separate byte */ + + /* User read only feature flags */ + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4, + RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT = 5, }; enum rseq_cs_flags { @@ -35,6 +42,11 @@ enum rseq_cs_flags { (1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT), RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), + + RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE = + (1U << RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT), + RSEQ_CS_FLAG_SLICE_EXT_ENABLED = + (1U << RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT), }; /* @@ -53,6 +65,27 @@ struct rseq_cs { __u64 abort_ip; } __attribute__((aligned(4 * sizeof(__u64)))); +/** + * rseq_slice_ctrl - Time slice extension control structure + * @all: Compound value + * @request: Request for a time slice extension + * @granted: Granted time slice extension + * + * @request is set by user space and can be cleared by user space or kernel + * space. @granted is set and cleared by the kernel and must only be read + * by user space. + */ +struct rseq_slice_ctrl { + union { + __u32 all; + struct { + __u8 request; + __u8 granted; + __u16 __reserved; + }; + }; +}; + /* * struct rseq is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. @@ -142,6 +175,12 @@ struct rseq { __u32 mm_cid; /* + * Time slice extension control structure. CPU local updates from + * kernel and user space. + */ + struct rseq_slice_ctrl slice_ctrl; + + /* * Flexible array member at end of structure, after last feature field. */ char end[]; |
