summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
Diffstat (limited to 'include')
-rw-r--r--include/linux/compiler_types.h19
-rw-r--r--include/linux/entry-common.h167
-rw-r--r--include/linux/rseq.h11
-rw-r--r--include/linux/rseq_entry.h192
-rw-r--r--include/linux/rseq_types.h32
-rw-r--r--include/linux/sched.h14
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/thread_info.h16
-rw-r--r--include/uapi/asm-generic/unistd.h5
-rw-r--r--include/uapi/linux/prctl.h10
-rw-r--r--include/uapi/linux/rseq.h41
11 files changed, 458 insertions, 50 deletions
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ea96a4466d82..26d322d43224 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -620,6 +620,25 @@ struct ftrace_likely_data {
__scalar_type_to_expr_cases(long long), \
default: (x)))
+/*
+ * __signed_scalar_typeof(x) - Declare a signed scalar type, leaving
+ * non-scalar types unchanged.
+ */
+
+#define __scalar_type_to_signed_cases(type) \
+ unsigned type: (signed type)0, \
+ signed type: (signed type)0
+
+#define __signed_scalar_typeof(x) typeof( \
+ _Generic((x), \
+ char: (signed char)0, \
+ __scalar_type_to_signed_cases(char), \
+ __scalar_type_to_signed_cases(short), \
+ __scalar_type_to_signed_cases(int), \
+ __scalar_type_to_signed_cases(long), \
+ __scalar_type_to_signed_cases(long long), \
+ default: (x)))
+
/* Is this type a native word size -- useful for atomic operations */
#define __native_word(t) \
(sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 87efb38b7081..f83ca0abf2cd 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -2,6 +2,7 @@
#ifndef __LINUX_ENTRYCOMMON_H
#define __LINUX_ENTRYCOMMON_H
+#include <linux/audit.h>
#include <linux/irq-entry-common.h>
#include <linux/livepatch.h>
#include <linux/ptrace.h>
@@ -36,8 +37,8 @@
SYSCALL_WORK_SYSCALL_EMU | \
SYSCALL_WORK_SYSCALL_AUDIT | \
SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
+ SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \
ARCH_SYSCALL_WORK_ENTER)
-
#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
SYSCALL_WORK_SYSCALL_TRACE | \
SYSCALL_WORK_SYSCALL_AUDIT | \
@@ -45,7 +46,84 @@
SYSCALL_WORK_SYSCALL_EXIT_TRAP | \
ARCH_SYSCALL_WORK_EXIT)
-long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
+/**
+ * arch_ptrace_report_syscall_entry - Architecture specific ptrace_report_syscall_entry() wrapper
+ *
+ * Invoked from syscall_trace_enter() to wrap ptrace_report_syscall_entry().
+ *
+ * This allows architecture specific ptrace_report_syscall_entry()
+ * implementations. If not defined by the architecture this falls back to
+ * to ptrace_report_syscall_entry().
+ */
+static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs);
+
+#ifndef arch_ptrace_report_syscall_entry
+static __always_inline int arch_ptrace_report_syscall_entry(struct pt_regs *regs)
+{
+ return ptrace_report_syscall_entry(regs);
+}
+#endif
+
+bool syscall_user_dispatch(struct pt_regs *regs);
+long trace_syscall_enter(struct pt_regs *regs, long syscall);
+void trace_syscall_exit(struct pt_regs *regs, long ret);
+
+static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
+{
+ if (unlikely(audit_context())) {
+ unsigned long args[6];
+
+ syscall_get_arguments(current, regs, args);
+ audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
+ }
+}
+
+static __always_inline long syscall_trace_enter(struct pt_regs *regs, unsigned long work)
+{
+ long syscall, ret = 0;
+
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
+ /*
+ * User space got a time slice extension granted and relinquishes
+ * the CPU. The work stops the slice timer to avoid an extra round
+ * through hrtimer_interrupt().
+ */
+ if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE)
+ rseq_syscall_enter_work(syscall_get_nr(current, regs));
+
+ /* Handle ptrace */
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+ ret = arch_ptrace_report_syscall_entry(regs);
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
+ return -1L;
+ }
+
+ /* Do seccomp after ptrace, to catch any tracer changes. */
+ if (work & SYSCALL_WORK_SECCOMP) {
+ ret = __secure_computing();
+ if (ret == -1L)
+ return ret;
+ }
+
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
+ syscall = trace_syscall_enter(regs, syscall);
+
+ syscall_enter_audit(regs, syscall);
+
+ return ret ? : syscall;
+}
/**
* syscall_enter_from_user_mode_work - Check and handle work before invoking
@@ -75,7 +153,7 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
if (work & SYSCALL_WORK_ENTER)
- syscall = syscall_trace_enter(regs, syscall, work);
+ syscall = syscall_trace_enter(regs, work);
return syscall;
}
@@ -112,6 +190,37 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
return ret;
}
+/*
+ * If SYSCALL_EMU is set, then the only reason to report is when
+ * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+ * instruction has been already reported in syscall_enter_from_user_mode().
+ */
+static __always_inline bool report_single_step(unsigned long work)
+{
+ if (work & SYSCALL_WORK_SYSCALL_EMU)
+ return false;
+
+ return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
+}
+
+/**
+ * arch_ptrace_report_syscall_exit - Architecture specific ptrace_report_syscall_exit()
+ *
+ * This allows architecture specific ptrace_report_syscall_exit()
+ * implementations. If not defined by the architecture this falls back to
+ * to ptrace_report_syscall_exit().
+ */
+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
+ int step);
+
+#ifndef arch_ptrace_report_syscall_exit
+static __always_inline void arch_ptrace_report_syscall_exit(struct pt_regs *regs,
+ int step)
+{
+ ptrace_report_syscall_exit(regs, step);
+}
+#endif
+
/**
* syscall_exit_work - Handle work before returning to user mode
* @regs: Pointer to current pt_regs
@@ -119,20 +228,40 @@ static __always_inline long syscall_enter_from_user_mode(struct pt_regs *regs, l
*
* Do one-time syscall specific work.
*/
-void syscall_exit_work(struct pt_regs *regs, unsigned long work);
+static __always_inline void syscall_exit_work(struct pt_regs *regs, unsigned long work)
+{
+ bool step;
+
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
+ audit_syscall_exit(regs);
+
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
+ trace_syscall_exit(regs, syscall_get_return_value(current, regs));
+
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
+ arch_ptrace_report_syscall_exit(regs, step);
+}
/**
- * syscall_exit_to_user_mode_work - Handle work before returning to user mode
+ * syscall_exit_to_user_mode_work - Handle one time work before returning to user mode
* @regs: Pointer to currents pt_regs
*
- * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling
- * exit_to_user_mode() to perform the final transition to user mode.
+ * Step 1 of syscall_exit_to_user_mode() with the same calling convention.
*
- * Calling convention is the same as for syscall_exit_to_user_mode() and it
- * returns with all work handled and interrupts disabled. The caller must
- * invoke exit_to_user_mode() before actually switching to user mode to
- * make the final state transitions. Interrupts must stay disabled between
- * return from this function and the invocation of exit_to_user_mode().
+ * The caller must invoke steps 2-3 of syscall_exit_to_user_mode() afterwards.
*/
static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
@@ -155,15 +284,13 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
*/
if (unlikely(work & SYSCALL_WORK_EXIT))
syscall_exit_work(regs, work);
- local_irq_disable_exit_to_user();
- syscall_exit_to_user_mode_prepare(regs);
}
/**
* syscall_exit_to_user_mode - Handle work before returning to user mode
* @regs: Pointer to currents pt_regs
*
- * Invoked with interrupts enabled and fully valid regs. Returns with all
+ * Invoked with interrupts enabled and fully valid @regs. Returns with all
* work handled, interrupts disabled such that the caller can immediately
* switch to user mode. Called from architecture specific syscall and ret
* from fork code.
@@ -176,6 +303,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
* - ptrace (single stepping)
*
* 2) Preparatory work
+ * - Disable interrupts
* - Exit to user mode loop (common TIF handling). Invokes
* arch_exit_to_user_mode_work() for architecture specific TIF work
* - Architecture specific one time work arch_exit_to_user_mode_prepare()
@@ -184,14 +312,17 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
* 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the
* functionality in exit_to_user_mode().
*
- * This is a combination of syscall_exit_to_user_mode_work() (1,2) and
- * exit_to_user_mode(). This function is preferred unless there is a
- * compelling architectural reason to use the separate functions.
+ * This is a combination of syscall_exit_to_user_mode_work() (1), disabling
+ * interrupts followed by syscall_exit_to_user_mode_prepare() (2) and
+ * exit_to_user_mode() (3). This function is preferred unless there is a
+ * compelling architectural reason to invoke the functions separately.
*/
static __always_inline void syscall_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
syscall_exit_to_user_mode_work(regs);
+ local_irq_disable_exit_to_user();
+ syscall_exit_to_user_mode_prepare(regs);
instrumentation_end();
exit_to_user_mode();
}
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 2266f4dc77b6..7a01a0760405 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -163,4 +163,15 @@ void rseq_syscall(struct pt_regs *regs);
static inline void rseq_syscall(struct pt_regs *regs) { }
#endif /* !CONFIG_DEBUG_RSEQ */
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+void rseq_syscall_enter_work(long syscall);
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
+#else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline void rseq_syscall_enter_work(long syscall) { }
+static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
+{
+ return -ENOTSUPP;
+}
+#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
+
#endif /* _LINUX_RSEQ_H */
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index a36b472627de..cbc4a791618b 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -15,6 +15,11 @@ struct rseq_stats {
unsigned long cs;
unsigned long clear;
unsigned long fixup;
+ unsigned long s_granted;
+ unsigned long s_expired;
+ unsigned long s_revoked;
+ unsigned long s_yielded;
+ unsigned long s_aborted;
};
DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
@@ -37,6 +42,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
#ifdef CONFIG_RSEQ
#include <linux/jump_label.h>
#include <linux/rseq.h>
+#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/tracepoint-defs.h>
@@ -75,6 +81,147 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
#define rseq_inline __always_inline
#endif
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
+
+static __always_inline bool rseq_slice_extension_enabled(void)
+{
+ return static_branch_likely(&rseq_slice_extension_key);
+}
+
+extern unsigned int rseq_slice_ext_nsecs;
+bool __rseq_arm_slice_extension_timer(void);
+
+static __always_inline bool rseq_arm_slice_extension_timer(void)
+{
+ if (!rseq_slice_extension_enabled())
+ return false;
+
+ if (likely(!current->rseq.slice.state.granted))
+ return false;
+
+ return __rseq_arm_slice_extension_timer();
+}
+
+static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
+{
+ if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
+ rseq_stat_inc(rseq_stats.s_revoked);
+ t->rseq.slice.state.granted = false;
+}
+
+static __always_inline bool rseq_grant_slice_extension(bool work_pending)
+{
+ struct task_struct *curr = current;
+ struct rseq_slice_ctrl usr_ctrl;
+ union rseq_slice_state state;
+ struct rseq __user *rseq;
+
+ if (!rseq_slice_extension_enabled())
+ return false;
+
+ /* If not enabled or not a return from interrupt, nothing to do. */
+ state = curr->rseq.slice.state;
+ state.enabled &= curr->rseq.event.user_irq;
+ if (likely(!state.state))
+ return false;
+
+ rseq = curr->rseq.usrptr;
+ scoped_user_rw_access(rseq, efault) {
+
+ /*
+ * Quick check conditions where a grant is not possible or
+ * needs to be revoked.
+ *
+ * 1) Any TIF bit which needs to do extra work aside of
+ * rescheduling prevents a grant.
+ *
+ * 2) A previous rescheduling request resulted in a slice
+ * extension grant.
+ */
+ if (unlikely(work_pending || state.granted)) {
+ /* Clear user control unconditionally. No point for checking */
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ rseq_slice_clear_grant(curr);
+ return false;
+ }
+
+ unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
+ if (likely(!(usr_ctrl.request)))
+ return false;
+
+ /* Grant the slice extention */
+ usr_ctrl.request = 0;
+ usr_ctrl.granted = 1;
+ unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
+ }
+
+ rseq_stat_inc(rseq_stats.s_granted);
+
+ curr->rseq.slice.state.granted = true;
+ /* Store expiry time for arming the timer on the way out */
+ curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
+ /*
+ * This is racy against a remote CPU setting TIF_NEED_RESCHED in
+ * several ways:
+ *
+ * 1)
+ * CPU0 CPU1
+ * clear_tsk()
+ * set_tsk()
+ * clear_preempt()
+ * Raise scheduler IPI on CPU0
+ * --> IPI
+ * fold_need_resched() -> Folds correctly
+ * 2)
+ * CPU0 CPU1
+ * set_tsk()
+ * clear_tsk()
+ * clear_preempt()
+ * Raise scheduler IPI on CPU0
+ * --> IPI
+ * fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
+ *
+ * #1 is not any different from a regular remote reschedule as it
+ * sets the previously not set bit and then raises the IPI which
+ * folds it into the preempt counter
+ *
+ * #2 is obviously incorrect from a scheduler POV, but it's not
+ * differently incorrect than the code below clearing the
+ * reschedule request with the safety net of the timer.
+ *
+ * The important part is that the clearing is protected against the
+ * scheduler IPI and also against any other interrupt which might
+ * end up waking up a task and setting the bits in the middle of
+ * the operation:
+ *
+ * clear_tsk()
+ * ---> Interrupt
+ * wakeup_on_this_cpu()
+ * set_tsk()
+ * set_preempt()
+ * clear_preempt()
+ *
+ * which would be inconsistent state.
+ */
+ scoped_guard(irq) {
+ clear_tsk_need_resched(curr);
+ clear_preempt_need_resched();
+ }
+ return true;
+
+efault:
+ force_sig(SIGSEGV);
+ return false;
+}
+
+#else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline bool rseq_slice_extension_enabled(void) { return false; }
+static inline bool rseq_arm_slice_extension_timer(void) { return false; }
+static inline void rseq_slice_clear_grant(struct task_struct *t) { }
+static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
+#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
+
bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
bool rseq_debug_validate_ids(struct task_struct *t);
@@ -359,8 +506,15 @@ bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
if (csaddr)
unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
+
+ /* Open coded, so it's in the same user access region */
+ if (rseq_slice_extension_enabled()) {
+ /* Unconditionally clear it, no point in conditionals */
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ }
}
+ rseq_slice_clear_grant(t);
/* Cache the new values */
t->rseq.ids.cpu_cid = ids->cpu_cid;
rseq_stat_inc(rseq_stats.ids);
@@ -456,8 +610,17 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
*/
u64 csaddr;
- if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs)))
- return false;
+ scoped_user_rw_access(rseq, efault) {
+ unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
+
+ /* Open coded, so it's in the same user access region */
+ if (rseq_slice_extension_enabled()) {
+ /* Unconditionally clear it, no point in conditionals */
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
+ }
+ }
+
+ rseq_slice_clear_grant(t);
if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
@@ -473,6 +636,8 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
u32 node_id = cpu_to_node(ids.cpu_id);
return rseq_update_usr(t, regs, &ids, node_id);
+efault:
+ return false;
}
static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
@@ -527,17 +692,19 @@ static __always_inline void clear_tif_rseq(void) { }
static __always_inline bool
rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
{
- if (likely(!test_tif_rseq(ti_work)))
- return false;
-
- if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
- current->rseq.event.slowpath = true;
- set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
- return true;
+ if (unlikely(test_tif_rseq(ti_work))) {
+ if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+ current->rseq.event.slowpath = true;
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ return true;
+ }
+ clear_tif_rseq();
}
-
- clear_tif_rseq();
- return false;
+ /*
+ * Arm the slice extension timer if nothing to do anymore and the
+ * task really goes out to user space.
+ */
+ return rseq_arm_slice_extension_timer();
}
#else /* CONFIG_GENERIC_ENTRY */
@@ -611,6 +778,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { }
static inline void rseq_irqentry_exit_to_user_mode(void) { }
static inline void rseq_exit_to_user_mode_legacy(void) { }
static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
+static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
#endif /* !CONFIG_RSEQ */
#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index ef0811379c54..da5fa6f40294 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -73,12 +73,39 @@ struct rseq_ids {
};
/**
+ * union rseq_slice_state - Status information for rseq time slice extension
+ * @state: Compound to access the overall state
+ * @enabled: Time slice extension is enabled for the task
+ * @granted: Time slice extension was granted to the task
+ */
+union rseq_slice_state {
+ u16 state;
+ struct {
+ u8 enabled;
+ u8 granted;
+ };
+};
+
+/**
+ * struct rseq_slice - Status information for rseq time slice extension
+ * @state: Time slice extension state
+ * @expires: The time when a grant expires
+ * @yielded: Indicator for rseq_slice_yield()
+ */
+struct rseq_slice {
+ union rseq_slice_state state;
+ u64 expires;
+ u8 yielded;
+};
+
+/**
* struct rseq_data - Storage for all rseq related data
* @usrptr: Pointer to the registered user space RSEQ memory
* @len: Length of the RSEQ region
- * @sig: Signature of critial section abort IPs
+ * @sig: Signature of critical section abort IPs
* @event: Storage for event management
* @ids: Storage for cached CPU ID and MM CID
+ * @slice: Storage for time slice extension data
*/
struct rseq_data {
struct rseq __user *usrptr;
@@ -86,6 +113,9 @@ struct rseq_data {
u32 sig;
struct rseq_event event;
struct rseq_ids ids;
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+ struct rseq_slice slice;
+#endif
};
#else /* CONFIG_RSEQ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8faf653803a1..36ae08ca0c62 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -586,15 +586,10 @@ struct sched_entity {
u64 sum_exec_runtime;
u64 prev_sum_exec_runtime;
u64 vruntime;
- union {
- /*
- * When !@on_rq this field is vlag.
- * When cfs_rq->curr == se (which implies @on_rq)
- * this field is vprot. See protect_slice().
- */
- s64 vlag;
- u64 vprot;
- };
+ /* Approximated virtual lag: */
+ s64 vlag;
+ /* 'Protected' deadline, to give out minimum quantums: */
+ u64 vprot;
u64 slice;
u64 nr_migrations;
@@ -956,7 +951,6 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
- struct address_space *faults_disabled_mapping;
int exit_state;
int exit_code;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index cf84d98964b2..6c8a570cf44a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -961,6 +961,7 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer);
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
int flags, uint32_t sig);
+asmlinkage long sys_rseq_slice_yield(void);
asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
asmlinkage long sys_open_tree_attr(int dfd, const char __user *path,
unsigned flags,
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index b40de9bab4b7..051e42902690 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -46,15 +46,17 @@ enum syscall_work_bit {
SYSCALL_WORK_BIT_SYSCALL_AUDIT,
SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
+ SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE,
};
-#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
-#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
-#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
-#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
-#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
-#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
-#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
+#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
+#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
+#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
+#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
+#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
+#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
+#define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE)
#endif
#include <asm/thread_info.h>
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 942370b3f5d2..a627acc8fb5f 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -860,8 +860,11 @@ __SYSCALL(__NR_file_setattr, sys_file_setattr)
#define __NR_listns 470
__SYSCALL(__NR_listns, sys_listns)
+#define __NR_rseq_slice_yield 471
+__SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
+
#undef __NR_syscalls
-#define __NR_syscalls 471
+#define __NR_syscalls 472
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 51c4e8c82b1e..79944b7ae50a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -386,4 +386,14 @@ struct prctl_mm_map {
# define PR_FUTEX_HASH_SET_SLOTS 1
# define PR_FUTEX_HASH_GET_SLOTS 2
+/* RSEQ time slice extensions */
+#define PR_RSEQ_SLICE_EXTENSION 79
+# define PR_RSEQ_SLICE_EXTENSION_GET 1
+# define PR_RSEQ_SLICE_EXTENSION_SET 2
+/*
+ * Bits for RSEQ_SLICE_EXTENSION_GET/SET
+ * PR_RSEQ_SLICE_EXT_ENABLE: Enable
+ */
+# define PR_RSEQ_SLICE_EXT_ENABLE 0x01
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
index 1b76d508400c..863c4a00a66b 100644
--- a/include/uapi/linux/rseq.h
+++ b/include/uapi/linux/rseq.h
@@ -19,13 +19,20 @@ enum rseq_cpu_id_state {
};
enum rseq_flags {
- RSEQ_FLAG_UNREGISTER = (1 << 0),
+ RSEQ_FLAG_UNREGISTER = (1 << 0),
+ RSEQ_FLAG_SLICE_EXT_DEFAULT_ON = (1 << 1),
};
enum rseq_cs_flags_bit {
+ /* Historical and unsupported bits */
RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+ /* (3) Intentional gap to put new bits into a separate byte */
+
+ /* User read only feature flags */
+ RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT = 4,
+ RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT = 5,
};
enum rseq_cs_flags {
@@ -35,6 +42,11 @@ enum rseq_cs_flags {
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE =
(1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+
+ RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE =
+ (1U << RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE_BIT),
+ RSEQ_CS_FLAG_SLICE_EXT_ENABLED =
+ (1U << RSEQ_CS_FLAG_SLICE_EXT_ENABLED_BIT),
};
/*
@@ -53,6 +65,27 @@ struct rseq_cs {
__u64 abort_ip;
} __attribute__((aligned(4 * sizeof(__u64))));
+/**
+ * rseq_slice_ctrl - Time slice extension control structure
+ * @all: Compound value
+ * @request: Request for a time slice extension
+ * @granted: Granted time slice extension
+ *
+ * @request is set by user space and can be cleared by user space or kernel
+ * space. @granted is set and cleared by the kernel and must only be read
+ * by user space.
+ */
+struct rseq_slice_ctrl {
+ union {
+ __u32 all;
+ struct {
+ __u8 request;
+ __u8 granted;
+ __u16 __reserved;
+ };
+ };
+};
+
/*
* struct rseq is aligned on 4 * 8 bytes to ensure it is always
* contained within a single cache-line.
@@ -142,6 +175,12 @@ struct rseq {
__u32 mm_cid;
/*
+ * Time slice extension control structure. CPU local updates from
+ * kernel and user space.
+ */
+ struct rseq_slice_ctrl slice_ctrl;
+
+ /*
* Flexible array member at end of structure, after last feature field.
*/
char end[];