summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-05-08 19:42:10 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-05-08 19:42:10 -0700
commit7f0023215262221ca08d56be2203e8a4770be033 (patch)
tree33c8dec5486e41d6b1e29dbaab172b3cbc0ddc41 /include/linux
parente5cf0260a7472b4f34a46c418c14bec272aac404 (diff)
parent9f6d929ee2c6f0266edb564bcd2bd47fd6e884a8 (diff)
Merge tag 'sched-urgent-2026-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: - Fix spurious failures in rseq self-tests (Mark Brown) - Fix rseq rseq::cpu_id_start ABI regression due to TCMalloc's creative use of the supposedly read-only field The fix is to introduce a new ABI variant based on a new (larger) rseq area registration size, to keep the TCMalloc use of rseq backwards compatible on new kernels (Thomas Gleixner) - Fix wakeup_preempt_fair() for not waking up task (Vincent Guittot) - Fix s64 mult overflow in vruntime_eligible() (Zhan Xusheng) * tag 'sched-urgent-2026-05-09' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix wakeup_preempt_fair() for not waking up task sched/fair: Fix overflow in vruntime_eligible() selftests/rseq: Expand for optimized RSEQ ABI v2 rseq: Reenable performance optimizations conditionally rseq: Implement read only ABI enforcement for optimized RSEQ V2 mode selftests/rseq: Validate legacy behavior selftests/rseq: Make registration flexible for legacy and optimized mode selftests/rseq: Skip tests if time slice extensions are not available rseq: Revert to historical performance killing behaviour rseq: Don't advertise time slice extensions if disabled rseq: Protect rseq_reset() against interrupts rseq: Set rseq::cpu_id_start to 0 on unregistration selftests/rseq: Don't run tests with runner scripts outside of the scripts
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/rseq.h37
-rw-r--r--include/linux/rseq_entry.h122
-rw-r--r--include/linux/rseq_types.h13
3 files changed, 96 insertions, 76 deletions
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index b9d62fc2140d..7ef79b25e714 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -9,6 +9,11 @@
void __rseq_handle_slowpath(struct pt_regs *regs);
+static __always_inline bool rseq_v2(struct task_struct *t)
+{
+ return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1);
+}
+
/* Invoked from resume_user_mode_work() */
static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
@@ -16,8 +21,7 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs)
if (current->rseq.event.slowpath)
__rseq_handle_slowpath(regs);
} else {
- /* '&' is intentional to spare one conditional branch */
- if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
+ if (current->rseq.event.sched_switch && current->rseq.event.has_rseq)
__rseq_handle_slowpath(regs);
}
}
@@ -30,9 +34,9 @@ void __rseq_signal_deliver(int sig, struct pt_regs *regs);
*/
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
- if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
- /* '&' is intentional to spare one conditional branch */
- if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
+ if (rseq_v2(current)) {
+ /* has_rseq is implied in rseq_v2() */
+ if (current->rseq.event.user_irq)
__rseq_signal_deliver(ksig->sig, regs);
} else {
if (current->rseq.event.has_rseq)
@@ -50,15 +54,22 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
struct rseq_event *ev = &t->rseq.event;
- if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /*
+ * Only apply the user_irq optimization for RSEQ ABI V2 registrations.
+ * Legacy users like TCMalloc rely on the original ABI V1 behaviour
+ * which updates IDs on every context swtich.
+ */
+ if (rseq_v2(t)) {
/*
- * Avoid a boat load of conditionals by using simple logic
- * to determine whether NOTIFY_RESUME needs to be raised.
+ * Avoid a boat load of conditionals by using simple logic to
+ * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be
+ * raised.
*
- * It's required when the CPU or MM CID has changed or
- * the entry was from user space.
+ * It's required when the CPU or MM CID has changed or the entry
+ * was via interrupt from user space. ev->has_rseq does not have
+ * to be evaluated here because rseq_v2() implies has_rseq.
*/
- bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
+ bool raise = ev->user_irq | ev->ids_changed;
if (raise) {
ev->sched_switch = true;
@@ -66,6 +77,7 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t)
}
} else {
if (ev->has_rseq) {
+ t->rseq.event.ids_changed = true;
t->rseq.event.sched_switch = true;
rseq_raise_notify_resume(t);
}
@@ -119,6 +131,8 @@ static inline void rseq_virt_userspace_exit(void)
static inline void rseq_reset(struct task_struct *t)
{
+ /* Protect against preemption and membarrier IPI */
+ guard(irqsave)();
memset(&t->rseq, 0, sizeof(t->rseq));
t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
@@ -159,6 +173,7 @@ static inline unsigned int rseq_alloc_align(void)
}
#else /* CONFIG_RSEQ */
+static inline bool rseq_v2(struct task_struct *t) { return false; }
static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
static inline void rseq_sched_switch_event(struct task_struct *t) { }
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
index f11ebd34f8b9..2d0295df5107 100644
--- a/include/linux/rseq_entry.h
+++ b/include/linux/rseq_entry.h
@@ -111,6 +111,20 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
t->rseq.slice.state.granted = false;
}
+/*
+ * Open coded, so it can be invoked within a user access region.
+ *
+ * This clears the user space state of the time slice extensions field only when
+ * the task has registered the optimized RSEQ_ABI V2. Some legacy registrations,
+ * e.g. TCMalloc, have conflicting non-ABI fields in struct RSEQ, which would be
+ * overwritten by an unconditional write.
+ */
+#define rseq_slice_clear_user(rseq, efault) \
+do { \
+ if (rseq_slice_extension_enabled()) \
+ unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); \
+} while (0)
+
static __always_inline bool __rseq_grant_slice_extension(bool work_pending)
{
struct task_struct *curr = current;
@@ -230,10 +244,10 @@ static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; }
+#define rseq_slice_clear_user(rseq, efault) do { } while (0)
#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
-bool rseq_debug_validate_ids(struct task_struct *t);
static __always_inline void rseq_note_user_irq_entry(void)
{
@@ -353,43 +367,6 @@ efault:
return false;
}
-/*
- * On debug kernels validate that user space did not mess with it if the
- * debug branch is enabled.
- */
-bool rseq_debug_validate_ids(struct task_struct *t)
-{
- struct rseq __user *rseq = t->rseq.usrptr;
- u32 cpu_id, uval, node_id;
-
- /*
- * On the first exit after registering the rseq region CPU ID is
- * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
- */
- node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
- cpu_to_node(t->rseq.ids.cpu_id) : 0;
-
- scoped_user_read_access(rseq, efault) {
- unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
- if (cpu_id != t->rseq.ids.cpu_id)
- goto die;
- unsafe_get_user(uval, &rseq->cpu_id, efault);
- if (uval != cpu_id)
- goto die;
- unsafe_get_user(uval, &rseq->node_id, efault);
- if (uval != node_id)
- goto die;
- unsafe_get_user(uval, &rseq->mm_cid, efault);
- if (uval != t->rseq.ids.mm_cid)
- goto die;
- }
- return true;
-die:
- t->rseq.event.fatal = true;
-efault:
- return false;
-}
-
#endif /* RSEQ_BUILD_SLOW_PATH */
/*
@@ -499,37 +476,50 @@ efault:
* faults in task context are fatal too.
*/
static rseq_inline
-bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
- u32 node_id, u64 *csaddr)
+bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, u64 *csaddr)
{
struct rseq __user *rseq = t->rseq.usrptr;
- if (static_branch_unlikely(&rseq_debug_enabled)) {
- if (!rseq_debug_validate_ids(t))
- return false;
- }
-
scoped_user_rw_access(rseq, efault) {
+ /* Validate the R/O fields for debug and optimized mode */
+ if (static_branch_unlikely(&rseq_debug_enabled) || rseq_v2(t)) {
+ u32 cpu_id, uval;
+
+ unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
+ if (cpu_id != t->rseq.ids.cpu_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->cpu_id, efault);
+ if (uval != cpu_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->node_id, efault);
+ if (uval != t->rseq.ids.node_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->mm_cid, efault);
+ if (uval != t->rseq.ids.mm_cid)
+ goto die;
+ }
+
unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
- unsafe_put_user(node_id, &rseq->node_id, efault);
+ unsafe_put_user(ids->node_id, &rseq->node_id, efault);
unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
if (csaddr)
unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
- /* Open coded, so it's in the same user access region */
- if (rseq_slice_extension_enabled()) {
- /* Unconditionally clear it, no point in conditionals */
- unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
- }
+ /* RSEQ ABI V2 only operations */
+ if (rseq_v2(t))
+ rseq_slice_clear_user(rseq, efault);
}
rseq_slice_clear_grant(t);
/* Cache the new values */
- t->rseq.ids.cpu_cid = ids->cpu_cid;
+ t->rseq.ids = *ids;
rseq_stat_inc(rseq_stats.ids);
rseq_trace_update(t, ids);
return true;
+
+die:
+ t->rseq.event.fatal = true;
efault:
return false;
}
@@ -539,11 +529,11 @@ efault:
* is in a critical section.
*/
static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
- struct rseq_ids *ids, u32 node_id)
+ struct rseq_ids *ids)
{
u64 csaddr;
- if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
+ if (!rseq_set_ids_get_csaddr(t, ids, &csaddr))
return false;
/*
@@ -612,6 +602,14 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
* interrupts disabled
*/
guard(pagefault)();
+ /*
+ * This optimization is only valid when the task registered for the
+ * optimized RSEQ_ABI_V2 variant. Some legacy users rely on the original
+ * RSEQ implementation behaviour which unconditionally updated the IDs.
+ * rseq_sched_switch_event() ensures that legacy registrations always
+ * have both sched_switch and ids_changed set, which is compatible with
+ * the historical TIF_NOTIFY_RESUME behaviour.
+ */
if (likely(!t->rseq.event.ids_changed)) {
struct rseq __user *rseq = t->rseq.usrptr;
/*
@@ -623,11 +621,9 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
scoped_user_rw_access(rseq, efault) {
unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
- /* Open coded, so it's in the same user access region */
- if (rseq_slice_extension_enabled()) {
- /* Unconditionally clear it, no point in conditionals */
- unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
- }
+ /* RSEQ ABI V2 only operations */
+ if (rseq_v2(t))
+ rseq_slice_clear_user(rseq, efault);
}
rseq_slice_clear_grant(t);
@@ -640,12 +636,12 @@ static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct t
}
struct rseq_ids ids = {
- .cpu_id = task_cpu(t),
- .mm_cid = task_mm_cid(t),
+ .cpu_id = task_cpu(t),
+ .mm_cid = task_mm_cid(t),
+ .node_id = cpu_to_node(ids.cpu_id),
};
- u32 node_id = cpu_to_node(ids.cpu_id);
- return rseq_update_usr(t, regs, &ids, node_id);
+ return rseq_update_usr(t, regs, &ids);
efault:
return false;
}
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
index 0b42045988db..85739a63e85e 100644
--- a/include/linux/rseq_types.h
+++ b/include/linux/rseq_types.h
@@ -9,6 +9,12 @@
#ifdef CONFIG_RSEQ
struct rseq;
+/*
+ * rseq_event::has_rseq contains the ABI version number so preserving it
+ * in AND operations requires a mask.
+ */
+#define RSEQ_HAS_RSEQ_VERSION_MASK 0xff
+
/**
* struct rseq_event - Storage for rseq related event management
* @all: Compound to initialize and clear the data efficiently
@@ -17,7 +23,8 @@ struct rseq;
* exit to user
* @ids_changed: Indicator that IDs need to be updated
* @user_irq: True on interrupt entry from user mode
- * @has_rseq: True if the task has a rseq pointer installed
+ * @has_rseq: Greater than 0 if the task has a rseq pointer installed.
+ * Contains the RSEQ version number
* @error: Compound error code for the slow path to analyze
* @fatal: User space data corrupted or invalid
* @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME
@@ -59,8 +66,9 @@ struct rseq_event {
* compiler emit a single compare on 64-bit
* @cpu_id: The CPU ID which was written last to user space
* @mm_cid: The MM CID which was written last to user space
+ * @node_id: The node ID which was written last to user space
*
- * @cpu_id and @mm_cid are updated when the data is written to user space.
+ * @cpu_id, @mm_cid and @node_id are updated when the data is written to user space.
*/
struct rseq_ids {
union {
@@ -70,6 +78,7 @@ struct rseq_ids {
u32 mm_cid;
};
};
+ u32 node_id;
};
/**