summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/bitmap.h15
-rw-r--r--include/linux/cleanup.h30
-rw-r--r--include/linux/cpumask.h26
-rw-r--r--include/linux/entry-common.h38
-rw-r--r--include/linux/irq-entry-common.h75
-rw-r--r--include/linux/irq_work.h9
-rw-r--r--include/linux/irq_work_types.h14
-rw-r--r--include/linux/mm.h25
-rw-r--r--include/linux/mm_types.h128
-rw-r--r--include/linux/resume_user_mode.h2
-rw-r--r--include/linux/rseq.h214
-rw-r--r--include/linux/rseq_entry.h616
-rw-r--r--include/linux/rseq_types.h164
-rw-r--r--include/linux/sched.h57
-rw-r--r--include/linux/thread_info.h5
15 files changed, 1094 insertions, 324 deletions
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 595217b7a6e7..b0395e4ccf90 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -45,6 +45,7 @@ struct device;
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
+ * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
@@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
+unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
@@ -338,6 +341,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1,
}
static __always_inline
+unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1,
+ const unsigned long *src2, unsigned int nbits)
+{
+ if (small_const_nbits(nbits)) {
+ *dst = *src1 | *src2;
+ return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits));
+ } else {
+ return __bitmap_weighted_or(dst, src1, src2, nbits);
+ }
+}
+
+static __always_inline
void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, unsigned int nbits)
{
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index bacefa0f1512..0b55a8f6c59e 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -208,7 +208,7 @@
*/
#define DEFINE_FREE(_name, _type, _free) \
- static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
+ static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; }
#define __free(_name) __cleanup(__free_##_name)
@@ -220,7 +220,7 @@
__val; \
})
-static inline __must_check
+static __always_inline __must_check
const volatile void * __must_check_fn(const volatile void *val)
{ return val; }
@@ -278,16 +278,16 @@ const volatile void * __must_check_fn(const volatile void *val)
#define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \
typedef _type class_##_name##_t; \
-static inline void class_##_name##_destructor(_type *p) \
+static __always_inline void class_##_name##_destructor(_type *p) \
{ _type _T = *p; _exit; } \
-static inline _type class_##_name##_constructor(_init_args) \
+static __always_inline _type class_##_name##_constructor(_init_args) \
{ _type t = _init; return t; }
#define EXTEND_CLASS(_name, ext, _init, _init_args...) \
typedef class_##_name##_t class_##_name##ext##_t; \
-static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\
+static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \
{ class_##_name##_destructor(p); } \
-static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
+static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \
{ class_##_name##_t t = _init; return t; }
#define CLASS(_name, var) \
@@ -360,7 +360,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
})
#define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \
- static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
+ static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \
{ \
void *_ptr = (void *)(__force unsigned long)*(_exp); \
if (IS_ERR(_ptr)) { \
@@ -368,7 +368,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
} \
return _ptr; \
} \
- static inline int class_##_name##_lock_err(class_##_name##_t *_T) \
+ static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \
{ \
long _rc = (__force unsigned long)*(_exp); \
if (!_rc) { \
@@ -397,9 +397,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond
EXTEND_CLASS(_name, _ext, \
({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \
class_##_name##_t _T) \
- static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+ static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
{ return class_##_name##_lock_ptr(_T); } \
- static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+ static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
{ return class_##_name##_lock_err(_T); }
/*
@@ -479,7 +479,7 @@ typedef struct { \
__VA_ARGS__; \
} class_##_name##_t; \
\
-static inline void class_##_name##_destructor(class_##_name##_t *_T) \
+static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \
{ \
if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \
} \
@@ -487,7 +487,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \
__DEFINE_GUARD_LOCK_PTR(_name, &_T->lock)
#define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \
-static inline class_##_name##_t class_##_name##_constructor(_type *l) \
+static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \
{ \
class_##_name##_t _t = { .lock = l }, *_T = &_t; \
_lock; \
@@ -495,7 +495,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l) \
}
#define __DEFINE_LOCK_GUARD_0(_name, _lock) \
-static inline class_##_name##_t class_##_name##_constructor(void) \
+static __always_inline class_##_name##_t class_##_name##_constructor(void) \
{ \
class_##_name##_t _t = { .lock = (void*)1 }, \
*_T __maybe_unused = &_t; \
@@ -521,9 +521,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock)
if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\
_t; }), \
typeof_member(class_##_name##_t, lock) l) \
- static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
+ static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \
{ return class_##_name##_lock_ptr(_T); } \
- static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
+ static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \
{ return class_##_name##_lock_err(_T); }
#define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ff8f41ab7ce6..66694ee8d86e 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -126,6 +126,7 @@ extern struct cpumask __cpu_dying_mask;
#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask)
extern atomic_t __num_online_cpus;
+extern unsigned int __num_possible_cpus;
extern cpumask_t cpus_booted_once_mask;
@@ -729,6 +730,22 @@ void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
}
/**
+ * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result
+ * @dstp: the cpumask result
+ * @src1p: the first input
+ * @src2p: the second input
+ *
+ * Return: The number of bits set in the resulting cpumask @dstp
+ */
+static __always_inline
+unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p,
+ const struct cpumask *src2p)
+{
+ return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p),
+ cpumask_bits(src2p), small_cpumask_bits);
+}
+
+/**
* cpumask_xor - *dstp = *src1p ^ *src2p
* @dstp: the cpumask result
* @src1p: the first input
@@ -1136,13 +1153,13 @@ void init_cpu_possible(const struct cpumask *src);
#define __assign_cpu(cpu, mask, val) \
__assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))
-#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible))
#define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled))
#define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present))
#define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active))
#define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying))
void set_cpu_online(unsigned int cpu, bool online);
+void set_cpu_possible(unsigned int cpu, bool possible);
/**
* to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
@@ -1195,7 +1212,12 @@ static __always_inline unsigned int num_online_cpus(void)
{
return raw_atomic_read(&__num_online_cpus);
}
-#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
+
+static __always_inline unsigned int num_possible_cpus(void)
+{
+ return __num_possible_cpus;
+}
+
#define num_enabled_cpus() cpumask_weight(cpu_enabled_mask)
#define num_present_cpus() cpumask_weight(cpu_present_mask)
#define num_active_cpus() cpumask_weight(cpu_active_mask)
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 7177436f0f9e..87efb38b7081 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -3,11 +3,11 @@
#define __LINUX_ENTRYCOMMON_H
#include <linux/irq-entry-common.h>
+#include <linux/livepatch.h>
#include <linux/ptrace.h>
+#include <linux/resume_user_mode.h>
#include <linux/seccomp.h>
#include <linux/sched.h>
-#include <linux/livepatch.h>
-#include <linux/resume_user_mode.h>
#include <asm/entry-common.h>
#include <asm/syscall.h>
@@ -37,6 +37,7 @@
SYSCALL_WORK_SYSCALL_AUDIT | \
SYSCALL_WORK_SYSCALL_USER_DISPATCH | \
ARCH_SYSCALL_WORK_ENTER)
+
#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \
SYSCALL_WORK_SYSCALL_TRACE | \
SYSCALL_WORK_SYSCALL_AUDIT | \
@@ -44,25 +45,7 @@
SYSCALL_WORK_SYSCALL_EXIT_TRAP | \
ARCH_SYSCALL_WORK_EXIT)
-/**
- * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts
- * @regs: Pointer to currents pt_regs
- *
- * Invoked from architecture specific syscall entry code with interrupts
- * disabled. The calling code has to be non-instrumentable. When the
- * function returns all state is correct, interrupts are enabled and the
- * subsequent functions can be instrumented.
- *
- * This handles lockdep, RCU (context tracking) and tracing state, i.e.
- * the functionality provided by enter_from_user_mode().
- *
- * This is invoked when there is extra architecture specific functionality
- * to be done between establishing state and handling user mode entry work.
- */
-void syscall_enter_from_user_mode_prepare(struct pt_regs *regs);
-
-long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long work);
+long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work);
/**
* syscall_enter_from_user_mode_work - Check and handle work before invoking
@@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
* @syscall: The syscall number
*
* Invoked from architecture specific syscall entry code with interrupts
- * enabled after invoking syscall_enter_from_user_mode_prepare() and extra
- * architecture specific work.
+ * enabled after invoking enter_from_user_mode(), enabling interrupts and
+ * extra architecture specific work.
*
* Returns: The original or a modified syscall number
*
@@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re
* function returns all state is correct, interrupts are enabled and the
* subsequent functions can be instrumented.
*
- * This is combination of syscall_enter_from_user_mode_prepare() and
- * syscall_enter_from_user_mode_work().
+ * This is the combination of enter_from_user_mode() and
+ * syscall_enter_from_user_mode_work() to be used when there is no
+ * architecture specific work to be done between the two.
*
* Returns: The original or a modified syscall number. See
* syscall_enter_from_user_mode_work() for further explanation.
@@ -162,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
local_irq_enable();
}
- rseq_syscall(regs);
+ rseq_debug_syscall_return(regs);
/*
* Do one-time syscall specific work. If these work items are
@@ -172,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs)
if (unlikely(work & SYSCALL_WORK_EXIT))
syscall_exit_work(regs, work);
local_irq_disable_exit_to_user();
- exit_to_user_mode_prepare(regs);
+ syscall_exit_to_user_mode_prepare(regs);
}
/**
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index ba1ed42f8a1c..6ab913e57da0 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -2,11 +2,12 @@
#ifndef __LINUX_IRQENTRYCOMMON_H
#define __LINUX_IRQENTRYCOMMON_H
+#include <linux/context_tracking.h>
+#include <linux/kmsan.h>
+#include <linux/rseq_entry.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
-#include <linux/context_tracking.h>
#include <linux/tick.h>
-#include <linux/kmsan.h>
#include <linux/unwind_deferred.h>
#include <asm/entry-common.h>
@@ -29,7 +30,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
- _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \
ARCH_EXIT_TO_USER_MODE_WORK)
/**
@@ -67,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; }
/**
* enter_from_user_mode - Establish state when coming from user mode
+ * @regs: Pointer to currents pt_regs
*
* Syscall/interrupt entry disables interrupts, but user mode is traced as
* interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
@@ -195,14 +197,11 @@ static __always_inline void arch_exit_to_user_mode(void) { }
*/
void arch_do_signal_or_restart(struct pt_regs *regs);
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- */
-unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work);
+/* Handle pending TIF work */
+unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work);
/**
- * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
* @regs: Pointer to pt_regs on entry stack
*
* 1) check that interrupts are disabled
@@ -210,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
* 3) call exit_to_user_mode_loop() if any flags from
* EXIT_TO_USER_MODE_WORK are set
* 4) check that interrupts are still disabled
+ *
+ * Don't invoke directly, use the syscall/irqentry_ prefixed variants below
*/
-static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
+static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long ti_work;
@@ -225,13 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work);
+}
+static __always_inline void __exit_to_user_mode_validate(void)
+{
/* Ensure that kernel state is sane for a return to userspace */
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
}
+/* Temporary workaround to keep ARM64 alive */
+static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs)
+{
+ __exit_to_user_mode_prepare(regs);
+ rseq_exit_to_user_mode_legacy();
+ __exit_to_user_mode_validate();
+}
+
+/**
+ * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs: Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ __exit_to_user_mode_prepare(regs);
+ rseq_syscall_exit_to_user_mode();
+ __exit_to_user_mode_validate();
+}
+
+/**
+ * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required
+ * @regs: Pointer to pt_regs on entry stack
+ *
+ * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for
+ * syscalls and interrupts.
+ */
+static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs)
+{
+ __exit_to_user_mode_prepare(regs);
+ rseq_irqentry_exit_to_user_mode();
+ __exit_to_user_mode_validate();
+}
+
/**
* exit_to_user_mode - Fixup state when exiting to user mode
*
@@ -274,7 +314,11 @@ static __always_inline void exit_to_user_mode(void)
*
* The function establishes state (lockdep, RCU (context tracking), tracing)
*/
-void irqentry_enter_from_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ rseq_note_user_irq_entry();
+}
/**
* irqentry_exit_to_user_mode - Interrupt exit work
@@ -289,7 +333,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs);
* Interrupt exit is not invoking #1 which is the syscall specific one time
* work.
*/
-void irqentry_exit_to_user_mode(struct pt_regs *regs);
+static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ irqentry_exit_to_user_mode_prepare(regs);
+ instrumentation_end();
+ exit_to_user_mode();
+}
#ifndef irqentry_state
/**
@@ -354,6 +404,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs);
* Conditional reschedule with additional sanity checks.
*/
void raw_irqentry_exit_cond_resched(void);
+
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 136f2980cba3..c5afd053ae32 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -2,8 +2,9 @@
#ifndef _LINUX_IRQ_WORK_H
#define _LINUX_IRQ_WORK_H
-#include <linux/smp_types.h>
+#include <linux/irq_work_types.h>
#include <linux/rcuwait.h>
+#include <linux/smp_types.h>
/*
* An entry can be in one of four states:
@@ -14,12 +15,6 @@
* busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed
*/
-struct irq_work {
- struct __call_single_node node;
- void (*func)(struct irq_work *);
- struct rcuwait irqwait;
-};
-
#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \
.node = { .u_flags = (_flags), }, \
.func = (_func), \
diff --git a/include/linux/irq_work_types.h b/include/linux/irq_work_types.h
new file mode 100644
index 000000000000..73abec5bb06e
--- /dev/null
+++ b/include/linux/irq_work_types.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_IRQ_WORK_TYPES_H
+#define _LINUX_IRQ_WORK_TYPES_H
+
+#include <linux/smp_types.h>
+#include <linux/types.h>
+
+struct irq_work {
+ struct __call_single_node node;
+ void (*func)(struct irq_work *);
+ struct rcuwait irqwait;
+};
+
+#endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 170594b5cb6b..8dc0a07570cc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2408,31 +2408,6 @@ struct zap_details {
/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */
#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1))
-#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_before_execve(struct task_struct *t);
-void sched_mm_cid_after_execve(struct task_struct *t);
-void sched_mm_cid_fork(struct task_struct *t);
-void sched_mm_cid_exit_signals(struct task_struct *t);
-static inline int task_mm_cid(struct task_struct *t)
-{
- return t->mm_cid;
-}
-#else
-static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
-static inline void sched_mm_cid_fork(struct task_struct *t) { }
-static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
-static inline int task_mm_cid(struct task_struct *t)
-{
- /*
- * Use the processor id as a fall-back when the mm cid feature is
- * disabled. This provides functional per-cpu data structure accesses
- * in user-space, althrough it won't provide the memory usage benefits.
- */
- return raw_smp_processor_id();
-}
-#endif
-
#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 90e5790c318f..3b7d05e7169c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -20,6 +20,7 @@
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>
+#include <linux/rseq_types.h>
#include <linux/bitmap.h>
#include <asm/mmu.h>
@@ -922,14 +923,6 @@ struct vm_area_struct {
#define vma_policy(vma) NULL
#endif
-#ifdef CONFIG_SCHED_MM_CID
-struct mm_cid {
- u64 time;
- int cid;
- int recent_cid;
-};
-#endif
-
/*
* Opaque type representing current mm_struct flag state. Must be accessed via
* mm_flags_xxx() helper functions.
@@ -991,44 +984,9 @@ struct mm_struct {
*/
atomic_t mm_users;
-#ifdef CONFIG_SCHED_MM_CID
- /**
- * @pcpu_cid: Per-cpu current cid.
- *
- * Keep track of the currently allocated mm_cid for each cpu.
- * The per-cpu mm_cid values are serialized by their respective
- * runqueue locks.
- */
- struct mm_cid __percpu *pcpu_cid;
- /*
- * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
- *
- * When the next mm_cid scan is due (in jiffies).
- */
- unsigned long mm_cid_next_scan;
- /**
- * @nr_cpus_allowed: Number of CPUs allowed for mm.
- *
- * Number of CPUs allowed in the union of all mm's
- * threads allowed CPUs.
- */
- unsigned int nr_cpus_allowed;
- /**
- * @max_nr_cid: Maximum number of allowed concurrency
- * IDs allocated.
- *
- * Track the highest number of allowed concurrency IDs
- * allocated for the mm.
- */
- atomic_t max_nr_cid;
- /**
- * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
- *
- * Provide mutual exclusion for mm cpus_allowed and
- * mm nr_cpus_allowed updates.
- */
- raw_spinlock_t cpus_allowed_lock;
-#endif
+ /* MM CID related storage */
+ struct mm_mm_cid mm_cid;
+
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
#endif
@@ -1370,37 +1328,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
}
#ifdef CONFIG_SCHED_MM_CID
-
-enum mm_cid_state {
- MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
- MM_CID_LAZY_PUT = (1U << 31),
-};
-
-static inline bool mm_cid_is_unset(int cid)
-{
- return cid == MM_CID_UNSET;
-}
-
-static inline bool mm_cid_is_lazy_put(int cid)
-{
- return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
-}
-
-static inline bool mm_cid_is_valid(int cid)
-{
- return !(cid & MM_CID_LAZY_PUT);
-}
-
-static inline int mm_cid_set_lazy_put(int cid)
-{
- return cid | MM_CID_LAZY_PUT;
-}
-
-static inline int mm_cid_clear_lazy_put(int cid)
-{
- return cid & ~MM_CID_LAZY_PUT;
-}
-
/*
* mm_cpus_allowed: Union of all mm's threads allowed CPUs.
*/
@@ -1415,37 +1342,21 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
}
/* Accessor for struct mm_struct's cidmask. */
-static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
+static inline unsigned long *mm_cidmask(struct mm_struct *mm)
{
unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm);
/* Skip mm_cpus_allowed */
cid_bitmap += cpumask_size();
- return (struct cpumask *)cid_bitmap;
+ return (unsigned long *)cid_bitmap;
}
-static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
-{
- int i;
-
- for_each_possible_cpu(i) {
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
-
- pcpu_cid->cid = MM_CID_UNSET;
- pcpu_cid->recent_cid = MM_CID_UNSET;
- pcpu_cid->time = 0;
- }
- mm->nr_cpus_allowed = p->nr_cpus_allowed;
- atomic_set(&mm->max_nr_cid, 0);
- raw_spin_lock_init(&mm->cpus_allowed_lock);
- cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
- cpumask_clear(mm_cidmask(mm));
-}
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p);
static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
{
- mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
- if (!mm->pcpu_cid)
+ mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu);
+ if (!mm->mm_cid.pcpu)
return -ENOMEM;
mm_init_cid(mm, p);
return 0;
@@ -1454,37 +1365,24 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
static inline void mm_destroy_cid(struct mm_struct *mm)
{
- free_percpu(mm->pcpu_cid);
- mm->pcpu_cid = NULL;
+ free_percpu(mm->mm_cid.pcpu);
+ mm->mm_cid.pcpu = NULL;
}
static inline unsigned int mm_cid_size(void)
{
- return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */
+ /* mm_cpus_allowed(), mm_cidmask(). */
+ return cpumask_size() + bitmap_size(num_possible_cpus());
}
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask)
-{
- struct cpumask *mm_allowed = mm_cpus_allowed(mm);
-
- if (!mm)
- return;
- /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
- raw_spin_lock(&mm->cpus_allowed_lock);
- cpumask_or(mm_allowed, mm_allowed, cpumask);
- WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
- raw_spin_unlock(&mm->cpus_allowed_lock);
-}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }
-
static inline unsigned int mm_cid_size(void)
{
return 0;
}
-static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
#endif /* CONFIG_SCHED_MM_CID */
struct mmu_gather;
diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h
index e0135e0adae0..bf92227c78d0 100644
--- a/include/linux/resume_user_mode.h
+++ b/include/linux/resume_user_mode.h
@@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs)
mem_cgroup_handle_over_high(GFP_KERNEL);
blkcg_maybe_throttle_current();
- rseq_handle_notify_resume(NULL, regs);
+ rseq_handle_slowpath(regs);
}
#endif /* LINUX_RESUME_USER_MODE_H */
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 69553e7c14c1..2266f4dc77b6 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -3,134 +3,164 @@
#define _LINUX_RSEQ_H
#ifdef CONFIG_RSEQ
-
-#include <linux/preempt.h>
#include <linux/sched.h>
-#ifdef CONFIG_MEMBARRIER
-# define RSEQ_EVENT_GUARD irq
-#else
-# define RSEQ_EVENT_GUARD preempt
-#endif
-
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
- RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
- RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
- RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
- RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
- RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
- RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
- if (t->rseq)
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
+#include <uapi/linux/rseq.h>
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+void __rseq_handle_slowpath(struct pt_regs *regs);
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
+/* Invoked from resume_user_mode_work() */
+static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
- if (current->rseq)
- __rseq_handle_notify_resume(ksig, regs);
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+ if (current->rseq.event.slowpath)
+ __rseq_handle_slowpath(regs);
+ } else {
+ /* '&' is intentional to spare one conditional branch */
+ if (current->rseq.event.sched_switch & current->rseq.event.has_rseq)
+ __rseq_handle_slowpath(regs);
+ }
}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
-{
- scoped_guard(RSEQ_EVENT_GUARD)
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
- rseq_handle_notify_resume(ksig, regs);
-}
+void __rseq_signal_deliver(int sig, struct pt_regs *regs);
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
+/*
+ * Invoked from signal delivery to fixup based on the register context before
+ * switching to the signal delivery context.
+ */
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
- __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /* '&' is intentional to spare one conditional branch */
+ if (current->rseq.event.has_rseq & current->rseq.event.user_irq)
+ __rseq_signal_deliver(ksig->sig, regs);
+ } else {
+ if (current->rseq.event.has_rseq)
+ __rseq_signal_deliver(ksig->sig, regs);
+ }
}
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
+static inline void rseq_raise_notify_resume(struct task_struct *t)
{
- __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
+ set_tsk_thread_flag(t, TIF_RSEQ);
}
-/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
- */
-static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
+/* Invoked from context switch to force evaluation on exit to user */
+static __always_inline void rseq_sched_switch_event(struct task_struct *t)
{
- if (clone_flags & CLONE_VM) {
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
+ struct rseq_event *ev = &t->rseq.event;
+
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /*
+ * Avoid a boat load of conditionals by using simple logic
+ * to determine whether NOTIFY_RESUME needs to be raised.
+ *
+ * It's required when the CPU or MM CID has changed or
+ * the entry was from user space.
+ */
+ bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq;
+
+ if (raise) {
+ ev->sched_switch = true;
+ rseq_raise_notify_resume(t);
+ }
} else {
- t->rseq = current->rseq;
- t->rseq_len = current->rseq_len;
- t->rseq_sig = current->rseq_sig;
- t->rseq_event_mask = current->rseq_event_mask;
+ if (ev->has_rseq) {
+ t->rseq.event.sched_switch = true;
+ rseq_raise_notify_resume(t);
+ }
}
}
-static inline void rseq_execve(struct task_struct *t)
+/*
+ * Invoked from __set_task_cpu() when a task migrates or from
+ * mm_cid_schedin() when the CID changes to enforce an IDs update.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
+ t->rseq.event.ids_changed = true;
}
-#else
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
-}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
+/* Enforce a full update after RSEQ registration and when execve() failed */
+static inline void rseq_force_update(void)
{
+ if (current->rseq.event.has_rseq) {
+ current->rseq.event.ids_changed = true;
+ current->rseq.event.sched_switch = true;
+ rseq_raise_notify_resume(current);
+ }
}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
+
+/*
+ * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
+ * which clears TIF_NOTIFY_RESUME on architectures that don't use the
+ * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
+ *
+ * To avoid updating user space RSEQ in that case just to do it eventually
+ * again before returning to user space, because __rseq_handle_slowpath()
+ * does nothing when invoked with NULL register state.
+ *
+ * After returning from guest mode, before exiting to userspace, hypervisors
+ * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
+ */
+static inline void rseq_virt_userspace_exit(void)
{
+ /*
+ * The generic optimization for deferring RSEQ updates until the next
+ * exit relies on having a dedicated TIF_RSEQ.
+ */
+ if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
+ current->rseq.event.sched_switch)
+ rseq_raise_notify_resume(current);
}
-static inline void rseq_preempt(struct task_struct *t)
+
+static inline void rseq_reset(struct task_struct *t)
{
+ memset(&t->rseq, 0, sizeof(t->rseq));
+ t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
-static inline void rseq_migrate(struct task_struct *t)
+
+static inline void rseq_execve(struct task_struct *t)
{
+ rseq_reset(t);
}
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ *
+ * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
+ * on the COW page on exit to user space, when the child stays on the same
+ * CPU as the parent. That's obviously not guaranteed, but in overcommit
+ * scenarios it is more likely and optimizes for the fork/exec case without
+ * taking the fault.
+ */
static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
-}
-static inline void rseq_execve(struct task_struct *t)
-{
+ if (clone_flags & CLONE_VM)
+ rseq_reset(t);
+ else
+ t->rseq = current->rseq;
}
-#endif
+#else /* CONFIG_RSEQ */
+static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_sched_switch_event(struct task_struct *t) { }
+static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
+static inline void rseq_force_update(void) { }
+static inline void rseq_virt_userspace_exit(void) { }
+static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
+static inline void rseq_execve(struct task_struct *t) { }
+#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
-
void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
-{
-}
-
-#endif
+#else /* CONFIG_DEBUG_RSEQ */
+static inline void rseq_syscall(struct pt_regs *regs) { }
+#endif /* !CONFIG_DEBUG_RSEQ */
#endif /* _LINUX_RSEQ_H */
diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h
new file mode 100644
index 000000000000..c92167ff8a7f
--- /dev/null
+++ b/include/linux/rseq_entry.h
@@ -0,0 +1,616 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_ENTRY_H
+#define _LINUX_RSEQ_ENTRY_H
+
+/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
+#ifdef CONFIG_RSEQ_STATS
+#include <linux/percpu.h>
+
+struct rseq_stats {
+ unsigned long exit;
+ unsigned long signal;
+ unsigned long slowpath;
+ unsigned long fastpath;
+ unsigned long ids;
+ unsigned long cs;
+ unsigned long clear;
+ unsigned long fixup;
+};
+
+DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
+
+/*
+ * Slow path has interrupts and preemption enabled, but the fast path
+ * runs with interrupts disabled so there is no point in having the
+ * preemption checks implied in __this_cpu_inc() for every operation.
+ */
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_stat_inc(which) this_cpu_inc((which))
+#else
+#define rseq_stat_inc(which) raw_cpu_inc((which))
+#endif
+
+#else /* CONFIG_RSEQ_STATS */
+#define rseq_stat_inc(x) do { } while (0)
+#endif /* !CONFIG_RSEQ_STATS */
+
+#ifdef CONFIG_RSEQ
+#include <linux/jump_label.h>
+#include <linux/rseq.h>
+#include <linux/uaccess.h>
+
+#include <linux/tracepoint-defs.h>
+
+#ifdef CONFIG_TRACEPOINTS
+DECLARE_TRACEPOINT(rseq_update);
+DECLARE_TRACEPOINT(rseq_ip_fixup);
+void __rseq_trace_update(struct task_struct *t);
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip);
+
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
+{
+ if (tracepoint_enabled(rseq_update) && ids)
+ __rseq_trace_update(t);
+}
+
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip)
+{
+ if (tracepoint_enabled(rseq_ip_fixup))
+ __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+}
+
+#else /* CONFIG_TRACEPOINT */
+static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
+static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip) { }
+#endif /* !CONFIG_TRACEPOINT */
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
+
+#ifdef RSEQ_BUILD_SLOW_PATH
+#define rseq_inline
+#else
+#define rseq_inline __always_inline
+#endif
+
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
+bool rseq_debug_validate_ids(struct task_struct *t);
+
+static __always_inline void rseq_note_user_irq_entry(void)
+{
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
+ current->rseq.event.user_irq = true;
+}
+
+/*
+ * Check whether there is a valid critical section and whether the
+ * instruction pointer in @regs is inside the critical section.
+ *
+ * - If the critical section is invalid, terminate the task.
+ *
+ * - If valid and the instruction pointer is inside, set it to the abort IP.
+ *
+ * - If valid and the instruction pointer is outside, clear the critical
+ * section address.
+ *
+ * Returns true, if the section was valid and either fixup or clear was
+ * done, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when a invalid
+ * section was found. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+
+#ifdef RSEQ_BUILD_SLOW_PATH
+/*
+ * The debug version is put out of line, but kept here so the code stays
+ * together.
+ *
+ * @csaddr has already been checked by the caller to be in user space
+ */
+bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
+ unsigned long csaddr)
+{
+ struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+ u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
+ unsigned long ip = instruction_pointer(regs);
+ u64 __user *uc_head = (u64 __user *) ucs;
+ u32 usig, __user *uc_sig;
+
+ scoped_user_rw_access(ucs, efault) {
+ /*
+ * Evaluate the user pile and exit if one of the conditions
+ * is not fulfilled.
+ */
+ unsafe_get_user(start_ip, &ucs->start_ip, efault);
+ if (unlikely(start_ip >= tasksize))
+ goto die;
+ /* If outside, just clear the critical section. */
+ if (ip < start_ip)
+ goto clear;
+
+ unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+ cs_end = start_ip + offset;
+ /* Check for overflow and wraparound */
+ if (unlikely(cs_end >= tasksize || cs_end < start_ip))
+ goto die;
+
+ /* If not inside, clear it. */
+ if (ip >= cs_end)
+ goto clear;
+
+ unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+ /* Ensure it's "valid" */
+ if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
+ goto die;
+ /* Validate that the abort IP is not in the critical section */
+ if (unlikely(abort_ip - start_ip < offset))
+ goto die;
+
+ /*
+ * Check version and flags for 0. No point in emitting
+ * deprecated warnings before dying. That could be done in
+ * the slow path eventually, but *shrug*.
+ */
+ unsafe_get_user(head, uc_head, efault);
+ if (unlikely(head))
+ goto die;
+
+ /* abort_ip - 4 is >= 0. See abort_ip check above */
+ uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+ unsafe_get_user(usig, uc_sig, efault);
+ if (unlikely(usig != t->rseq.sig))
+ goto die;
+
+ /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ /* If not in interrupt from user context, let it die */
+ if (unlikely(!t->rseq.event.user_irq))
+ goto die;
+ }
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ instruction_pointer_set(regs, (unsigned long)abort_ip);
+ rseq_stat_inc(rseq_stats.fixup);
+ break;
+ clear:
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ rseq_stat_inc(rseq_stats.clear);
+ abort_ip = 0ULL;
+ }
+
+ if (unlikely(abort_ip))
+ rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
+/*
+ * On debug kernels validate that user space did not mess with it if the
+ * debug branch is enabled.
+ */
+bool rseq_debug_validate_ids(struct task_struct *t)
+{
+ struct rseq __user *rseq = t->rseq.usrptr;
+ u32 cpu_id, uval, node_id;
+
+ /*
+ * On the first exit after registering the rseq region CPU ID is
+ * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
+ */
+ node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
+ cpu_to_node(t->rseq.ids.cpu_id) : 0;
+
+ scoped_user_read_access(rseq, efault) {
+ unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
+ if (cpu_id != t->rseq.ids.cpu_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->cpu_id, efault);
+ if (uval != cpu_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->node_id, efault);
+ if (uval != node_id)
+ goto die;
+ unsafe_get_user(uval, &rseq->mm_cid, efault);
+ if (uval != t->rseq.ids.mm_cid)
+ goto die;
+ }
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
+#endif /* RSEQ_BUILD_SLOW_PATH */
+
+/*
+ * This only ensures that abort_ip is in the user address space and
+ * validates that it is preceded by the signature.
+ *
+ * No other sanity checks are done here, that's what the debug code is for.
+ */
+static rseq_inline bool
+rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
+{
+ struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
+ unsigned long ip = instruction_pointer(regs);
+ unsigned long tasksize = TASK_SIZE;
+ u64 start_ip, abort_ip, offset;
+ u32 usig, __user *uc_sig;
+
+ rseq_stat_inc(rseq_stats.cs);
+
+ if (unlikely(csaddr >= tasksize)) {
+ t->rseq.event.fatal = true;
+ return false;
+ }
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ return rseq_debug_update_user_cs(t, regs, csaddr);
+
+ scoped_user_rw_access(ucs, efault) {
+ unsafe_get_user(start_ip, &ucs->start_ip, efault);
+ unsafe_get_user(offset, &ucs->post_commit_offset, efault);
+ unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
+
+ /*
+ * No sanity checks. If user space screwed it up, it can
+ * keep the pieces. That's what debug code is for.
+ *
+ * If outside, just clear the critical section.
+ */
+ if (ip - start_ip >= offset)
+ goto clear;
+
+ /*
+ * Two requirements for @abort_ip:
+ * - Must be in user space as x86 IRET would happily return to
+ * the kernel.
+ * - The four bytes preceding the instruction at @abort_ip must
+ * contain the signature.
+ *
+ * The latter protects against the following attack vector:
+ *
+ * An attacker with limited abilities to write, creates a critical
+ * section descriptor, sets the abort IP to a library function or
+ * some other ROP gadget and stores the address of the descriptor
+ * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
+ * protection.
+ */
+ if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
+ goto die;
+
+ /* The address is guaranteed to be >= 0 and < TASK_SIZE */
+ uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
+ unsafe_get_user(usig, uc_sig, efault);
+ if (unlikely(usig != t->rseq.sig))
+ goto die;
+
+ /* Invalidate the critical section */
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ /* Update the instruction pointer */
+ instruction_pointer_set(regs, (unsigned long)abort_ip);
+ rseq_stat_inc(rseq_stats.fixup);
+ break;
+ clear:
+ unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
+ rseq_stat_inc(rseq_stats.clear);
+ abort_ip = 0ULL;
+ }
+
+ if (unlikely(abort_ip))
+ rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
+ return true;
+die:
+ t->rseq.event.fatal = true;
+efault:
+ return false;
+}
+
+/*
+ * Updates CPU ID, Node ID and MM CID and reads the critical section
+ * address, when @csaddr != NULL. This allows to put the ID update and the
+ * read under the same uaccess region to spare a separate begin/end.
+ *
+ * As this is either invoked from a C wrapper with @csaddr = NULL or from
+ * the fast path code with a valid pointer, a clever compiler should be
+ * able to optimize the read out. Spares a duplicate implementation.
+ *
+ * Returns true, if the operation was successful, false otherwise.
+ *
+ * In the failure case task::rseq_event::fatal is set when invalid data
+ * was found on debug kernels. It's clear when the failure was an unresolved page
+ * fault.
+ *
+ * If inlined into the exit to user path with interrupts disabled, the
+ * caller has to protect against page faults with pagefault_disable().
+ *
+ * In preemptible task context this would be counterproductive as the page
+ * faults could not be fully resolved. As a consequence unresolved page
+ * faults in task context are fatal too.
+ */
+static rseq_inline
+bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
+ u32 node_id, u64 *csaddr)
+{
+ struct rseq __user *rseq = t->rseq.usrptr;
+
+ if (static_branch_unlikely(&rseq_debug_enabled)) {
+ if (!rseq_debug_validate_ids(t))
+ return false;
+ }
+
+ scoped_user_rw_access(rseq, efault) {
+ unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
+ unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
+ unsafe_put_user(node_id, &rseq->node_id, efault);
+ unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
+ if (csaddr)
+ unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
+ }
+
+ /* Cache the new values */
+ t->rseq.ids.cpu_cid = ids->cpu_cid;
+ rseq_stat_inc(rseq_stats.ids);
+ rseq_trace_update(t, ids);
+ return true;
+efault:
+ return false;
+}
+
+/*
+ * Update user space with new IDs and conditionally check whether the task
+ * is in a critical section.
+ */
+static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
+ struct rseq_ids *ids, u32 node_id)
+{
+ u64 csaddr;
+
+ if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
+ return false;
+
+ /*
+ * On architectures which utilize the generic entry code this
+ * allows to skip the critical section when the entry was not from
+ * a user space interrupt, unless debug mode is enabled.
+ */
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
+ if (!static_branch_unlikely(&rseq_debug_enabled)) {
+ if (likely(!t->rseq.event.user_irq))
+ return true;
+ }
+ }
+ if (likely(!csaddr))
+ return true;
+ /* Sigh, this really needs to do work */
+ return rseq_update_user_cs(t, regs, csaddr);
+}
+
+/*
+ * If you want to use this then convert your architecture to the generic
+ * entry code. I'm tired of building workarounds for people who can't be
+ * bothered to make the maintenance of generic infrastructure less
+ * burdensome. Just sucking everything into the architecture code and
+ * thereby making others chase the horrible hacks and keep them working is
+ * neither acceptable nor sustainable.
+ */
+#ifdef CONFIG_GENERIC_ENTRY
+
+/*
+ * This is inlined into the exit path because:
+ *
+ * 1) It's a one time comparison in the fast path when there is no event to
+ * handle
+ *
+ * 2) The access to the user space rseq memory (TLS) is unlikely to fault
+ * so the straight inline operation is:
+ *
+ * - Four 32-bit stores only if CPU ID/ MM CID need to be updated
+ * - One 64-bit load to retrieve the critical section address
+ *
+ * 3) In the unlikely case that the critical section address is != NULL:
+ *
+ * - One 64-bit load to retrieve the start IP
+ * - One 64-bit load to retrieve the offset for calculating the end
+ * - One 64-bit load to retrieve the abort IP
+ * - One 64-bit load to retrieve the signature
+ * - One store to clear the critical section address
+ *
+ * The non-debug case implements only the minimal required checking. It
+ * provides protection against a rogue abort IP in kernel space, which
+ * would be exploitable at least on x86, and also against a rogue CS
+ * descriptor by checking the signature at the abort IP. Any fallout from
+ * invalid critical section descriptors is a user space problem. The debug
+ * case provides the full set of checks and terminates the task if a
+ * condition is not met.
+ *
+ * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
+ * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
+ * slow path there will handle the failure.
+ */
+static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
+{
+ /*
+ * Page faults need to be disabled as this is called with
+ * interrupts disabled
+ */
+ guard(pagefault)();
+ if (likely(!t->rseq.event.ids_changed)) {
+ struct rseq __user *rseq = t->rseq.usrptr;
+ /*
+ * If IDs have not changed rseq_event::user_irq must be true
+ * See rseq_sched_switch_event().
+ */
+ u64 csaddr;
+
+ if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs)))
+ return false;
+
+ if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
+ if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
+ return false;
+ }
+ return true;
+ }
+
+ struct rseq_ids ids = {
+ .cpu_id = task_cpu(t),
+ .mm_cid = task_mm_cid(t),
+ };
+ u32 node_id = cpu_to_node(ids.cpu_id);
+
+ return rseq_update_usr(t, regs, &ids, node_id);
+}
+
+static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
+{
+ struct task_struct *t = current;
+
+ /*
+ * If the task did not go through schedule or got the flag enforced
+ * by the rseq syscall or execve, then nothing to do here.
+ *
+ * CPU ID and MM CID can only change when going through a context
+ * switch.
+ *
+ * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
+ * only when rseq_event::has_rseq is true. That conditional is
+ * required to avoid setting the TIF bit if RSEQ is not registered
+ * for a task. rseq_event::sched_switch is cleared when RSEQ is
+ * unregistered by a task so it's sufficient to check for the
+ * sched_switch bit alone.
+ *
+ * A sane compiler requires three instructions for the nothing to do
+ * case including clearing the events, but your mileage might vary.
+ */
+ if (unlikely((t->rseq.event.sched_switch))) {
+ rseq_stat_inc(rseq_stats.fastpath);
+
+ if (unlikely(!rseq_exit_user_update(regs, t)))
+ return true;
+ }
+ /* Clear state so next entry starts from a clean slate */
+ t->rseq.event.events = 0;
+ return false;
+}
+
+/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+static __always_inline bool test_tif_rseq(unsigned long ti_work)
+{
+ return ti_work & _TIF_RSEQ;
+}
+
+static __always_inline void clear_tif_rseq(void)
+{
+ static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
+ clear_thread_flag(TIF_RSEQ);
+}
+#else
+static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
+static __always_inline void clear_tif_rseq(void) { }
+#endif
+
+static __always_inline bool
+rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+ if (likely(!test_tif_rseq(ti_work)))
+ return false;
+
+ if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
+ current->rseq.event.slowpath = true;
+ set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
+ return true;
+ }
+
+ clear_tif_rseq();
+ return false;
+}
+
+#else /* CONFIG_GENERIC_ENTRY */
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+ return false;
+}
+#endif /* !CONFIG_GENERIC_ENTRY */
+
+static __always_inline void rseq_syscall_exit_to_user_mode(void)
+{
+ struct rseq_event *ev = &current->rseq.event;
+
+ rseq_stat_inc(rseq_stats.exit);
+
+ /* Needed to remove the store for the !lockdep case */
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ WARN_ON_ONCE(ev->sched_switch);
+ ev->events = 0;
+ }
+}
+
+static __always_inline void rseq_irqentry_exit_to_user_mode(void)
+{
+ struct rseq_event *ev = &current->rseq.event;
+
+ rseq_stat_inc(rseq_stats.exit);
+
+ lockdep_assert_once(!ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing could not clear it.
+ */
+ ev->events = 0;
+}
+
+/* Required to keep ARM64 working */
+static __always_inline void rseq_exit_to_user_mode_legacy(void)
+{
+ struct rseq_event *ev = &current->rseq.event;
+
+ rseq_stat_inc(rseq_stats.exit);
+
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ WARN_ON_ONCE(ev->sched_switch);
+
+ /*
+ * Ensure that event (especially user_irq) is cleared when the
+ * interrupt did not result in a schedule and therefore the
+ * rseq processing did not clear it.
+ */
+ ev->events = 0;
+}
+
+void __rseq_debug_syscall_return(struct pt_regs *regs);
+
+static inline void rseq_debug_syscall_return(struct pt_regs *regs)
+{
+ if (static_branch_unlikely(&rseq_debug_enabled))
+ __rseq_debug_syscall_return(regs);
+}
+#else /* CONFIG_RSEQ */
+static inline void rseq_note_user_irq_entry(void) { }
+static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
+{
+ return false;
+}
+static inline void rseq_syscall_exit_to_user_mode(void) { }
+static inline void rseq_irqentry_exit_to_user_mode(void) { }
+static inline void rseq_exit_to_user_mode_legacy(void) { }
+static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
+#endif /* !CONFIG_RSEQ */
+
+#endif /* _LINUX_RSEQ_ENTRY_H */
diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h
new file mode 100644
index 000000000000..332dc14b81c9
--- /dev/null
+++ b/include/linux/rseq_types.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSEQ_TYPES_H
+#define _LINUX_RSEQ_TYPES_H
+
+#include <linux/irq_work_types.h>
+#include <linux/types.h>
+#include <linux/workqueue_types.h>
+
+#ifdef CONFIG_RSEQ
+struct rseq;
+
+/**
+ * struct rseq_event - Storage for rseq related event management
+ * @all: Compound to initialize and clear the data efficiently
+ * @events: Compound to access events with a single load/store
+ * @sched_switch: True if the task was scheduled and needs update on
+ * exit to user
+ * @ids_changed: Indicator that IDs need to be updated
+ * @user_irq: True on interrupt entry from user mode
+ * @has_rseq: True if the task has a rseq pointer installed
+ * @error: Compound error code for the slow path to analyze
+ * @fatal: User space data corrupted or invalid
+ * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME
+ * is required
+ *
+ * @sched_switch and @ids_changed must be adjacent and the combo must be
+ * 16bit aligned to allow a single store, when both are set at the same
+ * time in the scheduler.
+ */
+struct rseq_event {
+ union {
+ u64 all;
+ struct {
+ union {
+ u32 events;
+ struct {
+ u8 sched_switch;
+ u8 ids_changed;
+ u8 user_irq;
+ };
+ };
+
+ u8 has_rseq;
+ u8 __pad;
+ union {
+ u16 error;
+ struct {
+ u8 fatal;
+ u8 slowpath;
+ };
+ };
+ };
+ };
+};
+
+/**
+ * struct rseq_ids - Cache for ids, which need to be updated
+ * @cpu_cid: Compound of @cpu_id and @mm_cid to make the
+ * compiler emit a single compare on 64-bit
+ * @cpu_id: The CPU ID which was written last to user space
+ * @mm_cid: The MM CID which was written last to user space
+ *
+ * @cpu_id and @mm_cid are updated when the data is written to user space.
+ */
+struct rseq_ids {
+ union {
+ u64 cpu_cid;
+ struct {
+ u32 cpu_id;
+ u32 mm_cid;
+ };
+ };
+};
+
+/**
+ * struct rseq_data - Storage for all rseq related data
+ * @usrptr: Pointer to the registered user space RSEQ memory
+ * @len: Length of the RSEQ region
+ * @sig: Signature of critial section abort IPs
+ * @event: Storage for event management
+ * @ids: Storage for cached CPU ID and MM CID
+ */
+struct rseq_data {
+ struct rseq __user *usrptr;
+ u32 len;
+ u32 sig;
+ struct rseq_event event;
+ struct rseq_ids ids;
+};
+
+#else /* CONFIG_RSEQ */
+struct rseq_data { };
+#endif /* !CONFIG_RSEQ */
+
+#ifdef CONFIG_SCHED_MM_CID
+
+#define MM_CID_UNSET BIT(31)
+#define MM_CID_ONCPU BIT(30)
+#define MM_CID_TRANSIT BIT(29)
+
+/**
+ * struct sched_mm_cid - Storage for per task MM CID data
+ * @active: MM CID is active for the task
+ * @cid: The CID associated to the task either permanently or
+ * borrowed from the CPU
+ */
+struct sched_mm_cid {
+ unsigned int active;
+ unsigned int cid;
+};
+
+/**
+ * struct mm_cid_pcpu - Storage for per CPU MM_CID data
+ * @cid: The CID associated to the CPU either permanently or
+ * while a task with a CID is running
+ */
+struct mm_cid_pcpu {
+ unsigned int cid;
+}____cacheline_aligned_in_smp;
+
+/**
+ * struct mm_mm_cid - Storage for per MM CID data
+ * @pcpu: Per CPU storage for CIDs associated to a CPU
+ * @percpu: Set, when CIDs are in per CPU mode
+ * @transit: Set to MM_CID_TRANSIT during a mode change transition phase
+ * @max_cids: The exclusive maximum CID value for allocation and convergence
+ * @irq_work: irq_work to handle the affinity mode change case
+ * @work: Regular work to handle the affinity mode change case
+ * @lock: Spinlock to protect against affinity setting which can't take @mutex
+ * @mutex: Mutex to serialize forks and exits related to this mm
+ * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
+ * is growth only.
+ * @users: The number of tasks sharing this MM. Separate from mm::mm_users
+ * as that is modified by mmget()/mm_put() by other entities which
+ * do not actually share the MM.
+ * @pcpu_thrs: Threshold for switching back from per CPU mode
+ * @update_deferred: A deferred switch back to per task mode is pending.
+ */
+struct mm_mm_cid {
+ /* Hotpath read mostly members */
+ struct mm_cid_pcpu __percpu *pcpu;
+ unsigned int percpu;
+ unsigned int transit;
+ unsigned int max_cids;
+
+ /* Rarely used. Moves @lock and @mutex into the second cacheline */
+ struct irq_work irq_work;
+ struct work_struct work;
+
+ raw_spinlock_t lock;
+ struct mutex mutex;
+
+ /* Low frequency modified */
+ unsigned int nr_cpus_allowed;
+ unsigned int users;
+ unsigned int pcpu_thrs;
+ unsigned int update_deferred;
+}____cacheline_aligned_in_smp;
+#else /* CONFIG_SCHED_MM_CID */
+struct mm_mm_cid { };
+struct sched_mm_cid { };
+#endif /* !CONFIG_SCHED_MM_CID */
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e84bc5bce816..93af4d10ba88 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -41,7 +41,7 @@
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
-#include <uapi/linux/rseq.h>
+#include <linux/rseq_types.h>
#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
@@ -1406,33 +1406,8 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
-#ifdef CONFIG_RSEQ
- struct rseq __user *rseq;
- u32 rseq_len;
- u32 rseq_sig;
- /*
- * RmW on rseq_event_mask must be performed atomically
- * with respect to preemption.
- */
- unsigned long rseq_event_mask;
-# ifdef CONFIG_DEBUG_RSEQ
- /*
- * This is a place holder to save a copy of the rseq fields for
- * validation of read-only fields. The struct rseq has a
- * variable-length array at the end, so it cannot be used
- * directly. Reserve a size large enough for the known fields.
- */
- char rseq_fields[sizeof(struct rseq)];
-# endif
-#endif
-
-#ifdef CONFIG_SCHED_MM_CID
- int mm_cid; /* Current cid in mm */
- int last_mm_cid; /* Most recent cid in mm */
- int migrate_from_cpu;
- int mm_cid_active; /* Whether cid bitmap is active */
- struct callback_head cid_work;
-#endif
+ struct rseq_data rseq;
+ struct sched_mm_cid mm_cid;
struct tlbflush_unmap_batch tlb_ubc;
@@ -2325,6 +2300,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
#define alloc_tag_restore(_tag, _old) do {} while (0)
#endif
+/* Avoids recursive inclusion hell */
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_before_execve(struct task_struct *t);
+void sched_mm_cid_after_execve(struct task_struct *t);
+void sched_mm_cid_fork(struct task_struct *t);
+void sched_mm_cid_exit(struct task_struct *t);
+static __always_inline int task_mm_cid(struct task_struct *t)
+{
+ return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT);
+}
+#else
+static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
+static inline void sched_mm_cid_fork(struct task_struct *t) { }
+static inline void sched_mm_cid_exit(struct task_struct *t) { }
+static __always_inline int task_mm_cid(struct task_struct *t)
+{
+ /*
+ * Use the processor id as a fall-back when the mm cid feature is
+ * disabled. This provides functional per-cpu data structure accesses
+ * in user-space, althrough it won't provide the memory usage benefits.
+ */
+ return task_cpu(t);
+}
+#endif
+
#ifndef MODULE
#ifndef COMPILE_OFFSETS
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index dd925d84fa46..b40de9bab4b7 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -67,6 +67,11 @@ enum syscall_work_bit {
#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
#endif
+#ifndef TIF_RSEQ
+# define TIF_RSEQ TIF_NOTIFY_RESUME
+# define _TIF_RSEQ _TIF_NOTIFY_RESUME
+#endif
+
#ifdef __KERNEL__
#ifndef arch_set_restart_data