summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/irq-entry-common.h7
-rw-r--r--include/linux/rseq.h10
-rw-r--r--kernel/rseq.c66
3 files changed, 58 insertions, 25 deletions
diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index d643c7c87822..e5941df13901 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -2,11 +2,12 @@
#ifndef __LINUX_IRQENTRYCOMMON_H
#define __LINUX_IRQENTRYCOMMON_H
+#include <linux/context_tracking.h>
+#include <linux/kmsan.h>
+#include <linux/rseq.h>
#include <linux/static_call_types.h>
#include <linux/syscalls.h>
-#include <linux/context_tracking.h>
#include <linux/tick.h>
-#include <linux/kmsan.h>
#include <linux/unwind_deferred.h>
#include <asm/entry-common.h>
@@ -226,6 +227,8 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs)
arch_exit_to_user_mode_prepare(regs, ti_work);
+ rseq_exit_to_user_mode();
+
/* Ensure that kernel state is sane for a return to userspace */
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index 69553e7c14c1..7622b733a508 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -66,6 +66,14 @@ static inline void rseq_migrate(struct task_struct *t)
rseq_set_notify_resume(t);
}
+static __always_inline void rseq_exit_to_user_mode(void)
+{
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
+ if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask))
+ current->rseq_event_mask = 0;
+ }
+}
+
/*
* If parent process has a registered restartable sequences area, the
* child inherits. Unregister rseq for a clone with CLONE_VM set.
@@ -118,7 +126,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
static inline void rseq_execve(struct task_struct *t)
{
}
-
+static inline void rseq_exit_to_user_mode(void) { }
#endif
#ifdef CONFIG_DEBUG_RSEQ
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2452b7366b00..246319d7cb0c 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -324,9 +324,9 @@ static bool rseq_warn_flags(const char *str, u32 flags)
return true;
}
-static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+static int rseq_check_flags(struct task_struct *t, u32 cs_flags)
{
- u32 flags, event_mask;
+ u32 flags;
int ret;
if (rseq_warn_flags("rseq_cs", cs_flags))
@@ -339,17 +339,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
if (rseq_warn_flags("rseq", flags))
return -EINVAL;
-
- /*
- * Load and clear event mask atomically with respect to
- * scheduler preemption and membarrier IPIs.
- */
- scoped_guard(RSEQ_EVENT_GUARD) {
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- }
-
- return !!event_mask;
+ return 0;
}
static int clear_rseq_cs(struct rseq __user *rseq)
@@ -380,7 +370,7 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
}
-static int rseq_ip_fixup(struct pt_regs *regs)
+static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
{
unsigned long ip = instruction_pointer(regs);
struct task_struct *t = current;
@@ -398,9 +388,11 @@ static int rseq_ip_fixup(struct pt_regs *regs)
*/
if (!in_rseq_cs(ip, &rseq_cs))
return clear_rseq_cs(t->rseq);
- ret = rseq_need_restart(t, rseq_cs.flags);
- if (ret <= 0)
+ ret = rseq_check_flags(t, rseq_cs.flags);
+ if (ret < 0)
return ret;
+ if (!abort)
+ return 0;
ret = clear_rseq_cs(t->rseq);
if (ret)
return ret;
@@ -430,14 +422,44 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
return;
/*
- * regs is NULL if and only if the caller is in a syscall path. Skip
- * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
- * kill a misbehaving userspace on debug kernels.
+ * If invoked from hypervisors or IO-URING, then @regs is a NULL
+ * pointer, so fixup cannot be done. If the syscall which led to
+ * this invocation was invoked inside a critical section, then it
+ * will either end up in this code again or a possible violation of
+ * a syscall inside a critical region can only be detected by the
+ * debug code in rseq_syscall() in a debug enabled kernel.
*/
if (regs) {
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
+ /*
+ * Read and clear the event mask first. If the task was not
+ * preempted or migrated or a signal is on the way, there
+ * is no point in doing any of the heavy lifting here on
+ * production kernels. In that case TIF_NOTIFY_RESUME was
+ * raised by some other functionality.
+ *
+ * This is correct because the read/clear operation is
+ * guarded against scheduler preemption, which makes it CPU
+ * local atomic. If the task is preempted right after
+ * re-enabling preemption then TIF_NOTIFY_RESUME is set
+ * again and this function is invoked another time _before_
+ * the task is able to return to user mode.
+ *
+ * On a debug kernel, invoke the fixup code unconditionally
+ * with the result handed in to allow the detection of
+ * inconsistencies.
+ */
+ u32 event_mask;
+
+ scoped_guard(RSEQ_EVENT_GUARD) {
+ event_mask = t->rseq_event_mask;
+ t->rseq_event_mask = 0;
+ }
+
+ if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) {
+ ret = rseq_ip_fixup(regs, !!event_mask);
+ if (unlikely(ret < 0))
+ goto error;
+ }
}
if (unlikely(rseq_update_cpu_node_id(t)))
goto error;