diff options
Diffstat (limited to 'kernel/rseq.c')
| -rw-r--r-- | kernel/rseq.c | 66 |
1 files changed, 44 insertions, 22 deletions
diff --git a/kernel/rseq.c b/kernel/rseq.c index 2452b7366b00..246319d7cb0c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -324,9 +324,9 @@ static bool rseq_warn_flags(const char *str, u32 flags) return true; } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +static int rseq_check_flags(struct task_struct *t, u32 cs_flags) { - u32 flags, event_mask; + u32 flags; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) @@ -339,17 +339,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags) if (rseq_warn_flags("rseq", flags)) return -EINVAL; - - /* - * Load and clear event mask atomically with respect to - * scheduler preemption and membarrier IPIs. - */ - scoped_guard(RSEQ_EVENT_GUARD) { - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - } - - return !!event_mask; + return 0; } static int clear_rseq_cs(struct rseq __user *rseq) @@ -380,7 +370,7 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } -static int rseq_ip_fixup(struct pt_regs *regs) +static int rseq_ip_fixup(struct pt_regs *regs, bool abort) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; @@ -398,9 +388,11 @@ static int rseq_ip_fixup(struct pt_regs *regs) */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t->rseq); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) + ret = rseq_check_flags(t, rseq_cs.flags); + if (ret < 0) return ret; + if (!abort) + return 0; ret = clear_rseq_cs(t->rseq); if (ret) return ret; @@ -430,14 +422,44 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) return; /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * If invoked from hypervisors or IO-URING, then @regs is a NULL + * pointer, so fixup cannot be done. If the syscall which led to + * this invocation was invoked inside a critical section, then it + * will either end up in this code again or a possible violation of + * a syscall inside a critical region can only be detected by the + * debug code in rseq_syscall() in a debug enabled kernel. */ if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + /* + * Read and clear the event mask first. If the task was not + * preempted or migrated or a signal is on the way, there + * is no point in doing any of the heavy lifting here on + * production kernels. In that case TIF_NOTIFY_RESUME was + * raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. + */ + u32 event_mask; + + scoped_guard(RSEQ_EVENT_GUARD) { + event_mask = t->rseq_event_mask; + t->rseq_event_mask = 0; + } + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) { + ret = rseq_ip_fixup(regs, !!event_mask); + if (unlikely(ret < 0)) + goto error; + } } if (unlikely(rseq_update_cpu_node_id(t))) goto error; |
