diff options
| -rw-r--r-- | fs/binfmt_elf.c | 2 | ||||
| -rw-r--r-- | include/linux/rseq.h | 16 | ||||
| -rw-r--r-- | include/linux/rseq_entry.h | 89 | ||||
| -rw-r--r-- | include/linux/sched.h | 10 | ||||
| -rw-r--r-- | kernel/rseq.c | 236 |
5 files changed, 151 insertions, 202 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e4653bb99946..3eb734c192e9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -46,7 +46,7 @@ #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> -#include <linux/rseq.h> +#include <uapi/linux/rseq.h> #include <asm/param.h> #include <asm/page.h> diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7f347c3a4af8..92f9cd49489b 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -5,6 +5,8 @@ #ifdef CONFIG_RSEQ #include <linux/sched.h> +#include <uapi/linux/rseq.h> + void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) @@ -48,7 +50,7 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { memset(&t->rseq, 0, sizeof(t->rseq)); - t->rseq.ids.cpu_cid = ~0ULL; + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } static inline void rseq_execve(struct task_struct *t) @@ -59,15 +61,19 @@ static inline void rseq_execve(struct task_struct *t) /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. + * + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault + * on the COW page on exit to user space, when the child stays on the same + * CPU as the parent. That's obviously not guaranteed, but in overcommit + * scenarios it is more likely and optimizes for the fork/exec case without + * taking the fault. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) { + if (clone_flags & CLONE_VM) rseq_reset(t); - } else { + else t->rseq = current->rseq; - t->rseq.ids.cpu_cid = ~0ULL; - } } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index fb53a6ff05d7..37444e80fd45 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -75,6 +75,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); #endif bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); +bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -194,6 +195,43 @@ efault: return false; } +/* + * On debug kernels validate that user space did not mess with it if the + * debug branch is enabled. + */ +bool rseq_debug_validate_ids(struct task_struct *t) +{ + struct rseq __user *rseq = t->rseq.usrptr; + u32 cpu_id, uval, node_id; + + /* + * On the first exit after registering the rseq region CPU ID is + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! + */ + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? + cpu_to_node(t->rseq.ids.cpu_id) : 0; + + scoped_user_read_access(rseq, efault) { + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -279,6 +317,57 @@ efault: return false; } +/* + * Updates CPU ID, Node ID and MM CID and reads the critical section + * address, when @csaddr != NULL. This allows to put the ID update and the + * read under the same uaccess region to spare a separate begin/end. + * + * As this is either invoked from a C wrapper with @csaddr = NULL or from + * the fast path code with a valid pointer, a clever compiler should be + * able to optimize the read out. Spares a duplicate implementation. + * + * Returns true, if the operation was successful, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when invalid data + * was found on debug kernels. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ +static rseq_inline +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, + u32 node_id, u64 *csaddr) +{ + struct rseq __user *rseq = t->rseq.usrptr; + + if (static_branch_unlikely(&rseq_debug_enabled)) { + if (!rseq_debug_validate_ids(t)) + return false; + } + + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); + unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); + if (csaddr) + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + } + + /* Cache the new values */ + t->rseq.ids.cpu_cid = ids->cpu_cid; + rseq_stat_inc(rseq_stats.ids); + rseq_trace_update(t, ids); + return true; +efault: + return false; +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/include/linux/sched.h b/include/linux/sched.h index 24a9da7ca3e7..e47abc8685d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,7 +42,6 @@ #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <linux/rseq_types.h> -#include <uapi/linux/rseq.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> @@ -1408,15 +1407,6 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; -#ifdef CONFIG_DEBUG_RSEQ - /* - * This is a place holder to save a copy of the rseq fields for - * validation of read-only fields. The struct rseq has a - * variable-length array at the end, so it cannot be used - * directly. Reserve a size large enough for the known fields. - */ - char rseq_fields[sizeof(struct rseq)]; -#endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ diff --git a/kernel/rseq.c b/kernel/rseq.c index 97631554ae96..1e4f1d2cdfe5 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -88,13 +88,6 @@ # define RSEQ_EVENT_GUARD preempt #endif -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); static inline void rseq_control_debug(bool on) @@ -227,159 +220,9 @@ static int __init rseq_debugfs_init(void) __initcall(rseq_debugfs_init); #endif /* CONFIG_DEBUG_FS */ -#ifdef CONFIG_DEBUG_RSEQ -static struct rseq *rseq_kernel_fields(struct task_struct *t) -{ - return (struct rseq *) t->rseq_fields; -} - -static int rseq_validate_ro_fields(struct task_struct *t) -{ - static DEFINE_RATELIMIT_STATE(_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - u32 cpu_id_start, cpu_id, node_id, mm_cid; - struct rseq __user *rseq = t->rseq.usrptr; - - /* - * Validate fields which are required to be read-only by - * user-space. - */ - if (!user_read_access_begin(rseq, t->rseq.len)) - goto efault; - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_get_user(node_id, &rseq->node_id, efault_end); - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); - user_read_access_end(); - - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || - cpu_id != rseq_kernel_fields(t)->cpu_id || - node_id != rseq_kernel_fields(t)->node_id || - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { - - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" - "\tcpu_id_start: %u ?= %u\n" - "\tcpu_id: %u ?= %u\n" - "\tnode_id: %u ?= %u\n" - "\tmm_cid: %u ?= %u\n", - t->pid, t->comm, - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, - cpu_id, rseq_kernel_fields(t)->cpu_id, - node_id, rseq_kernel_fields(t)->node_id, - mm_cid, rseq_kernel_fields(t)->mm_cid); - } - - /* For now, only print a console warning on mismatch. */ - return 0; - -efault_end: - user_read_access_end(); -efault: - return -EFAULT; -} - -/* - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent - * state. - */ -#define rseq_unsafe_put_user(t, value, field, error_label) \ - do { \ - unsafe_put_user(value, &t->rseq.usrptr->field, error_label); \ - rseq_kernel_fields(t)->field = value; \ - } while (0) - -#else -static int rseq_validate_ro_fields(struct task_struct *t) -{ - return 0; -} - -#define rseq_unsafe_put_user(t, value, field, error_label) \ - unsafe_put_user(value, &t->rseq.usrptr->field, error_label) -#endif - -static int rseq_update_cpu_node_id(struct task_struct *t) -{ - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); - - rseq_stat_inc(rseq_stats.ids); - - /* Validate read-only rseq fields on debug kernels */ - if (rseq_validate_ro_fields(t)) - goto efault; - WARN_ON_ONCE((int) mm_cid < 0); - - if (!user_write_access_begin(rseq, t->rseq.len)) - goto efault; - - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* Cache the user space values */ - t->rseq.ids.cpu_id = cpu_id; - t->rseq.ids.mm_cid = mm_cid; - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - trace_rseq_update(t); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; -} - -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) +static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) { - struct rseq __user *rseq = t->rseq.usrptr; - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; - - /* - * Validate read-only rseq fields. - */ - if (rseq_validate_ro_fields(t)) - goto efault; - - if (!user_write_access_begin(rseq, t->rseq.len)) - goto efault; - - /* - * Reset all fields to their initial state. - * - * All fields have an initial state of 0 except cpu_id which is set to - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after - * unregistration can figure out that rseq needs to be registered - * again. - */ - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); - rseq_unsafe_put_user(t, node_id, node_id, efault_end); - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); - - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); - return 0; - -efault_end: - user_write_access_end(); -efault: - return -EFAULT; + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); } static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) @@ -410,6 +253,8 @@ efault: void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; + struct rseq_ids ids; + u32 node_id; bool event; int sig; @@ -456,6 +301,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) scoped_guard(RSEQ_EVENT_GUARD) { event = t->rseq.event.sched_switch; t->rseq.event.sched_switch = false; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); } if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) @@ -464,7 +311,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (!rseq_handle_cs(t, regs)) goto error; - if (unlikely(rseq_update_cpu_node_id(t))) + node_id = cpu_to_node(ids.cpu_id); + if (!rseq_set_ids(t, &ids, node_id)) goto error; return; @@ -504,13 +352,33 @@ void rseq_syscall(struct pt_regs *regs) } #endif +static bool rseq_reset_ids(void) +{ + struct rseq_ids ids = { + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, + .mm_cid = 0, + }; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + if (rseq_set_ids(current, &ids, 0)) + return true; + + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - int ret; - if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; @@ -521,9 +389,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 return -EINVAL; if (current->rseq.sig != sig) return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; + if (!rseq_reset_ids()) + return -EFAULT; rseq_reset(current); return 0; } @@ -563,27 +430,22 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 if (!access_ok(rseq, rseq_len)) return -EFAULT; - /* - * If the rseq_cs pointer is non-NULL on registration, clear it to - * avoid a potential segfault on return to user-space. The proper thing - * to do would have been to fail the registration but this would break - * older libcs that reuse the rseq area for new threads without - * clearing the fields. Don't bother reading it, just reset it. - */ - if (put_user(0UL, &rseq->rseq_cs)) - return -EFAULT; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); + } -#ifdef CONFIG_DEBUG_RSEQ - /* - * Initialize the in-kernel rseq fields copy for validation of - * read-only fields. - */ - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) - return -EFAULT; -#endif /* * Activate the registration by setting the rseq area address, length * and signature in the task struct. @@ -599,6 +461,8 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32 */ current->rseq.event.has_rseq = true; rseq_sched_switch_event(current); - return 0; + +efault: + return -EFAULT; } |
