summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDmitry Vyukov <dvyukov@google.com>2025-05-21 17:04:29 +0200
committerThomas Gleixner <tglx@linutronix.de>2025-06-13 18:36:39 +0200
commita2fc422ed75748eef2985454e97847fb22f873c2 (patch)
tree811388d91b3d50b1b65b3b7230174de8a36448ce
parentb89732c8c8357487185f260a723a060b3476144e (diff)
syscall_user_dispatch: Add PR_SYS_DISPATCH_INCLUSIVE_ON
There are two possible scenarios for syscall filtering: - having a trusted/allowed range of PCs, and intercepting everything else - or the opposite: a single untrusted/intercepted range and allowing everything else (this is relevant for any kind of sandboxing scenario, or monitoring behavior of a single library) The current API only allows the former use case due to allowed range wrap-around check. Add PR_SYS_DISPATCH_INCLUSIVE_ON that enables the second use case. Add PR_SYS_DISPATCH_EXCLUSIVE_ON alias for PR_SYS_DISPATCH_ON to make it clear how it's different from the new PR_SYS_DISPATCH_INCLUSIVE_ON. Signed-off-by: Dmitry Vyukov <dvyukov@google.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Link: https://lore.kernel.org/all/97947cc8e205ff49675826d7b0327ef2e2c66eea.1747839857.git.dvyukov@google.com
-rw-r--r--Documentation/admin-guide/syscall-user-dispatch.rst23
-rw-r--r--include/uapi/linux/prctl.h7
-rw-r--r--kernel/entry/syscall_user_dispatch.c36
-rw-r--r--tools/include/uapi/linux/prctl.h7
4 files changed, 49 insertions, 24 deletions
diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst
index e3cfffef5a63..c1768d9e80fa 100644
--- a/Documentation/admin-guide/syscall-user-dispatch.rst
+++ b/Documentation/admin-guide/syscall-user-dispatch.rst
@@ -53,20 +53,25 @@ following prctl:
prctl(PR_SET_SYSCALL_USER_DISPATCH, <op>, <offset>, <length>, [selector])
-<op> is either PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF, to enable and
-disable the mechanism globally for that thread. When
-PR_SYS_DISPATCH_OFF is used, the other fields must be zero.
-
-[<offset>, <offset>+<length>) delimit a memory region interval
-from which syscalls are always executed directly, regardless of the
-userspace selector. This provides a fast path for the C library, which
-includes the most common syscall dispatchers in the native code
-applications, and also provides a way for the signal handler to return
+<op> is either PR_SYS_DISPATCH_EXCLUSIVE_ON/PR_SYS_DISPATCH_INCLUSIVE_ON
+or PR_SYS_DISPATCH_OFF, to enable and disable the mechanism globally for
+that thread. When PR_SYS_DISPATCH_OFF is used, the other fields must be zero.
+
+For PR_SYS_DISPATCH_EXCLUSIVE_ON [<offset>, <offset>+<length>) delimit
+a memory region interval from which syscalls are always executed directly,
+regardless of the userspace selector. This provides a fast path for the
+C library, which includes the most common syscall dispatchers in the native
+code applications, and also provides a way for the signal handler to return
without triggering a nested SIGSYS on (rt\_)sigreturn. Users of this
interface should make sure that at least the signal trampoline code is
included in this region. In addition, for syscalls that implement the
trampoline code on the vDSO, that trampoline is never intercepted.
+For PR_SYS_DISPATCH_INCLUSIVE_ON [<offset>, <offset>+<length>) delimit
+a memory region interval from which syscalls are dispatched based on
+the userspace selector. Syscalls from outside of the range are always
+executed directly.
+
[selector] is a pointer to a char-sized region in the process memory
region, that provides a quick way to enable disable syscall redirection
thread-wide, without the need to invoke the kernel directly. selector
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 43dec6eed559..9785c1d49f05 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -255,7 +255,12 @@ struct prctl_mm_map {
/* Dispatch syscalls to a userspace handler */
#define PR_SET_SYSCALL_USER_DISPATCH 59
# define PR_SYS_DISPATCH_OFF 0
-# define PR_SYS_DISPATCH_ON 1
+/* Enable dispatch except for the specified range */
+# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1
+/* Enable dispatch for the specified range */
+# define PR_SYS_DISPATCH_INCLUSIVE_ON 2
+/* Legacy name for backwards compatibility */
+# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON
/* The control values for the user space selector when dispatch is enabled */
# define SYSCALL_DISPATCH_FILTER_ALLOW 0
# define SYSCALL_DISPATCH_FILTER_BLOCK 1
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 5340c5aa89e7..a9055eccb27e 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -78,7 +78,7 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
if (offset || len || selector)
return -EINVAL;
break;
- case PR_SYS_DISPATCH_ON:
+ case PR_SYS_DISPATCH_EXCLUSIVE_ON:
/*
* Validate the direct dispatcher region just for basic
* sanity against overflow and a 0-sized dispatcher
@@ -87,30 +87,40 @@ static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned lon
*/
if (offset && offset + len <= offset)
return -EINVAL;
-
+ break;
+ case PR_SYS_DISPATCH_INCLUSIVE_ON:
+ if (len == 0 || offset + len <= offset)
+ return -EINVAL;
/*
- * access_ok() will clear memory tags for tagged addresses
- * if current has memory tagging enabled.
-
- * To enable a tracer to set a tracees selector the
- * selector address must be untagged for access_ok(),
- * otherwise an untagged tracer will always fail to set a
- * tagged tracees selector.
+ * Invert the range, the check in syscall_user_dispatch()
+ * supports wrap-around.
*/
- if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
- return -EFAULT;
-
+ offset = offset + len;
+ len = -len;
break;
default:
return -EINVAL;
}
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+ *
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (mode != PR_SYS_DISPATCH_OFF && selector &&
+ !access_ok(untagged_addr(selector), sizeof(*selector)))
+ return -EFAULT;
+
task->syscall_dispatch.selector = selector;
task->syscall_dispatch.offset = offset;
task->syscall_dispatch.len = len;
task->syscall_dispatch.on_dispatch = false;
- if (mode == PR_SYS_DISPATCH_ON)
+ if (mode != PR_SYS_DISPATCH_OFF)
set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
else
clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 43dec6eed559..9785c1d49f05 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -255,7 +255,12 @@ struct prctl_mm_map {
/* Dispatch syscalls to a userspace handler */
#define PR_SET_SYSCALL_USER_DISPATCH 59
# define PR_SYS_DISPATCH_OFF 0
-# define PR_SYS_DISPATCH_ON 1
+/* Enable dispatch except for the specified range */
+# define PR_SYS_DISPATCH_EXCLUSIVE_ON 1
+/* Enable dispatch for the specified range */
+# define PR_SYS_DISPATCH_INCLUSIVE_ON 2
+/* Legacy name for backwards compatibility */
+# define PR_SYS_DISPATCH_ON PR_SYS_DISPATCH_EXCLUSIVE_ON
/* The control values for the user space selector when dispatch is enabled */
# define SYSCALL_DISPATCH_FILTER_ALLOW 0
# define SYSCALL_DISPATCH_FILTER_BLOCK 1