From ce623f89872df4253719be71531116751eeab85f Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:27 +1100 Subject: nsfs: clean-up ns_get_path() signature to return int ns_get_path() and ns_get_path_cb() only ever return either NULL or an ERR_PTR. It is far more idiomatic to simply return an integer, and it makes all of the callers of ns_get_path() more straightforward to read. Fixes: e149ed2b805f ("take the targets of /proc/*/ns/* symlinks to separate fs") Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/proc_ns.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index d31cb6215905..aed366b4795c 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -76,10 +76,10 @@ static inline int ns_alloc_inum(struct ns_common *ns) extern struct file *proc_ns_fget(int fd); #define get_proc_ns(inode) ((struct ns_common *)(inode)->i_private) -extern void *ns_get_path(struct path *path, struct task_struct *task, +extern int ns_get_path(struct path *path, struct task_struct *task, const struct proc_ns_operations *ns_ops); typedef struct ns_common *ns_get_path_helper_t(void *); -extern void *ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, +extern int ns_get_path_cb(struct path *path, ns_get_path_helper_t ns_get_cb, void *private_data); extern int ns_get_name(char *buf, size_t size, struct task_struct *task, -- cgit v1.2.3 From 1bc82070fa2763bdca626fa8bde72b35f11e8960 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:28 +1100 Subject: namei: allow nd_jump_link() to produce errors In preparation for LOOKUP_NO_MAGICLINKS, it's necessary to add the ability for nd_jump_link() to return an error which the corresponding get_link() caller must propogate back up to the VFS. Suggested-by: Al Viro Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/namei.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 7fe7b87a3ded..b2479cc119c6 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -69,7 +69,7 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); -extern void nd_jump_link(struct path *path); +extern int __must_check nd_jump_link(struct path *path); static inline void nd_terminate_link(void *name, size_t len, size_t maxlen) { -- cgit v1.2.3 From 278121417a72d87fb29dd8c48801f80821e8f75a Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:30 +1100 Subject: namei: LOOKUP_NO_SYMLINKS: block symlink resolution /* Background. */ Userspace cannot easily resolve a path without resolving symlinks, and would have to manually resolve each path component with O_PATH and O_NOFOLLOW. This is clearly inefficient, and can be fairly easy to screw up (resulting in possible security bugs). Linus has mentioned that Git has a particular need for this kind of flag[1]. It also resolves a fairly long-standing perceived deficiency in O_NOFOLLOw -- that it only blocks the opening of trailing symlinks. This is part of a refresh of Al's AT_NO_JUMPS patchset[2] (which was a variation on David Drysdale's O_BENEATH patchset[3], which in turn was based on the Capsicum project[4]). /* Userspace API. */ LOOKUP_NO_SYMLINKS will be exposed to userspace through openat2(2). /* Semantics. */ Unlike most other LOOKUP flags (most notably LOOKUP_FOLLOW), LOOKUP_NO_SYMLINKS applies to all components of the path. With LOOKUP_NO_SYMLINKS, any symlink path component encountered during path resolution will yield -ELOOP. If the trailing component is a symlink (and no other components were symlinks), then O_PATH|O_NOFOLLOW will not error out and will instead provide a handle to the trailing symlink -- without resolving it. /* Testing. */ LOOKUP_NO_SYMLINKS is tested as part of the openat2(2) selftests. [1]: https://lore.kernel.org/lkml/CA+55aFyOKM7DW7+0sdDFKdZFXgptb5r1id9=Wvhd8AgSP7qjwQ@mail.gmail.com/ [2]: https://lore.kernel.org/lkml/20170429220414.GT29622@ZenIV.linux.org.uk/ [3]: https://lore.kernel.org/lkml/1415094884-18349-1-git-send-email-drysdale@google.com/ [4]: https://lore.kernel.org/lkml/1404124096-21445-1-git-send-email-drysdale@google.com/ Cc: Christian Brauner Suggested-by: Al Viro Suggested-by: Linus Torvalds Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/namei.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index b2479cc119c6..c42a1924ad67 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -39,6 +39,9 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_ROOT 0x2000 #define LOOKUP_ROOT_GRABBED 0x0008 +/* Scoping flags for lookup. */ +#define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ + extern int path_pts(struct path *path); extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); -- cgit v1.2.3 From 4b99d4996979d582859c5a49072e92de124bf691 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:31 +1100 Subject: namei: LOOKUP_NO_MAGICLINKS: block magic-link resolution /* Background. */ There has always been a special class of symlink-like objects in procfs (and a few other pseudo-filesystems) which allow for non-lexical resolution of paths using nd_jump_link(). These "magic-links" do not follow traditional mount namespace boundaries, and have been used consistently in container escape attacks because they can be used to trick unsuspecting privileged processes into resolving unexpected paths. It is also non-trivial for userspace to unambiguously avoid resolving magic-links, because they do not have a reliable indication that they are a magic-link (in order to verify them you'd have to manually open the path given by readlink(2) and then verify that the two file descriptors reference the same underlying file, which is plagued with possible race conditions or supplementary attack scenarios). It would therefore be very helpful for userspace to be able to avoid these symlinks easily, thus hopefully removing a tool from attackers' toolboxes. This is part of a refresh of Al's AT_NO_JUMPS patchset[1] (which was a variation on David Drysdale's O_BENEATH patchset[2], which in turn was based on the Capsicum project[3]). /* Userspace API. */ LOOKUP_NO_MAGICLINKS will be exposed to userspace through openat2(2). /* Semantics. */ Unlike most other LOOKUP flags (most notably LOOKUP_FOLLOW), LOOKUP_NO_MAGICLINKS applies to all components of the path. With LOOKUP_NO_MAGICLINKS, any magic-link path component encountered during path resolution will yield -ELOOP. The handling of ~LOOKUP_FOLLOW for a trailing magic-link is identical to LOOKUP_NO_SYMLINKS. LOOKUP_NO_SYMLINKS implies LOOKUP_NO_MAGICLINKS. /* Testing. */ LOOKUP_NO_MAGICLINKS is tested as part of the openat2(2) selftests. [1]: https://lore.kernel.org/lkml/20170429220414.GT29622@ZenIV.linux.org.uk/ [2]: https://lore.kernel.org/lkml/1415094884-18349-1-git-send-email-drysdale@google.com/ [3]: https://lore.kernel.org/lkml/1404124096-21445-1-git-send-email-drysdale@google.com/ Cc: Christian Brauner Suggested-by: David Drysdale Suggested-by: Al Viro Suggested-by: Andy Lutomirski Suggested-by: Linus Torvalds Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/namei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index c42a1924ad67..f8acec81cf03 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -41,6 +41,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ +#define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ extern int path_pts(struct path *path); -- cgit v1.2.3 From 72ba29297e1439efaa54d9125b866ae9d15df339 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:32 +1100 Subject: namei: LOOKUP_NO_XDEV: block mountpoint crossing /* Background. */ The need to contain path operations within a mountpoint has been a long-standing usecase that userspace has historically implemented manually with liberal usage of stat(). find, rsync, tar and many other programs implement these semantics -- but it'd be much simpler to have a fool-proof way of refusing to open a path if it crosses a mountpoint. This is part of a refresh of Al's AT_NO_JUMPS patchset[1] (which was a variation on David Drysdale's O_BENEATH patchset[2], which in turn was based on the Capsicum project[3]). /* Userspace API. */ LOOKUP_NO_XDEV will be exposed to userspace through openat2(2). /* Semantics. */ Unlike most other LOOKUP flags (most notably LOOKUP_FOLLOW), LOOKUP_NO_XDEV applies to all components of the path. With LOOKUP_NO_XDEV, any path component which crosses a mount-point during path resolution (including "..") will yield an -EXDEV. Absolute paths, absolute symlinks, and magic-links will only yield an -EXDEV if the jump involved changing mount-points. /* Testing. */ LOOKUP_NO_XDEV is tested as part of the openat2(2) selftests. [1]: https://lore.kernel.org/lkml/20170429220414.GT29622@ZenIV.linux.org.uk/ [2]: https://lore.kernel.org/lkml/1415094884-18349-1-git-send-email-drysdale@google.com/ [3]: https://lore.kernel.org/lkml/1404124096-21445-1-git-send-email-drysdale@google.com/ Cc: Christian Brauner Suggested-by: David Drysdale Suggested-by: Al Viro Suggested-by: Andy Lutomirski Suggested-by: Linus Torvalds Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/namei.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index f8acec81cf03..edb4562b5c4e 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -42,6 +42,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; /* Scoping flags for lookup. */ #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ #define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ +#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ extern int path_pts(struct path *path); -- cgit v1.2.3 From adb21d2b526f7f196b2f3fdca97d80ba05dd14a0 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:33 +1100 Subject: namei: LOOKUP_BENEATH: O_BENEATH-like scoped resolution /* Background. */ There are many circumstances when userspace wants to resolve a path and ensure that it doesn't go outside of a particular root directory during resolution. Obvious examples include archive extraction tools, as well as other security-conscious userspace programs. FreeBSD spun out O_BENEATH from their Capsicum project[1,2], so it also seems reasonable to implement similar functionality for Linux. This is part of a refresh of Al's AT_NO_JUMPS patchset[3] (which was a variation on David Drysdale's O_BENEATH patchset[4], which in turn was based on the Capsicum project[5]). /* Userspace API. */ LOOKUP_BENEATH will be exposed to userspace through openat2(2). /* Semantics. */ Unlike most other LOOKUP flags (most notably LOOKUP_FOLLOW), LOOKUP_BENEATH applies to all components of the path. With LOOKUP_BENEATH, any path component which attempts to "escape" the starting point of the filesystem lookup (the dirfd passed to openat) will yield -EXDEV. Thus, all absolute paths and symlinks are disallowed. Due to a security concern brought up by Jann[6], any ".." path components are also blocked. This restriction will be lifted in a future patch, but requires more work to ensure that permitting ".." is done safely. Magic-link jumps are also blocked, because they can beam the path lookup across the starting point. It would be possible to detect and block only the "bad" crossings with path_is_under() checks, but it's unclear whether it makes sense to permit magic-links at all. However, userspace is recommended to pass LOOKUP_NO_MAGICLINKS if they want to ensure that magic-link crossing is entirely disabled. /* Testing. */ LOOKUP_BENEATH is tested as part of the openat2(2) selftests. [1]: https://reviews.freebsd.org/D2808 [2]: https://reviews.freebsd.org/D17547 [3]: https://lore.kernel.org/lkml/20170429220414.GT29622@ZenIV.linux.org.uk/ [4]: https://lore.kernel.org/lkml/1415094884-18349-1-git-send-email-drysdale@google.com/ [5]: https://lore.kernel.org/lkml/1404124096-21445-1-git-send-email-drysdale@google.com/ [6]: https://lore.kernel.org/lkml/CAG48ez1jzNvxB+bfOBnERFGp=oMM0vHWuLD6EULmne3R6xa53w@mail.gmail.com/ Cc: Christian Brauner Suggested-by: David Drysdale Suggested-by: Al Viro Suggested-by: Andy Lutomirski Suggested-by: Linus Torvalds Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/namei.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index edb4562b5c4e..d2a98083be77 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -2,6 +2,7 @@ #ifndef _LINUX_NAMEI_H #define _LINUX_NAMEI_H +#include #include #include #include @@ -43,6 +44,9 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */ #define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ #define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ +#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ +/* LOOKUP_* flags which do scope-related checks based on the dirfd. */ +#define LOOKUP_IS_SCOPED LOOKUP_BENEATH extern int path_pts(struct path *path); -- cgit v1.2.3 From 8db52c7e7ee1bd861b6096fcafc0fe7d0f24a994 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 7 Dec 2019 01:13:34 +1100 Subject: namei: LOOKUP_IN_ROOT: chroot-like scoped resolution /* Background. */ Container runtimes or other administrative management processes will often interact with root filesystems while in the host mount namespace, because the cost of doing a chroot(2) on every operation is too prohibitive (especially in Go, which cannot safely use vfork). However, a malicious program can trick the management process into doing operations on files outside of the root filesystem through careful crafting of symlinks. Most programs that need this feature have attempted to make this process safe, by doing all of the path resolution in userspace (with symlinks being scoped to the root of the malicious root filesystem). Unfortunately, this method is prone to foot-guns and usually such implementations have subtle security bugs. Thus, what userspace needs is a way to resolve a path as though it were in a chroot(2) -- with all absolute symlinks being resolved relative to the dirfd root (and ".." components being stuck under the dirfd root). It is much simpler and more straight-forward to provide this functionality in-kernel (because it can be done far more cheaply and correctly). More classical applications that also have this problem (which have their own potentially buggy userspace path sanitisation code) include web servers, archive extraction tools, network file servers, and so on. /* Userspace API. */ LOOKUP_IN_ROOT will be exposed to userspace through openat2(2). /* Semantics. */ Unlike most other LOOKUP flags (most notably LOOKUP_FOLLOW), LOOKUP_IN_ROOT applies to all components of the path. With LOOKUP_IN_ROOT, any path component which attempts to cross the starting point of the pathname lookup (the dirfd passed to openat) will remain at the starting point. Thus, all absolute paths and symlinks will be scoped within the starting point. There is a slight change in behaviour regarding pathnames -- if the pathname is absolute then the dirfd is still used as the root of resolution of LOOKUP_IN_ROOT is specified (this is to avoid obvious foot-guns, at the cost of a minor API inconsistency). As with LOOKUP_BENEATH, Jann's security concern about ".."[1] applies to LOOKUP_IN_ROOT -- therefore ".." resolution is blocked. This restriction will be lifted in a future patch, but requires more work to ensure that permitting ".." is done safely. Magic-link jumps are also blocked, because they can beam the path lookup across the starting point. It would be possible to detect and block only the "bad" crossings with path_is_under() checks, but it's unclear whether it makes sense to permit magic-links at all. However, userspace is recommended to pass LOOKUP_NO_MAGICLINKS if they want to ensure that magic-link crossing is entirely disabled. /* Testing. */ LOOKUP_IN_ROOT is tested as part of the openat2(2) selftests. [1]: https://lore.kernel.org/lkml/CAG48ez1jzNvxB+bfOBnERFGp=oMM0vHWuLD6EULmne3R6xa53w@mail.gmail.com/ Cc: Christian Brauner Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/namei.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index d2a98083be77..4e77068f7a1a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -45,8 +45,9 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */ #define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */ #define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */ +#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */ /* LOOKUP_* flags which do scope-related checks based on the dirfd. */ -#define LOOKUP_IS_SCOPED LOOKUP_BENEATH +#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT) extern int path_pts(struct path *path); -- cgit v1.2.3 From fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Sat, 18 Jan 2020 23:07:59 +1100 Subject: open: introduce openat2(2) syscall /* Background. */ For a very long time, extending openat(2) with new features has been incredibly frustrating. This stems from the fact that openat(2) is possibly the most famous counter-example to the mantra "don't silently accept garbage from userspace" -- it doesn't check whether unknown flags are present[1]. This means that (generally) the addition of new flags to openat(2) has been fraught with backwards-compatibility issues (O_TMPFILE has to be defined as __O_TMPFILE|O_DIRECTORY|[O_RDWR or O_WRONLY] to ensure old kernels gave errors, since it's insecure to silently ignore the flag[2]). All new security-related flags therefore have a tough road to being added to openat(2). Userspace also has a hard time figuring out whether a particular flag is supported on a particular kernel. While it is now possible with contemporary kernels (thanks to [3]), older kernels will expose unknown flag bits through fcntl(F_GETFL). Giving a clear -EINVAL during openat(2) time matches modern syscall designs and is far more fool-proof. In addition, the newly-added path resolution restriction LOOKUP flags (which we would like to expose to user-space) don't feel related to the pre-existing O_* flag set -- they affect all components of path lookup. We'd therefore like to add a new flag argument. Adding a new syscall allows us to finally fix the flag-ignoring problem, and we can make it extensible enough so that we will hopefully never need an openat3(2). /* Syscall Prototype. */ /* * open_how is an extensible structure (similar in interface to * clone3(2) or sched_setattr(2)). The size parameter must be set to * sizeof(struct open_how), to allow for future extensions. All future * extensions will be appended to open_how, with their zero value * acting as a no-op default. */ struct open_how { /* ... */ }; int openat2(int dfd, const char *pathname, struct open_how *how, size_t size); /* Description. */ The initial version of 'struct open_how' contains the following fields: flags Used to specify openat(2)-style flags. However, any unknown flag bits or otherwise incorrect flag combinations (like O_PATH|O_RDWR) will result in -EINVAL. In addition, this field is 64-bits wide to allow for more O_ flags than currently permitted with openat(2). mode The file mode for O_CREAT or O_TMPFILE. Must be set to zero if flags does not contain O_CREAT or O_TMPFILE. resolve Restrict path resolution (in contrast to O_* flags they affect all path components). The current set of flags are as follows (at the moment, all of the RESOLVE_ flags are implemented as just passing the corresponding LOOKUP_ flag). RESOLVE_NO_XDEV => LOOKUP_NO_XDEV RESOLVE_NO_SYMLINKS => LOOKUP_NO_SYMLINKS RESOLVE_NO_MAGICLINKS => LOOKUP_NO_MAGICLINKS RESOLVE_BENEATH => LOOKUP_BENEATH RESOLVE_IN_ROOT => LOOKUP_IN_ROOT open_how does not contain an embedded size field, because it is of little benefit (userspace can figure out the kernel open_how size at runtime fairly easily without it). It also only contains u64s (even though ->mode arguably should be a u16) to avoid having padding fields which are never used in the future. Note that as a result of the new how->flags handling, O_PATH|O_TMPFILE is no longer permitted for openat(2). As far as I can tell, this has always been a bug and appears to not be used by userspace (and I've not seen any problems on my machines by disallowing it). If it turns out this breaks something, we can special-case it and only permit it for openat(2) but not openat2(2). After input from Florian Weimer, the new open_how and flag definitions are inside a separate header from uapi/linux/fcntl.h, to avoid problems that glibc has with importing that header. /* Testing. */ In a follow-up patch there are over 200 selftests which ensure that this syscall has the correct semantics and will correctly handle several attack scenarios. In addition, I've written a userspace library[4] which provides convenient wrappers around openat2(RESOLVE_IN_ROOT) (this is necessary because no other syscalls support RESOLVE_IN_ROOT, and thus lots of care must be taken when using RESOLVE_IN_ROOT'd file descriptors with other syscalls). During the development of this patch, I've run numerous verification tests using libpathrs (showing that the API is reasonably usable by userspace). /* Future Work. */ Additional RESOLVE_ flags have been suggested during the review period. These can be easily implemented separately (such as blocking auto-mount during resolution). Furthermore, there are some other proposed changes to the openat(2) interface (the most obvious example is magic-link hardening[5]) which would be a good opportunity to add a way for userspace to restrict how O_PATH file descriptors can be re-opened. Another possible avenue of future work would be some kind of CHECK_FIELDS[6] flag which causes the kernel to indicate to userspace which openat2(2) flags and fields are supported by the current kernel (to avoid userspace having to go through several guesses to figure it out). [1]: https://lwn.net/Articles/588444/ [2]: https://lore.kernel.org/lkml/CA+55aFyyxJL1LyXZeBsf2ypriraj5ut1XkNDsunRBqgVjZU_6Q@mail.gmail.com [3]: commit 629e014bb834 ("fs: completely ignore unknown open flags") [4]: https://sourceware.org/bugzilla/show_bug.cgi?id=17523 [5]: https://lore.kernel.org/lkml/20190930183316.10190-2-cyphar@cyphar.com/ [6]: https://youtu.be/ggD-eb3yPVs Suggested-by: Christian Brauner Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- include/linux/fcntl.h | 16 +++++++++++++++- include/linux/syscalls.h | 3 +++ include/uapi/asm-generic/unistd.h | 5 ++++- include/uapi/linux/fcntl.h | 2 +- include/uapi/linux/openat2.h | 39 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 3 deletions(-) create mode 100644 include/uapi/linux/openat2.h (limited to 'include') diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index d019df946cb2..7bcdcf4f6ab2 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -2,15 +2,29 @@ #ifndef _LINUX_FCNTL_H #define _LINUX_FCNTL_H +#include #include -/* list of all valid flags for the open/openat flags argument: */ +/* List of all valid flags for the open/openat flags argument: */ #define VALID_OPEN_FLAGS \ (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \ O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \ FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \ O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE) +/* List of all valid flags for the how->upgrade_mask argument: */ +#define VALID_UPGRADE_FLAGS \ + (UPGRADE_NOWRITE | UPGRADE_NOREAD) + +/* List of all valid flags for the how->resolve argument: */ +#define VALID_RESOLVE_FLAGS \ + (RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \ + RESOLVE_BENEATH | RESOLVE_IN_ROOT) + +/* List of all open_how "versions". */ +#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */ +#define OPEN_HOW_SIZE_LATEST OPEN_HOW_SIZE_VER0 + #ifndef force_o_largefile #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T)) #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d0391cc2dae9..cd9f27cbc567 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -69,6 +69,7 @@ struct rseq; union bpf_attr; struct io_uring_params; struct clone_args; +struct open_how; #include #include @@ -439,6 +440,8 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); +asmlinkage long sys_openat2(int dfd, const char __user *filename, + struct open_how *how, size_t size); asmlinkage long sys_close(unsigned int fd); asmlinkage long sys_vhangup(void); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 1fc8faa6e973..d4122c091472 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -851,8 +851,11 @@ __SYSCALL(__NR_pidfd_open, sys_pidfd_open) __SYSCALL(__NR_clone3, sys_clone3) #endif +#define __NR_openat2 437 +__SYSCALL(__NR_openat2, sys_openat2) + #undef __NR_syscalls -#define __NR_syscalls 436 +#define __NR_syscalls 438 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 1f97b33c840e..ca88b7bce553 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -3,6 +3,7 @@ #define _UAPI_LINUX_FCNTL_H #include +#include #define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) #define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) @@ -100,5 +101,4 @@ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ - #endif /* _UAPI_LINUX_FCNTL_H */ diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h new file mode 100644 index 000000000000..58b1eb711360 --- /dev/null +++ b/include/uapi/linux/openat2.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_OPENAT2_H +#define _UAPI_LINUX_OPENAT2_H + +#include + +/* + * Arguments for how openat2(2) should open the target path. If only @flags and + * @mode are non-zero, then openat2(2) operates very similarly to openat(2). + * + * However, unlike openat(2), unknown or invalid bits in @flags result in + * -EINVAL rather than being silently ignored. @mode must be zero unless one of + * {O_CREAT, O_TMPFILE} are set. + * + * @flags: O_* flags. + * @mode: O_CREAT/O_TMPFILE file mode. + * @resolve: RESOLVE_* flags. + */ +struct open_how { + __u64 flags; + __u64 mode; + __u64 resolve; +}; + +/* how->resolve flags for openat2(2). */ +#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings + (includes bind-mounts). */ +#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style + "magic-links". */ +#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks + (implies OEXT_NO_MAGICLINKS) */ +#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like + "..", symlinks, and absolute + paths which escape the dirfd. */ +#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".." + be scoped inside the dirfd + (similar to chroot(2)). */ + +#endif /* _UAPI_LINUX_OPENAT2_H */ -- cgit v1.2.3