From 9698d5a4836549d394e6efd858b5200878c9f255 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:02:23 +0100 Subject: pidfs: rework inode number allocation Recently we received a patchset that aims to enable file handle encoding and decoding via name_to_handle_at(2) and open_by_handle_at(2). A crucical step in the patch series is how to go from inode number to struct pid without leaking information into unprivileged contexts. The issue is that in order to find a struct pid the pid number in the initial pid namespace must be encoded into the file handle via name_to_handle_at(2). This can be used by containers using a separate pid namespace to learn what the pid number of a given process in the initial pid namespace is. While this is a weak information leak it could be used in various exploits and in general is an ugly wart in the design. To solve this problem a new way is needed to lookup a struct pid based on the inode number allocated for that struct pid. The other part is to remove the custom inode number allocation on 32bit systems that is also an ugly wart that should go away. So, a new scheme is used that I was discusssing with Tejun some time back. A cyclic ida is used for the lower 32 bits and a the high 32 bits are used for the generation number. This gives a 64 bit inode number that is unique on both 32 bit and 64 bit. The lower 32 bit number is recycled slowly and can be used to lookup struct pids. Link: https://lore.kernel.org/r/20241129-work-pidfs-v2-1-61043d66fbce@kernel.org Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 75bdf9807802..2958652bb108 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,5 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); +int pidfs_add_pid(struct pid *pid); +void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From d2ab36bb115b720c9c738184d4007e1ca01c53da Mon Sep 17 00:00:00 2001 From: Erin Shepherd Date: Fri, 29 Nov 2024 14:38:00 +0100 Subject: pseudofs: add support for export_ops Pseudo-filesystems might reasonably wish to implement the export ops (particularly for name_to_handle_at/open_by_handle_at); plumb this through pseudo_fs_context Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Erin Shepherd Link: https://lore.kernel.org/r/20241113-pidfs_fh-v2-1-9a4d28155a37@e43.eu Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-1-87d803a42495@kernel.org Signed-off-by: Christian Brauner --- include/linux/pseudo_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 730f77381d55..2503f7625d65 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -5,6 +5,7 @@ struct pseudo_fs_context { const struct super_operations *ops; + const struct export_operations *eops; const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; -- cgit v1.2.3 From 50166d57ea8c5042ecba0ee22532617d72ed085a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:38:02 +0100 Subject: exportfs: add open method This allows filesystems such as pidfs to provide their custom open. Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-3-87d803a42495@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 4cc8801e50e3..c69b79b64466 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -10,6 +10,7 @@ struct inode; struct iomap; struct super_block; struct vfsmount; +struct path; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 @@ -225,6 +226,9 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * open: + * Allow filesystems to specify a custom open function. + * * commit_metadata: * @commit_metadata should commit metadata changes to stable storage. * @@ -251,6 +255,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ -- cgit v1.2.3 From c220e216d6bcd52cc7333e38edf43dc66ba0dd13 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 29 Nov 2024 14:38:04 +0100 Subject: exportfs: add permission method This allows filesystems such as pidfs to provide their custom permission checks. Link: https://lore.kernel.org/r/20241129-work-pidfs-file_handle-v1-5-87d803a42495@kernel.org Reviewed-by: Amir Goldstein Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/exportfs.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index c69b79b64466..a087606ace19 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -3,6 +3,7 @@ #define LINUX_EXPORTFS_H 1 #include +#include struct dentry; struct iattr; @@ -10,7 +11,6 @@ struct inode; struct iomap; struct super_block; struct vfsmount; -struct path; /* limit the handle size to NFSv4 handle size now */ #define MAX_HANDLE_SZ 128 @@ -157,6 +157,17 @@ struct fid { }; }; +enum handle_to_path_flags { + HANDLE_CHECK_PERMS = (1 << 0), + HANDLE_CHECK_SUBTREE = (1 << 1), +}; + +struct handle_to_path_ctx { + struct path root; + enum handle_to_path_flags flags; + unsigned int fh_flags; +}; + #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ @@ -226,6 +237,9 @@ struct fid { * is also a directory. In the event that it cannot be found, or storage * space cannot be allocated, a %ERR_PTR should be returned. * + * permission: + * Allow filesystems to specify a custom permission function. + * * open: * Allow filesystems to specify a custom open function. * @@ -255,6 +269,7 @@ struct export_operations { bool write, u32 *device_generation); int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int nr_iomaps, struct iattr *iattr); + int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags); struct file * (*open)(struct path *path, unsigned int oflags); #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ -- cgit v1.2.3 From 16ecd47cb0cd895c7c2f5dd5db50f6c005c51639 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 14 Dec 2024 22:01:28 +0100 Subject: pidfs: lookup pid through rbtree The new pid inode number allocation scheme is neat but I overlooked a possible, even though unlikely, attack that can be used to trigger an overflow on both 32bit and 64bit. An unique 64 bit identifier was constructed for each struct pid by two combining a 32 bit idr with a 32 bit generation number. A 32bit number was allocated using the idr_alloc_cyclic() infrastructure. When the idr wrapped around a 32 bit wraparound counter was incremented. The 32 bit wraparound counter served as the upper 32 bits and the allocated idr number as the lower 32 bits. Since the idr can only allocate up to INT_MAX entries everytime a wraparound happens INT_MAX - 1 entries are lost (Ignoring that numbering always starts at 2 to avoid theoretical collisions with the root inode number.). If userspace fully populates the idr such that and puts itself into control of two entries such that one entry is somewhere in the middle and the other entry is the INT_MAX entry then it is possible to overflow the wraparound counter. That is probably difficult to pull off but the mere possibility is annoying. The problem could be contained to 32 bit by switching to a data structure such as the maple tree that allows allocating 64 bit numbers on 64 bit machines. That would leave 32 bit in a lurch but that probably doesn't matter that much. The other problem is that removing entries form the maple tree is somewhat non-trivial because the removal code can be called under the irq write lock of tasklist_lock and irq{save,restore} code. Instead, allocate unique identifiers for struct pid by simply incrementing a 64 bit counter and insert each struct pid into the rbtree so it can be looked up to decode file handles avoiding to leak actual pids across pid namespaces in file handles. On both 64 bit and 32 bit the same 64 bit identifier is used to lookup struct pid in the rbtree. On 64 bit the unique identifier for struct pid simply becomes the inode number. Comparing two pidfds continues to be as simple as comparing inode numbers. On 32 bit the 64 bit number assigned to struct pid is split into two 32 bit numbers. The lower 32 bits are used as the inode number and the upper 32 bits are used as the inode generation number. Whenever a wraparound happens on 32 bit the 64 bit number will be incremented by 2 so inode numbering starts at 2 again. When a wraparound happens on 32 bit multiple pidfds with the same inode number are likely to exist. This isn't a problem since before pidfs pidfds used the anonymous inode meaning all pidfds had the same inode number. On 32 bit sserspace can thus reconstruct the 64 bit identifier by retrieving both the inode number and the inode generation number to compare, or use file handles. This gives the same guarantees on both 32 bit and 64 bit. Link: https://lore.kernel.org/r/20241214-gekoppelt-erdarbeiten-a1f9a982a5a6@brauner Signed-off-by: Christian Brauner --- include/linux/pid.h | 2 ++ include/linux/pidfs.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index a3aad9b4074c..fe575fcdb4af 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -59,6 +59,7 @@ struct pid spinlock_t lock; struct dentry *stashed; u64 ino; + struct rb_node pidfs_node; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -68,6 +69,7 @@ struct pid struct upid numbers[]; }; +extern seqcount_spinlock_t pidmap_lock_seq; extern struct pid init_struct_pid; struct file; diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index 2958652bb108..df574d6708d4 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -4,7 +4,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); -int pidfs_add_pid(struct pid *pid); +void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3 From ef4144ac2dec35d47de666f35cd873eb1be4172e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 19 Dec 2024 18:01:32 +0100 Subject: pidfs: allow bind-mounts Allow bind-mounting pidfds. Similar to nsfs let's allow bind-mounts for pidfds. This allows pidfds to be safely recovered and checked for process recycling. Link: https://lore.kernel.org/r/20241219-work-pidfs-mount-v1-1-dbc56198b839@kernel.org Signed-off-by: Christian Brauner --- include/linux/pidfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h index df574d6708d4..7c830d0dec9a 100644 --- a/include/linux/pidfs.h +++ b/include/linux/pidfs.h @@ -6,5 +6,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); void __init pidfs_init(void); void pidfs_add_pid(struct pid *pid); void pidfs_remove_pid(struct pid *pid); +extern const struct dentry_operations pidfs_dentry_operations; #endif /* _LINUX_PID_FS_H */ -- cgit v1.2.3