From 6f73171e192366ff7c98af9fb50615ef9615f8a7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 4 Mar 2021 12:48:22 +0200 Subject: fsnotify: allow fsnotify_{peek,remove}_first_event with empty queue Current code has an assumtion that fsnotify_notify_queue_is_empty() is called to verify that queue is not empty before trying to peek or remove an event from queue. Remove this assumption by moving the fsnotify_notify_queue_is_empty() into the functions, allow them to return NULL value and check return value by all callers. This is a prep patch for multi event queues. Link: https://lore.kernel.org/r/20210304104826.3993892-2-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index e5409b83e731..7eb979bfc141 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -495,7 +495,13 @@ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) fsnotify_add_event(group, group->overflow_event, NULL); } -/* true if the group notification queue is empty */ +static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) +{ + assert_spin_locked(&group->notification_lock); + + return list_empty(&group->notification_list); +} + extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); -- cgit v1.2.3 From 8988f11abb820bacfcc53d498370bfb30f792ec4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 4 Mar 2021 12:48:23 +0200 Subject: fanotify: reduce event objectid to 29-bit hash objectid is only used by fanotify backend and it is just an optimization for event merge before comparing all fields in event. Move the objectid member from common struct fsnotify_event into struct fanotify_event and reduce it to 29-bit hash to cram it together with the 3-bit event type. Events of different types are never merged, so the combination of event type and hash form a 32-bit key for fast compare of events. This reduces the size of events by one pointer and paves the way for adding hashed queue support for fanotify. Link: https://lore.kernel.org/r/20210304104826.3993892-3-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7eb979bfc141..fc98f9f88d12 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -167,7 +167,6 @@ struct fsnotify_ops { */ struct fsnotify_event { struct list_head list; - unsigned long objectid; /* identifier for queue merges */ }; /* @@ -582,11 +581,9 @@ extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); -static inline void fsnotify_init_event(struct fsnotify_event *event, - unsigned long objectid) +static inline void fsnotify_init_event(struct fsnotify_event *event) { INIT_LIST_HEAD(&event->list); - event->objectid = objectid; } #else -- cgit v1.2.3 From 94e00d28a680dff18805ca472b191364347d2234 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 4 Mar 2021 12:48:25 +0200 Subject: fsnotify: use hash table for faster events merge In order to improve event merge performance, hash events in a 128 size hash table by the event merge key. The fanotify_event size grows by two pointers, but we just reduced its size by removing the objectid member, so overall its size is increased by one pointer. Permission events and overflow event are not merged so they are also not hashed. Link: https://lore.kernel.org/r/20210304104826.3993892-5-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fsnotify_backend.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index fc98f9f88d12..63fb766f0f3e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -233,6 +233,8 @@ struct fsnotify_group { #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { + /* Hash table of events for merge */ + struct hlist_head *merge_hash; /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; @@ -486,12 +488,14 @@ extern void fsnotify_destroy_event(struct fsnotify_group *group, /* attach the event to the group notification queue */ extern int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, - int (*merge)(struct list_head *, - struct fsnotify_event *)); + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)); /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { - fsnotify_add_event(group, group->overflow_event, NULL); + fsnotify_add_event(group, group->overflow_event, NULL, NULL); } static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) -- cgit v1.2.3 From 5b8fea65d197f408bb00b251c70d842826d6b70b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 4 Mar 2021 13:29:20 +0200 Subject: fanotify: configurable limits via sysfs fanotify has some hardcoded limits. The only APIs to escape those limits are FAN_UNLIMITED_QUEUE and FAN_UNLIMITED_MARKS. Allow finer grained tuning of the system limits via sysfs tunables under /proc/sys/fs/fanotify, similar to tunables under /proc/sys/fs/inotify, with some minor differences. - max_queued_events - global system tunable for group queue size limit. Like the inotify tunable with the same name, it defaults to 16384 and applies on initialization of a new group. - max_user_marks - user ns tunable for marks limit per user. Like the inotify tunable named max_user_watches, on a machine with sufficient RAM and it defaults to 1048576 in init userns and can be further limited per containing user ns. - max_user_groups - user ns tunable for number of groups per user. Like the inotify tunable named max_user_instances, it defaults to 128 in init userns and can be further limited per containing user ns. The slightly different tunable names used for fanotify are derived from the "group" and "mark" terminology used in the fanotify man pages and throughout the code. Considering the fact that the default value for max_user_instances was increased in kernel v5.10 from 8192 to 1048576, leaving the legacy fanotify limit of 8192 marks per group in addition to the max_user_marks limit makes little sense, so the per group marks limit has been removed. Note that when a group is initialized with FAN_UNLIMITED_MARKS, its own marks are not accounted in the per user marks account, so in effect the limit of max_user_marks is only for the collection of groups that are not initialized with FAN_UNLIMITED_MARKS. Link: https://lore.kernel.org/r/20210304112921.3996419-2-amir73il@gmail.com Suggested-by: Jan Kara Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fanotify.h | 3 +++ include/linux/fsnotify_backend.h | 6 +----- include/linux/sched/user.h | 3 --- include/linux/user_namespace.h | 4 ++++ 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 3e9c56ee651f..031a97d8369a 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -2,8 +2,11 @@ #ifndef _LINUX_FANOTIFY_H #define _LINUX_FANOTIFY_H +#include #include +extern struct ctl_table fanotify_table[]; /* for sysctl */ + #define FAN_GROUP_FLAG(group, flag) \ ((group)->fanotify_data.flags & (flag)) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 63fb766f0f3e..1ce66748a2d2 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -206,9 +206,6 @@ struct fsnotify_group { /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ - atomic_t num_marks; /* 1 for each mark and 1 for not being - * past the point of no return when freeing - * a group */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ @@ -240,8 +237,7 @@ struct fsnotify_group { wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ - unsigned int max_marks; - struct user_struct *user; + struct ucounts *ucounts; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index a8ec3b6093fc..3632c5d6ec55 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -14,9 +14,6 @@ struct user_struct { refcount_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ -#ifdef CONFIG_FANOTIFY - atomic_t fanotify_listeners; -#endif #ifdef CONFIG_EPOLL atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ #endif diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 64cf8ebdc4ec..f0d961a15fba 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -49,6 +49,10 @@ enum ucount_type { #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, +#endif +#ifdef CONFIG_FANOTIFY + UCOUNT_FANOTIFY_GROUPS, + UCOUNT_FANOTIFY_MARKS, #endif UCOUNT_COUNTS, }; -- cgit v1.2.3 From 7cea2a3c505e87a9d6afc78be4a7f7be636a73a7 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 4 Mar 2021 13:29:21 +0200 Subject: fanotify: support limited functionality for unprivileged users Add limited support for unprivileged fanotify groups. An unprivileged users is not allowed to get an open file descriptor in the event nor the process pid of another process. An unprivileged user cannot request permission events, cannot set mount/filesystem marks and cannot request unlimited queue/marks. This enables the limited functionality similar to inotify when watching a set of files and directories for OPEN/ACCESS/MODIFY/CLOSE events, without requiring SYS_CAP_ADMIN privileges. The FAN_REPORT_DFID_NAME init flag, provide a method for an unprivileged listener watching a set of directories (with FAN_EVENT_ON_CHILD) to monitor all changes inside those directories. This typically requires that the listener keeps a map of watched directory fid to dirfd (O_PATH), where fid is obtained with name_to_handle_at() before starting to watch for changes. When getting an event, the reported fid of the parent should be resolved to dirfd and fstatsat(2) with dirfd and name should be used to query the state of the filesystem entry. Link: https://lore.kernel.org/r/20210304112921.3996419-3-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/fanotify.h | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 031a97d8369a..bad41bcb25df 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -18,15 +18,38 @@ extern struct ctl_table fanotify_table[]; /* for sysctl */ * these constant, the programs may break if re-compiled with new uapi headers * and then run on an old kernel. */ -#define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \ + +/* Group classes where permission events are allowed */ +#define FANOTIFY_PERM_CLASSES (FAN_CLASS_CONTENT | \ FAN_CLASS_PRE_CONTENT) +#define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FANOTIFY_PERM_CLASSES) + #define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) -#define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \ - FAN_REPORT_TID | \ - FAN_CLOEXEC | FAN_NONBLOCK | \ - FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS) +/* + * fanotify_init() flags that require CAP_SYS_ADMIN. + * We do not allow unprivileged groups to request permission events. + * We do not allow unprivileged groups to get other process pid in events. + * We do not allow unprivileged groups to use unlimited resources. + */ +#define FANOTIFY_ADMIN_INIT_FLAGS (FANOTIFY_PERM_CLASSES | \ + FAN_REPORT_TID | \ + FAN_UNLIMITED_QUEUE | \ + FAN_UNLIMITED_MARKS) + +/* + * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN. + * FAN_CLASS_NOTIF is the only class we allow for unprivileged group. + * We do not allow unprivileged groups to get file descriptors in events, + * so one of the flags for reporting file handles is required. + */ +#define FANOTIFY_USER_INIT_FLAGS (FAN_CLASS_NOTIF | \ + FANOTIFY_FID_BITS | \ + FAN_CLOEXEC | FAN_NONBLOCK) + +#define FANOTIFY_INIT_FLAGS (FANOTIFY_ADMIN_INIT_FLAGS | \ + FANOTIFY_USER_INIT_FLAGS) #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ FAN_MARK_FILESYSTEM) -- cgit v1.2.3 From 9591c3a34f7722bd77f42c98d76fd5a5bad465f0 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 22 Mar 2021 19:39:43 +0200 Subject: fs: introduce a wrapper uuid_to_fsid() Some filesystem's use a digest of their uuid for f_fsid. Create a simple wrapper for this open coded folding. Filesystems that have a non null uuid but use the block device number for f_fsid may also consider using this helper. [JK: Added missing asm/byteorder.h include] Link: https://lore.kernel.org/r/20210322173944.449469-2-amir73il@gmail.com Acked-by: Damien Le Moal Reviewed-by: Christian Brauner Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- include/linux/statfs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/statfs.h b/include/linux/statfs.h index 20f695b90aab..02c862686ea3 100644 --- a/include/linux/statfs.h +++ b/include/linux/statfs.h @@ -4,6 +4,7 @@ #include #include +#include struct kstatfs { long f_type; @@ -50,4 +51,11 @@ static inline __kernel_fsid_t u64_to_fsid(u64 v) return (__kernel_fsid_t){.val = {(u32)v, (u32)(v>>32)}}; } +/* Fold 16 bytes uuid to 64 bit fsid */ +static inline __kernel_fsid_t uuid_to_fsid(__u8 *uuid) +{ + return u64_to_fsid(le64_to_cpup((void *)uuid) ^ + le64_to_cpup((void *)(uuid + sizeof(u64)))); +} + #endif -- cgit v1.2.3