diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-23 20:33:51 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-23 20:33:51 -0800 |
commit | f1ef09fde17f9b77ca1435a5b53a28b203afb81c (patch) | |
tree | 0efcd2c5b5da451a7ca780c8aa5e26d7ec712b85 /fs/notify | |
parent | ef96152e6a36e0510387cb174178b7982c1ae879 (diff) | |
parent | ace0c791e6c3cf5ef37cad2df69f0d90ccc40ffb (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull namespace updates from Eric Biederman:
"There is a lot here. A lot of these changes result in subtle user
visible differences in kernel behavior. I don't expect anything will
care but I will revert/fix things immediately if any regressions show
up.
From Seth Forshee there is a continuation of the work to make the vfs
ready for unpriviled mounts. We had thought the previous changes
prevented the creation of files outside of s_user_ns of a filesystem,
but it turns we missed the O_CREAT path. Ooops.
Pavel Tikhomirov and Oleg Nesterov worked together to fix a long
standing bug in the implemenation of PR_SET_CHILD_SUBREAPER where only
children that are forked after the prctl are considered and not
children forked before the prctl. The only known user of this prctl
systemd forks all children after the prctl. So no userspace
regressions will occur. Holding earlier forked children to the same
rules as later forked children creates a semantic that is sane enough
to allow checkpoing of processes that use this feature.
There is a long delayed change by Nikolay Borisov to limit inotify
instances inside a user namespace.
Michael Kerrisk extends the API for files used to maniuplate
namespaces with two new trivial ioctls to allow discovery of the
hierachy and properties of namespaces.
Konstantin Khlebnikov with the help of Al Viro adds code that when a
network namespace exits purges it's sysctl entries from the dcache. As
in some circumstances this could use a lot of memory.
Vivek Goyal fixed a bug with stacked filesystems where the permissions
on the wrong inode were being checked.
I continue previous work on ptracing across exec. Allowing a file to
be setuid across exec while being ptraced if the tracer has enough
credentials in the user namespace, and if the process has CAP_SETUID
in it's own namespace. Proc files for setuid or otherwise undumpable
executables are now owned by the root in the user namespace of their
mm. Allowing debugging of setuid applications in containers to work
better.
A bug I introduced with permission checking and automount is now
fixed. The big change is to mark the mounts that the kernel initiates
as a result of an automount. This allows the permission checks in sget
to be safely suppressed for this kind of mount. As the permission
check happened when the original filesystem was mounted.
Finally a special case in the mount namespace is removed preventing
unbounded chains in the mount hash table, and making the semantics
simpler which benefits CRIU.
The vfs fix along with related work in ima and evm I believe makes us
ready to finish developing and merge fully unprivileged mounts of the
fuse filesystem. The cleanups of the mount namespace makes discussing
how to fix the worst case complexity of umount. The stacked filesystem
fixes pave the way for adding multiple mappings for the filesystem
uids so that efficient and safer containers can be implemented"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
proc/sysctl: Don't grab i_lock under sysctl_lock.
vfs: Use upper filesystem inode in bprm_fill_uid()
proc/sysctl: prune stale dentries during unregistering
mnt: Tuck mounts under others instead of creating shadow/side mounts.
prctl: propagate has_child_subreaper flag to every descendant
introduce the walk_process_tree() helper
nsfs: Add an ioctl() to return owner UID of a userns
fs: Better permission checking for submounts
exit: fix the setns() && PR_SET_CHILD_SUBREAPER interaction
vfs: open() with O_CREAT should not create inodes with unknown ids
nsfs: Add an ioctl() to return the namespace type
proc: Better ownership of files for non-dumpable tasks in user namespaces
exec: Remove LSM_UNSAFE_PTRACE_CAP
exec: Test the ptracer's saved cred to see if the tracee can gain caps
exec: Don't reset euid and egid when the tracee has CAP_SETUID
inotify: Convert to using per-namespace limits
Diffstat (limited to 'fs/notify')
-rw-r--r-- | fs/notify/inotify/inotify.h | 17 | ||||
-rw-r--r-- | fs/notify/inotify/inotify_fsnotify.c | 6 | ||||
-rw-r--r-- | fs/notify/inotify/inotify_user.c | 34 |
3 files changed, 36 insertions, 21 deletions
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index a6f5907a3fee..7c461fd49c4c 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; + +#ifdef CONFIG_INOTIFY_USER +static inline void dec_inotify_instances(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES); +} + +static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts) +{ + return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES); +} + +static inline void dec_inotify_watches(struct ucounts *ucounts) +{ + dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES); +} +#endif diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 19e7ec109a75..f36c29398de3 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group) /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); - if (group->inotify_data.user) { - atomic_dec(&group->inotify_data.user->inotify_devs); - free_uid(group->inotify_data.user); - } + if (group->inotify_data.ucounts) + dec_inotify_instances(group->inotify_data.ucounts); } static void inotify_free_event(struct fsnotify_event *fsn_event) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 69d1ea3d292a..1cf41c623be1 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -44,10 +44,8 @@ #include <asm/ioctls.h> -/* these are configurable via /proc/sys/fs/inotify/ */ -static int inotify_max_user_instances __read_mostly; +/* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -static int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; @@ -60,7 +58,7 @@ static int zero; struct ctl_table inotify_table[] = { { .procname = "max_user_instances", - .data = &inotify_max_user_instances, + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = { }, { .procname = "max_user_watches", - .data = &inotify_max_user_watches, + .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, @@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* remove this mark from the idr */ inotify_remove_from_idr(group, i_mark); - atomic_dec(&group->inotify_data.user->inotify_watches); + dec_inotify_watches(group->inotify_data.ucounts); } /* ding dong the mark is dead */ @@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group, tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; - ret = -ENOSPC; - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) - goto out_err; - ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark); if (ret) goto out_err; + /* increment the number of watches the user has */ + if (!inc_inotify_watches(group->inotify_data.ucounts)) { + inotify_remove_from_idr(group, tmp_i_mark); + ret = -ENOSPC; + goto out_err; + } + /* we are on the idr, now get on the inode */ ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, NULL, 0); @@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group, goto out_err; } - /* increment the number of watches the user has */ - atomic_inc(&group->inotify_data.user->inotify_watches); /* return the watch descriptor for this new mark */ ret = tmp_i_mark->wd; @@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.user = get_current_user(); + group->inotify_data.ucounts = inc_ucount(current_user_ns(), + current_euid(), + UCOUNT_INOTIFY_INSTANCES); - if (atomic_inc_return(&group->inotify_data.user->inotify_devs) > - inotify_max_user_instances) { + if (!group->inotify_data.ucounts) { fsnotify_destroy_group(group); return ERR_PTR(-EMFILE); } @@ -819,8 +819,8 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); inotify_max_queued_events = 16384; - inotify_max_user_instances = 128; - inotify_max_user_watches = 8192; + init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192; return 0; } |