From 0afa5ca82212247456f9de1468b595a111fee633 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 19 Feb 2020 17:17:34 -0600 Subject: proc: Rename in proc_inode rename sysctl_inodes sibling_inodes I about to need and use the same functionality for pid based inodes and there is no point in adding a second field when this field is already here and serving the same purporse. Just give the field a generic name so it is clear that it is no longer sysctl specific. Also for good measure initialize sibling_inodes when proc_inode is initialized. Signed-off-by: Eric W. Biederman --- fs/proc/inode.c | 1 + fs/proc/internal.h | 2 +- fs/proc/proc_sysctl.c | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6da18316d209..bdae442d5262 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -68,6 +68,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; + INIT_HLIST_NODE(&ei->sibling_inodes); ei->ns_ops = NULL; return &ei->vfs_inode; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 41587276798e..366cd3aa690b 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -91,7 +91,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; - struct hlist_node sysctl_inodes; + struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c75bb4632ed1..42fbb7f3c587 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -279,9 +279,9 @@ static void proc_sys_prune_dcache(struct ctl_table_header *head) node = hlist_first_rcu(&head->inodes); if (!node) break; - ei = hlist_entry(node, struct proc_inode, sysctl_inodes); + ei = hlist_entry(node, struct proc_inode, sibling_inodes); spin_lock(&sysctl_lock); - hlist_del_init_rcu(&ei->sysctl_inodes); + hlist_del_init_rcu(&ei->sibling_inodes); spin_unlock(&sysctl_lock); inode = &ei->vfs_inode; @@ -483,7 +483,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, } ei->sysctl = head; ei->sysctl_entry = table; - hlist_add_head_rcu(&ei->sysctl_inodes, &head->inodes); + hlist_add_head_rcu(&ei->sibling_inodes, &head->inodes); head->count++; spin_unlock(&sysctl_lock); @@ -514,7 +514,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { spin_lock(&sysctl_lock); - hlist_del_init_rcu(&PROC_I(inode)->sysctl_inodes); + hlist_del_init_rcu(&PROC_I(inode)->sibling_inodes); if (!--head->count) kfree_rcu(head, rcu); spin_unlock(&sysctl_lock); -- cgit v1.2.3 From 26dbc60f385ff9cff475ea2a3bad02e80fd6fa43 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Feb 2020 08:34:44 -0600 Subject: proc: Generalize proc_sys_prune_dcache into proc_prune_siblings_dcache This prepares the way for allowing the pid part of proc to use this dcache pruning code as well. Signed-off-by: Eric W. Biederman --- fs/proc/inode.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/proc/internal.h | 1 + fs/proc/proc_sysctl.c | 35 +---------------------------------- 3 files changed, 40 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index bdae442d5262..74ce4a8d05eb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -103,6 +103,44 @@ void __init proc_init_kmemcache(void) BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } +void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) +{ + struct inode *inode; + struct proc_inode *ei; + struct hlist_node *node; + struct super_block *sb; + + rcu_read_lock(); + for (;;) { + node = hlist_first_rcu(inodes); + if (!node) + break; + ei = hlist_entry(node, struct proc_inode, sibling_inodes); + spin_lock(lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(lock); + + inode = &ei->vfs_inode; + sb = inode->i_sb; + if (!atomic_inc_not_zero(&sb->s_active)) + continue; + inode = igrab(inode); + rcu_read_unlock(); + if (unlikely(!inode)) { + deactivate_super(sb); + rcu_read_lock(); + continue; + } + + d_prune_aliases(inode); + iput(inode); + deactivate_super(sb); + + rcu_read_lock(); + } + rcu_read_unlock(); +} + static int proc_show_options(struct seq_file *seq, struct dentry *root) { struct super_block *sb = root->d_sb; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 366cd3aa690b..ba9a991824a5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -210,6 +210,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); +void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 42fbb7f3c587..5da9d7f7ae34 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -269,40 +269,7 @@ static void unuse_table(struct ctl_table_header *p) static void proc_sys_prune_dcache(struct ctl_table_header *head) { - struct inode *inode; - struct proc_inode *ei; - struct hlist_node *node; - struct super_block *sb; - - rcu_read_lock(); - for (;;) { - node = hlist_first_rcu(&head->inodes); - if (!node) - break; - ei = hlist_entry(node, struct proc_inode, sibling_inodes); - spin_lock(&sysctl_lock); - hlist_del_init_rcu(&ei->sibling_inodes); - spin_unlock(&sysctl_lock); - - inode = &ei->vfs_inode; - sb = inode->i_sb; - if (!atomic_inc_not_zero(&sb->s_active)) - continue; - inode = igrab(inode); - rcu_read_unlock(); - if (unlikely(!inode)) { - deactivate_super(sb); - rcu_read_lock(); - continue; - } - - d_prune_aliases(inode); - iput(inode); - deactivate_super(sb); - - rcu_read_lock(); - } - rcu_read_unlock(); + proc_prune_siblings_dcache(&head->inodes, &sysctl_lock); } /* called under sysctl_lock, will reacquire if has to wait */ -- cgit v1.2.3 From 080f6276fccfb3923691e57c1b44a627eabd1a25 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Feb 2020 08:33:57 -0600 Subject: proc: In proc_prune_siblings_dcache cache an aquired super block Because there are likely to be several sysctls in a row on the same superblock cache the super_block after the count has been raised and don't deactivate it until we are processing another super_block. Signed-off-by: "Eric W. Biederman" --- fs/proc/inode.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 74ce4a8d05eb..fa2dc732cd77 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -108,10 +108,11 @@ void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) struct inode *inode; struct proc_inode *ei; struct hlist_node *node; - struct super_block *sb; + struct super_block *old_sb = NULL; rcu_read_lock(); for (;;) { + struct super_block *sb; node = hlist_first_rcu(inodes); if (!node) break; @@ -122,23 +123,28 @@ void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) inode = &ei->vfs_inode; sb = inode->i_sb; - if (!atomic_inc_not_zero(&sb->s_active)) + if ((sb != old_sb) && !atomic_inc_not_zero(&sb->s_active)) continue; inode = igrab(inode); rcu_read_unlock(); + if (sb != old_sb) { + if (old_sb) + deactivate_super(old_sb); + old_sb = sb; + } if (unlikely(!inode)) { - deactivate_super(sb); rcu_read_lock(); continue; } d_prune_aliases(inode); iput(inode); - deactivate_super(sb); rcu_read_lock(); } rcu_read_unlock(); + if (old_sb) + deactivate_super(old_sb); } static int proc_show_options(struct seq_file *seq, struct dentry *root) -- cgit v1.2.3 From f90f3cafe8d56d593fc509a4185da1d5800efea4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 21 Feb 2020 08:43:23 -0600 Subject: proc: Use d_invalidate in proc_prune_siblings_dcache The function d_prune_aliases has the problem that it will only prune aliases thare are completely unused. It will not remove aliases for the dcache or even think of removing mounts from the dcache. For that behavior d_invalidate is needed. To use d_invalidate replace d_prune_aliases with d_find_alias followed by d_invalidate and dput. For completeness the directory and the non-directory cases are separated because in theory (although not in currently in practice for proc) directories can only ever have a single dentry while non-directories can have hardlinks and thus multiple dentries. As part of this separation use d_find_any_alias for directories to spare d_find_alias the extra work of doing that. Plus the differences between d_find_any_alias and d_find_alias makes it clear why the directory and non-directory code and not share code. To make it clear these routines now invalidate dentries rename proc_prune_siblings_dache to proc_invalidate_siblings_dcache, and rename proc_sys_prune_dcache proc_sys_invalidate_dcache. V2: Split the directory and non-directory cases. To make this code robust to future changes in proc. Signed-off-by: "Eric W. Biederman" --- fs/proc/inode.c | 16 ++++++++++++++-- fs/proc/internal.h | 2 +- fs/proc/proc_sysctl.c | 8 ++++---- 3 files changed, 19 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index fa2dc732cd77..ba6acd300ce1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -103,7 +103,7 @@ void __init proc_init_kmemcache(void) BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } -void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) { struct inode *inode; struct proc_inode *ei; @@ -137,7 +137,19 @@ void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock) continue; } - d_prune_aliases(inode); + if (S_ISDIR(inode->i_mode)) { + struct dentry *dir = d_find_any_alias(inode); + if (dir) { + d_invalidate(dir); + dput(dir); + } + } else { + struct dentry *dentry; + while ((dentry = d_find_alias(inode))) { + d_invalidate(dentry); + dput(dentry); + } + } iput(inode); rcu_read_lock(); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index ba9a991824a5..fd470172675f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -210,7 +210,7 @@ extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); -void proc_prune_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); +void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5da9d7f7ae34..b6f5d459b087 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -267,9 +267,9 @@ static void unuse_table(struct ctl_table_header *p) complete(p->unregistering); } -static void proc_sys_prune_dcache(struct ctl_table_header *head) +static void proc_sys_invalidate_dcache(struct ctl_table_header *head) { - proc_prune_siblings_dcache(&head->inodes, &sysctl_lock); + proc_invalidate_siblings_dcache(&head->inodes, &sysctl_lock); } /* called under sysctl_lock, will reacquire if has to wait */ @@ -291,10 +291,10 @@ static void start_unregistering(struct ctl_table_header *p) spin_unlock(&sysctl_lock); } /* - * Prune dentries for unregistered sysctls: namespaced sysctls + * Invalidate dentries for unregistered sysctls: namespaced sysctls * can have duplicate names and contaminate dcache very badly. */ - proc_sys_prune_dcache(p); + proc_sys_invalidate_dcache(p); /* * do not remove from the list until nobody holds it; walking the * list in do_sysctl() relies on that. -- cgit v1.2.3 From 71448011ea2a1cd36d8f5cbdab0ed716c454d565 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Feb 2020 11:17:28 -0600 Subject: proc: Clear the pieces of proc_inode that proc_evict_inode cares about This just keeps everything tidier, and allows for using flags like SLAB_TYPESAFE_BY_RCU where slabs are not always cleared before reuse. I don't see reuse without reinitializing happening with the proc_inode but I had a false alarm while reworking flushing of proc dentries and indoes when a process dies that caused me to tidy this up. The code is a little easier to follow and reason about this way so I figured the changes might as well be kept. Signed-off-by: "Eric W. Biederman" --- fs/proc/inode.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index ba6acd300ce1..d9243b24554a 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -33,21 +33,27 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; + struct proc_inode *ei = PROC_I(inode); truncate_inode_pages_final(&inode->i_data); clear_inode(inode); /* Stop tracking associated processes */ - put_pid(PROC_I(inode)->pid); + if (ei->pid) { + put_pid(ei->pid); + ei->pid = NULL; + } /* Let go of any associated proc directory entry */ - de = PDE(inode); - if (de) + de = ei->pde; + if (de) { pde_put(de); + ei->pde = NULL; + } - head = PROC_I(inode)->sysctl; + head = ei->sysctl; if (head) { - RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); + RCU_INIT_POINTER(ei->sysctl, NULL); proc_sys_evict_inode(inode, head); } } -- cgit v1.2.3 From 7bc3e6e55acf065500a24621f3b313e7e5998acf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 19 Feb 2020 18:22:26 -0600 Subject: proc: Use a list of inodes to flush from proc Rework the flushing of proc to use a list of directory inodes that need to be flushed. The list is kept on struct pid not on struct task_struct, as there is a fixed connection between proc inodes and pids but at least for the case of de_thread the pid of a task_struct changes. This removes the dependency on proc_mnt which allows for different mounts of proc having different mount options even in the same pid namespace and this allows for the removal of proc_mnt which will trivially the first mount of proc to honor it's mount options. This flushing remains an optimization. The functions pid_delete_dentry and pid_revalidate ensure that ordinary dcache management will not attempt to use dentries past the point their respective task has died. When unused the shrinker will eventually be able to remove these dentries. There is a case in de_thread where proc_flush_pid can be called early for a given pid. Which winds up being safe (if suboptimal) as this is just an optiimization. Only pid directories are put on the list as the other per pid files are children of those directories and d_invalidate on the directory will get them as well. So that the pid can be used during flushing it's reference count is taken in release_task and dropped in proc_flush_pid. Further the call of proc_flush_pid is moved after the tasklist_lock is released in release_task so that it is certain that the pid has already been unhashed when flushing it taking place. This removes a small race where a dentry could recreated. As struct pid is supposed to be small and I need a per pid lock I reuse the only lock that currently exists in struct pid the the wait_pidfd.lock. The net result is that this adds all of this functionality with just a little extra list management overhead and a single extra pointer in struct pid. v2: Initialize pid->inodes. I somehow failed to get that initialization into the initial version of the patch. A boot failure was reported by "kernel test robot ", and failure to initialize that pid->inodes matches all of the reported symptoms. Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 111 +++++++++++++++++------------------------------------ fs/proc/inode.c | 2 +- fs/proc/internal.h | 1 + 3 files changed, 38 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index c7c64272b0fa..e7efe9d6f3d6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode, *rgid = gid; } +void proc_pid_evict_inode(struct proc_inode *ei) +{ + struct pid *pid = ei->pid; + + if (S_ISDIR(ei->vfs_inode.i_mode)) { + spin_lock(&pid->wait_pidfd.lock); + hlist_del_init_rcu(&ei->sibling_inodes); + spin_unlock(&pid->wait_pidfd.lock); + } + + put_pid(pid); +} + struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; + struct pid *pid; /* We need a new inode */ @@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb, /* * grab the reference to task. */ - ei->pid = get_task_pid(task, PIDTYPE_PID); - if (!ei->pid) + pid = get_task_pid(task, PIDTYPE_PID); + if (!pid) goto out_unlock; + /* Let the pid remember us for quick removal */ + ei->pid = pid; + if (S_ISDIR(mode)) { + spin_lock(&pid->wait_pidfd.lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->wait_pidfd.lock); + } + task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .permission = proc_pid_permission, }; -static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) -{ - struct dentry *dentry, *leader, *dir; - char buf[10 + 1]; - struct qstr name; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - /* no ->d_hash() rejects on procfs */ - dentry = d_hash_and_lookup(mnt->mnt_root, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - if (pid == tgid) - return; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", tgid); - leader = d_hash_and_lookup(mnt->mnt_root, &name); - if (!leader) - goto out; - - name.name = "task"; - name.len = strlen(name.name); - dir = d_hash_and_lookup(leader, &name); - if (!dir) - goto out_put_leader; - - name.name = buf; - name.len = snprintf(buf, sizeof(buf), "%u", pid); - dentry = d_hash_and_lookup(dir, &name); - if (dentry) { - d_invalidate(dentry); - dput(dentry); - } - - dput(dir); -out_put_leader: - dput(leader); -out: - return; -} - /** - * proc_flush_task - Remove dcache entries for @task from the /proc dcache. - * @task: task that should be flushed. + * proc_flush_pid - Remove dcache entries for @pid from the /proc dcache. + * @pid: pid that should be flushed. * - * When flushing dentries from proc, one needs to flush them from global - * proc (proc_mnt) and from all the namespaces' procs this task was seen - * in. This call is supposed to do all of this job. - * - * Looks in the dcache for - * /proc/@pid - * /proc/@tgid/task/@pid - * if either directory is present flushes it and all of it'ts children - * from the dcache. + * This function walks a list of inodes (that belong to any proc + * filesystem) that are attached to the pid and flushes them from + * the dentry cache. * * It is safe and reasonable to cache /proc entries for a task until * that task exits. After that they just clog up the dcache with * useless entries, possibly causing useful dcache entries to be - * flushed instead. This routine is proved to flush those useless - * dcache entries at process exit time. + * flushed instead. This routine is provided to flush those useless + * dcache entries when a process is reaped. * * NOTE: This routine is just an optimization so it does not guarantee - * that no dcache entries will exist at process exit time it - * just makes it very unlikely that any will persist. + * that no dcache entries will exist after a process is reaped + * it just makes it very unlikely that any will persist. */ -void proc_flush_task(struct task_struct *task) +void proc_flush_pid(struct pid *pid) { - int i; - struct pid *pid, *tgid; - struct upid *upid; - - pid = task_pid(task); - tgid = task_tgid(task); - - for (i = 0; i <= pid->level; i++) { - upid = &pid->numbers[i]; - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, - tgid->numbers[i].nr); - } + proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock); + put_pid(pid); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d9243b24554a..1e730ea1dcd6 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode) /* Stop tracking associated processes */ if (ei->pid) { - put_pid(ei->pid); + proc_pid_evict_inode(ei); ei->pid = NULL; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index fd470172675f..9e294f0290e5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); +extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); -- cgit v1.2.3 From 69879c01a0c3f70e0887cfb4d9ff439814361e46 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 20 Feb 2020 08:08:20 -0600 Subject: proc: Remove the now unnecessary internal mount of proc There remains no more code in the kernel using pids_ns->proc_mnt, therefore remove it from the kernel. The big benefit of this change is that one of the most error prone and tricky parts of the pid namespace implementation, maintaining kernel mounts of proc is removed. In addition removing the unnecessary complexity of the kernel mount fixes a regression that caused the proc mount options to be ignored. Now that the initial mount of proc comes from userspace, those mount options are again honored. This fixes Android's usage of the proc hidepid option. Reported-by: Alistair Strachan Fixes: e94591d0d90c ("proc: Convert proc_mount to use mount_ns.") Signed-off-by: "Eric W. Biederman" --- fs/proc/root.c | 36 ------------------------------------ 1 file changed, 36 deletions(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index 608233dfd29c..2633f10446c3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = { .subdir = RB_ROOT, .name = "/proc", }; - -int pid_ns_prepare_proc(struct pid_namespace *ns) -{ - struct proc_fs_context *ctx; - struct fs_context *fc; - struct vfsmount *mnt; - - fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - if (fc->user_ns != ns->user_ns) { - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(ns->user_ns); - } - - ctx = fc->fs_private; - if (ctx->pid_ns != ns) { - put_pid_ns(ctx->pid_ns); - get_pid_ns(ns); - ctx->pid_ns = ns; - } - - mnt = fc_mount(fc); - put_fs_context(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - ns->proc_mnt = mnt; - return 0; -} - -void pid_ns_release_proc(struct pid_namespace *ns) -{ - kern_unmount(ns->proc_mnt); -} -- cgit v1.2.3 From 2ca7be7d55ad84fb0f7c2a23fb700a28fd76b19a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:00:07 -0500 Subject: exec: Only compute current once in flush_old_exec Make it clear that current only needs to be computed once in flush_old_exec. This may have some efficiency improvements and it makes the code easier to change. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Acked-by: Christian Brauner Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index db17be51b112..c3f34791f2f0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1260,13 +1260,14 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) */ int flush_old_exec(struct linux_binprm * bprm) { + struct task_struct *me = current; int retval; /* * Make sure we have a private signal table and that * we are unassociated from the previous thread group. */ - retval = de_thread(current); + retval = de_thread(me); if (retval) goto out; @@ -1294,10 +1295,10 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; set_fs(USER_DS); - current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); - current->personality &= ~bprm->per_clear; + me->personality &= ~bprm->per_clear; /* * We have to apply CLOEXEC before we change whether the process is @@ -1305,7 +1306,7 @@ int flush_old_exec(struct linux_binprm * bprm) * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ - do_close_on_exec(current->files); + do_close_on_exec(me->files); return 0; out: -- cgit v1.2.3 From 021691559245498dfa15454c9fc4351f367d0b7f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:00:21 -0500 Subject: exec: Factor unshare_sighand out of de_thread and call it separately This makes the code clearer and makes it easier to implement a mutex that is not taken over any locations that may block indefinitely waiting for userspace. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Acked-by: Christian Brauner Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index c3f34791f2f0..ff74b9a74d34 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1194,6 +1194,23 @@ no_thread_group: flush_itimer_signals(); #endif + BUG_ON(!thread_group_leader(tsk)); + return 0; + +killed: + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exit_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); + return -EAGAIN; +} + + +static int unshare_sighand(struct task_struct *me) +{ + struct sighand_struct *oldsighand = me->sighand; + if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* @@ -1210,23 +1227,13 @@ no_thread_group: write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); - rcu_assign_pointer(tsk->sighand, newsighand); + rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } - - BUG_ON(!thread_group_leader(tsk)); return 0; - -killed: - /* protects against exit_notify() and __exit_signal() */ - read_lock(&tasklist_lock); - sig->group_exit_task = NULL; - sig->notify_count = 0; - read_unlock(&tasklist_lock); - return -EAGAIN; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) @@ -1264,13 +1271,19 @@ int flush_old_exec(struct linux_binprm * bprm) int retval; /* - * Make sure we have a private signal table and that - * we are unassociated from the previous thread group. + * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; + /* + * Make the signal table private. + */ + retval = unshare_sighand(me); + if (retval) + goto out; + /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update -- cgit v1.2.3 From 153ffb6ba49fd80dc607a9f230415af02b728d70 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:00:38 -0500 Subject: exec: Move cleanup of posix timers on exec out of de_thread These functions have very little to do with de_thread move them out of de_thread an into flush_old_exec proper so it can be more clearly seen what flush_old_exec is doing. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Acked-by: Christian Brauner Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index ff74b9a74d34..215d86f77b63 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1189,11 +1189,6 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; -#ifdef CONFIG_POSIX_TIMERS - exit_itimers(sig); - flush_itimer_signals(); -#endif - BUG_ON(!thread_group_leader(tsk)); return 0; @@ -1277,6 +1272,11 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; +#ifdef CONFIG_POSIX_TIMERS + exit_itimers(me->signal); + flush_itimer_signals(); +#endif + /* * Make the signal table private. */ -- cgit v1.2.3 From ccf0fa6be02687868006fa0c85af32c40cae6fb6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:03:21 -0500 Subject: exec: Move exec_mmap right after de_thread in flush_old_exec I have read through the code in exec_mmap and I do not see anything that depends on sighand or the sighand lock, or on signals in anyway so this should be safe. This rearrangement of code has two significant benefits. It makes the determination of passing the point of no return by testing bprm->mm accurate. All failures prior to that point in flush_old_exec are either truly recoverable or they are fatal. Further this consolidates all of the possible indefinite waits for userspace together at the top of flush_old_exec. The possible wait for a ptracer on PTRACE_EVENT_EXIT, the possible wait for a page fault to be resolved in clear_child_tid, and the possible wait for a page fault in exit_robust_list. This consolidation allows the creation of a mutex to replace cred_guard_mutex that is not held over possible indefinite userspace waits. Which will allow removing deadlock scenarios from the kernel. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 215d86f77b63..d820a7272a76 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1272,18 +1272,6 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; -#ifdef CONFIG_POSIX_TIMERS - exit_itimers(me->signal); - flush_itimer_signals(); -#endif - - /* - * Make the signal table private. - */ - retval = unshare_sighand(me); - if (retval) - goto out; - /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update @@ -1307,6 +1295,18 @@ int flush_old_exec(struct linux_binprm * bprm) */ bprm->mm = NULL; +#ifdef CONFIG_POSIX_TIMERS + exit_itimers(me->signal); + flush_itimer_signals(); +#endif + + /* + * Make the signal table private. + */ + retval = unshare_sighand(me); + if (retval) + goto out; + set_fs(USER_DS); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); -- cgit v1.2.3 From eea9673250db4e854e9998ef9da6d4584857f0ea Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:03:36 -0500 Subject: exec: Add exec_update_mutex to replace cred_guard_mutex The cred_guard_mutex is problematic as it is held over possibly indefinite waits for userspace. The possible indefinite waits for userspace that I have identified are: The cred_guard_mutex is held in PTRACE_EVENT_EXIT waiting for the tracer. The cred_guard_mutex is held over "put_user(0, tsk->clear_child_tid)" in exit_mm(). The cred_guard_mutex is held over "get_user(futex_offset, ...") in exit_robust_list. The cred_guard_mutex held over copy_strings. The functions get_user and put_user can trigger a page fault which can potentially wait indefinitely in the case of userfaultfd or if userspace implements part of the page fault path. In any of those cases the userspace process that the kernel is waiting for might make a different system call that winds up taking the cred_guard_mutex and result in deadlock. Holding a mutex over any of those possibly indefinite waits for userspace does not appear necessary. Add exec_update_mutex that will just cover updating the process during exec where the permissions and the objects pointed to by the task struct may be out of sync. The plan is to switch the users of cred_guard_mutex to exec_update_mutex one by one. This lets us move forward while still being careful and not introducing any regressions. Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/ Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/ Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/ Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/ Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/ Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.") Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2") Reviewed-by: Kirill Tkhai Signed-off-by: "Eric W. Biederman" Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index d820a7272a76..0e46ec57fe0a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1010,16 +1010,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) } EXPORT_SYMBOL(read_code); +/* + * Maps the mm_struct mm into the current task struct. + * On success, this function returns with the mutex + * exec_update_mutex locked. + */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + if (ret) + return ret; + if (old_mm) { sync_mm_rss(old_mm); /* @@ -1031,9 +1041,11 @@ static int exec_mmap(struct mm_struct *mm) down_read(&old_mm->mmap_sem); if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); + mutex_unlock(&tsk->signal->exec_update_mutex); return -EINTR; } } + task_lock(tsk); active_mm = tsk->active_mm; membarrier_exec_mmap(mm); @@ -1288,11 +1300,12 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; /* - * After clearing bprm->mm (to mark that current is using the - * prepared mm now), we have nothing left of the original + * After setting bprm->called_exec_mmap (to mark that current is + * using the prepared mm now), we have nothing left of the original * process. If anything from here on returns an error, the check * in search_binary_handler() will SEGV current. */ + bprm->called_exec_mmap = 1; bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS @@ -1438,6 +1451,8 @@ static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { + if (bprm->called_exec_mmap) + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1487,6 +1502,7 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1678,7 +1694,7 @@ int search_binary_handler(struct linux_binprm *bprm) read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval < 0 && !bprm->mm) { + if (retval < 0 && bprm->called_exec_mmap) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV); -- cgit v1.2.3 From 2db9dbf71bf98d02a0bf33e798e5bfd2a9944696 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:27:24 +0100 Subject: proc: Use new infrastructure to fix deadlocks in execve This changes lock_trace to use the new exec_update_mutex instead of cred_guard_mutex. This fixes possible deadlocks when the trace is accessing /proc/$pid/stack for instance. This should be safe, as the credentials are only used for reading, and task->mm is updated on execve under the new exec_update_mutex. Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index e7efe9d6f3d6..a278fda0458b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + int err = mutex_lock_killable(&task->signal->exec_update_mutex); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); } #ifdef CONFIG_STACKTRACE -- cgit v1.2.3 From 76518d3798855242817e8a8ed76b2d72f4415624 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:27:41 +0100 Subject: proc: io_accounting: Use new infrastructure to fix deadlocks in execve This changes do_io_accounting to use the new exec_update_mutex instead of cred_guard_mutex. This fixes possible deadlocks when the trace is accessing /proc/$pid/io for instance. This should be safe, as the credentials are only used for reading. Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index a278fda0458b..74f948a6b621 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2883,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->cred_guard_mutex); + result = mutex_lock_killable(&task->signal->exec_update_mutex); if (result) return result; @@ -2919,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return result; } -- cgit v1.2.3 From d1e7fd6462ca9fc76650fbe6ca800e35b24267da Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 30 Mar 2020 19:01:04 -0500 Subject: signal: Extend exec_id to 64bits Replace the 32bit exec_id with a 64bit exec_id to make it impossible to wrap the exec_id counter. With care an attacker can cause exec_id wrap and send arbitrary signals to a newly exec'd parent. This bypasses the signal sending checks if the parent changes their credentials during exec. The severity of this problem can been seen that in my limited testing of a 32bit exec_id it can take as little as 19s to exec 65536 times. Which means that it can take as little as 14 days to wrap a 32bit exec_id. Adam Zabrocki has succeeded wrapping the self_exe_id in 7 days. Even my slower timing is in the uptime of a typical server. Which means self_exec_id is simply a speed bump today, and if exec gets noticably faster self_exec_id won't even be a speed bump. Extending self_exec_id to 64bits introduces a problem on 32bit architectures where reading self_exec_id is no longer atomic and can take two read instructions. Which means that is is possible to hit a window where the read value of exec_id does not match the written value. So with very lucky timing after this change this still remains expoiltable. I have updated the update of exec_id on exec to use WRITE_ONCE and the read of exec_id in do_notify_parent to use READ_ONCE to make it clear that there is no locking between these two locations. Link: https://lore.kernel.org/kernel-hardening/20200324215049.GA3710@pi3.com.pl Fixes: 2.3.23pre2 Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 0e46ec57fe0a..d55710a36056 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1413,7 +1413,7 @@ void setup_new_exec(struct linux_binprm * bprm) /* An exec changes our domain. We are no longer part of the thread group */ - current->self_exec_id++; + WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1); flush_signal_handlers(current, 0); } EXPORT_SYMBOL(setup_new_exec); -- cgit v1.2.3