From 6b4e306aa3dc94a0545eb9279475b1ab6209a31f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 7 Mar 2010 16:41:34 -0800 Subject: ns: proc files for namespace naming policy. Create files under /proc//ns/ to allow controlling the namespaces of a process. This addresses three specific problems that can make namespaces hard to work with. - Namespaces require a dedicated process to pin them in memory. - It is not possible to use a namespace unless you are the child of the original creator. - Namespaces don't have names that userspace can use to talk about them. The namespace files under /proc//ns/ can be opened and the file descriptor can be used to talk about a specific namespace, and to keep the specified namespace alive. A namespace can be kept alive by either holding the file descriptor open or bind mounting the file someplace else. aka: mount --bind /proc/self/ns/net /some/filesystem/path mount --bind /proc/self/fd/ /some/filesystem/path This allows namespaces to be named with userspace policy. It requires additional support to make use of these filedescriptors and that will be comming in the following patches. Acked-by: Daniel Lezcano Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index dfa532730e55..dc8bca72b002 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -static int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = dentry->d_inode; @@ -1736,8 +1736,7 @@ static int task_dumpable(struct task_struct *task) return 0; } - -static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; struct proc_inode *ei; @@ -1779,7 +1778,7 @@ out_unlock: return NULL; } -static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; struct task_struct *task; @@ -1820,7 +1819,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat * made this apply to all per process world readable and executable * directories. */ -static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) +int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode; struct task_struct *task; @@ -1862,7 +1861,7 @@ static int pid_delete_dentry(const struct dentry * dentry) return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } -static const struct dentry_operations pid_dentry_operations = +const struct dentry_operations pid_dentry_operations = { .d_revalidate = pid_revalidate, .d_delete = pid_delete_dentry, @@ -1870,9 +1869,6 @@ static const struct dentry_operations pid_dentry_operations = /* Lookups */ -typedef struct dentry *instantiate_t(struct inode *, struct dentry *, - struct task_struct *, const void *); - /* * Fill a directory entry. * @@ -1885,8 +1881,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *, * reported by readdir in sync with the inode numbers reported * by stat. */ -static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, - char *name, int len, +int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + const char *name, int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = filp->f_path.dentry; @@ -2820,6 +2816,7 @@ static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif @@ -3168,6 +3165,7 @@ out_no_task: static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), + DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), -- cgit v1.2.3 From 3864601387cf4196371e3c1897fdffa5228296f9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 26 May 2011 16:25:46 -0700 Subject: mm: extract exe_file handling from procfs Setup and cleanup of mm_struct->exe_file is currently done in fs/proc/. This was because exe_file was needed only for /proc//exe. Since we will need the exe_file functionality also for core dumps (so core name can contain full binary path), built this functionality always into the kernel. To achieve that move that out of proc FS to the kernel/ where in fact it should belong. By doing that we can make dup_mm_exe_file static. Also we can drop linux/proc_fs.h inclusion in fs/exec.c and kernel/fork.c. Signed-off-by: Jiri Slaby Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index dc8bca72b002..c2ac2fb123c8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1576,57 +1576,6 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -/* - * We added or removed a vma mapping the executable. The vmas are only mapped - * during exec and are not mapped with the mmap system call. - * Callers must hold down_write() on the mm's mmap_sem for these - */ -void added_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas++; -} - -void removed_exe_file_vma(struct mm_struct *mm) -{ - mm->num_exe_file_vmas--; - if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ - fput(mm->exe_file); - mm->exe_file = NULL; - } - -} - -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) -{ - if (new_exe_file) - get_file(new_exe_file); - if (mm->exe_file) - fput(mm->exe_file); - mm->exe_file = new_exe_file; - mm->num_exe_file_vmas = 0; -} - -struct file *get_mm_exe_file(struct mm_struct *mm) -{ - struct file *exe_file; - - /* We need mmap_sem to protect against races with removal of - * VM_EXECUTABLE vmas */ - down_read(&mm->mmap_sem); - exe_file = mm->exe_file; - if (exe_file) - get_file(exe_file); - up_read(&mm->mmap_sem); - return exe_file; -} - -void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) -{ - /* It's safe to write the exe_file pointer without exe_file_lock because - * this is called during fork when the task is not yet in /proc */ - newmm->exe_file = get_mm_exe_file(oldmm); -} - static int proc_exe_link(struct inode *inode, struct path *exe_path) { struct task_struct *task; -- cgit v1.2.3 From 0a8cb8e34149251ad1f280fe099a4f971554639a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 26 May 2011 16:25:50 -0700 Subject: fs/proc: convert to kstrtoX() Convert fs/proc/ from strict_strto*() to kstrto*() functions. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index c2ac2fb123c8..0c2c50cc2cca 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1059,7 +1059,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, { struct task_struct *task; char buffer[PROC_NUMBUF]; - long oom_adjust; + int oom_adjust; unsigned long flags; int err; @@ -1071,7 +1071,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, goto out; } - err = strict_strtol(strstrip(buffer), 0, &oom_adjust); + err = kstrtoint(strstrip(buffer), 0, &oom_adjust); if (err) goto out; if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) && @@ -1168,7 +1168,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, struct task_struct *task; char buffer[PROC_NUMBUF]; unsigned long flags; - long oom_score_adj; + int oom_score_adj; int err; memset(buffer, 0, sizeof(buffer)); @@ -1179,7 +1179,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto out; } - err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); + err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); if (err) goto out; if (oom_score_adj < OOM_SCORE_ADJ_MIN || @@ -1468,7 +1468,7 @@ sched_autogroup_write(struct file *file, const char __user *buf, struct inode *inode = file->f_path.dentry->d_inode; struct task_struct *p; char buffer[PROC_NUMBUF]; - long nice; + int nice; int err; memset(buffer, 0, sizeof(buffer)); @@ -1477,9 +1477,9 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (copy_from_user(buffer, buf, count)) return -EFAULT; - err = strict_strtol(strstrip(buffer), 0, &nice); - if (err) - return -EINVAL; + err = kstrtoint(strstrip(buffer), 0, &nice); + if (err < 0) + return err; p = get_proc_task(inode); if (!p) -- cgit v1.2.3 From 30cd8903913dac7b0918807cac46be3ecde5a5a7 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 26 May 2011 16:25:52 -0700 Subject: proc: put check_mem_permission after __get_free_page in mem_write It whould be better if put check_mem_permission after __get_free_page in mem_write, to be same as function mem_read. Hugh Dickins explained the reason. check_mem_permission gets a reference to the mm. If we __get_free_page after check_mem_permission, imagine what happens if the system is out of memory, and the mm we're looking at is selected for killing by the OOM killer: while we wait in __get_free_page for more memory, no memory is freed from the selected mm because it cannot reach exit_mmap while we hold that reference. Reported-by: Jovi Zhang Signed-off-by: KOSAKI Motohiro Acked-by: Hugh Dickins Reviewed-by: Stephen Wilson Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0c2c50cc2cca..4ede550517a6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -894,20 +894,20 @@ static ssize_t mem_write(struct file * file, const char __user *buf, if (!task) goto out_no_task; + copied = -ENOMEM; + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) + goto out_task; + mm = check_mem_permission(task); copied = PTR_ERR(mm); if (IS_ERR(mm)) - goto out_task; + goto out_free; copied = -EIO; if (file->private_data != (void *)((long)current->self_exec_id)) goto out_mm; - copied = -ENOMEM; - page = (char *)__get_free_page(GFP_TEMPORARY); - if (!page) - goto out_mm; - copied = 0; while (count > 0) { int this_len, retval; @@ -929,9 +929,11 @@ static ssize_t mem_write(struct file * file, const char __user *buf, count -= retval; } *ppos = dst; - free_page((unsigned long) page); + out_mm: mmput(mm); +out_free: + free_page((unsigned long) page); out_task: put_task_struct(task); out_no_task: -- cgit v1.2.3 From f133ecca9cbb31b5e6e9bda27cbe3034fbf656df Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Thu, 26 May 2011 12:40:09 -0400 Subject: arch/tile: more /proc and /sys file support This change introduces a few of the less controversial /proc and /proc/sys interfaces for tile, along with sysfs attributes for various things that were originally proposed as /proc/tile files. It also adjusts the "hardwall" proc API. Arnd Bergmann reviewed the initial arch/tile submission, which included a complete set of all the /proc/tile and /proc/sys/tile knobs that we had added in a somewhat ad hoc way during initial development, and provided feedback on where most of them should go. One knob turned out to be similar enough to the existing /proc/sys/debug/exception-trace that it was re-implemented to use that model instead. Another knob was /proc/tile/grid, which reported the "grid" dimensions of a tile chip (e.g. 8x8 processors = 64-core chip). Arnd suggested looking at sysfs for that, so this change moves that information to a pair of sysfs attributes (chip_width and chip_height) in the /sys/devices/system/cpu directory. We also put the "chip_serial" and "chip_revision" information from our old /proc/tile/board file as attributes in /sys/devices/system/cpu. Other information collected via hypervisor APIs is now placed in /sys/hypervisor. We create a /sys/hypervisor/type file (holding the constant string "tilera") to be parallel with the Xen use of /sys/hypervisor/type holding "xen". We create three top-level files, "version" (the hypervisor's own version), "config_version" (the version of the configuration file), and "hvconfig" (the contents of the configuration file). The remaining information from our old /proc/tile/board and /proc/tile/switch files becomes an attribute group appearing under /sys/hypervisor/board/. Finally, after some feedback from Arnd Bergmann for the previous version of this patch, the /proc/tile/hardwall file is split up into two conceptual parts. First, a directory /proc/tile/hardwall/ which contains one file per active hardwall, each file named after the hardwall's ID and holding a cpulist that says which cpus are enclosed by the hardwall. Second, a /proc/PID file "hardwall" that is either empty (for non-hardwall-using processes) or contains the hardwall ID. Finally, this change pushes the /proc/sys/tile/unaligned_fixup/ directory, with knobs controlling the kernel code for handling the fixup of unaligned exceptions. Reviewed-by: Arnd Bergmann Signed-off-by: Chris Metcalf --- fs/proc/base.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index dfa532730e55..3ad615fb8656 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,6 +83,9 @@ #include #include #include +#ifdef CONFIG_HARDWALL +#include +#endif #include "internal.h" /* NOTE: @@ -2894,6 +2897,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING INF("io", S_IRUGO, proc_tgid_io_accounting), #endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif }; static int proc_tgid_base_readdir(struct file * filp, @@ -3232,6 +3238,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING INF("io", S_IRUGO, proc_tid_io_accounting), #endif +#ifdef CONFIG_HARDWALL + INF("hardwall", S_IRUGO, proc_pid_hardwall), +#endif }; static int proc_tid_base_readdir(struct file * filp, -- cgit v1.2.3 From cf1279111686d9742cbc4145bc9d526c83f59fea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:35:23 -0400 Subject: proc_fd_permission() is doesn't need to bail out in RCU mode nothing blocking except generic_permission() Signed-off-by: Al Viro --- fs/proc/base.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs/proc/base.c') diff --git a/fs/proc/base.c b/fs/proc/base.c index 14def991d9dd..8a84210ca080 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2169,11 +2169,7 @@ static const struct file_operations proc_fd_operations = { */ static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { - int rv; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - rv = generic_permission(inode, mask, flags, NULL); + int rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) -- cgit v1.2.3