From a3860c1c5dd1137db23d7786d284939c5761d517 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Thu, 31 May 2012 16:26:04 -0700 Subject: introduce SIZE_MAX ULONG_MAX is often used to check for integer overflow when calculating allocation size. While ULONG_MAX happens to work on most systems, there is no guarantee that `size_t' must be the same size as `long'. This patch introduces SIZE_MAX, the maximum value of `size_t', to improve portability and readability for allocation size validation. Signed-off-by: Xi Wang Acked-by: Alex Elder Cc: David Airlie Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + include/linux/slab.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index ec55a3c8ba77..e07f5e0c5df4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -35,6 +35,7 @@ #define LLONG_MAX ((long long)(~0ULL>>1)) #define LLONG_MIN (-LLONG_MAX - 1) #define ULLONG_MAX (~0ULL) +#define SIZE_MAX (~(size_t)0) #define STACK_MAGIC 0xdeadbeef diff --git a/include/linux/slab.h b/include/linux/slab.h index a595dce6b0c7..67d5d94b783a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -242,7 +242,7 @@ size_t ksize(const void *); */ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) { - if (size != 0 && n > ULONG_MAX / size) + if (size != 0 && n > SIZE_MAX / size) return NULL; return __kmalloc(n * size, flags); } -- cgit v1.2.3 From 020ac5b6bef15785f9dde9de89d2734ff97da733 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 31 May 2012 16:26:12 -0700 Subject: fat: introduce special inode for managing the FSINFO block This is patchset makes fatfs stop using the VFS '->write_super()' method for writing out the FSINFO block. The final goal is to get rid of the 'sync_supers()' kernel thread. This kernel thread wakes up every 5 seconds (by default) and calls '->write_super()' for all mounted file-systems. And the bad thing is that this is done even if all the superblocks are clean. Moreover, some file-systems do not even need this end they do not register the '->write_super()' method at all (e.g., btrfs). So 'sync_supers()' most often just generates useless wake-ups and wastes power. I am trying to make all file-systems independent of '->write_super()' and plan to remove 'sync_supers()' and '->write_super' completely once there are no more users. The '->write_supers()' method is mostly used by baroque file-systems like hfs, udf, etc. Modern file-systems like btrfs and xfs do not use it. This justifies removing this stuff from VFS completely and make every FS self-manage own superblock. Tested with xfstests. This patch: Preparation for further changes. It introduces a special inode ('fsinfo_inode') in FAT file-system which we'll later use for managing the FSINFO block. Note, this there is already one special inode ('fat_inode') which is used for managing the FAT tables. Introduce new 'MSDOS_FSINFO_INO' constant for this special inode. It is safe to do because FAT file-system does not store inode numbers on the media but generates them run-time. I've also cleaned up the comment to existing 'MSDOS_ROOT_INO' constant, while on it. Signed-off-by: Artem Bityutskiy Cc: OGAWA Hirofumi Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/msdos_fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h index 34066e65fdeb..11cc2ac67e75 100644 --- a/include/linux/msdos_fs.h +++ b/include/linux/msdos_fs.h @@ -21,8 +21,9 @@ #define CT_LE_W(v) cpu_to_le16(v) #define CT_LE_L(v) cpu_to_le32(v) +#define MSDOS_ROOT_INO 1 /* The root inode number */ +#define MSDOS_FSINFO_INO 2 /* Used for managing the FSINFO block */ -#define MSDOS_ROOT_INO 1 /* == MINIX_ROOT_INO */ #define MSDOS_DIR_BITS 5 /* log2(sizeof(struct msdos_dir_entry)) */ /* directory limit */ -- cgit v1.2.3 From ae3cef7300e9fddc35ad251dd5f27c5b88c8594a Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 31 May 2012 16:26:14 -0700 Subject: kmod: unexport call_usermodehelper_freeinfo() call_usermodehelper_freeinfo() is not used outside of kmod.c. So unexport it, and make it static to kmod.c Signed-off-by: Boaz Harrosh Cc: Oleg Nesterov Cc: Tetsuo Handa Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmod.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index dd99c329e161..f07f9a4e10ff 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -79,10 +79,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info, /* Actually execute the sub-process */ int call_usermodehelper_exec(struct subprocess_info *info, int wait); -/* Free the subprocess_info. This is only needed if you're not going - to call call_usermodehelper_exec */ -void call_usermodehelper_freeinfo(struct subprocess_info *info); - static inline int call_usermodehelper_fns(char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), -- cgit v1.2.3 From 785042f2e275089e22c36b462f6495ce8d91732d Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 31 May 2012 16:26:15 -0700 Subject: kmod: move call_usermodehelper_fns() to .c file and unexport all it's helpers If we move call_usermodehelper_fns() to kmod.c file and EXPORT_SYMBOL it we can avoid exporting all it's helper functions: call_usermodehelper_setup call_usermodehelper_setfns call_usermodehelper_exec And make all of them static to kmod.c Since the optimizer will see all these as a single call site it will inline them inside call_usermodehelper_fns(). So we loose the call to _fns but gain 3 calls to the helpers. (Not that it matters) Signed-off-by: Boaz Harrosh Cc: Oleg Nesterov Cc: Tetsuo Handa Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmod.h | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index f07f9a4e10ff..5398d5807075 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -66,36 +66,10 @@ struct subprocess_info { void *data; }; -/* Allocate a subprocess_info structure */ -struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, - char **envp, gfp_t gfp_mask); - -/* Set various pieces of state into the subprocess_info structure */ -void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *info), - void *data); - -/* Actually execute the sub-process */ -int call_usermodehelper_exec(struct subprocess_info *info, int wait); - -static inline int +extern int call_usermodehelper_fns(char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), - void (*cleanup)(struct subprocess_info *), void *data) -{ - struct subprocess_info *info; - gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; - - info = call_usermodehelper_setup(path, argv, envp, gfp_mask); - - if (info == NULL) - return -ENOMEM; - - call_usermodehelper_setfns(info, init, cleanup, data); - - return call_usermodehelper_exec(info, wait); -} + void (*cleanup)(struct subprocess_info *), void *data); static inline int call_usermodehelper(char *path, char **argv, char **envp, int wait) -- cgit v1.2.3 From 43e13cc107cf6cd3c15fbe1cef849435c2223d50 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 31 May 2012 16:26:16 -0700 Subject: cred: remove task_is_dead() from __task_cred() validation Commit 8f92054e7ca1 ("CRED: Fix __task_cred()'s lockdep check and banner comment"): add the following validation condition: task->exit_state >= 0 to permit the access if the target task is dead and therefore unable to change its own credentials. OK, but afaics currently this can only help wait_task_zombie() which calls __task_cred() without rcu lock. Remove this validation and change wait_task_zombie() to use task_uid() instead. This means we do rcu_read_lock() only to shut up the lockdep, but we already do the same in, say, wait_task_stopped(). task_is_dead() should die, task->exit_state != 0 means that this task has passed exit_notify(), only do_wait-like code paths should use this. Unfortunately, we can't kill task_is_dead() right now, it has already acquired buggy users in drivers/staging. The fix already exists. Signed-off-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Acked-by: David Howells Cc: Jiri Olsa Cc: Paul E. McKenney Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cred.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 917dc5aeb1d4..ebbed2ce6637 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -277,17 +277,13 @@ static inline void put_cred(const struct cred *_cred) * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU - * readlock or the task must be dead and unable to change its own credentials. + * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ -#define __task_cred(task) \ - ({ \ - const struct task_struct *__t = (task); \ - rcu_dereference_check(__t->real_cred, \ - task_is_dead(__t)); \ - }) +#define __task_cred(task) \ + rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials -- cgit v1.2.3 From cb79295e20a8088a2fd6a9b3cb5f2d889ec36b4d Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 31 May 2012 16:26:22 -0700 Subject: cpu: introduce clear_tasks_mm_cpumask() helper Many architectures clear tasks' mm_cpumask like this: read_lock(&tasklist_lock); for_each_process(p) { if (p->mm) cpumask_clear_cpu(cpu, mm_cpumask(p->mm)); } read_unlock(&tasklist_lock); Depending on the context, the code above may have several problems, such as: 1. Working with task->mm w/o getting mm or grabing the task lock is dangerous as ->mm might disappear (exit_mm() assigns NULL under task_lock(), so tasklist lock is not enough). 2. Checking for process->mm is not enough because process' main thread may exit or detach its mm via use_mm(), but other threads may still have a valid mm. This patch implements a small helper function that does things correctly, i.e.: 1. We take the task's lock while whe handle its mm (we can't use get_task_mm()/mmput() pair as mmput() might sleep); 2. To catch exited main thread case, we use find_lock_task_mm(), which walks up all threads and returns an appropriate task (with task lock held). Also, Per Peter Zijlstra's idea, now we don't grab tasklist_lock in the new helper, instead we take the rcu read lock. We can do this because the function is called after the cpu is taken down and marked offline, so no new tasks will get this cpu set in their mm mask. Signed-off-by: Anton Vorontsov Cc: Richard Weinberger Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Russell King Cc: Benjamin Herrenschmidt Cc: Mike Frysinger Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 7230bb59a06f..2e9b9ebbeb78 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -177,6 +177,7 @@ extern void put_online_cpus(void); #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri) #define register_hotcpu_notifier(nb) register_cpu_notifier(nb) #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) +void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE -- cgit v1.2.3 From 29a5c67e7a78815fda0567a867adce467f6e6e5a Mon Sep 17 00:00:00 2001 From: maximilian attems Date: Thu, 31 May 2012 16:26:27 -0700 Subject: kexec: export kexec.h to user space Add userspace definitions, guard all relevant kernel structures. While at it document stuff and remove now useless userspace hint. It is easy to add the relevant system call to respective libc's, but it seems pointless to have to duplicate the data structures. This is based on the kexec-tools headers, with the exception of just using int on return (succes or failure) and using size_t instead of 'unsigned long int' for the number of segments argument of kexec_load(). Signed-off-by: maximilian attems Cc: Simon Horman Cc: Vivek Goyal Cc: Haren Myneni Cc: "Eric W. Biederman" Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/Kbuild | 1 + include/linux/kexec.h | 75 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 54 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 7185b8f15ced..8760be30b375 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -226,6 +226,7 @@ header-y += kdev_t.h header-y += kernel.h header-y += kernelcapi.h header-y += kernel-page-flags.h +header-y += kexec.h header-y += keyboard.h header-y += keyctl.h header-y += l2tp.h diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 0d7d6a1b172f..37c5f7261142 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -1,8 +1,58 @@ #ifndef LINUX_KEXEC_H #define LINUX_KEXEC_H -#ifdef CONFIG_KEXEC +/* kexec system call - It loads the new kernel to boot into. + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ + #include + +/* kexec flags for different usage scenarios */ +#define KEXEC_ON_CRASH 0x00000001 +#define KEXEC_PRESERVE_CONTEXT 0x00000002 +#define KEXEC_ARCH_MASK 0xffff0000 + +/* These values match the ELF architecture values. + * Unless there is a good reason that should continue to be the case. + */ +#define KEXEC_ARCH_DEFAULT ( 0 << 16) +#define KEXEC_ARCH_386 ( 3 << 16) +#define KEXEC_ARCH_X86_64 (62 << 16) +#define KEXEC_ARCH_PPC (20 << 16) +#define KEXEC_ARCH_PPC64 (21 << 16) +#define KEXEC_ARCH_IA_64 (50 << 16) +#define KEXEC_ARCH_ARM (40 << 16) +#define KEXEC_ARCH_S390 (22 << 16) +#define KEXEC_ARCH_SH (42 << 16) +#define KEXEC_ARCH_MIPS_LE (10 << 16) +#define KEXEC_ARCH_MIPS ( 8 << 16) + +/* The artificial cap on the number of segments passed to kexec_load. */ +#define KEXEC_SEGMENT_MAX 16 + +#ifndef __KERNEL__ +/* + * This structure is used to hold the arguments that are used when + * loading kernel binaries. + */ +struct kexec_segment { + const void *buf; + size_t bufsz; + const void *mem; + size_t memsz; +}; + +/* Load a new kernel image as described by the kexec_segment array + * consisting of passed number of segments at the entry-point address. + * The flags allow different useage types. + */ +extern int kexec_load(void *, size_t, struct kexec_segment *, + unsigned long int); +#endif /* __KERNEL__ */ + +#ifdef __KERNEL__ +#ifdef CONFIG_KEXEC #include #include #include @@ -67,11 +117,10 @@ typedef unsigned long kimage_entry_t; #define IND_DONE 0x4 #define IND_SOURCE 0x8 -#define KEXEC_SEGMENT_MAX 16 struct kexec_segment { void __user *buf; size_t bufsz; - unsigned long mem; /* User space sees this as a (void *) ... */ + unsigned long mem; size_t memsz; }; @@ -175,25 +224,6 @@ extern struct kimage *kexec_crash_image; #define kexec_flush_icache_page(page) #endif -#define KEXEC_ON_CRASH 0x00000001 -#define KEXEC_PRESERVE_CONTEXT 0x00000002 -#define KEXEC_ARCH_MASK 0xffff0000 - -/* These values match the ELF architecture values. - * Unless there is a good reason that should continue to be the case. - */ -#define KEXEC_ARCH_DEFAULT ( 0 << 16) -#define KEXEC_ARCH_386 ( 3 << 16) -#define KEXEC_ARCH_X86_64 (62 << 16) -#define KEXEC_ARCH_PPC (20 << 16) -#define KEXEC_ARCH_PPC64 (21 << 16) -#define KEXEC_ARCH_IA_64 (50 << 16) -#define KEXEC_ARCH_ARM (40 << 16) -#define KEXEC_ARCH_S390 (22 << 16) -#define KEXEC_ARCH_SH (42 << 16) -#define KEXEC_ARCH_MIPS_LE (10 << 16) -#define KEXEC_ARCH_MIPS ( 8 << 16) - /* List of defined/legal kexec flags */ #ifndef CONFIG_KEXEC_JUMP #define KEXEC_FLAGS KEXEC_ON_CRASH @@ -228,4 +258,5 @@ struct task_struct; static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } #endif /* CONFIG_KEXEC */ +#endif /* __KERNEL__ */ #endif /* LINUX_KEXEC_H */ -- cgit v1.2.3 From 93e6f119c0ce8a1bba6e81dc8dd97d67be360844 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 31 May 2012 16:26:28 -0700 Subject: ipc/mqueue: cleanup definition names and locations Since commit b231cca4381e ("message queues: increase range limits") on Oct 18, 2008, calls to mq_open() that did not pass in an attribute struct and expected to get default values for the size of the queue and the max message size now get the system wide maximums instead of hardwired defaults like they used to get. This was uncovered when one of the earlier patches in this patch set increased the default system wide maximums at the same time it increased the hard ceiling on the system wide maximums (a customer specifically needed the hard ceiling brought back up, the new ceiling that commit b231cca4381e introduced was too low for their production systems). By increasing the default maximums and not realising they were tied to any attempt to create a message queue without an attribute struct, I had inadvertently made it such that all message queue creation attempts without an attribute struct were failing because the new default maximums would create a queue that exceeded the default rlimit for message queue bytes. As a result, the system wide defaults were brought back down to their previous levels, and the system wide ceilings on the maximums were raised to meet the customer's needs. However, the fact that the no attribute struct behavior of mq_open() could be broken by changing the system wide maximums for message queues was seen as fundamentally broken itself. So we hardwired the no attribute case back like it used to be. But, then we realized that on the very off chance that some piece of software in the wild depended on that behavior, we could work around that issue by adding two new knobs to /proc that allowed setting the defaults for message queues created without an attr struct separately from the system wide maximums. What is not an option IMO is to leave the current behavior in place. No piece of software should ever rely on setting the system wide maximums in order to get a desired message queue. Such a reliance would be so fundamentally multitasking OS unfriendly as to not really be tolerable. Fortunately, we don't know of any software in the wild that uses this except for a regression test program that caught the issue in the first place. If there is though, we have made accommodations with the two new /proc knobs (and that's all the accommodations such fundamentally broken software can be allowed).. This patch: The various defines for minimums and maximums of the sysctl controllable mqueue values are scattered amongst different files and named inconsistently. Move them all into ipc_namespace.h and make them have consistent names. Additionally, make the number of queues per namespace also have a minimum and maximum and use the same sysctl function as the other two settable variables. Signed-off-by: Doug Ledford Acked-by: Serge E. Hallyn Cc: Amerigo Wang Cc: Joe Korty Cc: Jiri Slaby Acked-by: KOSAKI Motohiro Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 8a297a5e794c..1372b566e1e1 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -91,10 +91,15 @@ static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #ifdef CONFIG_POSIX_MQUEUE extern int mq_init_ns(struct ipc_namespace *ns); /* default values */ +#define MIN_QUEUESMAX 1 #define DFLT_QUEUESMAX 256 /* max number of message queues */ +#define HARD_QUEUESMAX 1024 +#define MIN_MSGMAX 1 #define DFLT_MSGMAX 10 /* max number of messages in each queue */ #define HARD_MSGMAX (32768*sizeof(void *)/4) +#define MIN_MSGSIZEMAX 128 #define DFLT_MSGSIZEMAX 8192 /* max message size */ +#define HARD_MSGSIZEMAX (8192*128) #else static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif -- cgit v1.2.3 From 858ee3784e8105467f1f3017f4ece51cb51d4830 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 31 May 2012 16:26:29 -0700 Subject: ipc/mqueue: switch back to using non-max values on create Commit b231cca4381e ("message queues: increase range limits") changed how we create a queue that does not include an attr struct passed to open so that it creates the queue with whatever the maximum values are. However, if the admin has set the maximums to allow flexibility in creating a queue (aka, both a large size and large queue are allowed, but combined they create a queue too large for the RLIMIT_MSGQUEUE of the user), then attempts to create a queue without an attr struct will fail. Switch back to using acceptable defaults regardless of what the maximums are. Note: so far, we only know of a few applications that rely on this behavior (specifically, set the maximums in /proc, then run the application which calls mq_open() without passing in an attr struct, and the application expects the newly created message queue to have the maximum sizes that were set in /proc used on the mq_open() call, and all of those applications that we know of are actually part of regression test suites that were coded to do something like this: for size in 4096 65536 $((1024 * 1024)) $((16 * 1024 * 1024)); do echo $size > /proc/sys/fs/mqueue/msgsize_max mq_open || echo "Error opening mq with size $size" done These test suites that depend on any behavior like this are broken. The concept that programs should rely upon the system wide maximum in order to get their desired results instead of simply using a attr struct to specify what they want is fundamentally unfriendly programming practice for any multi-tasking OS. Fixing this will break those few apps that we know of (and those app authors recognize the brokenness of their code and the need to fix it). However, the following patch "mqueue: separate mqueue default value" allows a workaround in the form of new knobs for the default msg queue creation parameters for any software out there that we don't already know about that might rely on this behavior at the moment. Signed-off-by: Doug Ledford Cc: Serge E. Hallyn Cc: Amerigo Wang Cc: Joe Korty Cc: Jiri Slaby Acked-by: KOSAKI Motohiro Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 1372b566e1e1..bde094ee7b0e 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -95,9 +95,11 @@ extern int mq_init_ns(struct ipc_namespace *ns); #define DFLT_QUEUESMAX 256 /* max number of message queues */ #define HARD_QUEUESMAX 1024 #define MIN_MSGMAX 1 +#define DFLT_MSG 10U #define DFLT_MSGMAX 10 /* max number of messages in each queue */ #define HARD_MSGMAX (32768*sizeof(void *)/4) #define MIN_MSGSIZEMAX 128 +#define DFLT_MSGSIZE 8192U #define DFLT_MSGSIZEMAX 8192 /* max message size */ #define HARD_MSGSIZEMAX (8192*128) #else -- cgit v1.2.3 From 5b5c4d1a1440e94994c73dddbad7be0676cd8b9a Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 31 May 2012 16:26:30 -0700 Subject: ipc/mqueue: update maximums for the mqueue subsystem Commit b231cca4381e ("message queues: increase range limits") changed the maximum size of a message in a message queue from INT_MAX to 8192*128. Unfortunately, we had customers that relied on a size much larger than 8192*128 on their production systems. After reviewing POSIX, we found that it is silent on the maximum message size. We did find a couple other areas in which it was not silent. Fix up the mqueue maximums so that the customer's system can continue to work, and document both the POSIX and real world requirements in ipc_namespace.h so that we don't have this issue crop back up. Also, commit 9cf18e1dd74cd0 ("ipc: HARD_MSGMAX should be higher not lower on 64bit") fiddled with HARD_MSGMAX without realizing that the number was intentionally in place to limit the msg queue depth to one that was small enough to kmalloc an array of pointers (hence why we divided 128k by sizeof(long)). If we wish to meet POSIX requirements, we have no choice but to change our allocation to a vmalloc instead (at least for the large queue size case). With that, it's possible to increase our allowed maximum to the POSIX requirements (or more if we choose). [sfr@canb.auug.org.au: using vmalloc requires including vmalloc.h] Signed-off-by: Doug Ledford Cc: Serge E. Hallyn Cc: Amerigo Wang Cc: Joe Korty Cc: Jiri Slaby Acked-by: KOSAKI Motohiro Cc: Manfred Spraul Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 47 ++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index bde094ee7b0e..6e1dd08194fd 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -90,18 +90,41 @@ static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {} #ifdef CONFIG_POSIX_MQUEUE extern int mq_init_ns(struct ipc_namespace *ns); -/* default values */ -#define MIN_QUEUESMAX 1 -#define DFLT_QUEUESMAX 256 /* max number of message queues */ -#define HARD_QUEUESMAX 1024 -#define MIN_MSGMAX 1 -#define DFLT_MSG 10U -#define DFLT_MSGMAX 10 /* max number of messages in each queue */ -#define HARD_MSGMAX (32768*sizeof(void *)/4) -#define MIN_MSGSIZEMAX 128 -#define DFLT_MSGSIZE 8192U -#define DFLT_MSGSIZEMAX 8192 /* max message size */ -#define HARD_MSGSIZEMAX (8192*128) +/* + * POSIX Message Queue default values: + * + * MIN_*: Lowest value an admin can set the maximum unprivileged limit to + * DFLT_*MAX: Default values for the maximum unprivileged limits + * DFLT_{MSG,MSGSIZE}: Default values used when the user doesn't supply + * an attribute to the open call and the queue must be created + * HARD_*: Highest value the maximums can be set to. These are enforced + * on CAP_SYS_RESOURCE apps as well making them inviolate (so make them + * suitably high) + * + * POSIX Requirements: + * Per app minimum openable message queues - 8. This does not map well + * to the fact that we limit the number of queues on a per namespace + * basis instead of a per app basis. So, make the default high enough + * that no given app should have a hard time opening 8 queues. + * Minimum maximum for HARD_MSGMAX - 32767. I bumped this to 65536. + * Minimum maximum for HARD_MSGSIZEMAX - POSIX is silent on this. However, + * we have run into a situation where running applications in the wild + * require this to be at least 5MB, and preferably 10MB, so I set the + * value to 16MB in hopes that this user is the worst of the bunch and + * the new maximum will handle anyone else. I may have to revisit this + * in the future. + */ +#define MIN_QUEUESMAX 1 +#define DFLT_QUEUESMAX 256 +#define HARD_QUEUESMAX 1024 +#define MIN_MSGMAX 1 +#define DFLT_MSG 64U +#define DFLT_MSGMAX 1024 +#define HARD_MSGMAX 65536 +#define MIN_MSGSIZEMAX 128 +#define DFLT_MSGSIZE 8192U +#define DFLT_MSGSIZEMAX (1024*1024) +#define HARD_MSGSIZEMAX (16*1024*1024) #else static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } #endif -- cgit v1.2.3 From e6315bb154e778391ce64b194756bd3d108dadf6 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 31 May 2012 16:26:31 -0700 Subject: mqueue: revert bump up DFLT_*MAX Mqueue limitation is slightly naieve parameter likes other ipcs because unprivileged user can consume kernel memory by using ipcs. Thus, too aggressive raise bring us security issue. Example, current setting allow evil unprivileged user use 256GB (= 256 * 1024 * 1024*1024) and it's enough large to system will belome unresponsive. Don't do that. Instead, every admin should adjust the knobs for their own systems. Signed-off-by: KOSAKI Motohiro Acked-by: Doug Ledford Acked-by: Joe Korty Cc: Amerigo Wang Acked-by: Serge E. Hallyn Cc: Jiri Slaby Cc: Manfred Spraul Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6e1dd08194fd..2488535a32a3 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -118,12 +118,12 @@ extern int mq_init_ns(struct ipc_namespace *ns); #define DFLT_QUEUESMAX 256 #define HARD_QUEUESMAX 1024 #define MIN_MSGMAX 1 -#define DFLT_MSG 64U -#define DFLT_MSGMAX 1024 +#define DFLT_MSG 10U +#define DFLT_MSGMAX 10 #define HARD_MSGMAX 65536 #define MIN_MSGSIZEMAX 128 #define DFLT_MSGSIZE 8192U -#define DFLT_MSGSIZEMAX (1024*1024) +#define DFLT_MSGSIZEMAX 8192 #define HARD_MSGSIZEMAX (16*1024*1024) #else static inline int mq_init_ns(struct ipc_namespace *ns) { return 0; } -- cgit v1.2.3 From cef0184c115e5e4e10498f6548d9526465e72478 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 31 May 2012 16:26:33 -0700 Subject: mqueue: separate mqueue default value from maximum value Commit b231cca4381e ("message queues: increase range limits") changed mqueue default value when attr parameter is specified NULL from hard coded value to fs.mqueue.{msg,msgsize}_max sysctl value. This made large side effect. When user need to use two mqueue applications 1) using !NULL attr parameter and it require big message size and 2) using NULL attr parameter and only need small size message, app (1) require to raise fs.mqueue.msgsize_max and app (2) consume large memory size even though it doesn't need. Doug Ledford propsed to switch back it to static hard coded value. However it also has a compatibility problem. Some applications might started depend on the default value is tunable. The solution is to separate default value from maximum value. Signed-off-by: KOSAKI Motohiro Signed-off-by: Doug Ledford Acked-by: Doug Ledford Acked-by: Joe Korty Cc: Amerigo Wang Acked-by: Serge E. Hallyn Cc: Jiri Slaby Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 2488535a32a3..5499c92a9153 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -62,6 +62,8 @@ struct ipc_namespace { unsigned int mq_queues_max; /* initialized to DFLT_QUEUESMAX */ unsigned int mq_msg_max; /* initialized to DFLT_MSGMAX */ unsigned int mq_msgsize_max; /* initialized to DFLT_MSGSIZEMAX */ + unsigned int mq_msg_default; + unsigned int mq_msgsize_default; /* user_ns which owns the ipc ns */ struct user_namespace *user_ns; -- cgit v1.2.3 From e42d98ebe7d754a2c9fbccd6186721d3ca8679f6 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Thu, 31 May 2012 16:26:38 -0700 Subject: rapidio: add DMA engine support for RIO data transfers Adds DMA Engine framework support into RapidIO subsystem. Uses DMA Engine DMA_SLAVE interface to generate data transfers to/from remote RapidIO target devices. Introduces RapidIO-specific wrapper for prep_slave_sg() interface with an extra parameter to pass target specific information. Uses scatterlist to describe local data buffer. Address flat data buffer on a remote side. Signed-off-by: Alexandre Bounine Cc: Dan Williams Acked-by: Vinod Koul Cc: Li Yang Cc: Matt Porter Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dmaengine.h | 12 ++++++++++++ include/linux/rio.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/rio_drv.h | 9 +++++++++ 3 files changed, 68 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index d3fec584e8c3..56377df39124 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -635,6 +635,18 @@ static inline struct dma_async_tx_descriptor *dmaengine_prep_slave_sg( dir, flags, NULL); } +#ifdef CONFIG_RAPIDIO_DMA_ENGINE +struct rio_dma_ext; +static inline struct dma_async_tx_descriptor *dmaengine_prep_rio_sg( + struct dma_chan *chan, struct scatterlist *sgl, unsigned int sg_len, + enum dma_transfer_direction dir, unsigned long flags, + struct rio_dma_ext *rio_ext) +{ + return chan->device->device_prep_slave_sg(chan, sgl, sg_len, + dir, flags, rio_ext); +} +#endif + static inline struct dma_async_tx_descriptor *dmaengine_prep_dma_cyclic( struct dma_chan *chan, dma_addr_t buf_addr, size_t buf_len, size_t period_len, enum dma_transfer_direction dir) diff --git a/include/linux/rio.h b/include/linux/rio.h index 4d50611112ba..a90ebadd9da0 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_RAPIDIO_DMA_ENGINE +#include +#endif #define RIO_NO_HOPCOUNT -1 #define RIO_INVALID_DESTID 0xffff @@ -254,6 +257,9 @@ struct rio_mport { u32 phys_efptr; unsigned char name[40]; void *priv; /* Master port private data */ +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + struct dma_device dma; +#endif }; /** @@ -395,6 +401,47 @@ union rio_pw_msg { u32 raw[RIO_PW_MSG_SIZE/sizeof(u32)]; }; +#ifdef CONFIG_RAPIDIO_DMA_ENGINE + +/** + * enum rio_write_type - RIO write transaction types used in DMA transfers + * + * Note: RapidIO specification defines write (NWRITE) and + * write-with-response (NWRITE_R) data transfer operations. + * Existing DMA controllers that service RapidIO may use one of these operations + * for entire data transfer or their combination with only the last data packet + * requires response. + */ +enum rio_write_type { + RDW_DEFAULT, /* default method used by DMA driver */ + RDW_ALL_NWRITE, /* all packets use NWRITE */ + RDW_ALL_NWRITE_R, /* all packets use NWRITE_R */ + RDW_LAST_NWRITE_R, /* last packet uses NWRITE_R, others - NWRITE */ +}; + +struct rio_dma_ext { + u16 destid; + u64 rio_addr; /* low 64-bits of 66-bit RapidIO address */ + u8 rio_addr_u; /* upper 2-bits of 66-bit RapidIO address */ + enum rio_write_type wr_type; /* preferred RIO write operation type */ +}; + +struct rio_dma_data { + /* Local data (as scatterlist) */ + struct scatterlist *sg; /* I/O scatter list */ + unsigned int sg_len; /* size of scatter list */ + /* Remote device address (flat buffer) */ + u64 rio_addr; /* low 64-bits of 66-bit RapidIO address */ + u8 rio_addr_u; /* upper 2-bits of 66-bit RapidIO address */ + enum rio_write_type wr_type; /* preferred RIO write operation type */ +}; + +static inline struct rio_mport *dma_to_mport(struct dma_device *ddev) +{ + return container_of(ddev, struct rio_mport, dma); +} +#endif /* CONFIG_RAPIDIO_DMA_ENGINE */ + /* Architecture and hardware-specific functions */ extern int rio_register_mport(struct rio_mport *); extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int); diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index 7f07470e1ed9..31ad146be316 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -377,6 +377,15 @@ void rio_unregister_driver(struct rio_driver *); struct rio_dev *rio_dev_get(struct rio_dev *); void rio_dev_put(struct rio_dev *); +#ifdef CONFIG_RAPIDIO_DMA_ENGINE +extern struct dma_chan *rio_request_dma(struct rio_dev *rdev); +extern void rio_release_dma(struct dma_chan *dchan); +extern struct dma_async_tx_descriptor *rio_dma_prep_slave_sg( + struct rio_dev *rdev, struct dma_chan *dchan, + struct rio_dma_data *data, + enum dma_transfer_direction direction, unsigned long flags); +#endif + /** * rio_name - Get the unique RIO device identifier * @rdev: RIO device -- cgit v1.2.3 From ee62c6b2dc93c09585b51fad18449dc5edb9977f Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 31 May 2012 16:26:41 -0700 Subject: eventfd: change int to __u64 in eventfd_signal() eventfd_ctx->count is an __u64 counter which is allowed to reach ULLONG_MAX. eventfd_write() adds a __u64 value to "count", but the kernel side eventfd_signal() only adds an int value to it. Make them consistent. [akpm@linux-foundation.org: update interface documentation] Signed-off-by: Sha Zhengju Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/eventfd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index 91bb4f27238c..3c3ef19a625a 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -34,7 +34,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); -int eventfd_signal(struct eventfd_ctx *ctx, int n); +__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, __u64 *cnt); -- cgit v1.2.3 From ac34ebb3a67e699edcb5ac72f19d31679369dfaa Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Thu, 31 May 2012 16:26:42 -0700 Subject: aio/vfs: cleanup of rw_copy_check_uvector() and compat_rw_copy_check_uvector() A cleanup of rw_copy_check_uvector and compat_rw_copy_check_uvector after changes made to support CMA in an earlier patch. Rather than having an additional check_access parameter to these functions, the first paramater type is overloaded to allow the caller to specify CHECK_IOVEC_ONLY which means check that the contents of the iovec are valid, but do not check the memory that they point to. This is used by process_vm_readv/writev where we need to validate that a iovec passed to the syscall is valid but do not want to check the memory that it points to at this point because it refers to an address space in another process. Signed-off-by: Chris Yeoh Reviewed-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 3 +-- include/linux/fs.h | 12 ++++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 5d46217f84ad..4e890394ef99 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -577,8 +577,7 @@ extern ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer, - int check_access); + struct iovec **ret_pointer); extern void __user *compat_alloc_user_space(unsigned long len); diff --git a/include/linux/fs.h b/include/linux/fs.h index 038076b27ea4..cf2c5611b19b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -173,6 +173,15 @@ struct inodes_stat_t { #define WRITE_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA) #define WRITE_FLUSH_FUA (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA) + +/* + * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector + * that indicates that they should check the contents of the iovec are + * valid, but not check the memory that the iovec elements + * points too. + */ +#define CHECK_IOVEC_ONLY -1 + #define SEL_IN 1 #define SEL_OUT 2 #define SEL_EX 4 @@ -1690,8 +1699,7 @@ struct seq_file; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer, - int check_access); + struct iovec **ret_pointer); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); -- cgit v1.2.3 From d97b46a64674a267bc41c9e16132ee2a98c3347d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:44 -0700 Subject: syscalls, x86: add __NR_kcmp syscall While doing the checkpoint-restore in the user space one need to determine whether various kernel objects (like mm_struct-s of file_struct-s) are shared between tasks and restore this state. The 2nd step can be solved by using appropriate CLONE_ flags and the unshare syscall, while there's currently no ways for solving the 1st one. One of the ways for checking whether two tasks share e.g. mm_struct is to provide some mm_struct ID of a task to its proc file, but showing such info considered to be not that good for security reasons. Thus after some debates we end up in conclusion that using that named 'comparison' syscall might be the best candidate. So here is it -- __NR_kcmp. It takes up to 5 arguments - the pids of the two tasks (which characteristics should be compared), the comparison type and (in case of comparison of files) two file descriptors. Lookups for pids are done in the caller's PID namespace only. At moment only x86 is supported and tested. [akpm@linux-foundation.org: fix up selftests, warnings] [akpm@linux-foundation.org: include errno.h] [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Cyrill Gorcunov Acked-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Glauber Costa Cc: Andi Kleen Cc: Tejun Heo Cc: Matt Helsley Cc: Pekka Enberg Cc: Eric Dumazet Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Valdis.Kletnieks@vt.edu Cc: Michal Marek Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kcmp.h | 17 +++++++++++++++++ include/linux/syscalls.h | 2 ++ 2 files changed, 19 insertions(+) create mode 100644 include/linux/kcmp.h (limited to 'include/linux') diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h new file mode 100644 index 000000000000..2dcd1b3aafc8 --- /dev/null +++ b/include/linux/kcmp.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_KCMP_H +#define _LINUX_KCMP_H + +/* Comparison type */ +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + + KCMP_TYPES, +}; + +#endif /* _LINUX_KCMP_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3de3acb84a95..19439c75c5b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid, unsigned long riovcnt, unsigned long flags); +asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, + unsigned long idx1, unsigned long idx2); #endif -- cgit v1.2.3 From fe8c7f5cbf91124987106faa3bdf0c8b955c4cf7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:45 -0700 Subject: c/r: prctl: extend PR_SET_MM to set up more mm_struct entries During checkpoint we dump whole process memory to a file and the dump includes process stack memory. But among stack data itself, the stack carries additional parameters such as command line arguments, environment data and auxiliary vector. So when we do restore procedure and once we've restored stack data itself we need to setup mm_struct::arg_start/end, env_start/end, so restored process would be able to find command line arguments and environment data it had at checkpoint time. The same applies to auxiliary vector. For this reason additional PR_SET_MM_(ARG_START | ARG_END | ENV_START | ENV_END | AUXV) codes are introduced. Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 78b76e24cc7e..18d84c4b42d8 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -113,6 +113,11 @@ # define PR_SET_MM_START_STACK 5 # define PR_SET_MM_START_BRK 6 # define PR_SET_MM_BRK 7 +# define PR_SET_MM_ARG_START 8 +# define PR_SET_MM_ARG_END 9 +# define PR_SET_MM_ENV_START 10 +# define PR_SET_MM_ENV_END 11 +# define PR_SET_MM_AUXV 12 /* * Set specific pid that is allowed to ptrace the current task. -- cgit v1.2.3 From b32dfe377102ce668775f8b6b1461f7ad428f8b6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:46 -0700 Subject: c/r: prctl: add ability to set new mm_struct::exe_file When we do restore we would like to have a way to setup a former mm_struct::exe_file so that /proc/pid/exe would point to the original executable file a process had at checkpoint time. For this the PR_SET_MM_EXE_FILE code is introduced. This option takes a file descriptor which will be set as a source for new /proc/$pid/exe symlink. Note it allows to change /proc/$pid/exe if there are no VM_EXECUTABLE vmas present for current process, simply because this feature is a special to C/R and mm::num_exe_file_vmas become meaningless after that. To minimize the amount of transition the /proc/pid/exe symlink might have, this feature is implemented in one-shot manner. Thus once changed the symlink can't be changed again. This should help sysadmins to monitor the symlinks over all process running in a system. In particular one could make a snapshot of processes and ring alarm if there unexpected changes of /proc/pid/exe's in a system. Note -- this feature is available iif CONFIG_CHECKPOINT_RESTORE is set and the caller must have CAP_SYS_RESOURCE capability granted, otherwise the request to change symlink will be rejected. Signed-off-by: Cyrill Gorcunov Reviewed-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Pavel Emelyanov Cc: Kees Cook Cc: Tejun Heo Cc: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 18d84c4b42d8..711e0a30aacc 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -118,6 +118,7 @@ # define PR_SET_MM_ENV_START 10 # define PR_SET_MM_ENV_END 11 # define PR_SET_MM_AUXV 12 +# define PR_SET_MM_EXE_FILE 13 /* * Set specific pid that is allowed to ptrace the current task. -- cgit v1.2.3