From b3a084b9b684622b149e8dcf03855bf0d5fb588b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 08:38:44 +0200 Subject: rcu: rcu_read_lock_bh_held(): disabling irqs also disables bh rcu_dereference_bh() doesnt know yet about hard irq being disabled, so lockdep can trigger in netpoll_rx() after commit f0f9deae9e7c4 (netpoll: Disable IRQ around RCU dereference in netpoll_rx) Reported-by: Miles Lane Signed-off-by: Eric Dumazet Tested-by: Miles Lane Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9fbc54a2585d..83af1f8d8b74 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -454,7 +454,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) \ - rcu_dereference_check(p, rcu_read_lock_bh_held()) + rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled()) /** * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched -- cgit v1.2.3 From 5336377d6225959624146629ce3fc88ee8ecda3d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 5 Oct 2010 11:29:27 -0700 Subject: modules: Fix module_bug_list list corruption race With all the recent module loading cleanups, we've minimized the code that sits under module_mutex, fixing various deadlocks and making it possible to do most of the module loading in parallel. However, that whole conversion totally missed the rather obscure code that adds a new module to the list for BUG() handling. That code was doubly obscure because (a) the code itself lives in lib/bugs.c (for dubious reasons) and (b) it gets called from the architecture-specific "module_finalize()" rather than from generic code. Calling it from arch-specific code makes no sense what-so-ever to begin with, and is now actively wrong since that code isn't protected by the module loading lock any more. So this commit moves the "module_bug_{finalize,cleanup}()" calls away from the arch-specific code, and into the generic code - and in the process protects it with the module_mutex so that the list operations are now safe. Future fixups: - move the module list handling code into kernel/module.c where it belongs. - get rid of 'module_bug_list' and just use the regular list of modules (called 'modules' - imagine that) that we already create and maintain for other reasons. Reported-and-tested-by: Thomas Gleixner Cc: Rusty Russell Cc: Adrian Bunk Cc: Andrew Morton Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/module.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 8a6b9fdc7ffa..aace066bad8f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -686,17 +686,16 @@ extern int module_sysfs_initialized; #ifdef CONFIG_GENERIC_BUG -int module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, +void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); void module_bug_cleanup(struct module *); #else /* !CONFIG_GENERIC_BUG */ -static inline int module_bug_finalize(const Elf_Ehdr *hdr, +static inline void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { - return 0; } static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ -- cgit v1.2.3 From 231d0aefd88e94129cb8fb84794f9bb788c6366e Mon Sep 17 00:00:00 2001 From: Evgeny Kuznetsov Date: Tue, 5 Oct 2010 12:47:57 +0400 Subject: wait: using uninitialized member of wait queue The "flags" member of "struct wait_queue_t" is used in several places in the kernel code without beeing initialized by init_wait(). "flags" is used in bitwise operations. If "flags" not initialized then unexpected behaviour may take place. Incorrect flags might used later in code. Added initialization of "wait_queue_t.flags" with zero value into "init_wait". Signed-off-by: Evgeny Kuznetsov [ The bit we care about does end up being initialized by both prepare_to_wait() and add_to_wait_queue(), so this doesn't seem to cause actual bugs, but is definitely the right thing to do -Linus ] Signed-off-by: Linus Torvalds --- include/linux/wait.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 0836ccc57121..3efc9f3f43a0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -614,6 +614,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->task_list); \ + (wait)->flags = 0; \ } while (0) /** -- cgit v1.2.3 From 430c62fb2948d964cf8dc7f3e2f69623c04ef62f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Oct 2010 09:35:16 +0200 Subject: elevator: fix oops on early call to elevator_change() 2.6.36 introduces an API for drivers to switch the IO scheduler instead of manually calling the elevator exit and init functions. This API was added since q->elevator must be cleared in between those two calls. And since we already have this functionality directly from use by the sysfs interface to switch schedulers online, it was prudent to reuse it internally too. But this API needs the queue to be in a fully initialized state before it is called, or it will attempt to unregister elevator kobjects before they have been added. This results in an oops like this: BUG: unable to handle kernel NULL pointer dereference at 0000000000000051 IP: [] sysfs_create_dir+0x2e/0xc0 PGD 47ddfc067 PUD 47c6a1067 PMD 0 Oops: 0000 [#1] PREEMPT SMP last sysfs file: /sys/devices/pci0000:00/0000:00:02.0/0000:04:00.1/irq CPU 2 Modules linked in: t(+) loop hid_apple usbhid ahci ehci_hcd uhci_hcd libahci usbcore nls_base igb Pid: 7319, comm: modprobe Not tainted 2.6.36-rc6+ #132 QSSC-S4R/QSSC-S4R RIP: 0010:[] [] sysfs_create_dir+0x2e/0xc0 RSP: 0018:ffff88027da25d08 EFLAGS: 00010246 RAX: ffff88047c68c528 RBX: 00000000fffffffe RCX: 0000000000000000 RDX: 000000000000002f RSI: 000000000000002f RDI: ffff88047e196c88 RBP: ffff88027da25d38 R08: 0000000000000000 R09: d84156c5635688c0 R10: d84156c5635688c0 R11: 0000000000000000 R12: ffff88047e196c88 R13: 0000000000000000 R14: 0000000000000000 R15: ffff88047c68c528 FS: 00007fcb0b26f6e0(0000) GS:ffff880287400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000051 CR3: 000000047e76e000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 7319, threadinfo ffff88027da24000, task ffff88027d377090) Stack: ffff88027da25d58 ffff88047c68c528 00000000fffffffe ffff88047e196c88 <0> ffff88047c68c528 ffff88047e05bd90 ffff88027da25d78 ffffffff8123fb77 <0> ffff88047e05bd90 0000000000000000 ffff88047e196c88 ffff88047c68c528 Call Trace: [] kobject_add_internal+0xe7/0x1f0 [] kobject_add_varg+0x38/0x60 [] kobject_add+0x69/0x90 [] ? sysfs_remove_dir+0x20/0xa0 [] ? sub_preempt_count+0x9d/0xe0 [] ? _raw_spin_unlock+0x30/0x50 [] ? sysfs_remove_dir+0x20/0xa0 [] ? sysfs_remove_dir+0x34/0xa0 [] elv_register_queue+0x34/0xa0 [] elevator_change+0xfd/0x250 [] ? t_init+0x0/0x361 [t] [] ? t_init+0x0/0x361 [t] [] t_init+0xa8/0x361 [t] [] do_one_initcall+0x3e/0x170 [] sys_init_module+0xbd/0x220 [] system_call_fastpath+0x16/0x1b Code: e5 41 56 41 55 41 54 49 89 fc 53 48 83 ec 10 48 85 ff 74 52 48 8b 47 18 49 c7 c5 00 46 61 81 48 85 c0 74 04 4c 8b 68 30 45 31 f6 <41> 80 7d 51 00 74 0e 49 8b 44 24 28 4c 89 e7 ff 50 20 49 89 c6 RIP [] sysfs_create_dir+0x2e/0xc0 RSP CR2: 0000000000000051 ---[ end trace a6541d3bf07945df ]--- Fix this by adding a registered bit to the elevator queue, which is set when the sysfs kobjects have been registered. Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 926b50322a46..4fd978e7eb83 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -93,6 +93,7 @@ struct elevator_queue struct elevator_type *elevator_type; struct mutex sysfs_lock; struct hlist_head *hash; + unsigned int registered:1; }; /* -- cgit v1.2.3 From 7c5347733dcc4ba0bac0baf86d99fae0561f33b7 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 11 Oct 2010 18:13:31 -0400 Subject: fanotify: disable fanotify syscalls This patch disables the fanotify syscalls by just not building them and letting the cond_syscall() statements in kernel/sys_ni.c redirect them to sys_ni_syscall(). It was pointed out by Tvrtko Ursulin that the fanotify interface did not include an explicit prioritization between groups. This is necessary for fanotify to be usable for hierarchical storage management software, as they must get first access to the file, before inotify-like notifiers see the file. This feature can be added in an ABI compatible way in the next release (by using a number of bits in the flags field to carry the info) but it was suggested by Alan that maybe we should just hold off and do it in the next cycle, likely with an (new) explicit argument to the syscall. I don't like this approach best as I know people are already starting to use the current interface, but Alan is all wise and noone on list backed me up with just using what we have. I feel this is needlessly ripping the rug out from under people at the last minute, but if others think it needs to be a new argument it might be the best way forward. Three choices: Go with what we got (and implement the new feature next cycle). Add a new field right now (and implement the new feature next cycle). Wait till next cycle to release the ABI (and implement the new feature next cycle). This is number 3. Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- include/linux/Kbuild | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 626b629429ff..4e8ea8c8ec1e 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -118,7 +118,6 @@ header-y += eventpoll.h header-y += ext2_fs.h header-y += fadvise.h header-y += falloc.h -header-y += fanotify.h header-y += fb.h header-y += fcntl.h header-y += fd.h -- cgit v1.2.3 From 0eead9ab41da33644ae2c97c57ad03da636a0422 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Oct 2010 10:57:40 -0700 Subject: Don't dump task struct in a.out core-dumps akiphie points out that a.out core-dumps have that odd task struct dumping that was never used and was never really a good idea (it goes back into the mists of history, probably the original core-dumping code). Just remove it. Also do the access_ok() check on dump_write(). It probably doesn't matter (since normal filesystems all seem to do it anyway), but he points out that it's normally done by the VFS layer, so ... [ I suspect that we should possibly do "vfs_write()" instead of calling ->write directly. That also does the whole fsnotify and write statistics thing, which may or may not be a good idea. ] And just to be anal, do this all for the x86-64 32-bit a.out emulation code too, even though it's not enabled (and won't currently even compile) Reported-by: akiphie Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 8ba66a9d9022..59579cfee6a0 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -11,7 +11,7 @@ */ static inline int dump_write(struct file *file, const void *addr, int nr) { - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; } static inline int dump_seek(struct file *file, loff_t off) -- cgit v1.2.3 From 3aa0ce825ade0cf5506e32ccf51d01fc8d22a9cf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Oct 2010 14:32:06 -0700 Subject: Un-inline the core-dump helper functions Tony Luck reports that the addition of the access_ok() check in commit 0eead9ab41da ("Don't dump task struct in a.out core-dumps") broke the ia64 compile due to missing the necessary header file includes. Rather than add yet another include () to make everything happy, just uninline the silly core dump helper functions and move the bodies to fs/exec.c where they make a lot more sense. dump_seek() in particular was too big to be an inline function anyway, and none of them are in any way performance-critical. And we really don't need to mess up our include file headers more than they already are. Reported-and-tested-by: Tony Luck Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 59579cfee6a0..ba4b85a6d9b8 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -9,37 +9,7 @@ * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. */ -static inline int dump_write(struct file *file, const void *addr, int nr) -{ - return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static inline int dump_seek(struct file *file, loff_t off) -{ - int ret = 1; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) { - ret = 0; - break; - } - off -= n; - } - free_page((unsigned long)buf); - } - return ret; -} +extern int dump_write(struct file *file, const void *addr, int nr); +extern int dump_seek(struct file *file, loff_t off); #endif /* _LINUX_COREDUMP_H */ -- cgit v1.2.3 From 79b5dc0c64d88cda3da23b2e22a5cec0964372ac Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 15 Oct 2010 14:34:14 -0700 Subject: types.h: define __aligned_u64 and expose to userspace We currently have a kernel internal type called aligned_u64 which aligns __u64's on 8 bytes boundaries even on systems which would normally align them on 4 byte boundaries. This patch creates a new type __aligned_u64 which does the same thing but which is exposed to userspace rather than being kernel internal. [akpm: merge early as both the net and audit trees want this] [akpm@linux-foundation.org: enhance the comment describing the reasons for using aligned_u64. Via Andreas and Andi.] Based-on-patch-by: Andreas Gruenbacher Signed-off-by: Eric Paris Cc: Jan Engelhardt Cc: David Miller Cc: Andi Kleen Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 01a082f56ef4..357dbc19606f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -121,7 +121,15 @@ typedef __u64 u_int64_t; typedef __s64 int64_t; #endif -/* this is a special 64bit data type that is 8-byte aligned */ +/* + * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid + * common 32/64-bit compat problems. + * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other + * architectures) and to 8-byte boundaries on 64-bit architetures. The new + * aligned_64 type enforces 8-byte alignment so that structs containing + * aligned_64 values have the same alignment on 32-bit and 64-bit architectures. + * No conversions are necessary between 32-bit user-space and a 64-bit kernel. + */ #define aligned_u64 __u64 __attribute__((aligned(8))) #define aligned_be64 __be64 __attribute__((aligned(8))) #define aligned_le64 __le64 __attribute__((aligned(8))) @@ -178,6 +186,11 @@ typedef __u64 __bitwise __be64; typedef __u16 __bitwise __sum16; typedef __u32 __bitwise __wsum; +/* this is a special 64bit data type that is 8-byte aligned */ +#define __aligned_u64 __u64 __attribute__((aligned(8))) +#define __aligned_be64 __be64 __attribute__((aligned(8))) +#define __aligned_le64 __le64 __attribute__((aligned(8))) + #ifdef __KERNEL__ typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; -- cgit v1.2.3