From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Feb 2006 23:27:36 -0800 Subject: [PATCH] percpu data: only iterate over possible CPUs percpu_data blindly allocates bootmem memory to store NR_CPUS instances of cpudata, instead of allocating memory only for possible cpus. As a preparation for changing that, we need to convert various 0 -> NR_CPUS loops to use for_each_cpu(). (The above only applies to users of asm-generic/percpu.h. powerpc has gone it alone and is presently only allocating memory for present CPUs, so it's currently corrupting memory). Signed-off-by: Eric Dumazet Cc: "David S. Miller" Cc: James Bottomley Acked-by: Ingo Molnar Cc: Jens Axboe Cc: Anton Blanchard Acked-by: William Irwin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f77f23f8f479..839466fdfb4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6109,7 +6109,7 @@ void __init sched_init(void) runqueue_t *rq; int i, j, k; - for (i = 0; i < NR_CPUS; i++) { + for_each_cpu(i) { prio_array_t *array; rq = cpu_rq(i); -- cgit v1.2.3 From bd576c9523fbf23e94fb7dbe05d2ae1cf96864e4 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Sat, 4 Feb 2006 23:27:42 -0800 Subject: [PATCH] sched: only print migration_cost once per boot migration_cost prints after every CPU hotplug event. Make it print only once at boot. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 839466fdfb4c..bc38804e40dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5551,13 +5551,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); + if (system_state == SYSTEM_BOOTING) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); + } + printk("\n"); } - printk("\n"); j1 = jiffies; if (migration_debug) printk("migration: %ld seconds\n", (j1-j0)/HZ); -- cgit v1.2.3 From 5c0d5d262aa4c5e93f9f5de298cf25d6d8b558c4 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sat, 4 Feb 2006 23:27:49 -0800 Subject: [PATCH] missing license tag in intermodule It may suck something awful, but it shouldn't taint the kernel. Signed-off-by: Dave Jones Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/intermodule.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 0cbe633420fb..55b1e5b85db9 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -179,3 +179,6 @@ EXPORT_SYMBOL(inter_module_register); EXPORT_SYMBOL(inter_module_unregister); EXPORT_SYMBOL(inter_module_get_request); EXPORT_SYMBOL(inter_module_put); + +MODULE_LICENSE("GPL"); + -- cgit v1.2.3 From 7714d5985bb7101a90fb427dc29dc592cf1b960e Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 7 Feb 2006 12:58:22 -0800 Subject: [PATCH] swsusp: kill unneeded/unbalanced bio_get - Remove unneeded bio_get() which would cause a bio leak - Writing doesn't dirty pages. Reading dirties pages. - We should dirty the pages after the IO completion, not before (Busy-waiting for disk I/O completion isn't very polite.) Signed-off-by: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 59c91c148e82..4e90905f0e87 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -743,7 +743,6 @@ static int submit(int rw, pgoff_t page_off, void *page) if (!bio) return -ENOMEM; bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio_get(bio); bio->bi_bdev = resume_bdev; bio->bi_end_io = end_io; @@ -753,14 +752,13 @@ static int submit(int rw, pgoff_t page_off, void *page) goto Done; } - if (rw == WRITE) - bio_set_pages_dirty(bio); atomic_set(&io_done, 1); submit_bio(rw | (1 << BIO_RW_SYNC), bio); while (atomic_read(&io_done)) yield(); - + if (rw == READ) + bio_set_pages_dirty(bio); Done: bio_put(bio); return error; -- cgit v1.2.3 From 8e08b756869eeb08ace17ad64c2a8cb97b18e856 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 7 Feb 2006 12:58:45 -0800 Subject: [PATCH] module: strlen_user() race fix Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e058aedf6b93..5aad477ddc79 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1670,6 +1670,9 @@ static struct module *load_module(void __user *umod, goto free_mod; } + /* Userspace could have altered the string after the strlen_user() */ + args[arglen - 1] = '\0'; + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; -- cgit v1.2.3 From 46cd2f32baf181b74b16cceb123bab6fe1f61f85 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Feb 2006 12:58:50 -0800 Subject: [PATCH] Fix build failure in recent pm_prepare_* changes. Fix compilation problem in PM headers. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/console.c | 4 +++- kernel/power/power.h | 16 ---------------- 2 files changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/power/console.c b/kernel/power/console.c index 579d239d129f..623786d44159 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,7 +9,9 @@ #include #include "power.h" -#ifdef SUSPEND_CONSOLE +#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) +#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) + static int orig_fgconsole, orig_kmsg; int pm_prepare_console(void) diff --git a/kernel/power/power.h b/kernel/power/power.h index d8f0d1a76bae..388dba680841 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -1,14 +1,6 @@ #include #include -/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but - we probably do not take enough locks for switching consoles, etc, - so bad things might happen. -*/ -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) -#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) -#endif - struct swsusp_info { struct new_utsname uts; u32 version_code; @@ -42,14 +34,6 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -#ifdef SUSPEND_CONSOLE -extern int pm_prepare_console(void); -extern void pm_restore_console(void); -#else -static int pm_prepare_console(void) { return 0; } -static void pm_restore_console(void) {} -#endif - /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; -- cgit v1.2.3 From cf2e340f4249b781b3d2beb41e891d08581f0e10 Mon Sep 17 00:00:00 2001 From: JANAK DESAI Date: Tue, 7 Feb 2006 12:58:58 -0800 Subject: [PATCH] unshare system call -v5: system call handler function sys_unshare system call handler function accepts the same flags as clone system call, checks constraints on each of the flags and invokes corresponding unshare functions to disassociate respective process context if it was being shared with another task. Here is the link to a program for testing unshare system call. http://prdownloads.sourceforge.net/audit/unshare_test.c?download Please note that because of a problem in rmdir associated with bind mounts and clone with CLONE_NEWNS, the test fails while trying to remove temporary test directory. You can remove that temporary directory by doing rmdir, twice, from the command line. The first will fail with EBUSY, but the second will succeed. I have reported the problem to Ram Pai and Al Viro with a small program which reproduces the problem. Al told us yesterday that he will be looking at the problem soon. I have tried multiple rmdirs from the unshare_test program itself, but for some reason that is not working. Doing two rmdirs from command line does seem to remove the directory. Signed-off-by: Janak Desai Cc: Al Viro Cc: Christoph Hellwig Cc: Michael Kerrisk Cc: Andi Kleen Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7f0ab5ee948c..6eb9362775f9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1323,3 +1323,235 @@ void __init proc_caches_init(void) sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); } + + +/* + * Check constraints on flags passed to the unshare system call and + * force unsharing of additional process context as appropriate. + */ +static inline void check_unshare_flags(unsigned long *flags_ptr) +{ + /* + * If unsharing a thread from a thread group, must also + * unshare vm. + */ + if (*flags_ptr & CLONE_THREAD) + *flags_ptr |= CLONE_VM; + + /* + * If unsharing vm, must also unshare signal handlers. + */ + if (*flags_ptr & CLONE_VM) + *flags_ptr |= CLONE_SIGHAND; + + /* + * If unsharing signal handlers and the task was created + * using CLONE_THREAD, then must unshare the thread + */ + if ((*flags_ptr & CLONE_SIGHAND) && + (atomic_read(¤t->signal->count) > 1)) + *flags_ptr |= CLONE_THREAD; + + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (*flags_ptr & CLONE_NEWNS) + *flags_ptr |= CLONE_FS; +} + +/* + * Unsharing of tasks created with CLONE_THREAD is not supported yet + */ +static int unshare_thread(unsigned long unshare_flags) +{ + if (unshare_flags & CLONE_THREAD) + return -EINVAL; + + return 0; +} + +/* + * Unsharing of fs info for tasks created with CLONE_FS is not supported yet + */ +static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) +{ + struct fs_struct *fs = current->fs; + + if ((unshare_flags & CLONE_FS) && + (fs && atomic_read(&fs->count) > 1)) + return -EINVAL; + + return 0; +} + +/* + * Unsharing of namespace for tasks created without CLONE_NEWNS is not + * supported yet + */ +static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp) +{ + struct namespace *ns = current->namespace; + + if ((unshare_flags & CLONE_NEWNS) && + (ns && atomic_read(&ns->count) > 1)) + return -EINVAL; + + return 0; +} + +/* + * Unsharing of sighand for tasks created with CLONE_SIGHAND is not + * supported yet + */ +static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) +{ + struct sighand_struct *sigh = current->sighand; + + if ((unshare_flags & CLONE_SIGHAND) && + (sigh && atomic_read(&sigh->count) > 1)) + return -EINVAL; + else + return 0; +} + +/* + * Unsharing of vm for tasks created with CLONE_VM is not supported yet + */ +static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) +{ + struct mm_struct *mm = current->mm; + + if ((unshare_flags & CLONE_VM) && + (mm && atomic_read(&mm->mm_users) > 1)) + return -EINVAL; + + return 0; + +} + +/* + * Unsharing of files for tasks created with CLONE_FILES is not supported yet + */ +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +{ + struct files_struct *fd = current->files; + + if ((unshare_flags & CLONE_FILES) && + (fd && atomic_read(&fd->count) > 1)) + return -EINVAL; + + return 0; +} + +/* + * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not + * supported yet + */ +static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp) +{ + if (unshare_flags & CLONE_SYSVSEM) + return -EINVAL; + + return 0; +} + +/* + * unshare allows a process to 'unshare' part of the process + * context which was originally shared using clone. copy_* + * functions used by do_fork() cannot be used here directly + * because they modify an inactive task_struct that is being + * constructed. Here we are modifying the current, active, + * task_struct. + */ +asmlinkage long sys_unshare(unsigned long unshare_flags) +{ + int err = 0; + struct fs_struct *fs, *new_fs = NULL; + struct namespace *ns, *new_ns = NULL; + struct sighand_struct *sigh, *new_sigh = NULL; + struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; + struct files_struct *fd, *new_fd = NULL; + struct sem_undo_list *new_ulist = NULL; + + check_unshare_flags(&unshare_flags); + + if ((err = unshare_thread(unshare_flags))) + goto bad_unshare_out; + if ((err = unshare_fs(unshare_flags, &new_fs))) + goto bad_unshare_cleanup_thread; + if ((err = unshare_namespace(unshare_flags, &new_ns))) + goto bad_unshare_cleanup_fs; + if ((err = unshare_sighand(unshare_flags, &new_sigh))) + goto bad_unshare_cleanup_ns; + if ((err = unshare_vm(unshare_flags, &new_mm))) + goto bad_unshare_cleanup_sigh; + if ((err = unshare_fd(unshare_flags, &new_fd))) + goto bad_unshare_cleanup_vm; + if ((err = unshare_semundo(unshare_flags, &new_ulist))) + goto bad_unshare_cleanup_fd; + + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + + task_lock(current); + + if (new_fs) { + fs = current->fs; + current->fs = new_fs; + new_fs = fs; + } + + if (new_ns) { + ns = current->namespace; + current->namespace = new_ns; + new_ns = ns; + } + + if (new_sigh) { + sigh = current->sighand; + current->sighand = new_sigh; + new_sigh = sigh; + } + + if (new_mm) { + mm = current->mm; + active_mm = current->active_mm; + current->mm = new_mm; + current->active_mm = new_mm; + activate_mm(active_mm, new_mm); + new_mm = mm; + } + + if (new_fd) { + fd = current->files; + current->files = new_fd; + new_fd = fd; + } + + task_unlock(current); + } + +bad_unshare_cleanup_fd: + if (new_fd) + put_files_struct(new_fd); + +bad_unshare_cleanup_vm: + if (new_mm) + mmput(new_mm); + +bad_unshare_cleanup_sigh: + if (new_sigh) + if (atomic_dec_and_test(&new_sigh->count)) + kmem_cache_free(sighand_cachep, new_sigh); + +bad_unshare_cleanup_ns: + if (new_ns) + put_namespace(new_ns); + +bad_unshare_cleanup_fs: + if (new_fs) + put_fs_struct(new_fs); + +bad_unshare_cleanup_thread: +bad_unshare_out: + return err; +} -- cgit v1.2.3 From 99d1419d96d7df9cfa56bc977810be831bd5ef64 Mon Sep 17 00:00:00 2001 From: JANAK DESAI Date: Tue, 7 Feb 2006 12:58:59 -0800 Subject: [PATCH] unshare system call -v5: unshare filesystem info If filesystem structure is being shared, allocate a new one and copy information from the current, shared, structure. Signed-off-by: Janak Desai Cc: Al Viro Cc: Christoph Hellwig Cc: Michael Kerrisk Cc: Andi Kleen Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 6eb9362775f9..598e5c27242c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1371,15 +1371,18 @@ static int unshare_thread(unsigned long unshare_flags) } /* - * Unsharing of fs info for tasks created with CLONE_FS is not supported yet + * Unshare the filesystem structure if it is being shared */ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) { struct fs_struct *fs = current->fs; if ((unshare_flags & CLONE_FS) && - (fs && atomic_read(&fs->count) > 1)) - return -EINVAL; + (fs && atomic_read(&fs->count) > 1)) { + *new_fsp = __copy_fs_struct(current->fs); + if (!*new_fsp) + return -ENOMEM; + } return 0; } -- cgit v1.2.3 From 741a295130606143edbf9fc740f633dbc1e6225f Mon Sep 17 00:00:00 2001 From: JANAK DESAI Date: Tue, 7 Feb 2006 12:59:00 -0800 Subject: [PATCH] unshare system call -v5: unshare namespace If the namespace structure is being shared, allocate a new one and copy information from the current, shared, structure. Signed-off-by: Janak Desai Cc: Al Viro Cc: Christoph Hellwig Cc: Michael Kerrisk Cc: Andi Kleen Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 598e5c27242c..07dd241aa1e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1388,16 +1388,21 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) } /* - * Unsharing of namespace for tasks created without CLONE_NEWNS is not - * supported yet + * Unshare the namespace structure if it is being shared */ -static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp) +static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) { struct namespace *ns = current->namespace; if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) - return -EINVAL; + (ns && atomic_read(&ns->count) > 1)) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); + if (!*new_nsp) + return -ENOMEM; + } return 0; } @@ -1482,7 +1487,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) goto bad_unshare_cleanup_thread; - if ((err = unshare_namespace(unshare_flags, &new_ns))) + if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) goto bad_unshare_cleanup_fs; if ((err = unshare_sighand(unshare_flags, &new_sigh))) goto bad_unshare_cleanup_ns; -- cgit v1.2.3 From a0a7ec308f1be5957b20a1a535d21f683dfd83f0 Mon Sep 17 00:00:00 2001 From: JANAK DESAI Date: Tue, 7 Feb 2006 12:59:01 -0800 Subject: [PATCH] unshare system call -v5: unshare vm If vm structure is being shared, allocate a new one and copy information from the current, shared, structure. Signed-off-by: Janak Desai Cc: Al Viro Cc: Christoph Hellwig Cc: Michael Kerrisk Cc: Andi Kleen Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 87 ++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 07dd241aa1e0..d1aceaea3f33 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -446,6 +446,55 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) } } +/* + * Allocate a new mm structure and copy contents from the + * mm structure of the passed in task structure. + */ +static struct mm_struct *dup_mm(struct task_struct *tsk) +{ + struct mm_struct *mm, *oldmm = current->mm; + int err; + + if (!oldmm) + return NULL; + + mm = allocate_mm(); + if (!mm) + goto fail_nomem; + + memcpy(mm, oldmm, sizeof(*mm)); + + if (!mm_init(mm)) + goto fail_nomem; + + if (init_new_context(tsk, mm)) + goto fail_nocontext; + + err = dup_mmap(mm, oldmm); + if (err) + goto free_pt; + + mm->hiwater_rss = get_mm_rss(mm); + mm->hiwater_vm = mm->total_vm; + + return mm; + +free_pt: + mmput(mm); + +fail_nomem: + return NULL; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return NULL; +} + static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) { struct mm_struct * mm, *oldmm; @@ -473,43 +522,17 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) } retval = -ENOMEM; - mm = allocate_mm(); + mm = dup_mm(tsk); if (!mm) goto fail_nomem; - /* Copy the current MM stuff.. */ - memcpy(mm, oldmm, sizeof(*mm)); - if (!mm_init(mm)) - goto fail_nomem; - - if (init_new_context(tsk,mm)) - goto fail_nocontext; - - retval = dup_mmap(mm, oldmm); - if (retval) - goto free_pt; - - mm->hiwater_rss = get_mm_rss(mm); - mm->hiwater_vm = mm->total_vm; - good_mm: tsk->mm = mm; tsk->active_mm = mm; return 0; -free_pt: - mmput(mm); fail_nomem: return retval; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return retval; } static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) @@ -1423,18 +1446,20 @@ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct ** } /* - * Unsharing of vm for tasks created with CLONE_VM is not supported yet + * Unshare vm if it is being shared */ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) { struct mm_struct *mm = current->mm; if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) - return -EINVAL; + (mm && atomic_read(&mm->mm_users) > 1)) { + *new_mmp = dup_mm(current); + if (!*new_mmp) + return -ENOMEM; + } return 0; - } /* -- cgit v1.2.3 From a016f3389c06606dd80e687942ff3c71d41823c4 Mon Sep 17 00:00:00 2001 From: JANAK DESAI Date: Tue, 7 Feb 2006 12:59:02 -0800 Subject: [PATCH] unshare system call -v5: unshare files If the file descriptor structure is being shared, allocate a new one and copy information from the current, shared, structure. Signed-off-by: Janak Desai Cc: Al Viro Cc: Christoph Hellwig Cc: Michael Kerrisk Cc: Andi Kleen Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 81 +++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 51 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d1aceaea3f33..8e88b374cee9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -620,32 +620,17 @@ out: return newf; } -static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +/* + * Allocate a new files structure and copy contents from the + * passed in files structure. + */ +static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { - struct files_struct *oldf, *newf; + struct files_struct *newf; struct file **old_fds, **new_fds; - int open_files, size, i, error = 0, expand; + int open_files, size, i, expand; struct fdtable *old_fdt, *new_fdt; - /* - * A background process may not have any files ... - */ - oldf = current->files; - if (!oldf) - goto out; - - if (clone_flags & CLONE_FILES) { - atomic_inc(&oldf->count); - goto out; - } - - /* - * Note: we may be using current for both targets (See exec.c) - * This works because we cache current->files (old) as oldf. Don't - * break this. - */ - tsk->files = NULL; - error = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -674,9 +659,9 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) if (expand) { spin_unlock(&oldf->file_lock); spin_lock(&newf->file_lock); - error = expand_files(newf, open_files-1); + *errorp = expand_files(newf, open_files-1); spin_unlock(&newf->file_lock); - if (error < 0) + if (*errorp < 0) goto out_release; new_fdt = files_fdtable(newf); /* @@ -725,10 +710,8 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } - tsk->files = newf; - error = 0; out: - return error; + return newf; out_release: free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); @@ -738,6 +721,40 @@ out_release: goto out; } +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + struct files_struct *oldf, *newf; + int error = 0; + + /* + * A background process may not have any files ... + */ + oldf = current->files; + if (!oldf) + goto out; + + if (clone_flags & CLONE_FILES) { + atomic_inc(&oldf->count); + goto out; + } + + /* + * Note: we may be using current for both targets (See exec.c) + * This works because we cache current->files (old) as oldf. Don't + * break this. + */ + tsk->files = NULL; + error = -ENOMEM; + newf = dup_fd(oldf, &error); + if (!newf) + goto out; + + tsk->files = newf; + error = 0; +out: + return error; +} + /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to @@ -1463,15 +1480,19 @@ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) } /* - * Unsharing of files for tasks created with CLONE_FILES is not supported yet + * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; + int error = 0; if ((unshare_flags & CLONE_FILES) && - (fd && atomic_read(&fd->count) > 1)) - return -EINVAL; + (fd && atomic_read(&fd->count) > 1)) { + *new_fdp = dup_fd(fd, &error); + if (!*new_fdp) + return error; + } return 0; } -- cgit v1.2.3 From 1b8623545b42c03eb92e51b28c84acf4b8ba00a3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Dec 2005 01:07:03 -0500 Subject: [PATCH] remove bogus asm/bug.h includes. A bunch of asm/bug.h includes are both not needed (since it will get pulled anyway) and bogus (since they are done too early). Removed. Signed-off-by: Al Viro --- kernel/compat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 1867290c37e3..8c9cd88b6785 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,7 +23,6 @@ #include #include -#include int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { -- cgit v1.2.3 From 53f087febfd12e74ba9f1082e71e9a45adc039ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 05:56:41 -0500 Subject: [PATCH] timer.c NULL noise removal Signed-off-by: Al Viro --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 4f1cb0ab5251..b9dad3994676 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -495,7 +495,7 @@ unsigned long next_timer_interrupt(void) base = &__get_cpu_var(tvec_bases); spin_lock(&base->t_base.lock); expires = base->timer_jiffies + (LONG_MAX >> 1); - list = 0; + list = NULL; /* Look for timer events in tv1. */ j = base->timer_jiffies & TVR_MASK; -- cgit v1.2.3 From 4bb8089c86b95b4f6bbd839cb83ca4556b06a031 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Feb 2006 05:57:32 -0500 Subject: [PATCH] kernel/sys.c NULL noise removal Signed-off-by: Al Viro --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 0929c698affc..f91218a5463e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -428,7 +428,7 @@ void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; - image = xchg(&kexec_image, 0); + image = xchg(&kexec_image, NULL); if (!image) { return; } -- cgit v1.2.3 From c70d3d703ad94727dab2a3664aeee33d71e00715 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Feb 2006 22:41:41 +0300 Subject: [PATCH] sys_signal: initialize ->sa_mask Pointed out by Linus Torvalds. sys_signal() forgets to initialize ->sa_mask. ( I suspect arch/ia64/ia32/ia32_signal.c:sys32_signal() also needs this fix ) Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b373fc2420da..01a1e7f7acf7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2702,6 +2702,7 @@ sys_signal(int sig, __sighandler_t handler) new_sa.sa.sa_handler = handler; new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; + sigemptyset(&new_sa.sa.sa_mask); ret = do_sigaction(sig, &new_sa, &old_sa); -- cgit v1.2.3 From 9ac95f2f90e022c16d293d7978faddf7e779a1a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Feb 2006 22:41:50 +0300 Subject: [PATCH] do_sigaction: cleanup ->sa_mask manipulation Clear unblockable signals beforehand. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/signal.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 01a1e7f7acf7..ea154104a00b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2430,7 +2430,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) } int -do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) +do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct k_sigaction *k; sigset_t mask; @@ -2454,6 +2454,8 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) *oact = *k; if (act) { + sigdelsetmask(&act->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); /* * POSIX 3.3.1.3: * "Setting a signal action to SIG_IGN for a signal that is @@ -2479,8 +2481,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) read_lock(&tasklist_lock); spin_lock_irq(&t->sighand->siglock); *k = *act; - sigdelsetmask(&k->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); @@ -2495,8 +2495,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) } *k = *act; - sigdelsetmask(&k->sa.sa_mask, - sigmask(SIGKILL) | sigmask(SIGSTOP)); } spin_unlock_irq(¤t->sighand->siglock); -- cgit v1.2.3 From a2000572ad511f5f43091ed7bd2cc3b913104a1e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 10 Feb 2006 01:51:02 -0800 Subject: [PATCH] sched: remove smpnice I don't think the code is quite ready, which is why I asked for Peter's additions to also be merged before I acked it (although it turned out that it still isn't quite ready with his additions either). Basically I have had similar observations to Suresh in that it does not play nicely with the rest of the balancing infrastructure (and raised similar concerns in my review). The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2 SMP+HT Xeon are as follows: | Following boot | hackbench 20 | hackbench 40 -----------+----------------+---------------------+--------------------- 2.6.16-rc2 | 30,37,100,112 | 5600,5530,6020,6090 | 6390,7090,8760,8470 +nosmpnice | 3, 2, 4, 2 | 28, 150, 294, 132 | 348, 348, 294, 347 Hackbench raw performance is down around 15% with smpnice (but that in itself isn't a huge deal because it is just a benchmark). However, the samples show that the imbalance passed into move_tasks is increased by about a factor of 10-30. I think this would also go some way to explaining latency blips turning up in the balancing code (though I haven't actually measured that). We'll probably have to revert this in the SUSE kernel. Cc: "Siddha, Suresh B" Acked-by: Ingo Molnar Cc: Peter Williams Cc: "Martin J. Bligh" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 129 ++++++++------------------------------------------------- 1 file changed, 18 insertions(+), 111 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bc38804e40dd..87d93be336a1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -215,7 +215,6 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP - unsigned long prio_bias; unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -669,68 +668,13 @@ static int effective_prio(task_t *p) return prio; } -#ifdef CONFIG_SMP -static inline void inc_prio_bias(runqueue_t *rq, int prio) -{ - rq->prio_bias += MAX_PRIO - prio; -} - -static inline void dec_prio_bias(runqueue_t *rq, int prio) -{ - rq->prio_bias -= MAX_PRIO - prio; -} - -static inline void inc_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running++; - if (rt_task(p)) { - if (p != rq->migration_thread) - /* - * The migration thread does the actual balancing. Do - * not bias by its priority as the ultra high priority - * will skew balancing adversely. - */ - inc_prio_bias(rq, p->prio); - } else - inc_prio_bias(rq, p->static_prio); -} - -static inline void dec_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running--; - if (rt_task(p)) { - if (p != rq->migration_thread) - dec_prio_bias(rq, p->prio); - } else - dec_prio_bias(rq, p->static_prio); -} -#else -static inline void inc_prio_bias(runqueue_t *rq, int prio) -{ -} - -static inline void dec_prio_bias(runqueue_t *rq, int prio) -{ -} - -static inline void inc_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running++; -} - -static inline void dec_nr_running(task_t *p, runqueue_t *rq) -{ - rq->nr_running--; -} -#endif - /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { enqueue_task(p, rq->active); - inc_nr_running(p, rq); + rq->nr_running++; } /* @@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { enqueue_task_head(p, rq->active); - inc_nr_running(p, rq); + rq->nr_running++; } static int recalc_task_prio(task_t *p, unsigned long long now) @@ -863,7 +807,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { - dec_nr_running(p, rq); + rq->nr_running--; dequeue_task(p, p->array); p->array = NULL; } @@ -1007,61 +951,27 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static unsigned long __source_load(int cpu, int type, enum idle_type idle) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long running = rq->nr_running; - unsigned long source_load, cpu_load = rq->cpu_load[type-1], - load_now = running * SCHED_LOAD_SCALE; - + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; if (type == 0) - source_load = load_now; - else - source_load = min(cpu_load, load_now); - - if (running > 1 || (idle == NOT_IDLE && running)) - /* - * If we are busy rebalancing the load is biased by - * priority to create 'nice' support across cpus. When - * idle rebalancing we should only bias the source_load if - * there is more than one task running on that queue to - * prevent idle rebalance from trying to pull tasks from a - * queue with only one running task. - */ - source_load = source_load * rq->prio_bias / running; + return load_now; - return source_load; -} - -static inline unsigned long source_load(int cpu, int type) -{ - return __source_load(cpu, type, NOT_IDLE); + return min(rq->cpu_load[type-1], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long running = rq->nr_running; - unsigned long target_load, cpu_load = rq->cpu_load[type-1], - load_now = running * SCHED_LOAD_SCALE; - + unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; if (type == 0) - target_load = load_now; - else - target_load = max(cpu_load, load_now); + return load_now; - if (running > 1 || (idle == NOT_IDLE && running)) - target_load = target_load * rq->prio_bias / running; - - return target_load; -} - -static inline unsigned long target_load(int cpu, int type) -{ - return __target_load(cpu, type, NOT_IDLE); + return max(rq->cpu_load[type-1], load_now); } /* @@ -1530,7 +1440,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; - inc_nr_running(p, rq); + rq->nr_running++; } set_need_resched(); } else @@ -1875,9 +1785,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) { dequeue_task(p, src_array); - dec_nr_running(p, src_rq); + src_rq->nr_running--; set_task_cpu(p, this_cpu); - inc_nr_running(p, this_rq); + this_rq->nr_running++; enqueue_task(p, this_array); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; @@ -2056,9 +1966,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* Bias balancing toward cpus of our domain */ if (local_group) - load = __target_load(i, load_idx, idle); + load = target_load(i, load_idx); else - load = __source_load(i, load_idx, idle); + load = source_load(i, load_idx); avg_load += load; } @@ -2171,7 +2081,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, int i; for_each_cpu_mask(i, group->cpumask) { - load = __source_load(i, 0, idle); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -3571,10 +3481,8 @@ void set_user_nice(task_t *p, long nice) goto out_unlock; } array = p->array; - if (array) { + if (array) dequeue_task(p, array); - dec_prio_bias(rq, p->static_prio); - } old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); @@ -3584,7 +3492,6 @@ void set_user_nice(task_t *p, long nice) if (array) { enqueue_task(p, array); - inc_prio_bias(rq, p->static_prio); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: -- cgit v1.2.3 From c22db9412736204b25aeba19d18e5ea922f7d632 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 10 Feb 2006 01:51:11 -0800 Subject: [PATCH] prevent recursive panic from softlockup watchdog When panic_timeout is zero, suppress triggering a nested panic due to soft lockup detection. Signed-off-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index c5c4ab255834..126dc43f1c74 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -130,6 +130,7 @@ NORET_TYPE void panic(const char * fmt, ...) #endif local_irq_enable(); for (i = 0;;) { + touch_softlockup_watchdog(); i += panic_blink(i); mdelay(1); i++; -- cgit v1.2.3 From 16bf134840da3920ded1290973c56ec214636f12 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 14 Feb 2006 13:52:59 -0800 Subject: [PATCH] compound page: no access_process_vm check The PageCompound check before access_process_vm's set_page_dirty_lock is no longer necessary, so remove it. But leave the PageCompound checks in bio_set_pages_dirty, dio_bio_complete and nfs_free_user_pages: at least some of those were introduced as a little optimization on hugetlb pages. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 5f33cdb6fff5..d2cf144d0af5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -242,8 +242,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - if (!PageCompound(page)) - set_page_dirty_lock(page); + set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); -- cgit v1.2.3 From d6077cb80cde4506720f9165eba99ee07438513f Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Tue, 14 Feb 2006 13:53:10 -0800 Subject: [PATCH] sched: revert "filter affine wakeups" Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6: [PATCH] sched: filter affine wakeups Apparently caused more than 10% performance regression for aim7 benchmark. The setup in use is 16-cpu HP rx8620, 64Gb of memory and 12 MSA1000s with 144 disks. Each disk is 72Gb with a single ext3 filesystem (courtesy of HP, who supplied benchmark results). The problem is, for aim7, the wake-up pattern is random, but it still needs load balancing action in the wake-up path to achieve best performance. With the above commit, lack of load balancing hurts that workload. However, for workloads like database transaction processing, the requirement is exactly opposite. In the wake up path, best performance is achieved with absolutely zero load balancing. We simply wake up the process on the CPU that it was previously run. Worst performance is obtained when we do load balancing at wake up. There isn't an easy way to auto detect the workload characteristics. Ingo's earlier patch that detects idle CPU and decide whether to load balance or not doesn't perform with aim7 either since all CPUs are busy (it causes even bigger perf. regression). Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6, which causes more than 10% performance regression with aim7. Signed-off-by: Ken Chen Acked-by: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 87d93be336a1..66d957227de9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1204,9 +1204,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) } } - if (p->last_waker_cpu != this_cpu) - goto out_set_cpu; - if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; @@ -1277,8 +1274,6 @@ out_set_cpu: cpu = task_cpu(p); } - p->last_waker_cpu = this_cpu; - out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) { @@ -1360,12 +1355,9 @@ void fastcall sched_fork(task_t *p, int clone_flags) #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) - p->last_waker_cpu = cpu; -#if defined(__ARCH_WANT_UNLOCKED_CTXSW) +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; #endif -#endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; -- cgit v1.2.3 From 06027bdd278a32a84b273e41db68a5db8ffd2bb6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 14 Feb 2006 13:53:15 -0800 Subject: [PATCH] hrtimer: round up relative start time on low-res arches CONFIG_TIME_LOW_RES is a temporary way for architectures to signal that they simply return xtime in do_gettimeoffset(). In this corner-case we want to round up by resolution when starting a relative timer, to avoid short timeouts. This will go away with the GTOD framework. Signed-off-by: Ingo Molnar Cc: Roman Zippel Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2b6e1757aedd..5ae51f1bc7c8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -418,8 +418,19 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base); - if (mode == HRTIMER_REL) + if (mode == HRTIMER_REL) { tim = ktime_add(tim, new_base->get_time()); + /* + * CONFIG_TIME_LOW_RES is a temporary way for architectures + * to signal that they simply return xtime in + * do_gettimeoffset(). In this case we want to round up by + * resolution when starting a relative timer, to avoid short + * timeouts. This will go away with the GTOD framework. + */ +#ifdef CONFIG_TIME_LOW_RES + tim = ktime_add(tim, base->resolution); +#endif + } timer->expires = tim; enqueue_hrtimer(timer, new_base); -- cgit v1.2.3 From 3f17da699431ec48540beabc55c54d4b5e66c8e7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Feb 2006 22:13:24 +0300 Subject: [PATCH] fix kill_proc_info() vs CLONE_THREAD race There is a window after copy_process() unlocks ->sighand.siglock and before it adds the new thread to the thread list. In that window __group_complete_signal(SIGKILL) will not see the new thread yet, so this thread will start running while the whole thread group was supposed to exit. I beleive we have another good reason to place attach_pid(PID/TGID) under ->sighand.siglock. We can do the same for release_task()->__unhash_process() de_thread()->switch_exec_pids() After that we don't need tasklist_lock to iterate over the thread list, and we can simplify things, see for example do_sigaction() or sys_times(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8e88b374cee9..3683ce10f4a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1123,8 +1123,8 @@ static task_t *copy_process(unsigned long clone_flags, p->real_parent = current; p->parent = p->real_parent; + spin_lock(¤t->sighand->siglock); if (clone_flags & CLONE_THREAD) { - spin_lock(¤t->sighand->siglock); /* * Important: if an exit-all has been started then * do not create this new thread - the whole thread @@ -1162,8 +1162,6 @@ static task_t *copy_process(unsigned long clone_flags, */ p->it_prof_expires = jiffies_to_cputime(1); } - - spin_unlock(¤t->sighand->siglock); } /* @@ -1189,6 +1187,7 @@ static task_t *copy_process(unsigned long clone_flags, nr_threads++; total_forks++; + spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); return p; -- cgit v1.2.3 From dadac81b1b86196fcc48fb87620403c4a7174f06 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Feb 2006 22:13:26 +0300 Subject: [PATCH] fix kill_proc_info() vs fork() theoretical race copy_process: attach_pid(p, PIDTYPE_PID, p->pid); attach_pid(p, PIDTYPE_TGID, p->tgid); What if kill_proc_info(p->pid) happens in between? copy_process() holds current->sighand.siglock, so we are safe in CLONE_THREAD case, because current->sighand == p->sighand. Otherwise, p->sighand is unlocked, the new process is already visible to the find_task_by_pid(), but have a copy of parent's 'struct pid' in ->pids[PIDTYPE_TGID]. This means that __group_complete_signal() may hang while doing do ... while (next_thread() != p) We can solve this problem if we reverse these 2 attach_pid()s: attach_pid() does wmb() group_send_sig_info() calls spin_lock(), which provides a read barrier. // Yes ? I don't think we can hit this race in practice, but still. Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3683ce10f4a9..fbea12d7a943 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,8 +1173,6 @@ static task_t *copy_process(unsigned long clone_flags, if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); - attach_pid(p, PIDTYPE_PID, p->pid); - attach_pid(p, PIDTYPE_TGID, p->tgid); if (thread_group_leader(p)) { p->signal->tty = current->signal->tty; p->signal->pgrp = process_group(current); @@ -1184,6 +1182,8 @@ static task_t *copy_process(unsigned long clone_flags, if (p->pid) __get_cpu_var(process_counts)++; } + attach_pid(p, PIDTYPE_TGID, p->tgid); + attach_pid(p, PIDTYPE_PID, p->pid); nr_threads++; total_forks++; -- cgit v1.2.3 From 5ecfbae093f0c37311e89b29bfc0c9d586eace87 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 15 Feb 2006 22:50:10 +0300 Subject: [PATCH] fix zap_thread's ptrace related problems 1. The tracee can go from ptrace_stop() to do_signal_stop() after __ptrace_unlink(p). 2. It is unsafe to __ptrace_unlink(p) while p->parent may wait for tasklist_lock in ptrace_detach(). Signed-off-by: Oleg Nesterov Cc: Roland McGrath Cc: Ingo Molnar Cc: Christoph Hellwig Cc: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d2cf144d0af5..d95a72c9279d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -72,8 +72,8 @@ void ptrace_untrace(task_t *child) */ void __ptrace_unlink(task_t *child) { - if (!child->ptrace) - BUG(); + BUG_ON(!child->ptrace); + child->ptrace = 0; if (!list_empty(&child->ptrace_list)) { list_del_init(&child->ptrace_list); @@ -184,22 +184,27 @@ bad: return retval; } +void __ptrace_detach(struct task_struct *child, unsigned int data) +{ + child->exit_code = data; + /* .. re-parent .. */ + __ptrace_unlink(child); + /* .. and wake it up. */ + if (child->exit_state != EXIT_ZOMBIE) + wake_up_process(child); +} + int ptrace_detach(struct task_struct *child, unsigned int data) { if (!valid_signal(data)) - return -EIO; + return -EIO; /* Architecture-specific hardware disable .. */ ptrace_disable(child); - /* .. re-parent .. */ - child->exit_code = data; - write_lock_irq(&tasklist_lock); - __ptrace_unlink(child); - /* .. and wake it up. */ - if (child->exit_state != EXIT_ZOMBIE) - wake_up_process(child); + if (child->ptrace) + __ptrace_detach(child, data); write_unlock_irq(&tasklist_lock); return 0; -- cgit v1.2.3 From 06fed33849c13af637c4d09e9ba27828fac9edd5 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Wed, 15 Feb 2006 15:17:38 -0800 Subject: [PATCH] cpuset: oops in exit on null cpuset fix Fix a latent bug in cpuset_exit() handling. If a task tried to allocate memory after calling cpuset_exit(), it oops'd in cpuset_update_task_memory_state() on a NULL cpuset pointer. So set the exiting tasks cpuset to the root cpuset instead of to NULL. A distro kernel hit this with an added kernel package that had just such a hook (allocating memory) in the exit code path. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba42b0a76961..12815d3f1a05 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1977,6 +1977,39 @@ void cpuset_fork(struct task_struct *child) * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't * mess with it, or task is a failed fork, never visible to attach_task. + * + * Hack: + * + * Set the exiting tasks cpuset to the root cpuset (top_cpuset). + * + * Don't leave a task unable to allocate memory, as that is an + * accident waiting to happen should someone add a callout in + * do_exit() after the cpuset_exit() call that might allocate. + * If a task tries to allocate memory with an invalid cpuset, + * it will oops in cpuset_update_task_memory_state(). + * + * We call cpuset_exit() while the task is still competent to + * handle notify_on_release(), then leave the task attached to + * the root cpuset (top_cpuset) for the remainder of its exit. + * + * To do this properly, we would increment the reference count on + * top_cpuset, and near the very end of the kernel/exit.c do_exit() + * code we would add a second cpuset function call, to drop that + * reference. This would just create an unnecessary hot spot on + * the top_cpuset reference count, to no avail. + * + * Normally, holding a reference to a cpuset without bumping its + * count is unsafe. The cpuset could go away, or someone could + * attach us to a different cpuset, decrementing the count on + * the first cpuset that we never incremented. But in this case, + * top_cpuset isn't going away, and either task has PF_EXITING set, + * which wards off any attach_task() attempts, or task is a failed + * fork, never visible to attach_task. + * + * Another way to do this would be to set the cpuset pointer + * to NULL here, and check in cpuset_update_task_memory_state() + * for a NULL pointer. This hack avoids that NULL check, for no + * cost (other than this way too long comment ;). **/ void cpuset_exit(struct task_struct *tsk) @@ -1984,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk) struct cpuset *cs; cs = tsk->cpuset; - tsk->cpuset = NULL; + tsk->cpuset = &top_cpuset; /* Hack - see comment above */ if (notify_on_release(cs)) { char *pathbuf = NULL; -- cgit v1.2.3 From c8adb494a6df6b2be8e50a8dafd5bab231df3505 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 15 Feb 2006 15:17:42 -0800 Subject: [PATCH] swsusp: nuke noisy message I get about 88 squillion of these when suspending an old ad450nx server. Cc: Pavel Roskin Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 41f66365f0d8..8d5a5986d621 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -91,10 +91,8 @@ static int save_highmem_zone(struct zone *zone) * corrected eventually when the cases giving rise to this * are better understood. */ - if (PageReserved(page)) { - printk("highmem reserved page?!\n"); + if (PageReserved(page)) continue; - } BUG_ON(PageNosave(page)); if (PageNosaveFree(page)) continue; -- cgit v1.2.3 From a62eaf151d9cb478d127cfbc2e93c498869785b0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 16 Feb 2006 23:41:58 +0100 Subject: [PATCH] x86_64: Add boot option to disable randomized mappings and cleanup AMD SimNow!'s JIT doesn't like them at all in the guest. For distribution installation it's easiest if it's a boot time option. Also I moved the variable to a more appropiate place and make it independent from sysctl And marked __read_mostly which it is. Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71dd6f62efec..7654d55c47f5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,8 +126,6 @@ extern int sysctl_hz_timer; extern int acct_parm[]; #endif -int randomize_va_space = 1; - static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, -- cgit v1.2.3 From 726c14bf499e91e7ede4f1728830aba05c675061 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 17 Feb 2006 10:30:23 +1100 Subject: [PATCH] Provide an interface for getting the current tick length This provides an interface for arch code to find out how many nanoseconds are going to be added on to xtime by the next call to do_timer. The value returned is a fixed-point number in 52.12 format in nanoseconds. The reason for this format is that it gives the full precision that the timekeeping code is using internally. The motivation for this is to fix a problem that has arisen on 32-bit powerpc in that the value returned by do_gettimeofday drifts apart from xtime if NTP is being used. PowerPC is now using a lockless do_gettimeofday based on reading the timebase register and performing some simple arithmetic. (This method of getting the time is also exported to userspace via the VDSO.) However, the factor and offset it uses were calculated based on the nominal tick length and weren't being adjusted when NTP varied the tick length. Note that 64-bit powerpc has had the lockless do_gettimeofday for a long time now. It also had an extremely hairy routine that got called from the 32-bit compat routine for adjtimex, which adjusted the factor and offset according to what it thought the timekeeping code was going to do. Not only was this only called if a 32-bit task did adjtimex (i.e. not if a 64-bit task did adjtimex), it was also duplicating computations from kernel/timer.c and it wasn't clear that it was (still) correct. The simple solution is to ask the timekeeping code how long the current jiffy will be on each timer interrupt, after calling do_timer. If this jiffy will be a different length from the last one, we then need to compute new values for the factor and offset used in the lockless do_gettimeofday. In this way we can keep xtime and do_gettimeofday in sync, even when NTP is varying the tick length. Note that when adjtimex varies the tick length, it almost always introduces the variation from the next tick on. The only case I could see where adjtimex would vary the length of the current tick is when an old-style adjtime adjustment is being cancelled. (It's not clear to me why the adjustment has to be cancelled immediately rather than from the next tick on.) Thus I don't see any real need for a hook in adjtimex; the rare case of an old-style adjustment being cancelled can be fixed up at the next tick. Signed-off-by: Paul Mackerras Acked-by: john stultz Signed-off-by: Linus Torvalds --- kernel/timer.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index b9dad3994676..fe3a9a9f8328 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -717,12 +717,16 @@ static void second_overflow(void) #endif } -/* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +/* + * Returns how many microseconds we need to add to xtime this tick + * in doing an adjustment requested with adjtime. + */ +static long adjtime_adjustment(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; - if ((time_adjust_step = time_adjust) != 0 ) { + time_adjust_step = time_adjust; + if (time_adjust_step) { /* * We are doing an adjtime thing. Prepare time_adjust_step to * be within bounds. Note that a positive time_adjust means we @@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void) */ time_adjust_step = min(time_adjust_step, (long)tickadj); time_adjust_step = max(time_adjust_step, (long)-tickadj); + } + return time_adjust_step; +} +/* in the NTP reference this is called "hardclock()" */ +static void update_wall_time_one_tick(void) +{ + long time_adjust_step, delta_nsec; + + time_adjust_step = adjtime_adjustment(); + if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - } delta_nsec = tick_nsec + time_adjust_step * 1000; /* * Advance the phase, once it gets to one microsecond, then @@ -758,6 +771,22 @@ static void update_wall_time_one_tick(void) } } +/* + * Return how long ticks are at the moment, that is, how much time + * update_wall_time_one_tick will add to xtime next time we call it + * (assuming no calls to do_adjtimex in the meantime). + * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 + * bits to the right of the binary point. + * This function has no side-effects. + */ +u64 current_tick_length(void) +{ + long delta_nsec; + + delta_nsec = tick_nsec + adjtime_adjustment() * 1000; + return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; +} + /* * Using a loop looks inefficient, but "ticks" is * usually just one (we shouldn't be losing ticks, -- cgit v1.2.3 From 4bbf39c29bc3409d6454faf0dfa1b3b0aa2ac2af Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Feb 2006 13:52:44 -0800 Subject: [PATCH] Introduce CONFIG_DEFAULT_MIGRATION_COST Heiko Carstens wrote: The boot sequence on s390 sometimes takes ages and we spend a very long time (up to one or two minutes) in calibrate_migration_costs. The time spent there differs from boot to boot. Also the calculated costs differ a lot. I've seen differences by up to a factor of 15 (yes, factor not percent). Also I doubt that making these measurements make much sense on a completely virtualized architecture where you cannot tell how much cpu time you will get anyway. So introduce the CONFIG_DEFAULT_MIGRATION_COST method for an architecture to set the scheduler migration costs. This turns off automatic detection of migration costs. Makes sense on virtual platforms, where migration costs are hard to measure accurately. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 66d957227de9..12d291bf3379 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5058,7 +5058,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, #define MAX_DOMAIN_DISTANCE 32 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = - { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL }; + { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = +/* + * Architectures may override the migration cost and thus avoid + * boot-time calibration. Unit is nanoseconds. Mostly useful for + * virtualized hardware: + */ +#ifdef CONFIG_DEFAULT_MIGRATION_COST + CONFIG_DEFAULT_MIGRATION_COST +#else + -1LL +#endif +}; /* * Allow override of migration cost - in units of microseconds. -- cgit v1.2.3 From a8534adb74e23374889b84b3d97eb18da542a1b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 17 Feb 2006 13:52:51 -0800 Subject: [PATCH] swsusp: fix breakage with swap on LVM Restore the compatibility with the older code and make it possible to suspend if the kernel command line doesn't contain the "resume=" argument Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 4e90905f0e87..2d9d08f72f76 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -153,13 +153,11 @@ static int swsusp_swap_check(void) /* This is called before saving image */ { int i; - if (!swsusp_resume_device) - return -ENODEV; spin_lock(&swap_lock); for (i = 0; i < MAX_SWAPFILES; i++) { if (!(swap_info[i].flags & SWP_WRITEOK)) continue; - if (is_resume_device(swap_info + i)) { + if (!swsusp_resume_device || is_resume_device(swap_info + i)) { spin_unlock(&swap_lock); root_swap = i; return 0; -- cgit v1.2.3 From ef20c8c197df9b8d5bd4af0679123826da028861 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Feb 2006 15:41:50 -0500 Subject: [PATCH] GFP_KERNEL allocations in atomic (auditsc) audit_log_exit() is called from atomic contexts and gets explicit gfp_mask argument; it should use it for all allocations rather than doing some with gfp_mask and some with GFP_KERNEL. Signed-off-by: Al Viro --- kernel/auditsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 685c25175d96..d7e7e637b92a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -841,7 +841,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) for (aux = context->aux; aux; aux = aux->next) { - ab = audit_log_start(context, GFP_KERNEL, aux->type); + ab = audit_log_start(context, gfp_mask, aux->type); if (!ab) continue; /* audit_panic has been called */ @@ -878,14 +878,14 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) } if (context->pwd && context->pwdmnt) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); + ab = audit_log_start(context, gfp_mask, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); audit_log_end(ab); } } for (i = 0; i < context->name_count; i++) { - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + ab = audit_log_start(context, gfp_mask, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ -- cgit v1.2.3 From c255d844dd73616f23e4b4733edcc2e5fa4042b2 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 20 Feb 2006 18:27:58 -0800 Subject: [PATCH] suspend-to-ram: allow video options to be set at runtime Currently, acpi video options can only be set on kernel command line. That's little inflexible; I'd like userland s2ram application that just works, and modifying kernel command line according to whitelist is not fun. It is better to just allow s2ram application to set video options just before suspend (according to the whitelist). This implements sysctl to allow setting suspend video options without reboot. (akpm: Documentation updates for this new sysctl are pending..) Signed-off-by: Pavel Machek Cc: "Brown, Len" Cc: "Antonino A. Daplas" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7654d55c47f5..ebc41bf22f1e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -44,14 +44,12 @@ #include #include #include +#include +#include #include #include -#ifdef CONFIG_ROOT_NFS -#include -#endif - #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -655,6 +653,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif +#ifdef CONFIG_ACPI_SLEEP + { + .ctl_name = KERN_ACPI_VIDEO_FLAGS, + .procname = "acpi_video_flags", + .data = &acpi_video_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = 0 } }; -- cgit v1.2.3 From 7a9166e3b037296366cea6f3c97f705d33e209e6 Mon Sep 17 00:00:00 2001 From: Luke Yang Date: Mon, 20 Feb 2006 18:28:07 -0800 Subject: [PATCH] Fix undefined symbols for nommu architecture Signed-off-by: Luke Yang Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ebc41bf22f1e..c05a2b7125e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -636,6 +636,7 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#if defined(CONFIG_MMU) { .ctl_name = KERN_RANDOMIZE, .procname = "randomize_va_space", @@ -644,6 +645,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #if defined(CONFIG_S390) && defined(CONFIG_SMP) { .ctl_name = KERN_SPIN_RETRY, -- cgit v1.2.3 From 7fd105e758c8d746d57ab7e77f100e096bf153c8 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 20 Feb 2006 18:28:08 -0800 Subject: [PATCH] Fix compile for CONFIG_SYSVIPC=n or CONFIG_SYSCTL=n The compat syscalls are added to sys_ni.c since they are not defined if the above CONFIG options are off. Also, nfs would not build with CONFIG_SYSCTL off. Noticed by Arthur Othieno. Signed-off-by: Stephen Rothwell Cc: "David S. Miller" Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 17313b99e53d..1067090db6b1 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -104,6 +104,8 @@ cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(compat_sys_ipc); +cond_syscall(compat_sys_sysctl); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); -- cgit v1.2.3 From 5914811acf36c3ff091f860a6964808f668f27d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=F6rn=20Steinbrink?= Date: Sat, 18 Feb 2006 18:12:43 +0100 Subject: [PATCH] kjournald keeps reference to namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In daemonize() a new thread gets cleaned up and 'merged' with init_task. The current fs_struct is handled there, but not the current namespace. This adds the namespace part. [ Eric Biederman pointed out the namespace wrappers, and also notes that we can't ever count on using our parents namespace because we already have called exit_fs(), which is the only way to the namespace from a process. ] Signed-off-by: Björn Steinbrink Acked-by: Eric Biederman Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 93cee3671332..531aadca5530 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -360,6 +360,9 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); + exit_namespace(current); + current->namespace = init_task.namespace; + get_namespace(current->namespace); exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); -- cgit v1.2.3 From d2b176ed878d4d5fcc0bd35656dfd373f3702af9 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Tue, 28 Feb 2006 09:42:23 -0800 Subject: [IA64] sysctl option to silence unaligned trap warnings Allow sysadmin to disable all warnings about userland apps making unaligned accesses by using: # echo 1 > /proc/sys/kernel/ignore-unaligned-usertrap Rather than having to use prctl on a process by process basis. Default behaivour leaves the warnings enabled. Signed-off-by: Jes Sorensen Signed-off-by: Tony Luck --- kernel/sysctl.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c05a2b7125e1..acf6c1550f27 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,6 +124,10 @@ extern int sysctl_hz_timer; extern int acct_parm[]; #endif +#ifdef CONFIG_IA64 +extern int no_unaligned_warning; +#endif + static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); static int proc_doutsstring(ctl_table *table, int write, struct file *filp, @@ -665,6 +669,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif +#ifdef CONFIG_IA64 + { + .ctl_name = KERN_IA64_UNALIGNED, + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif { .ctl_name = 0 } }; -- cgit v1.2.3 From 7f99f06f01aa9460b5a18f1b0e0900c90d0a84fc Mon Sep 17 00:00:00 2001 From: Stefan Seyfried Date: Thu, 2 Mar 2006 02:54:34 -0800 Subject: [PATCH] fix acpi_video_flags on x86-64 acpi_video_flags variable is unsigned long, so it should be set as such. This actually matters on x86-64. Signed-off-by: Stefan Seyfried Signed-off-by: Pavel Machek Cc: "Brown, Len" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index acf6c1550f27..de2d9109194e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -667,7 +667,7 @@ static ctl_table kern_table[] = { .data = &acpi_video_flags, .maxlen = sizeof (unsigned long), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_doulongvec_minmax, }, #endif #ifdef CONFIG_IA64 -- cgit v1.2.3 From 685db65e422bfa523b8a9dacb5a658b42b254f05 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 2 Mar 2006 02:54:35 -0800 Subject: [PATCH] time_interpolator: Use readq_relaxed() instead of readq(). On some platforms readq performs additional work to make sure I/O is done in a coherent way. This is not needed for time retrieval as done by the time interpolator. So we can use readq_relaxed instead which will improve performance. It affects sparc64 and ia64 only. Apparently it makes a significant difference on ia64. Signed-off-by: Christoph Lameter Cc: john stultz Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index fe3a9a9f8328..fc6646fd5aab 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1351,10 +1351,10 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) return x(); case TIME_SOURCE_MMIO64 : - return readq((void __iomem *) time_interpolator->addr); + return readq_relaxed((void __iomem *)time_interpolator->addr); case TIME_SOURCE_MMIO32 : - return readl((void __iomem *) time_interpolator->addr); + return readl_relaxed((void __iomem *)time_interpolator->addr); default: return get_cycles(); } -- cgit v1.2.3 From 8ba7b0a14b2ec19583bedbcdbea7f1c5008fc922 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 6 Mar 2006 17:38:49 -0800 Subject: Add early-boot-safety check to cond_resched() Just to be safe, we should not trigger a conditional reschedule during the early boot sequence. We've historically done some questionable early on, and the safety warnings in __might_sleep() are generally turned off during that period, so there might be problems lurking. This affects CONFIG_PREEMPT_VOLUNTARY, which takes over might_sleep() to cause a voluntary conditional reschedule. Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 12d291bf3379..3454bb869fd0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4028,6 +4028,8 @@ static inline void __cond_resched(void) */ if (unlikely(preempt_count())) return; + if (unlikely(system_state != SYSTEM_RUNNING)) + return; do { add_preempt_count(PREEMPT_ACTIVE); schedule(); -- cgit v1.2.3 From 69239749e1ac4f3496906aa4267cb9f61ce52c9c Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 6 Mar 2006 15:42:45 -0800 Subject: [PATCH] fix next_timer_interrupt() for hrtimer Also from Thomas Gleixner Function next_timer_interrupt() got broken with a recent patch 6ba1b91213e81aa92b5cf7539f7d2a94ff54947c as sys_nanosleep() was moved to hrtimer. This broke things as next_timer_interrupt() did not check hrtimer tree for next event. Function next_timer_interrupt() is needed with dyntick (CONFIG_NO_IDLE_HZ, VST) implementations, as the system can be in idle when next hrtimer event was supposed to happen. At least ARM and S390 currently use next_timer_interrupt(). Signed-off-by: Thomas Gleixner Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 35 +++++++++++++++++++++++++++++++++++ kernel/timer.c | 16 ++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5ae51f1bc7c8..14bc9cfa6399 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -505,6 +505,41 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) return rem; } +#ifdef CONFIG_NO_IDLE_HZ +/** + * hrtimer_get_next_event - get the time until next expiry event + * + * Returns the delta to the next expiry event or KTIME_MAX if no timer + * is pending. + */ +ktime_t hrtimer_get_next_event(void) +{ + struct hrtimer_base *base = __get_cpu_var(hrtimer_bases); + ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + unsigned long flags; + int i; + + for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) { + struct hrtimer *timer; + + spin_lock_irqsave(&base->lock, flags); + if (!base->first) { + spin_unlock_irqrestore(&base->lock, flags); + continue; + } + timer = rb_entry(base->first, struct hrtimer, node); + delta.tv64 = timer->expires.tv64; + spin_unlock_irqrestore(&base->lock, flags); + delta = ktime_sub(delta, base->get_time()); + if (delta.tv64 < mindelta.tv64) + mindelta.tv64 = delta.tv64; + } + if (mindelta.tv64 < 0) + mindelta.tv64 = 0; + return mindelta; +} +#endif + /** * hrtimer_init - initialize a timer to the given clock * diff --git a/kernel/timer.c b/kernel/timer.c index fc6646fd5aab..8256f3f5ec0d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -489,9 +489,21 @@ unsigned long next_timer_interrupt(void) struct list_head *list; struct timer_list *nte; unsigned long expires; + unsigned long hr_expires = MAX_JIFFY_OFFSET; + ktime_t hr_delta; tvec_t *varray[4]; int i, j; + hr_delta = hrtimer_get_next_event(); + if (hr_delta.tv64 != KTIME_MAX) { + struct timespec tsdelta; + tsdelta = ktime_to_timespec(hr_delta); + hr_expires = timespec_to_jiffies(&tsdelta); + if (hr_expires < 3) + return hr_expires + jiffies; + } + hr_expires += jiffies; + base = &__get_cpu_var(tvec_bases); spin_lock(&base->t_base.lock); expires = base->timer_jiffies + (LONG_MAX >> 1); @@ -542,6 +554,10 @@ found: } } spin_unlock(&base->t_base.lock); + + if (time_before(hr_expires, expires)) + return hr_expires; + return expires; } #endif -- cgit v1.2.3 From 5aee405c662ca644980c184774277fc6d0769a84 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Mon, 6 Mar 2006 15:42:51 -0800 Subject: [PATCH] time: add barrier after updating jiffies_64 Add a compiler barrier so that we don't read jiffies before updating jiffies_64. Signed-off-by: Atsushi Nemoto Cc: Ralf Baechle Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 8256f3f5ec0d..bf7c4193b936 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -941,6 +941,8 @@ static inline void update_times(void) void do_timer(struct pt_regs *regs) { jiffies_64++; + /* prevent loading jiffies before storing new jiffies_64 value. */ + barrier(); update_times(); softlockup_tick(regs); } -- cgit v1.2.3 From 81c29a857d3c8d6ea9c4f20d196c36bf0a07c615 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 7 Mar 2006 21:55:27 -0800 Subject: [PATCH] idle threads should have a sane ->timestamp value Idle threads should have a sane ->timestamp value, to avoid init kernel thread(s) from inheriting it and causing miscalculations in try_to_wake_up(). Reported-by: Mike Galbraith . Signed-off-by: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3454bb869fd0..e82c99f1db64 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4335,6 +4335,7 @@ void __devinit init_idle(task_t *idle, int cpu) runqueue_t *rq = cpu_rq(cpu); unsigned long flags; + idle->timestamp = sched_clock(); idle->sleep_avg = 0; idle->array = NULL; idle->prio = MAX_PRIO; -- cgit v1.2.3 From 21a1ea9eb40411d4ee29448c53b9e4c0654d6ceb Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 7 Mar 2006 21:55:33 -0800 Subject: [PATCH] rcu batch tuning This patch adds new tunables for RCU queue and finished batches. There are two types of controls - number of completed RCU updates invoked in a batch (blimit) and monitoring for high rate of incoming RCUs on a cpu (qhimark, qlowmark). By default, the per-cpu batch limit is set to a small value. If the input RCU rate exceeds the high watermark, we do two things - force quiescent state on all cpus and set the batch limit of the CPU to INTMAX. Setting batch limit to INTMAX forces all finished RCUs to be processed in one shot. If we have more than INTMAX RCUs queued up, then we have bigger problems anyway. Once the incoming queued RCUs fall below the low watermark, the batch limit is set to the default. Signed-off-by: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 76 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 0cf8146bd585..8cf15a569fcd 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; /* Fake initialization required by compiler */ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; -static int maxbatch = 10000; +static int blimit = 10; +static int qhimark = 10000; +static int qlowmark = 100; +#ifdef CONFIG_SMP +static int rsinterval = 1000; +#endif + +static atomic_t rcu_barrier_cpu_count; +static struct semaphore rcu_barrier_sema; +static struct completion rcu_barrier_completion; + +#ifdef CONFIG_SMP +static void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + int cpu; + cpumask_t cpumask; + set_need_resched(); + if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) { + rdp->last_rs_qlen = rdp->qlen; + /* + * Don't send IPI to itself. With irqs disabled, + * rdp->cpu is the current cpu. + */ + cpumask = rcp->cpumask; + cpu_clear(rdp->cpu, cpumask); + for_each_cpu_mask(cpu, cpumask) + smp_send_reschedule(cpu); + } +} +#else +static inline void force_quiescent_state(struct rcu_data *rdp, + struct rcu_ctrlblk *rcp) +{ + set_need_resched(); +} +#endif /** * call_rcu - Queue an RCU callback for invocation after a grace period. @@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *head, rdp = &__get_cpu_var(rcu_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - - if (unlikely(++rdp->count > 10000)) - set_need_resched(); - + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } local_irq_restore(flags); } -static atomic_t rcu_barrier_cpu_count; -static struct semaphore rcu_barrier_sema; -static struct completion rcu_barrier_completion; - /** * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. @@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_head *head, rdp = &__get_cpu_var(rcu_bh_data); *rdp->nxttail = head; rdp->nxttail = &head->next; - rdp->count++; -/* - * Should we directly call rcu_do_batch() here ? - * if (unlikely(rdp->count > 10000)) - * rcu_do_batch(rdp); - */ + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_bh_ctrlblk); + } + local_irq_restore(flags); } @@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data *rdp) next = rdp->donelist = list->next; list->func(list); list = next; - rdp->count--; - if (++count >= maxbatch) + rdp->qlen--; + if (++count >= rdp->blimit) break; } + if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; if (!rdp->donelist) rdp->donetail = &rdp->donelist; else @@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; + rdp->blimit = blimit; } static void __devinit rcu_online_cpu(int cpu) @@ -567,7 +602,12 @@ void synchronize_kernel(void) synchronize_rcu(); } -module_param(maxbatch, int, 0); +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); +#ifdef CONFIG_SMP +module_param(rsinterval, int, 0); +#endif EXPORT_SYMBOL_GPL(rcu_batches_completed); EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ -- cgit v1.2.3 From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 7 Mar 2006 21:55:35 -0800 Subject: [PATCH] fix file counting I have benchmarked this on an x86_64 NUMA system and see no significant performance difference on kernbench. Tested on both x86_64 and powerpc. The way we do file struct accounting is not very suitable for batched freeing. For scalability reasons, file accounting was constructor/destructor based. This meant that nr_files was decremented only when the object was removed from the slab cache. This is susceptible to slab fragmentation. With RCU based file structure, consequent batched freeing and a test program like Serge's, we just speed this up and end up with a very fragmented slab - llm22:~ # cat /proc/sys/fs/file-nr 587730 0 758844 At the same time, I see only a 2000+ objects in filp cache. The following patch I fixes this problem. This patch changes the file counting by removing the filp_count_lock. Instead we use a separate percpu counter, nr_files, for now and all accesses to it are through get_nr_files() api. In the sysctl handler for nr_files, we populate files_stat.nr_files before returning to user. Counting files as an when they are created and destroyed (as opposed to inside slab) allows us to correctly count open files with RCU. Signed-off-by: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index de2d9109194e..32b48e8ee36e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,6 +50,9 @@ #include #include +extern int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -943,7 +946,7 @@ static ctl_table fs_table[] = { .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_nr_files, }, { .ctl_name = FS_MAXFILE, -- cgit v1.2.3 From 7cd9013be6c22f3ff6f777354f766c8c0b955e17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 11 Mar 2006 03:27:18 -0800 Subject: [PATCH] remove __put_task_struct_cb export again The patch '[PATCH] RCU signal handling' [1] added an export for __put_task_struct_cb, a put_task_struct helper newly introduced in that patch. But the put_task_struct couldn't be used modular previously as __put_task_struct wasn't exported. There are not callers of it in modular code, and it shouldn't be exported because we don't want drivers to hold references to task_structs. This patch removes the export and folds __put_task_struct into __put_task_struct_cb as there's no other caller. [1] http://www2.kernel.org/git/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e56d090310d7625ecb43a1eeebd479f04affb48b Signed-off-by: Christoph Hellwig Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +++- kernel/sched.c | 7 ------- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index fbea12d7a943..a8eab86de7f1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -108,8 +108,10 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -void __put_task_struct(struct task_struct *tsk) +void __put_task_struct_cb(struct rcu_head *rhp) { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE))); WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); diff --git a/kernel/sched.c b/kernel/sched.c index e82c99f1db64..4d46e90f59c3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -178,13 +178,6 @@ static unsigned int task_timeslice(task_t *p) #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ < (long long) (sd)->cache_hot_time) -void __put_task_struct_cb(struct rcu_head *rhp) -{ - __put_task_struct(container_of(rhp, struct task_struct, rcu)); -} - -EXPORT_SYMBOL_GPL(__put_task_struct_cb); - /* * These are the runqueue data structures: */ -- cgit v1.2.3 From f9a3879abf2f1a27c39915e6074b8ff15a24cb55 Mon Sep 17 00:00:00 2001 From: GOTO Masanori Date: Mon, 13 Mar 2006 21:20:44 -0800 Subject: [PATCH] Fix sigaltstack corruption among cloned threads This patch fixes alternate signal stack corruption among cloned threads with CLONE_SIGHAND (and CLONE_VM) for linux-2.6.16-rc6. The value of alternate signal stack is currently inherited after a call of clone(... CLONE_SIGHAND | CLONE_VM). But if sigaltstack is set by a parent thread, and then if multiple cloned child threads (+ parent threads) call signal handler at the same time, some threads may be conflicted - because they share to use the same alternative signal stack region. Finally they get sigsegv. It's an undesirable race condition. Note that child threads created from NPTL pthread_create() also hit this conflict when the parent thread uses sigaltstack, without my patch. To fix this problem, this patch clears the child threads' sigaltstack information like exec(). This behavior follows the SUSv3 specification. In SUSv3, pthread_create() says "The alternate stack shall not be inherited (when new threads are initialized)". It means that sigaltstack should be cleared when sigaltstack memory space is shared by cloned threads with CLONE_SIGHAND. Note that I chose "if (clone_flags & CLONE_SIGHAND)" line because: - If clone_flags line is not existed, fork() does not inherit sigaltstack. - CLONE_VM is another choice, but vfork() does not inherit sigaltstack. - CLONE_SIGHAND implies CLONE_VM, and it looks suitable. - CLONE_THREAD is another candidate, and includes CLONE_SIGHAND + CLONE_VM, but this flag has a bit different semantics. I decided to use CLONE_SIGHAND. [ Changed to test for CLONE_VM && !CLONE_VFORK after discussion --Linus ] Signed-off-by: GOTO Masanori Cc: Roland McGrath Cc: Ingo Molnar Acked-by: Linus Torvalds Cc: Ulrich Drepper Cc: Jakub Jelinek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a8eab86de7f1..ccdfbb16c86d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1061,6 +1061,12 @@ static task_t *copy_process(unsigned long clone_flags, */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; + /* + * sigaltstack should be cleared when sharing the same VM + */ + if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) + p->sas_ss_sp = p->sas_ss_size = 0; + /* * Syscall tracing should be turned off in the child regardless * of CLONE_PTRACE. -- cgit v1.2.3 From e0e8eb54d8ae0c4cfd1d297f6351b08a7f635c5f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 16 Mar 2006 10:31:38 -0700 Subject: [PATCH] unshare: Use rcu_assign_pointer when setting sighand The sighand pointer only needs the rcu_read_lock on the read side. So only depending on task_lock protection when setting this pointer is not enough. We also need a memory barrier to ensure the initialization is seen first. Use rcu_assign_pointer as it does this for us, and clearly documents that we are setting an rcu readable pointer. Signed-off-by: Eric W. Biederman Acked-by: Paul E. McKenney Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ccdfbb16c86d..46060cb24af0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1569,7 +1569,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) if (new_sigh) { sigh = current->sighand; - current->sighand = new_sigh; + rcu_assign_pointer(current->sighand, new_sigh); new_sigh = sigh; } -- cgit v1.2.3 From 67890d7084085e29c51afa2514036d42643fd3cf Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 16 Mar 2006 23:04:00 -0800 Subject: [PATCH] time_interpolator: add __read_mostly The pointer to the current time interpolator and the current list of time interpolators are typically only changed during bootup. Adding __read_mostly takes them away from possibly hot cachelines. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index bf7c4193b936..2410c18dbeb1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1354,8 +1354,8 @@ void __init init_timers(void) #ifdef CONFIG_TIME_INTERPOLATION -struct time_interpolator *time_interpolator; -static struct time_interpolator *time_interpolator_list; +struct time_interpolator *time_interpolator __read_mostly; +static struct time_interpolator *time_interpolator_list __read_mostly; static DEFINE_SPINLOCK(time_interpolator_lock); static inline u64 time_interpolator_get_cycles(unsigned int src) -- cgit v1.2.3 From a0a0c28c1a7109d7955815074c52cac079ab3ba5 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Thu, 16 Mar 2006 23:04:01 -0800 Subject: [PATCH] posix-timers: fix requeue accounting when signal is ignored When the posix-timer signal is ignored then the timer is rearmed by the callback function. The requeue pending accounting has to be fixed up else the state might be wrong. Signed-off-by: Roman Zippel Signed-off-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 216f574b5ffb..fa895fc2ecf5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -353,6 +353,7 @@ static int posix_timer_fn(void *data) hrtimer_forward(&timr->it.real.timer, timr->it.real.interval); ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; } } -- cgit v1.2.3 From 2d61b86775a5676a8fba2ba2f0f869564e35c630 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 18 Mar 2006 20:41:10 +0300 Subject: [PATCH] disable unshare(CLONE_VM) for now sys_unshare() does mmput(new_mm). This is not enough if we have mm->core_waiters. This patch is a temporary fix for soon to be released 2.6.16. Signed-off-by: Oleg Nesterov [ Checked with Uli: "I'm not planning to use unshare(CLONE_VM). It's not needed for any functionality planned so far. What we (as in Red Hat) need unshare() for now is the filesystem side." ] Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 46060cb24af0..b373322ca497 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1478,9 +1478,7 @@ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) if ((unshare_flags & CLONE_VM) && (mm && atomic_read(&mm->mm_users) > 1)) { - *new_mmp = dup_mm(current); - if (!*new_mmp) - return -ENOMEM; + return -EINVAL; } return 0; -- cgit v1.2.3