From 4b5d37ac02954572e80e09255bb5737277aaee8e Mon Sep 17 00:00:00 2001 From: Giancarlo Formicuccia Date: Fri, 9 Sep 2005 13:01:22 -0700 Subject: [PATCH] Clear task_struct->fs_excl on fork() An oversight. We don't want to carry the IO scheduler's "we hold exclusive fs resources" hint over to the child across fork(). Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7e1ead9a6ba4..dfeadf466f18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -176,6 +176,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); return tsk; } -- cgit v1.2.3 From b0d62e6d5b3318b6b722121d945afa295f7201b5 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 9 Sep 2005 13:02:01 -0700 Subject: [PATCH] fix disassociate_ctty vs. fork race Race is as follows. Process A forks process B, both being part of the same session. Then, A calls disassociate_ctty while B forks C: A B ==== ==== fork() copy_signal() dissasociate_ctty() .... attach_pid(p, PIDTYPE_SID, p->signal->session); Now, C can have current->signal->tty pointing to a freed tty structure, as it hasn't yet been added to the session group (to have its controlling tty cleared on the diassociate_ctty() call). This has shown up as an oops but could be even more serious. I haven't tried to create a test case, but a customer has verified that the patch below resolves the issue, which was occuring quite frequently. I'll try and post the test case if i can. The patch simply checks for a NULL tty *after* it has been attached to the proper session group and clears it as necessary. Alternatively, we could simply do the tty assignment after the the process is added to the proper session group. Signed-off-by: Jason Baron Cc: Roland McGrath Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index dfeadf466f18..b25802065031 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1116,6 +1116,9 @@ static task_t *copy_process(unsigned long clone_flags, __get_cpu_var(process_counts)++; } + if (!current->signal->tty && p->signal->tty) + p->signal->tty = NULL; + nr_threads++; total_forks++; write_unlock_irq(&tasklist_lock); -- cgit v1.2.3 From 383f2835eb9afb723af71850037b2f074ac9db60 Mon Sep 17 00:00:00 2001 From: "Chen, Kenneth W" Date: Fri, 9 Sep 2005 13:02:02 -0700 Subject: [PATCH] Prefetch kernel stacks to speed up context switch For architecture like ia64, the switch stack structure is fairly large (currently 528 bytes). For context switch intensive application, we found that significant amount of cache misses occurs in switch_to() function. The following patch adds a hook in the schedule() function to prefetch switch stack structure as soon as 'next' task is determined. This allows maximum overlap in prefetch cache lines for that structure. Signed-off-by: Ken Chen Cc: Ingo Molnar Cc: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 18b95520a2e2..2632b812cf24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2888,6 +2888,7 @@ switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); + prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); -- cgit v1.2.3 From 73a358d1892a8233801e3fd54668075b52ec42da Mon Sep 17 00:00:00 2001 From: KUROSAWA Takahiro Date: Fri, 9 Sep 2005 13:02:10 -0700 Subject: [PATCH] fix for cpusets minor problem This patch fixes minor problem that the CPUSETS have when files in the cpuset filesystem are read after lseek()-ed beyond the EOF. Signed-off-by: KUROSAWA Takahiro Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f06e7690106..712d02029971 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -972,6 +972,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, *s++ = '\n'; *s = '\0'; + /* Do nothing if *ppos is at the eof or beyond the eof. */ + if (s - page <= *ppos) + return 0; + start = page + *ppos; n = s - start; retval = n - copy_to_user(buf, start, min(n, nbytes)); -- cgit v1.2.3 From c0dfb2905126e9e94edebbce8d3e05001301f52d Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Fri, 9 Sep 2005 13:04:09 -0700 Subject: [PATCH] files: rcuref APIs Adds a set of primitives to do reference counting for objects that are looked up without locks using RCU. Signed-off-by: Ravikiran Thirumalai Signed-off-by: Dipankar Sarma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcupdate.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f436993bd590..bef3b6901b76 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include #include #include +#include #include /* Definition for rcupdate control block. */ @@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int maxbatch = 10; +#ifndef __HAVE_ARCH_CMPXCHG +/* + * We use an array of spinlocks for the rcurefs -- similar to ones in sparc + * 32 bit atomic_t implementations, and a hash function similar to that + * for our refcounting needs. + * Can't help multiprocessors which donot have cmpxchg :( + */ + +spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { + [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED +}; +#endif + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. -- cgit v1.2.3 From badf16621c1f9d1ac753be056fce11b43d6e0be5 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Fri, 9 Sep 2005 13:04:10 -0700 Subject: [PATCH] files: break up files struct In order for the RCU to work, the file table array, sets and their sizes must be updated atomically. Instead of ensuring this through too many memory barriers, we put the arrays and their sizes in a separate structure. This patch takes the first step of putting the file table elements in a separate structure fdtable that is embedded withing files_struct. It also changes all the users to refer to the file table using files_fdtable() macro. Subsequent applciation of RCU becomes easier after this. Signed-off-by: Dipankar Sarma Signed-Off-By: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 21 +++++++++------ kernel/fork.c | 82 ++++++++++++++++++++++++++++++++++++----------------------- 2 files changed, 63 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 5b0fb9f09f21..83beb1e93b18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize); static inline void close_files(struct files_struct * files) { int i, j; + struct fdtable *fdt; j = 0; + fdt = files_fdtable(files); for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= files->max_fdset || i >= files->max_fds) + if (i >= fdt->max_fdset || i >= fdt->max_fds) break; - set = files->open_fds->fds_bits[j++]; + set = fdt->open_fds->fds_bits[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&files->fd[i], NULL); + struct file * file = xchg(&fdt->fd[i], NULL); if (file) filp_close(file, files); } @@ -403,16 +405,19 @@ struct files_struct *get_files_struct(struct task_struct *task) void fastcall put_files_struct(struct files_struct *files) { + struct fdtable *fdt; + if (atomic_dec_and_test(&files->count)) { close_files(files); /* * Free the fd and fdset arrays if we expanded them. */ - if (files->fd != &files->fd_array[0]) - free_fd_array(files->fd, files->max_fds); - if (files->max_fdset > __FD_SETSIZE) { - free_fdset(files->open_fds, files->max_fdset); - free_fdset(files->close_on_exec, files->max_fdset); + fdt = files_fdtable(files); + if (fdt->fd != &files->fd_array[0]) + free_fd_array(fdt->fd, fdt->max_fds); + if (fdt->max_fdset > __FD_SETSIZE) { + free_fdset(fdt->open_fds, fdt->max_fdset); + free_fdset(fdt->close_on_exec, fdt->max_fdset); } kmem_cache_free(files_cachep, files); } diff --git a/kernel/fork.c b/kernel/fork.c index b25802065031..ecc694debb50 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -568,21 +568,47 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) static int count_open_files(struct files_struct *files, int size) { int i; + struct fdtable *fdt; /* Find the last open fd */ + fdt = files_fdtable(files); for (i = size/(8*sizeof(long)); i > 0; ) { - if (files->open_fds->fds_bits[--i]) + if (fdt->open_fds->fds_bits[--i]) break; } i = (i+1) * 8 * sizeof(long); return i; } +static struct files_struct *alloc_files(void) +{ + struct files_struct *newf; + struct fdtable *fdt; + + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + fdt = files_fdtable(newf); + fdt->next_fd = 0; + fdt->max_fds = NR_OPEN_DEFAULT; + fdt->max_fdset = __FD_SETSIZE; + fdt->close_on_exec = &newf->close_on_exec_init; + fdt->open_fds = &newf->open_fds_init; + fdt->fd = &newf->fd_array[0]; +out: + return newf; +} + static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; int open_files, size, i, error = 0, expand; + struct fdtable *old_fdt, *new_fdt; /* * A background process may not have any files ... @@ -603,35 +629,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) */ tsk->files = NULL; error = -ENOMEM; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); - if (!newf) + newf = alloc_files(); + if (!newf) goto out; - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - newf->max_fds = NR_OPEN_DEFAULT; - newf->max_fdset = __FD_SETSIZE; - newf->close_on_exec = &newf->close_on_exec_init; - newf->open_fds = &newf->open_fds_init; - newf->fd = &newf->fd_array[0]; - spin_lock(&oldf->file_lock); - - open_files = count_open_files(oldf, oldf->max_fdset); + old_fdt = files_fdtable(oldf); + new_fdt = files_fdtable(newf); + size = old_fdt->max_fdset; + open_files = count_open_files(oldf, old_fdt->max_fdset); expand = 0; /* * Check whether we need to allocate a larger fd array or fd set. * Note: we're not a clone task, so the open count won't change. */ - if (open_files > newf->max_fdset) { - newf->max_fdset = 0; + if (open_files > new_fdt->max_fdset) { + new_fdt->max_fdset = 0; expand = 1; } - if (open_files > newf->max_fds) { - newf->max_fds = 0; + if (open_files > new_fdt->max_fds) { + new_fdt->max_fds = 0; expand = 1; } @@ -646,11 +664,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) spin_lock(&oldf->file_lock); } - old_fds = oldf->fd; - new_fds = newf->fd; + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; - memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); - memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -663,24 +681,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * is partway through open(). So make sure that this * fd is available to the new process. */ - FD_CLR(open_files - i, newf->open_fds); + FD_CLR(open_files - i, new_fdt->open_fds); } *new_fds++ = f; } spin_unlock(&oldf->file_lock); /* compute the remainder to be cleared */ - size = (newf->max_fds - open_files) * sizeof(struct file *); + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (newf->max_fdset > open_files) { - int left = (newf->max_fdset-open_files)/8; + if (new_fdt->max_fdset > open_files) { + int left = (new_fdt->max_fdset-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); - memset(&newf->open_fds->fds_bits[start], 0, left); - memset(&newf->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } tsk->files = newf; @@ -689,9 +707,9 @@ out: return error; out_release: - free_fdset (newf->close_on_exec, newf->max_fdset); - free_fdset (newf->open_fds, newf->max_fdset); - free_fd_array(newf->fd, newf->max_fds); + free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); + free_fdset (new_fdt->open_fds, new_fdt->max_fdset); + free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); goto out; } -- cgit v1.2.3 From ab2af1f5005069321c5d130f09cce577b03f43ef Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Fri, 9 Sep 2005 13:04:13 -0700 Subject: [PATCH] files: files struct with RCU Patch to eliminate struct files_struct.file_lock spinlock on the reader side and use rcu refcounting rcuref_xxx api for the f_count refcounter. The updates to the fdtable are done by allocating a new fdtable structure and setting files->fdt to point to the new structure. The fdtable structure is protected by RCU thereby allowing lock-free lookup. For fd arrays/sets that are vmalloced, we use keventd to free them since RCU callbacks can't sleep. A global list of fdtable to be freed is not scalable, so we use a per-cpu list. If keventd is already handling the current cpu's work, we use a timer to defer queueing of that work. Since the last publication, this patch has been re-written to avoid using explicit memory barriers and use rcu_assign_pointer(), rcu_dereference() premitives instead. This required that the fd information is kept in a separate structure (fdtable) and updated atomically. Signed-off-by: Dipankar Sarma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 15 ++++++++------- kernel/fork.c | 23 +++++++++++++++++------ 2 files changed, 25 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 83beb1e93b18..6d2089a1bce7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -411,15 +411,16 @@ void fastcall put_files_struct(struct files_struct *files) close_files(files); /* * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. */ fdt = files_fdtable(files); - if (fdt->fd != &files->fd_array[0]) - free_fd_array(fdt->fd, fdt->max_fds); - if (fdt->max_fdset > __FD_SETSIZE) { - free_fdset(fdt->open_fds, fdt->max_fdset); - free_fdset(fdt->close_on_exec, fdt->max_fdset); - } - kmem_cache_free(files_cachep, files); + if (fdt == &files->fdtab) + fdt->free_files = files; + else + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); } } diff --git a/kernel/fork.c b/kernel/fork.c index ecc694debb50..8149f3602881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -565,13 +566,12 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) return 0; } -static int count_open_files(struct files_struct *files, int size) +static int count_open_files(struct fdtable *fdt) { + int size = fdt->max_fdset; int i; - struct fdtable *fdt; /* Find the last open fd */ - fdt = files_fdtable(files); for (i = size/(8*sizeof(long)); i > 0; ) { if (fdt->open_fds->fds_bits[--i]) break; @@ -592,13 +592,17 @@ static struct files_struct *alloc_files(void) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); - fdt = files_fdtable(newf); + fdt = &newf->fdtab; fdt->next_fd = 0; fdt->max_fds = NR_OPEN_DEFAULT; fdt->max_fdset = __FD_SETSIZE; fdt->close_on_exec = &newf->close_on_exec_init; fdt->open_fds = &newf->open_fds_init; fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&fdt->rcu); + fdt->free_files = NULL; + fdt->next = NULL; + rcu_assign_pointer(newf->fdt, fdt); out: return newf; } @@ -637,7 +641,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) old_fdt = files_fdtable(oldf); new_fdt = files_fdtable(newf); size = old_fdt->max_fdset; - open_files = count_open_files(oldf, old_fdt->max_fdset); + open_files = count_open_files(old_fdt); expand = 0; /* @@ -661,7 +665,14 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) spin_unlock(&newf->file_lock); if (error < 0) goto out_release; + new_fdt = files_fdtable(newf); + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); } old_fds = old_fdt->fd; @@ -683,7 +694,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) */ FD_CLR(open_files - i, new_fdt->open_fds); } - *new_fds++ = f; + rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); -- cgit v1.2.3 From fb1c8f93d869b34cacb8b8932e2b83d96a19d720 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 10 Sep 2005 00:25:56 -0700 Subject: [PATCH] spinlock consolidation This patch (written by me and also containing many suggestions of Arjan van de Ven) does a major cleanup of the spinlock code. It does the following things: - consolidates and enhances the spinlock/rwlock debugging code - simplifies the asm/spinlock.h files - encapsulates the raw spinlock type and moves generic spinlock features (such as ->break_lock) into the generic code. - cleans up the spinlock code hierarchy to get rid of the spaghetti. Most notably there's now only a single variant of the debugging code, located in lib/spinlock_debug.c. (previously we had one SMP debugging variant per architecture, plus a separate generic one for UP builds) Also, i've enhanced the rwlock debugging facility, it will now track write-owners. There is new spinlock-owner/CPU-tracking on SMP builds too. All locks have lockup detection now, which will work for both soft and hard spin/rwlock lockups. The arch-level include files now only contain the minimally necessary subset of the spinlock code - all the rest that can be generalized now lives in the generic headers: include/asm-i386/spinlock_types.h | 16 include/asm-x86_64/spinlock_types.h | 16 I have also split up the various spinlock variants into separate files, making it easier to see which does what. The new layout is: SMP | UP ----------------------------|----------------------------------- asm/spinlock_types_smp.h | linux/spinlock_types_up.h linux/spinlock_types.h | linux/spinlock_types.h asm/spinlock_smp.h | linux/spinlock_up.h linux/spinlock_api_smp.h | linux/spinlock_api_up.h linux/spinlock.h | linux/spinlock.h /* * here's the role of the various spinlock/rwlock related include files: * * on SMP builds: * * asm/spinlock_types.h: contains the raw_spinlock_t/raw_rwlock_t and the * initializers * * linux/spinlock_types.h: * defines the generic type and initializers * * asm/spinlock.h: contains the __raw_spin_*()/etc. lowlevel * implementations, mostly inline assembly code * * (also included on UP-debug builds:) * * linux/spinlock_api_smp.h: * contains the prototypes for the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. * * on UP builds: * * linux/spinlock_type_up.h: * contains the generic, simplified UP spinlock type. * (which is an empty structure on non-debug builds) * * linux/spinlock_types.h: * defines the generic type and initializers * * linux/spinlock_up.h: * contains the __raw_spin_*()/etc. version of UP * builds. (which are NOPs on non-debug, non-preempt * builds) * * (included on UP-non-debug builds:) * * linux/spinlock_api_up.h: * builds the _spin_*() APIs. * * linux/spinlock.h: builds the final spin_*() APIs. */ All SMP and UP architectures are converted by this patch. arm, i386, ia64, ppc, ppc64, s390/s390x, x64 was build-tested via crosscompilers. m32r, mips, sh, sparc, have not been tested yet, but should be mostly fine. From: Grant Grundler Booted and lightly tested on a500-44 (64-bit, SMP kernel, dual CPU). Builds 32-bit SMP kernel (not booted or tested). I did not try to build non-SMP kernels. That should be trivial to fix up later if necessary. I converted bit ops atomic_hash lock to raw_spinlock_t. Doing so avoids some ugly nesting of linux/*.h and asm/*.h files. Those particular locks are well tested and contained entirely inside arch specific code. I do NOT expect any new issues to arise with them. If someone does ever need to use debug/metrics with them, then they will need to unravel this hairball between spinlocks, atomic ops, and bit ops that exist only because parisc has exactly one atomic instruction: LDCW (load and clear word). From: "Luck, Tony" ia64 fix Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Grant Grundler Cc: Matthew Wilcox Signed-off-by: Hirokazu Takata Signed-off-by: Mikael Pettersson Signed-off-by: Benoit Boissinot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/sched.c | 4 ++++ kernel/spinlock.c | 15 +++++++++------ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 8d57a2f1226b..ff4dc02ce170 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o diff --git a/kernel/sched.c b/kernel/sched.c index 2632b812cf24..15db82116e19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1511,6 +1511,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) * Manfred Spraul */ prev_task_flags = prev->flags; +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif finish_arch_switch(prev); finish_lock_switch(rq, prev); if (mm) diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0c3f9d8bbe17..0375fcd5921d 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -3,7 +3,10 @@ * * Author: Zwane Mwaikambo * - * Copyright (2004) Ingo Molnar + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) */ #include @@ -17,12 +20,12 @@ * Generic declaration of the raw read_trylock() function, * architectures are supposed to optimize this: */ -int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) { - _raw_read_lock(lock); + __raw_read_lock(lock); return 1; } -EXPORT_SYMBOL(generic_raw_read_trylock); +EXPORT_SYMBOL(generic__raw_read_trylock); int __lockfunc _spin_trylock(spinlock_t *lock) { @@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock) } EXPORT_SYMBOL(_write_trylock); -#ifndef CONFIG_PREEMPT +#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) void __lockfunc _read_lock(rwlock_t *lock) { @@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) local_irq_save(flags); preempt_disable(); - _raw_spin_lock_flags(lock, flags); + _raw_spin_lock_flags(lock, &flags); return flags; } EXPORT_SYMBOL(_spin_lock_irqsave); -- cgit v1.2.3 From 4247bdc60048018b98f71228b45cfbc5f5270c86 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Sat, 10 Sep 2005 00:26:06 -0700 Subject: [PATCH] cpuset semaphore depth check deadlock fix The cpusets-formalize-intermediate-gfp_kernel-containment patch has a deadlock problem. This patch was part of a set of four patches to make more extensive use of the cpuset 'mem_exclusive' attribute to manage kernel GFP_KERNEL memory allocations and to constrain the out-of-memory (oom) killer. A task that is changing cpusets in particular ways on a system when it is very short of free memory could double trip over the global cpuset_sem semaphore (get the lock and then deadlock trying to get it again). The second attempt to get cpuset_sem would be in the routine cpuset_zone_allowed(). This was discovered by code inspection. I can not reproduce the problem except with an artifically hacked kernel and a specialized stress test. In real life you cannot hit this unless you are manipulating cpusets, and are very unlikely to hit it unless you are rapidly modifying cpusets on a memory tight system. Even then it would be a rare occurence. If you did hit it, the task double tripping over cpuset_sem would deadlock in the kernel, and any other task also trying to manipulate cpusets would deadlock there too, on cpuset_sem. Your batch manager would be wedged solid (if it was cpuset savvy), but classic Unix shells and utilities would work well enough to reboot the system. The unusual condition that led to this bug is that unlike most semaphores, cpuset_sem _can_ be acquired while in the page allocation code, when __alloc_pages() calls cpuset_zone_allowed. So it easy to mistakenly perform the following sequence: 1) task makes system call to alter a cpuset 2) take cpuset_sem 3) try to allocate memory 4) memory allocator, via cpuset_zone_allowed, trys to take cpuset_sem 5) deadlock The reason that this is not a serious bug for most users is that almost all calls to allocate memory don't require taking cpuset_sem. Only some code paths off the beaten track require taking cpuset_sem -- which is good. Taking a global semaphore on the main code path for allocating memory would not scale well. This patch fixes this deadlock by wrapping the up() and down() calls on cpuset_sem in kernel/cpuset.c with code that tracks the nesting depth of the current task on that semaphore, and only does the real down() if the task doesn't hold the lock already, and only does the real up() if the nesting depth (number of unmatched downs) is exactly one. The previous required use of refresh_mems(), anytime that the cpuset_sem semaphore was acquired and the code executed while holding that semaphore might try to allocate memory, is no longer required. Two refresh_mems() calls were removed thanks to this. This is a good change, as failing to get all the necessary refresh_mems() calls placed was a primary source of bugs in this cpuset code. The only remaining call to refresh_mems() is made while doing a memory allocation, if certain task memory placement data needs to be updated from its cpuset, due to the cpuset having been changed behind the tasks back. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 100 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 60 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 712d02029971..407b5f0a8c8e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -181,6 +181,37 @@ static struct super_block *cpuset_sb = NULL; static DECLARE_MUTEX(cpuset_sem); +/* + * The global cpuset semaphore cpuset_sem can be needed by the + * memory allocator to update a tasks mems_allowed (see the calls + * to cpuset_update_current_mems_allowed()) or to walk up the + * cpuset hierarchy to find a mem_exclusive cpuset see the calls + * to cpuset_excl_nodes_overlap()). + * + * But if the memory allocation is being done by cpuset.c code, it + * usually already holds cpuset_sem. Double tripping on a kernel + * semaphore deadlocks the current task, and any other task that + * subsequently tries to obtain the lock. + * + * Run all up's and down's on cpuset_sem through the following + * wrappers, which will detect this nested locking, and avoid + * deadlocking. + */ + +static inline void cpuset_down(struct semaphore *psem) +{ + if (current->cpuset_sem_nest_depth == 0) + down(psem); + current->cpuset_sem_nest_depth++; +} + +static inline void cpuset_up(struct semaphore *psem) +{ + current->cpuset_sem_nest_depth--; + if (current->cpuset_sem_nest_depth == 0) + up(psem); +} + /* * A couple of forward declarations required, due to cyclic reference loop: * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file @@ -522,19 +553,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Refresh current tasks mems_allowed and mems_generation from * current tasks cpuset. Call with cpuset_sem held. * - * Be sure to call refresh_mems() on any cpuset operation which - * (1) holds cpuset_sem, and (2) might possibly alloc memory. - * Call after obtaining cpuset_sem lock, before any possible - * allocation. Otherwise one risks trying to allocate memory - * while the task cpuset_mems_generation is not the same as - * the mems_generation in its cpuset, which would deadlock on - * cpuset_sem in cpuset_update_current_mems_allowed(). - * - * Since we hold cpuset_sem, once refresh_mems() is called, the - * test (current->cpuset_mems_generation != cs->mems_generation) - * in cpuset_update_current_mems_allowed() will remain false, - * until we drop cpuset_sem. Anyone else who would change our - * cpusets mems_generation needs to lock cpuset_sem first. + * This routine is needed to update the per-task mems_allowed + * data, within the tasks context, when it is trying to allocate + * memory (in various mm/mempolicy.c routines) and notices + * that some other task has been modifying its cpuset. */ static void refresh_mems(void) @@ -840,7 +862,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - down(&cpuset_sem); + cpuset_down(&cpuset_sem); if (is_removed(cs)) { retval = -ENODEV; @@ -874,7 +896,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); cpuset_release_agent(pathbuf); out1: kfree(buffer); @@ -914,9 +936,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); mask = cs->cpus_allowed; - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -925,9 +947,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); mask = cs->mems_allowed; - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -1334,8 +1356,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - down(&cpuset_sem); - refresh_mems(); + cpuset_down(&cpuset_sem); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1360,14 +1381,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) * will down() this new directory's i_sem and if we race with * another mkdir, we might deadlock. */ - up(&cpuset_sem); + cpuset_up(&cpuset_sem); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); kfree(cs); return err; } @@ -1389,14 +1410,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_sem already */ - down(&cpuset_sem); - refresh_mems(); + cpuset_down(&cpuset_sem); if (atomic_read(&cs->count) > 0) { - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return -EBUSY; } if (!list_empty(&cs->children)) { - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return -EBUSY; } parent = cs->parent; @@ -1412,7 +1432,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); cpuset_release_agent(pathbuf); return 0; } @@ -1515,10 +1535,10 @@ void cpuset_exit(struct task_struct *tsk) if (notify_on_release(cs)) { char *pathbuf = NULL; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); if (atomic_dec_and_test(&cs->count)) check_for_release(cs, &pathbuf); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); @@ -1539,11 +1559,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) { cpumask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); task_lock((struct task_struct *)tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock((struct task_struct *)tsk); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return mask; } @@ -1568,9 +1588,9 @@ void cpuset_update_current_mems_allowed(void) if (!cs) return; /* task is exiting */ if (current->cpuset_mems_generation != cs->mems_generation) { - down(&cpuset_sem); + cpuset_down(&cpuset_sem); refresh_mems(); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); } } @@ -1669,14 +1689,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) return 0; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - down(&cpuset_sem); + cpuset_down(&cpuset_sem); cs = current->cpuset; if (!cs) goto done; /* current task exiting */ cs = nearest_exclusive_ancestor(cs); allowed = node_isset(node, cs->mems_allowed); done: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return allowed; } @@ -1697,7 +1717,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ int overlap = 0; /* do cpusets overlap? */ - down(&cpuset_sem); + cpuset_down(&cpuset_sem); cs1 = current->cpuset; if (!cs1) goto done; /* current task exiting */ @@ -1708,7 +1728,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) cs2 = nearest_exclusive_ancestor(cs2); overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); done: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return overlap; } @@ -1731,7 +1751,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); task_lock(tsk); cs = tsk->cpuset; task_unlock(tsk); @@ -1746,7 +1766,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) seq_puts(m, buf); seq_putc(m, '\n'); out: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); kfree(buf); return retval; } -- cgit v1.2.3 From fc38ed7531eefa332c8c69ee288487860cd6b426 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Sat, 10 Sep 2005 00:26:08 -0700 Subject: [PATCH] sched: run SCHED_NORMAL tasks with real time tasks on SMT siblings The hyperthread aware nice handling currently puts to sleep any non real time task when a real time task is running on its sibling cpu. This can lead to prolonged starvation by having the non real time task pegged to the cpu with load balancing not pulling that task away. Currently we force lower priority hyperthread tasks to run a percentage of time difference based on timeslice differences which is meaningless when comparing real time tasks to SCHED_NORMAL tasks. We can allow non real time tasks to run with real time tasks on the sibling up to per_cpu_gain% if we use jiffies as a counter. Cleanups and micro-optimisations to the relevant code section should make it more understandable as well. Signed-off-by: Con Kolivas Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 65 ++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 47 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 15db82116e19..ef748e691608 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2580,6 +2580,13 @@ out: } #ifdef CONFIG_SCHED_SMT +static inline void wakeup_busy_runqueue(runqueue_t *rq) +{ + /* If an SMT runqueue is sleeping due to priority reasons wake it up */ + if (rq->curr == rq->idle && rq->nr_running) + resched_task(rq->idle); +} + static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2613,12 +2620,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); - /* - * If an SMT sibling task is sleeping due to priority - * reasons wake it up now. - */ - if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) - resched_task(smt_rq->idle); + wakeup_busy_runqueue(smt_rq); } for_each_cpu_mask(i, sibling_map) @@ -2672,6 +2674,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) runqueue_t *smt_rq = cpu_rq(i); task_t *smt_curr = smt_rq->curr; + /* Kernel threads do not participate in dependent sleeping */ + if (!p->mm || !smt_curr->mm || rt_task(p)) + goto check_smt_task; + /* * If a user task with lower static priority than the * running task on the SMT sibling is trying to schedule, @@ -2680,21 +2686,44 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) - ret = 1; + if (rt_task(smt_curr)) { + /* + * With real time tasks we run non-rt tasks only + * per_cpu_gain% of the time. + */ + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + ret = 1; + } else + if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / + 100) > task_timeslice(p))) + ret = 1; + +check_smt_task: + if ((!smt_curr->mm && smt_curr != smt_rq->idle) || + rt_task(smt_curr)) + continue; + if (!p->mm) { + wakeup_busy_runqueue(smt_rq); + continue; + } /* - * Reschedule a lower priority task on the SMT sibling, - * or wake it up if it has been put to sleep for priority - * reasons. + * Reschedule a lower priority task on the SMT sibling for + * it to be put to sleep, or wake it up if it has been put to + * sleep for priority reasons to see if it should run now. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || - (smt_curr == smt_rq->idle && smt_rq->nr_running)) - resched_task(smt_curr); + if (rt_task(p)) { + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + resched_task(smt_curr); + } else { + if ((p->time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(smt_curr)) + resched_task(smt_curr); + else + wakeup_busy_runqueue(smt_rq); + } } out_unlock: for_each_cpu_mask(i, sibling_map) -- cgit v1.2.3 From da5a5522709a030da91932d4d4c2b179a481a8c0 Mon Sep 17 00:00:00 2001 From: "M.Baris Demiray" Date: Sat, 10 Sep 2005 00:26:09 -0700 Subject: [PATCH] sched: make idlest_group/cpu cpus_allowed-aware Add relevant checks into find_idlest_group() and find_idlest_cpu() to make them return only the groups that have allowed CPUs and allowed CPUs respectively. Signed-off-by: M.Baris Demiray Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ef748e691608..bac23fb418f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; int i; + /* Skip over this group if it has no CPUs allowed */ + if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + goto nextgroup; + local_group = cpu_isset(this_cpu, group->cpumask); - /* XXX: put a cpus allowed check */ /* Tally up the load of all CPUs in the group */ avg_load = 0; @@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) min_load = avg_load; idlest = group; } +nextgroup: group = group->next; } while (group != sd->groups); @@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* * find_idlest_queue - find the idlest runqueue among the cpus in group. */ -static int find_idlest_cpu(struct sched_group *group, int this_cpu) +static int find_idlest_cpu(struct sched_group *group, + struct task_struct *p, int this_cpu) { + cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; - for_each_cpu_mask(i, group->cpumask) { + /* Traverse only the allowed CPUs */ + cpus_and(tmp, group->cpumask, p->cpus_allowed); + + for_each_cpu_mask(i, tmp) { load = source_load(i, 0); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag) if (!group) goto nextlevel; - new_cpu = find_idlest_cpu(group, cpu); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) goto nextlevel; -- cgit v1.2.3 From 95cdf3b799a481969a48d69a1a52916ad5da6694 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 10 Sep 2005 00:26:11 -0700 Subject: [PATCH] sched cleanups whitespace cleanups. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bac23fb418f6..24eed372d280 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +void wait_task_inactive(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -1007,8 +1007,8 @@ nextgroup: /* * find_idlest_queue - find the idlest runqueue among the cpus in group. */ -static int find_idlest_cpu(struct sched_group *group, - struct task_struct *p, int this_cpu) +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; @@ -1136,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p) * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int try_to_wake_up(task_t *p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1283,7 +1283,7 @@ out: return success; } -int fastcall wake_up_process(task_t * p) +int fastcall wake_up_process(task_t *p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); @@ -1362,7 +1362,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1445,7 +1445,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +void fastcall sched_exit(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -1766,7 +1766,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, int *all_pinned) + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { /* * We do not migrate tasks that are: @@ -3058,7 +3059,8 @@ need_resched: #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) { task_t *p = curr->private; return try_to_wake_up(p, mode, sync); @@ -3100,7 +3102,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @key: is directly passed to the wakeup function */ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) + int nr_exclusive, void *key) { unsigned long flags; @@ -3132,7 +3134,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void fastcall +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; int sync = 1; @@ -3323,7 +3326,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -3542,7 +3546,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @policy: new policy. * @param: structure containing the new RT priority. */ -int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) { int retval; int oldprio, oldpolicy = -1; @@ -3562,7 +3567,7 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ if (param->sched_priority < 0 || - (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) @@ -3625,7 +3630,8 @@ recheck: } EXPORT_SYMBOL_GPL(sched_setscheduler); -static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { int retval; struct sched_param lparam; @@ -3956,7 +3962,7 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t * lock) +int cond_resched_lock(spinlock_t *lock) { int ret = 0; @@ -4139,7 +4145,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p) return list_entry(p->sibling.next,struct task_struct,sibling); } -static void show_task(task_t * p) +static void show_task(task_t *p) { task_t *relative; unsigned state; @@ -4165,7 +4171,7 @@ static void show_task(task_t * p) #endif #ifdef CONFIG_DEBUG_STACK_USAGE { - unsigned long * n = (unsigned long *) (p->thread_info+1); + unsigned long *n = (unsigned long *) (p->thread_info+1); while (!*n) n++; free = (unsigned long) n - (unsigned long)(p->thread_info+1); @@ -4374,7 +4380,7 @@ out: * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ -static int migration_thread(void * data) +static int migration_thread(void *data) { runqueue_t *rq; int cpu = (long)data; -- cgit v1.2.3 From d79fc0fc6645b0cf5cd980da76942ca6d6300fa4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 10 Sep 2005 00:26:12 -0700 Subject: [PATCH] sched: TASK_NONINTERACTIVE This patch implements a task state bit (TASK_NONINTERACTIVE), which can be used by blocking points to mark the task's wait as "non-interactive". This does not mean the task will be considered a CPU-hog - the wait will simply not have an effect on the waiting task's priority - positive or negative alike. Right now only pipe_wait() will make use of it, because it's a common source of not-so-interactive waits (kernel compilation jobs, etc.). Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 24eed372d280..6da13bba3e23 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1260,6 +1260,16 @@ out_activate: p->activated = -1; } + /* + * Tasks that have marked their sleep as noninteractive get + * woken up without updating their sleep average. (i.e. their + * sleep is handled in a priority-neutral manner, no priority + * boost and no penalty.) + */ + if (old_state & TASK_NONINTERACTIVE) + __activate_task(p, rq); + else + activate_task(p, rq, cpu == this_cpu); /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) @@ -1268,7 +1278,6 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); -- cgit v1.2.3 From 67f9a619e7460b7d07284a9d0745727a77d3ade6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 10 Sep 2005 00:26:16 -0700 Subject: [PATCH] sched: fix SMT scheduler latency bug William Weston reported unusually high scheduling latencies on his x86 HT box, on the -RT kernel. I managed to reproduce it on my HT box and the latency tracer shows the incident in action: _------=> CPU# / _-----=> irqs-off | / _----=> need-resched || / _---=> hardirq/softirq ||| / _--=> preempt-depth |||| / ||||| delay cmd pid ||||| time | caller \ / ||||| \ | / du-2803 3Dnh2 0us : __trace_start_sched_wakeup (try_to_wake_up) .............................................................. ... we are running on CPU#3, PID 2778 gets woken to CPU#1: ... .............................................................. du-2803 3Dnh2 0us : __trace_start_sched_wakeup <<...>-2778> (73 1) du-2803 3Dnh2 0us : _raw_spin_unlock (try_to_wake_up) ................................................ ... still on CPU#3, we send an IPI to CPU#1: ... ................................................ du-2803 3Dnh1 0us : resched_task (try_to_wake_up) du-2803 3Dnh1 1us : smp_send_reschedule (try_to_wake_up) du-2803 3Dnh1 1us : send_IPI_mask_bitmask (smp_send_reschedule) du-2803 3Dnh1 2us : _raw_spin_unlock_irqrestore (try_to_wake_up) ............................................... ... 1 usec later, the IPI arrives on CPU#1: ... ............................................... -0 1Dnh. 2us : smp_reschedule_interrupt (c0100c5a 0 0) So far so good, this is the normal wakeup/preemption mechanism. But here comes the scheduler anomaly on CPU#1: -0 1Dnh. 2us : preempt_schedule_irq (need_resched) -0 1Dnh. 2us : preempt_schedule_irq (need_resched) -0 1Dnh. 3us : __schedule (preempt_schedule_irq) -0 1Dnh. 3us : profile_hit (__schedule) -0 1Dnh1 3us : sched_clock (__schedule) -0 1Dnh1 4us : _raw_spin_lock_irq (__schedule) -0 1Dnh1 4us : _raw_spin_lock_irqsave (__schedule) -0 1Dnh2 5us : _raw_spin_unlock (__schedule) -0 1Dnh1 5us : preempt_schedule (__schedule) -0 1Dnh1 6us : _raw_spin_lock (__schedule) -0 1Dnh2 6us : find_next_bit (__schedule) -0 1Dnh2 6us : _raw_spin_lock (__schedule) -0 1Dnh3 7us : find_next_bit (__schedule) -0 1Dnh3 7us : find_next_bit (__schedule) -0 1Dnh3 8us : _raw_spin_unlock (__schedule) -0 1Dnh2 8us : preempt_schedule (__schedule) -0 1Dnh2 8us : find_next_bit (__schedule) -0 1Dnh2 9us : trace_stop_sched_switched (__schedule) -0 1Dnh2 9us : _raw_spin_lock (trace_stop_sched_switched) -0 1Dnh3 10us : trace_stop_sched_switched <<...>-2778> (73 8c) -0 1Dnh3 10us : _raw_spin_unlock (trace_stop_sched_switched) -0 1Dnh1 10us : _raw_spin_unlock (__schedule) -0 1Dnh. 11us : local_irq_enable_noresched (preempt_schedule_irq) -0 1Dnh. 11us < (0) we didnt pick up pid 2778! It only gets scheduled much later: <...>-2778 1Dnh2 412us : __switch_to (__schedule) <...>-2778 1Dnh2 413us : __schedule <-0> (8c 73) <...>-2778 1Dnh2 413us : _raw_spin_unlock (__schedule) <...>-2778 1Dnh1 413us : trace_stop_sched_switched (__schedule) <...>-2778 1Dnh1 414us : _raw_spin_lock (trace_stop_sched_switched) <...>-2778 1Dnh2 414us : trace_stop_sched_switched <<...>-2778> (73 1) <...>-2778 1Dnh2 414us : _raw_spin_unlock (trace_stop_sched_switched) <...>-2778 1Dnh1 415us : trace_stop_sched_switched (__schedule) the reason for this anomaly is the following code in dependent_sleeper(): /* * If a user task with lower static priority than the * running task on the SMT sibling is trying to schedule, * delay it till there is proportionately less timeslice * left of the sibling task to prevent a lower priority * task from using an unfair proportion of the * physical cpu's resources. -ck */ [...] if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(p))) ret = 1; Note that in contrast to the comment above, we dont actually do the check based on static priority, we do the check based on timeslices. But timeslices go up and down, and even highprio tasks can randomly have very low timeslices (just before their next refill) and can thus be judged as 'lowprio' by the above piece of code. This condition is clearly buggy. The correct test is to check for static_prio _and_ to check for the preemption priority. Even on different static priority levels, a higher-prio interactive task should not be delayed due to a higher-static-prio CPU hog. There is a symmetric bug in the 'kick SMT sibling' code of this function as well, which can be solved in a similar way. The patch below (against the current scheduler queue in -mm) fixes both bugs. I have build and boot-tested this on x86 SMT, and nice +20 tasks still get properly throttled - so the dependent-sleeper logic is still in action. btw., these bugs pessimised the SMT scheduler because the 'delay wakeup' property was applied too liberally, so this fix is likely a throughput improvement as well. I separated out a smt_slice() function to make the code easier to read. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6da13bba3e23..c61ee3451a04 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2650,6 +2650,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) */ } +/* + * number of 'lost' timeslices this task wont be able to fully + * utilize, if another task runs on a sibling. This models the + * slowdown effect of other tasks running on siblings: + */ +static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) +{ + return p->time_slice * (100 - sd->per_cpu_gain) / 100; +} + static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2714,8 +2724,9 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) (sd->per_cpu_gain * DEF_TIMESLICE / 100)) ret = 1; } else - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / - 100) > task_timeslice(p))) + if (smt_curr->static_prio < p->static_prio && + !TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(smt_curr, sd) > task_timeslice(p)) ret = 1; check_smt_task: @@ -2737,8 +2748,8 @@ check_smt_task: (sd->per_cpu_gain * DEF_TIMESLICE / 100)) resched_task(smt_curr); } else { - if ((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr)) + if (TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(p, sd) > task_timeslice(smt_curr)) resched_task(smt_curr); else wakeup_busy_runqueue(smt_rq); -- cgit v1.2.3 From d6d5cfaf4551aa7713ca6ab73bb77e832602204b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 10 Sep 2005 00:26:16 -0700 Subject: [PATCH] sched: less newidle locking Similarly to the earlier change in load_balance, only lock the runqueue in load_balance_newidle if the busiest queue found has a nr_running > 1. This will reduce frequency of expensive remote runqueue lock aquisitions in the schedule() path on some workloads. Signed-off-by: Nick Piggin Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c61ee3451a04..930189540f3b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2104,8 +2104,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, - &all_pinned); + imbalance, sd, idle, &all_pinned); spin_unlock(&busiest->lock); /* All tasks on this runqueue were pinned by CPU affinity */ @@ -2200,18 +2199,22 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, BUG_ON(busiest == this_rq); - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); - nr_moved = move_tasks(this_rq, this_cpu, busiest, + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, NEWLY_IDLE, NULL); + spin_unlock(&busiest->lock); + } + if (!nr_moved) schedstat_inc(sd, lb_failed[NEWLY_IDLE]); else sd->nr_balance_failed = 0; - spin_unlock(&busiest->lock); return nr_moved; out_balanced: -- cgit v1.2.3 From e17224bf1d01b461ec02a60f5a9b7657a89bdd23 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 10 Sep 2005 00:26:18 -0700 Subject: [PATCH] sched: less locking During periodic load balancing, don't hold this runqueue's lock while scanning remote runqueues, which can take a non trivial amount of time especially on very large systems. Holding the runqueue lock will only help to stabilise ->nr_running, however this doesn't do much to help because tasks being woken will simply get held up on the runqueue lock, so ->nr_running would not provide a really accurate picture of runqueue load in that case anyway. What's more, ->nr_running (and possibly the cpu_load averages) of remote runqueues won't be stable anyway, so load balancing is always an inexact operation. Signed-off-by: Nick Piggin Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 930189540f3b..8535e5c68f5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2075,7 +2075,6 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, int nr_moved, all_pinned = 0; int active_balance = 0; - spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); group = find_busiest_group(sd, this_cpu, &imbalance, idle); @@ -2102,18 +2101,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_lock_balance(this_rq, busiest); + double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle, &all_pinned); - spin_unlock(&busiest->lock); + double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) goto out_balanced; } - spin_unlock(&this_rq->lock); - if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2156,8 +2153,6 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); - schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; -- cgit v1.2.3 From 5969fe0618051e8577316555a81a6e44b7b7d640 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 10 Sep 2005 00:26:19 -0700 Subject: [PATCH] sched: HT optimisation If an idle sibling of an HT queue encounters a busy sibling, then make higher level load balancing of the non-idle variety. Performance of multiprocessor HT systems with low numbers of tasks (generally < number of virtual CPUs) can be significantly worse than the exact same workloads when running in non-HT mode. The reason is largely due to poor scheduling behaviour. This patch improves the situation, making the performance gap far less significant on one problematic test case (tbench). Signed-off-by: Nick Piggin Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8535e5c68f5b..46fdd0bb1ed6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1906,7 +1906,7 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) + unsigned long *imbalance, enum idle_type idle, int *sd_idle) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -1931,6 +1931,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -2074,10 +2077,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, unsigned long imbalance; int nr_moved, all_pinned = 0; int active_balance = 0; + int sd_idle = 0; + + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; schedstat_inc(sd, lb_cnt[idle]); - group = find_busiest_group(sd, this_cpu, &imbalance, idle); + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2150,6 +2157,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return nr_moved; out_balanced: @@ -2161,6 +2170,8 @@ out_balanced: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return 0; } @@ -2178,9 +2189,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, runqueue_t *busiest = NULL; unsigned long imbalance; int nr_moved = 0; + int sd_idle = 0; + + if (sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2205,15 +2220,19 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, spin_unlock(&busiest->lock); } - if (!nr_moved) + if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - else + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + } else sd->nr_balance_failed = 0; return nr_moved; out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; sd->nr_balance_failed = 0; return 0; } @@ -2338,7 +2357,10 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, if (j - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { - /* We've pulled tasks over so no longer idle */ + /* We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is + * not idle. + */ idle = NOT_IDLE; } sd->last_balance += interval; -- cgit v1.2.3 From 5927ad78ec75870b1bdfa65a10ad1300cd664d36 Mon Sep 17 00:00:00 2001 From: Renaud Lienhart Date: Sat, 10 Sep 2005 00:26:20 -0700 Subject: [PATCH] sched: use cached variable in sys_sched_yield() In sys_sched_yield(), we cache current->array in the "array" variable, thus there's no need to dereference "current" again later. Signed-Off-By: Renaud Lienhart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 46fdd0bb1ed6..103f705b245c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3938,7 +3938,7 @@ asmlinkage long sys_sched_yield(void) if (rt_task(current)) target = rq->active; - if (current->array->nr_active == 1) { + if (array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); -- cgit v1.2.3 From fa3b6ddc3f4a8eadba52234134cdb59c28b5332d Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sat, 10 Sep 2005 00:26:21 -0700 Subject: [PATCH] sched: don't kick ALB in the presence of pinned task Jack Steiner brought this issue at my OLS talk. Take a scenario where two tasks are pinned to two HT threads in a physical package. Idle packages in the system will keep kicking migration_thread on the busy package with out any success. We will run into similar scenarios in the presence of CMP/NUMA. Signed-off-by: Suresh Siddha Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 103f705b245c..1dc29dec38a9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2125,6 +2125,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { spin_lock(&busiest->lock); + + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + spin_unlock(&busiest->lock); + all_pinned = 1; + goto out_one_pinned; + } + if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; @@ -2165,6 +2175,8 @@ out_balanced: schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; + +out_one_pinned: /* tune up the balancing interval */ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) @@ -2357,7 +2369,8 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, if (j - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { - /* We've pulled tasks over so either we're no + /* + * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is * not idle. */ -- cgit v1.2.3 From 0c117f1b4d14380baeed9c883f765ee023da8761 Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sat, 10 Sep 2005 00:26:21 -0700 Subject: [PATCH] sched: allow the load to grow upto its cpu_power Don't pull tasks from a group if that would cause the group's total load to drop below its total cpu_power (ie. cause the group to start going idle). Signed-off-by: Suresh Siddha Signed-off-by: Nick Piggin Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1dc29dec38a9..dbd4490afec1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1910,6 +1910,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; int load_idx; max_load = this_load = total_load = total_pwr = 0; @@ -1959,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load) + if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -1979,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + /* How much load to actually move to equalise the imbalance */ - *imbalance = min((max_load - avg_load) * busiest->cpu_power, + *imbalance = min(max_pull * busiest->cpu_power, (avg_load - this_load) * this->cpu_power) / SCHED_LOAD_SCALE; -- cgit v1.2.3 From 417ef531415c070926b071b75fd1c1ac4b6e2f7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 10 Sep 2005 00:26:39 -0700 Subject: [PATCH] kernel/acct: add kerneldoc for kernel/acct.c: - fix typos - add kerneldoc for non-static functions Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f70e6027cca9..b756f527497e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -165,7 +165,7 @@ out: } /* - * Close the old accouting file (if currently open) and then replace + * Close the old accounting file (if currently open) and then replace * it with file (if non-NULL). * * NOTE: acct_globals.lock MUST be held on entry and exit. @@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file) } } -/* - * sys_acct() is the only system call needed to implement process - * accounting. It takes the name of the file where accounting records - * should be written. If the filename is NULL, accounting will be - * shutdown. +/** + * sys_acct - enable/disable process accounting + * @name: file name for accounting records or NULL to shutdown accounting + * + * Returns 0 for success or negative errno values for failure. + * + * sys_acct() is the only system call needed to implement process + * accounting. It takes the name of the file where accounting records + * should be written. If the filename is NULL, accounting will be + * shutdown. */ asmlinkage long sys_acct(const char __user *name) { @@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name) return (0); } -/* - * If the accouting is turned on for a file in the filesystem pointed - * to by sb, turn accouting off. +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @sb: super block for the filesystem + * + * If the accounting is turned on for a file in the filesystem pointed + * to by sb, turn accounting off. */ void acct_auto_close(struct super_block *sb) { @@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file) set_fs(fs); } -/* +/** * acct_process - now just a wrapper around do_acct_process + * @exitcode: task exit code + * + * handles process accounting for an exiting task */ void acct_process(long exitcode) { @@ -530,9 +541,9 @@ void acct_process(long exitcode) } -/* - * acct_update_integrals - * - update mm integral fields in task_struct +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting */ void acct_update_integrals(struct task_struct *tsk) { @@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk) } } -/* - * acct_clear_integrals - * - clear the mm integral fields in task_struct +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared */ void acct_clear_integrals(struct task_struct *tsk) { -- cgit v1.2.3 From 64ed93a268bc18fa6f72f61420d0e0022c5e38d1 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Sat, 10 Sep 2005 00:27:21 -0700 Subject: [PATCH] add schedule_timeout_{,un}interruptible() interfaces Add schedule_timeout_{,un}interruptible() interfaces so that schedule_timeout() callers don't have to worry about forgetting to add the set_current_state() call beforehand. Signed-off-by: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 13e2b513be01..a0b716d62701 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1154,6 +1154,20 @@ fastcall signed long __sched schedule_timeout(signed long timeout) EXPORT_SYMBOL(schedule_timeout); +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { -- cgit v1.2.3 From 75bcc8c5e1de78616b04ef9f317a293a7c1c163c Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Sat, 10 Sep 2005 00:27:24 -0700 Subject: [PATCH] kernel: fix-up schedule_timeout() usage Use schedule_timeout_{,un}interruptible() instead of set_current_state()/schedule_timeout() to reduce kernel size. Signed-off-by: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 9 +++------ kernel/signal.c | 3 +-- kernel/timer.c | 18 ++++++------------ 3 files changed, 10 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index ddfcaaa86623..102296e21ea8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); if (expire == 0) return 0; @@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); if (expire == 0) return 0; @@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &s, &info); diff --git a/kernel/signal.c b/kernel/signal.c index 4980a073237f..b92c3c9f8b9a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2221,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); try_to_freeze(); spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/timer.c b/kernel/timer.c index a0b716d62701..f4152fcd9f8e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1184,8 +1184,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); ret = 0; if (expire) { @@ -1213,8 +1212,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); ret = 0; if (expire) { @@ -1612,10 +1610,8 @@ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout) { - set_current_state(TASK_UNINTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); @@ -1628,10 +1624,8 @@ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout && !signal_pending(current)) { - set_current_state(TASK_INTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } -- cgit v1.2.3 From a2a979821b6ab75a4f143cfaa1c4672cc259ec10 Mon Sep 17 00:00:00 2001 From: Keith Owens Date: Sun, 11 Sep 2005 17:19:06 +1000 Subject: [PATCH] MCA/INIT: scheduler hooks Scheduler hooks to see/change which process is deemed to be on a cpu. Signed-off-by: Keith Owens Signed-off-by: Tony Luck --- kernel/sched.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index dbd4490afec1..e9ff04a9b56d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3576,6 +3576,32 @@ task_t *idle_task(int cpu) return cpu_rq(cpu)->idle; } +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + */ +task_t *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with interrupts disabled, the caller must save the original + * value of the current task (see curr_task() above) and restore that value + * before reenabling interrupts. + */ +void set_curr_task(int cpu, task_t *p) +{ + cpu_curr(cpu) = p; +} + /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. -- cgit v1.2.3 From 1df5c10a5b40d1ad747e3de3caf28764153c5c8e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 12 Sep 2005 07:59:21 -0700 Subject: Mark ia64-specific MCA/INIT scheduler hooks as dangerous ..and only enable them for ia64. The functions are only valid when the whole system has been totally stopped and no scheduler activity is ongoing on any CPU, and interrupts are globally disabled. In other words, they aren't useful for anything else. So make sure that nobody can use them by mistake. Signed-off-by: Linus Torvalds --- kernel/sched.c | 70 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e9ff04a9b56d..81b3a96ed2d0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3576,32 +3576,6 @@ task_t *idle_task(int cpu) return cpu_rq(cpu)->idle; } -/** - * curr_task - return the current task for a given cpu. - * @cpu: the processor in question. - */ -task_t *curr_task(int cpu) -{ - return cpu_curr(cpu); -} - -/** - * set_curr_task - set the current task for a given cpu. - * @cpu: the processor in question. - * @p: the task pointer to set. - * - * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function - * must be called with interrupts disabled, the caller must save the original - * value of the current task (see curr_task() above) and restore that value - * before reenabling interrupts. - */ -void set_curr_task(int cpu, task_t *p) -{ - cpu_curr(cpu) = p; -} - /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. @@ -5628,3 +5602,47 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_IA64 +/* + * These functions are only useful for the IA64 MCA handling. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +task_t *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, task_t *p) +{ + cpu_curr(cpu) = p; +} + +#endif -- cgit v1.2.3 From b3426599af9524104be6938bcb1fcaab314781c7 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Mon, 12 Sep 2005 04:30:30 -0700 Subject: [PATCH] cpuset semaphore depth check optimize Optimize the deadlock avoidance check on the global cpuset semaphore cpuset_sem. Instead of adding a depth counter to the task struct of each task, rather just two words are enough, one to store the depth and the other the current cpuset_sem holder. Thanks to Nikita Danilov for the idea. Signed-off-by: Paul Jackson [ We may want to change this further, but at least it's now a totally internal decision to the cpusets code ] Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 407b5f0a8c8e..79866bc6b3a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -180,6 +180,8 @@ static struct super_block *cpuset_sb = NULL; */ static DECLARE_MUTEX(cpuset_sem); +static struct task_struct *cpuset_sem_owner; +static int cpuset_sem_depth; /* * The global cpuset semaphore cpuset_sem can be needed by the @@ -200,16 +202,19 @@ static DECLARE_MUTEX(cpuset_sem); static inline void cpuset_down(struct semaphore *psem) { - if (current->cpuset_sem_nest_depth == 0) + if (cpuset_sem_owner != current) { down(psem); - current->cpuset_sem_nest_depth++; + cpuset_sem_owner = current; + } + cpuset_sem_depth++; } static inline void cpuset_up(struct semaphore *psem) { - current->cpuset_sem_nest_depth--; - if (current->cpuset_sem_nest_depth == 0) + if (--cpuset_sem_depth == 0) { + cpuset_sem_owner = NULL; up(psem); + } } /* -- cgit v1.2.3 From 3f74478b5fd7263e9311cdb320923d599c73a792 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 12 Sep 2005 18:49:24 +0200 Subject: [PATCH] x86-64: Some cleanup and optimization to the processor data area. - Remove unused irqrsp field - Remove pda->me - Optimize set_softirq_pending slightly Signed-off-by: Andi Kleen Signed-off-by: Linus Torvalds --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b4ab6af1dea8..f766b2fc48be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void) cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ - local_softirq_pending() = 0; + set_softirq_pending(0); local_irq_enable(); -- cgit v1.2.3