From a82f7119fd940c1505fc9fdf93d835fa52bc075d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Mar 2008 00:34:57 +0100 Subject: Hibernation: Fix mark_nosave_pages() There is a problem in the hibernation code that triggers on some NUMA systems on which pfn_valid() returns 'true' for some PFNs that don't belong to any zone. Namely, there is a BUG_ON() in memory_bm_find_bit() that triggers for PFNs not belonging to any zone and passing the pfn_valid() test. On the affected systems it triggers when we mark PFNs reported by the platform as not saveable, because the PFNs in question belong to a region mapped directly using iorepam() (i.e. the ACPI data area) and they pass the pfn_valid() test. Modify memory_bm_find_bit() so that it returns an error if given PFN doesn't belong to any zone instead of crashing the kernel and ignore the result returned by it in mark_nosave_pages(), while marking the "nosave" memory regions. This doesn't affect the hibernation functionality, as we won't touch the PFNs in question anyway. http://bugzilla.kernel.org/show_bug.cgi?id=9966 . Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/snapshot.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 72a020cabb4c..5f91a07c4eac 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * of @bm->cur_zone_bm are updated. */ -static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { struct zone_bitmap *zone_bm; @@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { zone_bm = zone_bm->next; - BUG_ON(!zone_bm); + if (!zone_bm) + return -EFAULT; } bm->cur.zone_bm = zone_bm; } @@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, pfn -= bb->start_pfn; *bit_nr = pfn % BM_BITS_PER_CHUNK; *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + return 0; } static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); set_bit(bit, addr); } +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; +} + static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); clear_bit(bit, addr); } @@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); return test_bit(bit, addr); } @@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) region->end_pfn << PAGE_SHIFT); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) - memory_bm_set_bit(bm, pfn); + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } -- cgit v1.2.3 From 53471121a8aad3f0b590bfce7c95a1f5f52150f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 12 Mar 2008 18:10:51 -0400 Subject: documentation: Move power-related files to Documentation/power/ Move 00-INDEX entries to power/00-INDEX (and add entry for pm_qos_interface.txt). Update references to moved filenames. Fix some trailing whitespace. Signed-off-by: Randy Dunlap Signed-off-by: Len Brown --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 79833170bb9c..6233f3b4ae66 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -190,7 +190,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read and the + and more information, read and the Battery Powered Linux mini-HOWTO, available from . -- cgit v1.2.3 From 0e1f34833bd9170ccc93ab759e48e695917fa48f Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 10 Mar 2008 11:01:20 -0700 Subject: sched: fix race in schedule() Fix a hard to trigger crash seen in the -rt kernel that also affects the vanilla scheduler. There is a race condition between schedule() and some dequeue/enqueue functions; rt_mutex_setprio(), __setscheduler() and sched_move_task(). When scheduling to idle, idle_balance() is called to pull tasks from other busy processor. It might drop the rq lock. It means that those 3 functions encounter on_rq=0 and running=1. The current task should be put when running. Here is a possible scenario: CPU0 CPU1 | schedule() | ->deactivate_task() | ->idle_balance() | -->load_balance_newidle() rt_mutex_setprio() | | --->double_lock_balance() *get lock *rel lock * on_rq=0, ruuning=1 | * sched_class is changed | *rel lock *get lock : | : ->put_prev_task_rt() ->pick_next_task_fair() => panic The current process of CPU1(P1) is scheduling. Deactivated P1, and the scheduler looks for another process on other CPU's runqueue because CPU1 will be idle. idle_balance(), load_balance_newidle() and double_lock_balance() are called and double_lock_balance() could drop the rq lock. On the other hand, CPU0 is trying to boost the priority of P1. The result of boosting only P1's prio and sched_class are changed to RT. The sched entities of P1 and P1's group are never put. It makes cfs_rq invalid, because the cfs_rq has curr and no leaf, but pick_next_task_fair() is called, then the kernel panics. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1cb53fb1fe3d..9df9ba73cb7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4268,11 +4268,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -4281,10 +4280,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -4581,19 +4579,17 @@ recheck: update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -7618,11 +7614,10 @@ void sched_move_task(struct task_struct *tsk) running = task_current(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - } + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); set_task_rq(tsk, task_cpu(tsk)); @@ -7631,11 +7626,10 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->moved_group(tsk); #endif - if (on_rq) { - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) enqueue_task(rq, tsk, 0); - } task_rq_unlock(rq, &flags); } -- cgit v1.2.3 From 3fe69747dab906cd6a8523230276a9820d6a514f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Mar 2008 20:55:51 +0100 Subject: sched: min_vruntime fix Current min_vruntime tracking is incorrect and will cause serious problems when we don't run the leftmost task for some reason. min_vruntime does two things; 1) it's used to determine a forward direction when the u64 vruntime wraps, 2) it's used to track the leftmost vruntime to position newly enqueued tasks from. The current logic advances min_vruntime whenever the current task's vruntime advance. Because the current task may pass the leftmost task still waiting we're failing the second goal. This causes new tasks to be placed too far ahead and thus penalizes their runtime. Fix this by making min_vruntime the min_vruntime of the waiting tasks by tracking it in enqueue/dequeue, and compare against current's vruntime to obtain the absolute minimum when placing new tasks. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e2a530515619..9d003c9d2a48 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) + if (leftmost) { cfs_rq->rb_leftmost = &se->run_node; + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, se->vruntime); + } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -184,8 +191,21 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + struct sched_entity *next; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + + if (next_node) { + next = rb_entry(next_node, + struct sched_entity, run_node); + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, + next->vruntime); + } + } rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -303,7 +323,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; - u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -315,19 +334,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, &curr->load); } curr->vruntime += delta_exec_weighted; - - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(curr->vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = curr->vruntime; - - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -493,7 +499,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime; - vruntime = cfs_rq->min_vruntime; + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(cfs_rq->min_vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = cfs_rq->min_vruntime; if (sched_feat(TREE_AVG)) { struct sched_entity *last = __pick_last_entity(cfs_rq); -- cgit v1.2.3 From e89996ae3f9e88d4fd75751a15c10b19d197e702 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 23:48:28 +0100 Subject: sched: fix update_load_add()/sub() Clear the cached inverse value when updating load. This is needed for calc_delta_mine() to work correctly when using the rq load. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9df9ba73cb7a..3a4ba3dc0f49 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1108,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; + lw->inv_weight = 0; } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; + lw->inv_weight = 0; } /* -- cgit v1.2.3 From 27d117266097101dcf79c4576903cdcdd0eabffc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 22:20:01 +0100 Subject: sched: fix calc_delta_mine() lw->weight can be 0 for a short time during bootup. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3a4ba3dc0f49..6b06f23261c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1084,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; + lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); tmp = (u64)delta_exec * weight; /* -- cgit v1.2.3 From aa2ac25229cd4d0280f6174c42712744ad61b140 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Mar 2008 21:12:12 +0100 Subject: sched: fix overload performance: buddy wakeups Currently we schedule to the leftmost task in the runqueue. When the runtimes are very short because of some server/client ping-pong, especially in over-saturated workloads, this will cycle through all tasks trashing the cache. Reduce cache trashing by keeping dependent tasks together by running newly woken tasks first. However, by not running the leftmost task first we could starve tasks because the wakee can gain unlimited runtime. Therefore we only run the wakee if its within a small (wakeup_granularity) window of the leftmost task. This preserves fairness, but does alternate server/client task groups. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6b06f23261c0..d1ad69b270ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -301,7 +301,7 @@ struct cfs_rq { /* 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr; + struct sched_entity *curr, *next; unsigned long nr_spread_over; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9d003c9d2a48..31c4a2988b64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -207,6 +207,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) } } + if (cfs_rq->next == se) + cfs_rq->next = NULL; + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -626,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static struct sched_entity * +pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 diff, gran; + + if (!cfs_rq->next) + return se; + + diff = cfs_rq->next->vruntime - se->vruntime; + if (diff < 0) + return se; + + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); + if (diff > gran) + return se; + + return cfs_rq->next; +} + static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = NULL; if (first_fair(cfs_rq)) { se = __pick_next_entity(cfs_rq); + se = pick_next(cfs_rq, se); set_next_entity(cfs_rq, se); } @@ -1070,6 +1093,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + + cfs_rq_of(pse)->next = pse; + /* * Batch tasks do not preempt (their preemption is driven by * the tick): -- cgit v1.2.3 From e22ecef1d2658ba54ed7d3fdb5d60829fb434c23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 22:16:08 +0100 Subject: sched: fix fair sleepers Fair sleepers need to scale their latency target down by runqueue weight. Otherwise busy systems will gain ever larger sleep bonus. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched_fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 31c4a2988b64..31aa1b9fa762 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -528,8 +528,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + vruntime -= calc_delta_fair(sysctl_sched_latency, + &cfs_rq->load); + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); -- cgit v1.2.3 From 6a6029b8cefe0ca7e82f27f3904dbedba3de4e06 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 22:17:08 +0100 Subject: sched: simplify sched_slice() Use the existing calc_delta_mine() calculation for sched_slice(). This saves a divide and simplifies the code because we share it with the other /cfs_rq->load users. It also improves code size: text data bss dec hex filename 42659 2740 144 45543 b1e7 sched.o.before 42093 2740 144 44977 afb1 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched_fair.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 31aa1b9fa762..f2cc59080efa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -283,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - - return slice; + return calc_delta_mine(__sched_period(cfs_rq->nr_running), + se->load.weight, &cfs_rq->load); } /* -- cgit v1.2.3 From 16d54669427069ef2823752c365d695b0cc4748f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 17 Mar 2008 09:04:59 +0100 Subject: relay: fix subbuf_splice_actor() adding too many pages If subbuf_pages was larger than the max number of pages the pipe buffer will hold, subbuf_splice_actor() would happily go beyond the array size. Signed-off-by: Jens Axboe --- kernel/relay.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index d080b9d161a7..4c035a8a248c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1066,7 +1066,7 @@ static int subbuf_splice_actor(struct file *in, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1097,8 +1097,9 @@ static int subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; + nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); - for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { + for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; unsigned int cur_pos = read_start + total_len; -- cgit v1.2.3 From 098fb9db2c74cfd6ffdbf61eb026a0c21abc5f75 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 16 Mar 2008 20:36:10 +0100 Subject: sched: clean up wakeup balancing, move wake_affine() split out the affine-wakeup bits. No code changed: kernel/sched.o: text data bss dec hex filename 42521 2858 232 45611 b22b sched.o.before 42521 2858 232 45611 b22b sched.o.after md5: 9d76738f1272aa82f0b7affd2f51df6b sched.o.before.asm 09b31c44e9aff8666f72773dc433e2df sched.o.after.asm (the md5's changed because stack slots changed and some registers get scheduled by gcc in a different order - but otherwise the before and after assembly is instruction for instruction equivalent.) Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 134 +++++++++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2cc59080efa..70679b266693 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -980,12 +980,59 @@ static inline int wake_idle(int cpu, struct task_struct *p) #endif #ifdef CONFIG_SMP + +static int +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p, + int cpu, int this_cpu, int sync, int idx, + unsigned long load, unsigned long this_load, + unsigned int imbalance) +{ + unsigned long tl = this_load; + unsigned long tl_per_task; + + if (!(this_sd->flags & SD_WAKE_AFFINE)) + return 0; + + /* + * Attract cache-cold tasks on sync wakeups: + */ + if (sync && !task_hot(p, rq->clock, this_sd)) + return 1; + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); + + return 1; + } + return 0; +} + static int select_task_rq_fair(struct task_struct *p, int sync) { - int cpu, this_cpu; - struct rq *rq; struct sched_domain *sd, *this_sd = NULL; - int new_cpu; + unsigned long load, this_load; + int cpu, this_cpu, new_cpu; + unsigned int imbalance; + struct rq *rq; + int idx; cpu = task_cpu(p); rq = task_rq(p); @@ -1008,66 +1055,35 @@ static int select_task_rq_fair(struct task_struct *p, int sync) /* * Check for affine wakeup and passive balancing possibilities. */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - unsigned long load, this_load; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } + if (!this_sd) + goto out_keep_cpu; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } + idx = this_sd->wake_idx; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); + + new_cpu = this_cpu; /* Wake to this CPU if we can */ + + if (wake_affine(rq, this_sd, p, cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + goto out_set_cpu; + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + goto out_set_cpu; } } +out_keep_cpu: new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: return wake_idle(new_cpu, p); -- cgit v1.2.3 From ac192d3921a14e2c9080799e16959b4bd56f49d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 16 Mar 2008 20:56:26 +0100 Subject: sched: clean up wakeup balancing, rename variables rename 'cpu' to 'prev_cpu'. No code changed: kernel/sched.o: text data bss dec hex filename 42521 2858 232 45611 b22b sched.o.before 42521 2858 232 45611 b22b sched.o.after md5: 09b31c44e9aff8666f72773dc433e2df sched.o.before.asm 09b31c44e9aff8666f72773dc433e2df sched.o.after.asm Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 70679b266693..2d2be02b8e3b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -983,7 +983,7 @@ static inline int wake_idle(int cpu, struct task_struct *p) static int wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p, - int cpu, int this_cpu, int sync, int idx, + int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { @@ -1010,7 +1010,7 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p, if (sync) tl -= current->se.load.weight; - if ((tl <= load && tl + target_load(cpu, idx) <= tl_per_task) || + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 100*(tl + p->se.load.weight) <= imbalance*load) { /* * This domain has SD_WAKE_AFFINE and @@ -1028,22 +1028,26 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p, static int select_task_rq_fair(struct task_struct *p, int sync) { struct sched_domain *sd, *this_sd = NULL; + int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - int cpu, this_cpu, new_cpu; unsigned int imbalance; struct rq *rq; int idx; - cpu = task_cpu(p); - rq = task_rq(p); - this_cpu = smp_processor_id(); - new_cpu = cpu; + prev_cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + new_cpu = prev_cpu; - if (cpu == this_cpu) + if (prev_cpu == this_cpu) goto out_set_cpu; + /* + * 'this_sd' is the first domain that both + * this_cpu and prev_cpu are present in: + */ for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpu_isset(prev_cpu, sd->span)) { this_sd = sd; break; } @@ -1062,12 +1066,12 @@ static int select_task_rq_fair(struct task_struct *p, int sync) imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - load = source_load(cpu, idx); + load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); new_cpu = this_cpu; /* Wake to this CPU if we can */ - if (wake_affine(rq, this_sd, p, cpu, this_cpu, sync, idx, + if (wake_affine(rq, this_sd, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) goto out_set_cpu; @@ -1084,7 +1088,11 @@ static int select_task_rq_fair(struct task_struct *p, int sync) } out_keep_cpu: - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ + /* + * Could not wake to this_cpu. + * Wake to the previous cpu instead: + */ + new_cpu = prev_cpu; out_set_cpu: return wake_idle(new_cpu, p); } -- cgit v1.2.3 From f48273860edfca2306236d0f0de609aab3f773d4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 16 Mar 2008 21:21:47 +0100 Subject: sched: clean up wakeup balancing, code flow Clean up the code flow. No code changed: kernel/sched.o: text data bss dec hex filename 42521 2858 232 45611 b22b sched.o.before 42521 2858 232 45611 b22b sched.o.after md5: 09b31c44e9aff8666f72773dc433e2df sched.o.before.asm 09b31c44e9aff8666f72773dc433e2df sched.o.after.asm Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2d2be02b8e3b..b5a357396b49 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1040,7 +1040,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) new_cpu = prev_cpu; if (prev_cpu == this_cpu) - goto out_set_cpu; + goto out; /* * 'this_sd' is the first domain that both @@ -1054,13 +1054,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync) } if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; + goto out; /* * Check for affine wakeup and passive balancing possibilities. */ if (!this_sd) - goto out_keep_cpu; + goto out; idx = this_sd->wake_idx; @@ -1069,11 +1069,11 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - new_cpu = this_cpu; /* Wake to this CPU if we can */ - if (wake_affine(rq, this_sd, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - goto out_set_cpu; + load, this_load, imbalance)) { + new_cpu = this_cpu; + goto out; + } /* * Start passive balancing when half the imbalance_pct @@ -1083,17 +1083,12 @@ static int select_task_rq_fair(struct task_struct *p, int sync) if (imbalance*this_load <= 100*load) { schedstat_inc(this_sd, ttwu_move_balance); schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; + new_cpu = this_cpu; + goto out; } } -out_keep_cpu: - /* - * Could not wake to this_cpu. - * Wake to the previous cpu instead: - */ - new_cpu = prev_cpu; -out_set_cpu: +out: return wake_idle(new_cpu, p); } #endif /* CONFIG_SMP */ -- cgit v1.2.3 From 4ae7d5cefd4aa3560e359a3b0f03e12adc8b5c86 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 19 Mar 2008 01:42:00 +0100 Subject: sched: improve affine wakeups improve affine wakeups. Maintain the 'overlap' metric based on CFS's sum_exec_runtime - which means the amount of time a task executes after it wakes up some other task. Use the 'overlap' for the wakeup decisions: if the 'overlap' is short, it means there's strong workload coupling between this task and the woken up task. If the 'overlap' is large then the workload is decoupled and the scheduler will move them to separate CPUs more easily. ( Also slightly move the preempt_check within try_to_wake_up() - this has no effect on functionality but allows 'early wakeups' (for still-on-rq tasks) to be correctly accounted as well.) Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ++++- kernel/sched_debug.c | 1 + kernel/sched_fair.c | 58 +++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 47 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d1ad69b270ca..adbd475cfd25 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1855,10 +1855,11 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); - check_preempt_curr(rq, p); success = 1; out_running: + check_preempt_curr(rq, p); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -1892,6 +1893,8 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4a..ef358ba07683 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); + PN(se.avg_overlap); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b5a357396b49..87c9d3a2aafa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -556,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) account_entity_enqueue(cfs_rq, se); } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!se->last_wakeup) + return; + + update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); + se->last_wakeup = 0; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -566,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -981,12 +997,15 @@ static inline int wake_idle(int cpu, struct task_struct *p) #ifdef CONFIG_SMP +static const struct sched_class fair_sched_class; + static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p, - int prev_cpu, int this_cpu, int sync, int idx, - unsigned long load, unsigned long this_load, +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, + struct task_struct *p, int prev_cpu, int this_cpu, int sync, + int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { + struct task_struct *curr = this_rq->curr; unsigned long tl = this_load; unsigned long tl_per_task; @@ -994,10 +1013,15 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct task_struct *p, return 0; /* - * Attract cache-cold tasks on sync wakeups: + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: */ - if (sync && !task_hot(p, rq->clock, this_sd)) - return 1; + if (sync && curr->sched_class == &fair_sched_class) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); @@ -1030,18 +1054,16 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; + struct rq *rq, *this_rq; unsigned int imbalance; - struct rq *rq; int idx; prev_cpu = task_cpu(p); rq = task_rq(p); this_cpu = smp_processor_id(); + this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1069,11 +1091,12 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) { - new_cpu = this_cpu; + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + return this_cpu; + + if (prev_cpu == this_cpu) goto out; - } /* * Start passive balancing when half the imbalance_pct @@ -1083,8 +1106,7 @@ static int select_task_rq_fair(struct task_struct *p, int sync) if (imbalance*this_load <= 100*load) { schedstat_inc(this_sd, ttwu_move_balance); schedstat_inc(p, se.nr_wakeups_passive); - new_cpu = this_cpu; - goto out; + return this_cpu; } } @@ -1111,6 +1133,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } + se->last_wakeup = se->sum_exec_runtime; + if (unlikely(se == pse)) + return; + cfs_rq_of(pse)->next = pse; /* -- cgit v1.2.3 From f540a6080a092e2ab69fd146c308022db7347b0a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 15 Mar 2008 17:10:34 +0100 Subject: sched: wakeup-buddy tasks are cache-hot Wakeup-buddy tasks are cache-hot - this makes it a bit harder for the load-balancer to tear them apart. (but it's still possible, if the load is sufficiently assymetric) Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index adbd475cfd25..3f7c5eb254e2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1396,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + /* + * Buddy candidates are cache hot: + */ + if (&p->se == cfs_rq_of(&p->se)->next) + return 1; + if (p->sched_class != &fair_sched_class) return 0; -- cgit v1.2.3 From 74e3cd7f480ae1888b7cd196bf8125a1d3bfee05 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Mar 2008 18:47:57 +0100 Subject: sched: retune wake granularity reduce wake-up granularity for better interactivity. Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 87c9d3a2aafa..b85cac4b5e25 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -- cgit v1.2.3 From 3150e63df41450a795bbd0bd98a8e70da74e0285 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 19 Mar 2008 17:01:08 -0700 Subject: revert "clocksource: make clocksource watchdog cycle through online CPUs" Revert commit 1ada5cba6a0318f90e45b38557e7b5206a9cba38 ("clocksource: make clocksource watchdog cycle through online CPUs") due to the regression reported by Gabriel C at http://lkml.org/lkml/2008/2/24/281 (short vesion: it makes TSC be marked as always unstable on his machine). Cc: Andi Kleen Acked-by: Ingo Molnar Cc: Thomas Gleixner Cc: Robert Hancock Acked-by: Linus Torvalds Cc: "Rafael J. Wysocki" Cc: Gabriel C Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 548c436a776b..278534bbca95 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data) } if (!list_empty(&watchdog_list)) { - /* Cycle through CPUs to check if the CPUs stay synchronized to - * each other. */ - int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); - if (next_cpu >= NR_CPUS) - next_cpu = first_cpu(cpu_online_map); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); } spin_unlock(&watchdog_lock); } @@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (!started && watchdog) { watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } } -- cgit v1.2.3 From 75c0371a2d385ecbd6e1f854d9dce20889f06736 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 20 Mar 2008 15:39:41 -0700 Subject: audit: netlink socket can be auto-bound to pid other than current->pid (v2) From: Pavel Emelyanov This patch is based on the one from Thomas. The kauditd_thread() calls the netlink_unicast() and passes the audit_pid to it. The audit_pid, in turn, is received from the user space and the tool (I've checked the audit v1.6.9) uses getpid() to pass one in the kernel. Besides, this tool doesn't bind the netlink socket to this id, but simply creates it allowing the kernel to auto-bind one. That's the preamble. The problem is that netlink_autobind() _does_not_ guarantees that the socket will be auto-bound to the current pid. Instead it uses the current pid as a hint to start looking for a free id. So, in case of conflict, the audit messages can be sent to a wrong socket. This can happen (it's unlikely, but can be) in case some task opens more than one netlink sockets and then the audit one starts - in this case the audit's pid can be busy and its socket will be bound to another id. The proposal is to introduce an audit_nlk_pid in audit subsys, that will point to the netlink socket to send packets to. It will most often be equal to audit_pid. The socket id can be got from the skb's netlink CB right in the audit_receive_msg. The audit_nlk_pid reset to 0 is not required, since all the decisions are taken based on audit_pid value only. Later, if the audit tools will bind the socket themselves, the kernel will have to provide a way to setup the audit_nlk_pid as well. A good side effect of this patch is that audit_pid can later be converted to struct pid, as it is not longer safe to use pid_t-s in the presence of pid namespaces. But audit code still uses the tgid from task_struct in the audit_signal_info and in the audit_filter_syscall. Signed-off-by: Thomas Graf Signed-off-by: Pavel Emelyanov Acked-by: Eric Paris Signed-off-by: David S. Miller --- kernel/audit.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 10c4930c2bbf..be55cb503633 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -78,9 +78,13 @@ static int audit_default; /* If auditing cannot proceed, audit_failure selects what happens. */ static int audit_failure = AUDIT_FAIL_PRINTK; -/* If audit records are to be written to the netlink socket, audit_pid - * contains the (non-zero) pid. */ +/* + * If audit records are to be written to the netlink socket, audit_pid + * contains the pid of the auditd process and audit_nlk_pid contains + * the pid to use to send netlink messages to that process. + */ int audit_pid; +static int audit_nlk_pid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -350,7 +354,7 @@ static int kauditd_thread(void *dummy) wake_up(&audit_backlog_wait); if (skb) { if (audit_pid) { - int err = netlink_unicast(audit_sock, skb, audit_pid, 0); + int err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); @@ -626,6 +630,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sid, 1); audit_pid = new_pid; + audit_nlk_pid = NETLINK_CB(skb).pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) err = audit_set_rate_limit(status_get->rate_limit, -- cgit v1.2.3 From 2070ee01d314ecec8a570c07647ccf4ced6340bb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Mar 2008 16:43:47 +0100 Subject: sched: cleanup old and rarely used 'debug' features. TREE_AVG and APPROX_AVG are initial task placement policies that have been disabled for a long while.. time to remove them. Signed-off-by: Peter Zijlstra CC: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++------ kernel/sched_fair.c | 14 -------------- 2 files changed, 2 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3f7c5eb254e2..366a90923a3b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -594,18 +594,14 @@ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, SCHED_FEAT_WAKEUP_PREEMPT = 2, SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_TREE_AVG = 8, - SCHED_FEAT_APPROX_AVG = 16, - SCHED_FEAT_HRTICK = 32, - SCHED_FEAT_DOUBLE_TICK = 64, + SCHED_FEAT_HRTICK = 8, + SCHED_FEAT_DOUBLE_TICK = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0 | SCHED_FEAT_HRTICK * 1 | SCHED_FEAT_DOUBLE_TICK * 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b85cac4b5e25..86a93376282c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -302,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) return vslice; } -static u64 sched_vslice(struct cfs_rq *cfs_rq) -{ - return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); -} - static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { return __sched_vslice(cfs_rq->load.weight + se->load.weight, @@ -504,15 +499,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } else vruntime = cfs_rq->min_vruntime; - if (sched_feat(TREE_AVG)) { - struct sched_entity *last = __pick_last_entity(cfs_rq); - if (last) { - vruntime += last->vruntime; - vruntime >>= 1; - } - } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) - vruntime += sched_vslice(cfs_rq)/2; - /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a -- cgit v1.2.3 From 23e3c3cd2e39a3c9d07ee07d882c8cf6ddd61c86 Mon Sep 17 00:00:00 2001 From: Roel Kluin <12o3l@tiscali.nl> Date: Thu, 13 Mar 2008 17:41:59 +0100 Subject: sched: remove double unlikely from schedule() Combine two unlikely's Signed-off-by: Roel Kluin <12o3l@tiscali.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 366a90923a3b..573179eb553e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3882,7 +3882,7 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) { + signal_pending(prev))) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, 1); -- cgit v1.2.3 From 9aefd0abd8610e8f3bb097debf3afb73f8b7b210 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 12 Mar 2008 18:31:58 +0100 Subject: sched: add exported arch_reinit_sched_domains() to header file. Needed so it can be called from outside of sched.c. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 573179eb553e..78482e51b583 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6920,7 +6920,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static int arch_reinit_sched_domains(void) +int arch_reinit_sched_domains(void) { int err; -- cgit v1.2.3 From 22e52b072dd87faa9b2559fe89d4e8f2370f81ca Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 12 Mar 2008 18:31:59 +0100 Subject: sched: add arch_update_cpu_topology hook. Will be called each time the scheduling domains are rebuild. Needed for architectures that don't have a static cpu topology. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 78482e51b583..28c73f07efb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6807,6 +6807,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ */ static cpumask_t fallback_doms; +void __attribute__((weak)) arch_update_cpu_topology(void) +{ +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -6816,6 +6820,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) { int err; + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) -- cgit v1.2.3 From 92896bd9fd75b1c993b92874d339a8088bb75560 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 24 Mar 2008 11:07:15 -0700 Subject: Don't 'printk()' while holding xtime lock for writing The printk() can deadlock because it can wake up klogd(), and task enqueueing will try to read the time in order to set a hrtimer. Reported-by: Marcin Slusarz Debugged-by: Peter Zijlstra Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 671af612b768..a3fa587c350c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -191,8 +191,12 @@ static void change_clocksource(void) tick_clock_notify(); + /* + * We're holding xtime lock and waking up klogd would deadlock + * us on enqueue. So no printing! printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); + */ } #else static inline void change_clocksource(void) { } -- cgit v1.2.3 From fd3c36f8b527d13cf311d15e0702bc0390956970 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 24 Mar 2008 12:29:47 -0700 Subject: markers: update preempt_disable. call_rcu, rcu_barrier comments Add comments requested by Andrew. Updated comments about synchronize_sched(). Since we use call_rcu and rcu_barrier now, these comments were out of sync with the code. Signed-off-by: Mathieu Desnoyers Cc: Christoph Hellwig Cc: Mike Mason Cc: Dipankar Sarma Cc: David Smith Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 48a4ea5afffd..ccae1d10e32f 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -104,9 +104,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, char ptype; /* - * disabling preemption to make sure the teardown of the callbacks can - * be done correctly when they are in modules and they insure RCU read - * coherency. + * preempt_disable does two things : disabling preemption to make sure + * the teardown of the callbacks can be done correctly when they are in + * modules and they insure RCU read coherency. */ preempt_disable(); ptype = ACCESS_ONCE(mdata->ptype); @@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, /* * Disable a marker and its probe callback. - * Note: only after a synchronize_sched() issued after setting elem->call to the - * empty function insures that the original callback is not used anymore. This - * insured by preemption disabling around the call site. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. */ static void disable_marker(struct marker *elem) { @@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem) elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and - * should be done only after a synchronize_sched(). These are never used - * until the next initialization anyway. + * should be done only after an RCU period. These are never used until + * the next initialization anyway. */ } @@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin, /* * Update probes, removing the faulty probes. - * Issues a synchronize_sched() when no reference to the module passed - * as parameter is found in the probes so the probe module can be - * safely unloaded from now on. * * Internal callback only changed before the first probe is connected to it. * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 -- cgit v1.2.3 From 58336114af4d2cce830201aae49e50b93ede6c5c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 24 Mar 2008 12:29:49 -0700 Subject: markers: remove ACCESS_ONCE As Paul pointed out, the ACCESS_ONCE are not needed because we already have the explicit surrounding memory barriers. Signed-off-by: Mathieu Desnoyers Cc: Christoph Hellwig Cc: Mike Mason Cc: Dipankar Sarma Cc: David Smith Cc: "Paul E. McKenney" Cc: Steven Rostedt Cc: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index ccae1d10e32f..041c33e3e95c 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -109,13 +109,13 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, * modules and they insure RCU read coherency. */ preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, fmt); multi[i].func(multi[i].probe_private, call_private, fmt, @@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata, char ptype; preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, fmt, &args); -- cgit v1.2.3 From a846a1954b6397e844fe1e258af7598897ec6159 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 24 Mar 2008 12:29:52 -0700 Subject: bsd_acct: plain current->real_parent access is not always safe This is minor, but dereferencing even current real_parent is not safe on debug kernels, since the memory, this points to, can be unmapped - RCU protection is required. Besides, the tgid field is deprecated and is to be replaced with task_tgid_xxx call (the 2nd patch), so RCU will be required anyway. Signed-off-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 521dfa53cb99..7ff5339a3f05 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -482,7 +482,9 @@ static void do_acct_process(struct file *file) #endif #if ACCT_VERSION==3 ac.ac_pid = current->tgid; - ac.ac_ppid = current->real_parent->tgid; + rcu_read_lock(); + ac.ac_ppid = rcu_dereference(current->real_parent)->tgid; + rcu_read_unlock(); #endif spin_lock_irq(¤t->sighand->siglock); -- cgit v1.2.3 From 5f7b703fe2be40db5a2bf136ac9e44cf5db267cc Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 24 Mar 2008 12:29:53 -0700 Subject: bsd_acct: using task_struct->tgid is not right in pid-namespaces In case we're accounting from a sub-namespace, the tgids reported will not refer to the right namespace. Save the pid_namespace we're accounting in on the acct_glbs and use it in do_acct_process. Two less :) places using the task_struct.tgid member. Signed-off-by: Pavel Emelyanov Cc: Oleg Nesterov Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 7ff5339a3f05..91e1cfd734d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -58,6 +58,7 @@ #include #include #include /* sector_div */ +#include /* * These constants control the amount of freespace that suspend and @@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct file *); +static void do_acct_process(struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock @@ -86,6 +87,7 @@ struct acct_glbs { volatile int active; volatile int needcheck; struct file *file; + struct pid_namespace *ns; struct timer_list timer; }; @@ -175,9 +177,11 @@ out: static void acct_file_reopen(struct file *file) { struct file *old_acct = NULL; + struct pid_namespace *old_ns = NULL; if (acct_globals.file) { old_acct = acct_globals.file; + old_ns = acct_globals.ns; del_timer(&acct_globals.timer); acct_globals.active = 0; acct_globals.needcheck = 0; @@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file) } if (file) { acct_globals.file = file; + acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); acct_globals.needcheck = 0; acct_globals.active = 1; /* It's been deleted if it was used before so this is safe */ @@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); - do_acct_process(old_acct); + do_acct_process(old_ns, old_acct); filp_close(old_acct, NULL); + put_pid_ns(old_ns); spin_lock(&acct_globals.lock); } } @@ -419,7 +425,7 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct file *file) +static void do_acct_process(struct pid_namespace *ns, struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -481,9 +487,9 @@ static void do_acct_process(struct file *file) ac.ac_gid16 = current->gid; #endif #if ACCT_VERSION==3 - ac.ac_pid = current->tgid; + ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); - ac.ac_ppid = rcu_dereference(current->real_parent)->tgid; + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); rcu_read_unlock(); #endif @@ -580,6 +586,7 @@ void acct_collect(long exitcode, int group_dead) void acct_process(void) { struct file *file = NULL; + struct pid_namespace *ns; /* * accelerate the common fastpath: @@ -594,8 +601,10 @@ void acct_process(void) return; } get_file(file); + ns = get_pid_ns(acct_globals.ns); spin_unlock(&acct_globals.lock); - do_acct_process(file); + do_acct_process(ns, file); fput(file); + put_pid_ns(ns); } -- cgit v1.2.3 From 266c2e0abeca649fa6667a1a427ad1da507c6375 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 24 Mar 2008 19:25:08 -0700 Subject: Make printk() console semaphore accesses sensible The printk() logic on when/how to get the console semaphore was unreadable, this splits the code up into a few helper functions and makes it easier to follow what is going on. Signed-off-by: Linus Torvalds --- kernel/printk.c | 83 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 9adc2a473e6e..c46a20a19a15 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -616,6 +616,40 @@ asmlinkage int printk(const char *fmt, ...) /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ +static inline int can_use_console(unsigned int cpu) +{ + return cpu_online(cpu) || have_callable_console(); +} + +/* + * Try to get console ownership to actually show the kernel + * messages from a 'printk'. Return true (and with the + * console_semaphore held, and 'console_locked' set) if it + * is successful, false otherwise. + * + * This gets called with the 'logbuf_lock' spinlock held and + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +static int acquire_console_semaphore_for_printk(unsigned int cpu) +{ + int retval = 0; + + if (can_use_console(cpu)) + retval = !try_acquire_console_sem(); + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); + return retval; +} + const char printk_recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int printk_recursion_bug; @@ -725,43 +759,22 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!down_trylock(&console_sem)) { - /* - * We own the drivers. We can drop the spinlock and - * let release_console_sem() print the text, maybe ... - */ - console_locked = 1; - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + /* + * Try to acquire and then immediately release the + * console semaphore. The release will do all the + * actual magic (print out buffers, wake up klogd, + * etc). + * + * The acquire_console_semaphore_for_printk() function + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ + if (acquire_console_semaphore_for_printk(this_cpu)) + release_console_sem(); - /* - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ - if (cpu_online(smp_processor_id()) || have_callable_console()) { - console_may_schedule = 0; - release_console_sem(); - } else { - /* Release by hand to avoid flushing the buffer. */ - console_locked = 0; - up(&console_sem); - } - lockdep_on(); - raw_local_irq_restore(flags); - } else { - /* - * Someone else owns the drivers. We drop the spinlock, which - * allows the semaphore holder to proceed and to call the - * console drivers with the output which we just produced. - */ - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); - lockdep_on(); + lockdep_on(); out_restore_irqs: - raw_local_irq_restore(flags); - } + raw_local_irq_restore(flags); preempt_enable(); return printed_len; -- cgit v1.2.3 From 898a19de1502649877091b398229026b4142c0e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 25 Mar 2008 09:01:51 +0100 Subject: clocksource: revert: use init_timer_deferrable for clocksource_watchdog Revert commit 1077f5a917b7c630231037826b344b2f7f5b903f Author: Parag Warudkar Date: Wed Jan 30 13:30:01 2008 +0100 clocksource.c: use init_timer_deferrable for clocksource_watchdog clocksource_watchdog can use a deferrable timer - reduces wakeups from idle per second. The watchdog timer needs to run with the specified interval. Otherwise it will miss the possible wrap of the watchdog clocksource. Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/time/clocksource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 278534bbca95..7f60097d443a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -174,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (watchdog) del_timer(&watchdog_timer); watchdog = cs; - init_timer_deferrable(&watchdog_timer); + init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ -- cgit v1.2.3 From 06d8308c61e54346585b2691c13ee3f90cb6fb2f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 22 Mar 2008 09:20:24 +0100 Subject: NOHZ: reevaluate idle sleep length after add_timer_on() add_timer_on() can add a timer on a CPU which is currently in a long idle sleep, but the timer wheel is not reevaluated by the nohz code on that CPU. So a timer can be delayed for quite a long time. This triggered a false positive in the clocksource watchdog code. To avoid this we need to wake up the idle CPU and enforce the reevaluation of the timer wheel for the next timer event. Add a function, which checks a given CPU for idle state, marks the idle task with NEED_RESCHED and sends a reschedule IPI to notify the other CPU of the change in the timer wheel. Call this function from add_timer_on(). Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Ingo Molnar Cc: stable@kernel.org -- include/linux/sched.h | 6 ++++++ kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/timer.c | 10 +++++++++- 3 files changed, 58 insertions(+), 1 deletion(-) --- kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/timer.c | 10 +++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 28c73f07efb2..8dcdec6fe0fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1052,6 +1052,49 @@ static void resched_cpu(int cpu) resched_task(cpu_curr(cpu)); spin_unlock_irqrestore(&rq->lock, flags); } + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif + #else static void __resched_task(struct task_struct *p, int tif_bit) { diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88b..b024106daa70 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified -- cgit v1.2.3 From 37529fe9f62835e1c11895a1895064748b032dc1 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 Mar 2008 12:01:28 +0100 Subject: set relay file can not be read by pread(2) I found that relay files can be read by pread(2). I fix it, for relay files are not capable of seeking. Signed-off-by: Lai Jiangshan Signed-off-by: Jens Axboe --- kernel/relay.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 4c035a8a248c..ed3f6cf2db86 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) kref_get(&buf->kref); filp->private_data = buf; - return 0; + return nonseekable_open(inode, filp); } /** -- cgit v1.2.3 From 5eb7f9fa847b8ab6e4864bfb8cb45f370844a47c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Mar 2008 12:04:09 +0100 Subject: relay: set an spd_release() hook for splice relay doesn't reference the pages it adds, however we need a non-NULL hook or splice_to_pipe() can oops. Signed-off-by: Jens Axboe --- kernel/relay.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index ed3f6cf2db86..d6204a485818 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) +{ +} + /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ @@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, + .spd_release = relay_page_release, }; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) -- cgit v1.2.3 From f6d107fb10def502522b10bfb7af9533afbb8274 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 27 Mar 2008 14:52:15 +1100 Subject: Give futex init a proper name The futex init function is called init(). This is a pain in the neck when debugging when you code dies in ... init :-) This renames it to futex_init(). Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 06968cd79200..87a6428cb5b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2158,7 +2158,7 @@ static struct file_system_type futex_fs_type = { .kill_sb = kill_anon_super, }; -static int __init init(void) +static int __init futex_init(void) { u32 curval; int i; @@ -2194,4 +2194,4 @@ static int __init init(void) return 0; } -__initcall(init); +__initcall(futex_init); -- cgit v1.2.3 From 1d4a788f15302877ff2cb08f22009e290a36a209 Mon Sep 17 00:00:00 2001 From: YAMAMOTO Takashi Date: Fri, 28 Mar 2008 14:15:50 -0700 Subject: memcgroup: fix spurious EBUSY on memory cgroup removal Call mm_free_cgroup earlier. Otherwise a reference due to lazy mm switching can prevent cgroup removal. Signed-off-by: YAMAMOTO Takashi Acked-by: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index dd249c37b3a3..9c042f901570 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -394,7 +394,6 @@ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); - mm_free_cgroup(mm); destroy_context(mm); free_mm(mm); } @@ -416,6 +415,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + mm_free_cgroup(mm); mmdrop(mm); } } -- cgit v1.2.3 From f706d5d22c35e18ed13a4b2b4991aac75bf39df5 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Fri, 28 Mar 2008 14:15:56 -0700 Subject: audit: silence two kerneldoc warnings in kernel/audit.c Silence two kerneldoc warnings. Warning(kernel/audit.c:1276): No description found for parameter 'string' Warning(kernel/audit.c:1276): No description found for parameter 'len' [also fix a typo for bonus points] Signed-off-by: Dave Jones Acked-by: Randy Dunlap Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index be55cb503633..b782b046543d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1269,8 +1269,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, /** * audit_string_contains_control - does a string need to be logged in hex - * @string - string to be checked - * @len - max length of the string to check + * @string: string to be checked + * @len: max length of the string to check */ int audit_string_contains_control(const char *string, size_t len) { @@ -1285,7 +1285,7 @@ int audit_string_contains_control(const char *string, size_t len) /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer - * @len: lenth of string (not including trailing null) + * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string -- cgit v1.2.3 From 9dce07f1a441b77a15631cf0ed0238e0baa7ed64 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 29 Mar 2008 03:07:28 +0000 Subject: NULL noise: fs/*, mm/*, kernel/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 ++-- kernel/futex.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9c2fb01e89b..53d86b4b0ce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2082,7 +2082,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) kfree(pidarray); } else { - ctr->buf = 0; + ctr->buf = NULL; ctr->bufsz = 0; } file->private_data = ctr; @@ -2614,7 +2614,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) static int cgroupstats_open(struct inode *inode, struct file *file) { - return single_open(file, proc_cgroupstats_show, 0); + return single_open(file, proc_cgroupstats_show, NULL); } static struct file_operations proc_cgroupstats_operations = { diff --git a/kernel/futex.c b/kernel/futex.c index 87a6428cb5b6..e43945e995f5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -281,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, */ static void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr == 0) + if (key->both.ptr == NULL) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: -- cgit v1.2.3 From 8481664d373e7e2cea3ea0c2d7a06c9e939b19ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 29 Mar 2008 03:07:58 +0000 Subject: futex_compat __user annotation Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/futex_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index ff90f049f8f6..04ac3a9e42cf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, return 0; } -static void __user *futex_uaddr(struct robust_list *entry, +static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); -- cgit v1.2.3