From 8ed92e51f99c2199c64cb33b4ba95ab12940a94c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Oct 2012 14:28:50 +0200 Subject: sched: Add WAKEUP_PREEMPTION feature flag, on by default As per the recent discussion with Mike and Linus, make it easier to test with/without this feature. No change in default behavior. Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-izoxq4haeg4mTognnDbwcevt@git.kernel.org --- kernel/sched/fair.c | 2 +- kernel/sched/features.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..f936552b3db1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2907,7 +2907,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ * Batch and idle tasks do not preempt non-idle tasks (their preemption * is driven by the tick): */ - if (unlikely(p->policy != SCHED_NORMAL)) + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..e68e69ab917d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -31,6 +31,11 @@ SCHED_FEAT(LAST_BUDDY, true) */ SCHED_FEAT(CACHE_HOT_BUDDY, true) +/* + * Allow wakeup-time preemption of the current task: + */ +SCHED_FEAT(WAKEUP_PREEMPTION, true) + /* * Use arch dependent cpu power functions */ -- cgit v1.2.3 From 5258f386ea4e8454bc801fb443e8a4217da1947c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 28 Oct 2012 12:19:23 -0700 Subject: sched/autogroup: Fix crash on reboot when autogroup is disabled Due to these two commits: 8323f26ce342 sched: Fix race in task_group() 800d4d30c8f2 sched, autogroup: Stop going ahead if autogroup is disabled ... autogroup scheduling's dynamic knobs are wrecked. With both patches applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to 800d4d30 not allowing autogroup to move things, and 8323f26ce making that the only way to switch runqueues. Remove most of the (dysfunctional) knobs and turn the remaining sched_autogroup_enabled knob readonly. If the user fiddles with cgroups hereafter, once tasks are moved, autogroup won't mess with them again unless they call setsid(). No knobs, no glitz, nada, just a cute little thing folks can turn on if they don't want to muck about with cgroups and/or systemd. Signed-off-by: Mike Galbraith Cc: Xiaotian Feng Cc: Peter Zijlstra Cc: Xiaotian Feng Cc: Linus Torvalds Cc: Andrew Morton Cc: Oleg Nesterov Cc: # v3.6 Link: http://lkml.kernel.org/r/1351451963.4999.8.camel@maggy.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/auto_group.c | 68 ++++++++--------------------------------------- kernel/sched/auto_group.h | 9 +------ kernel/sysctl.c | 6 ++--- 3 files changed, 14 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 0984a21076a3..0f1bacb005a4 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -110,6 +110,9 @@ out_fail: bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { + if (!sysctl_sched_autogroup_enabled) + return false; + if (tg != &root_task_group) return false; @@ -143,15 +146,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - t = p; do { sched_move_task(t); } while_each_thread(p, t); -out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } @@ -159,8 +158,11 @@ out: /* Allocates GFP_KERNEL, cannot be called under any spinlock */ void sched_autogroup_create_attach(struct task_struct *p) { - struct autogroup *ag = autogroup_create(); + struct autogroup *ag; + if (!sysctl_sched_autogroup_enabled) + return; + ag = autogroup_create(); autogroup_move_group(p, ag); /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); @@ -176,11 +178,15 @@ EXPORT_SYMBOL(sched_autogroup_detach); void sched_autogroup_fork(struct signal_struct *sig) { + if (!sysctl_sched_autogroup_enabled) + return; sig->autogroup = autogroup_task_get(current); } void sched_autogroup_exit(struct signal_struct *sig) { + if (!sysctl_sched_autogroup_enabled) + return; autogroup_kref_put(sig->autogroup); } @@ -193,58 +199,6 @@ static int __init setup_autogroup(char *str) __setup("noautogroup", setup_autogroup); -#ifdef CONFIG_PROC_FS - -int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) -{ - static unsigned long next = INITIAL_JIFFIES; - struct autogroup *ag; - int err; - - if (nice < -20 || nice > 19) - return -EINVAL; - - err = security_task_setnice(current, nice); - if (err) - return err; - - if (nice < 0 && !can_nice(current, nice)) - return -EPERM; - - /* this is a heavy operation taking global locks.. */ - if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) - return -EAGAIN; - - next = HZ / 10 + jiffies; - ag = autogroup_task_get(p); - - down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); - if (!err) - ag->nice = nice; - up_write(&ag->lock); - - autogroup_kref_put(ag); - - return err; -} - -void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) -{ - struct autogroup *ag = autogroup_task_get(p); - - if (!task_group_is_autogroup(ag->tg)) - goto out; - - down_read(&ag->lock); - seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); - up_read(&ag->lock); - -out: - autogroup_kref_put(ag); -} -#endif /* CONFIG_PROC_FS */ - #ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h index 8bd047142816..4552c6bf79d2 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/auto_group.h @@ -4,11 +4,6 @@ #include struct autogroup { - /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. - */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; @@ -29,9 +24,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (enabled && task_wants_autogroup(p, tg)) + if (task_wants_autogroup(p, tg)) return p->signal->autogroup->tg; return tg; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 81c7b1a1a307..2914d0f752cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -363,10 +363,8 @@ static struct ctl_table kern_table[] = { .procname = "sched_autogroup_enabled", .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .mode = 0444, + .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_CFS_BANDWIDTH -- cgit v1.2.3 From a634f93335daa8f38180a0e576ccd68a73c36eaf Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 15:55:59 +0100 Subject: cputime: Move thread_group_cputime() to sched code thread_group_cputime() is a general cputime API that is not only used by posix cpu timer. Let's move this helper to sched code. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/posix-cpu-timers.c | 24 ------------------------ kernel/sched/cputime.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 125cb67daa21..d73840271dce 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -217,30 +217,6 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; - do { - times->utime += t->utime; - times->stime += t->stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: - rcu_read_unlock(); -} - static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { if (b->utime > a->utime) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8d859dae5bed..e56f138a23c7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -288,6 +288,34 @@ static __always_inline bool steal_account_process_tick(void) return false; } +/* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. + */ +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime += t->utime; + times->stime += t->stime; + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING -- cgit v1.2.3 From e80d0a1ae8bb8fee0edd37427836f108b30f596b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 21 Nov 2012 16:26:44 +0100 Subject: cputime: Rename thread_group_times to thread_group_cputime_adjusted We have thread_group_cputime() and thread_group_times(). The naming doesn't provide enough information about the difference between these two APIs. To lower the confusion, rename thread_group_times() to thread_group_cputime_adjusted(). This name better suggests that it's a version of thread_group_cputime() that does some stabilization on the raw cputime values. ie here: scale on top of CFS runtime stats and bound lower value for monotonicity. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/exit.c | 4 ++-- kernel/sched/cputime.c | 8 ++++---- kernel/sys.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 346616c0092c..618f7ee56003 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1186,11 +1186,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_times() to get times for the thread + * We use thread_group_cputime_adjusted() to get times for the thread * group, which consolidates times for all threads in the * group including the group leader. */ - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e56f138a23c7..7dc155371b95 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -445,13 +445,13 @@ void account_idle_ticks(unsigned long ticks) * Use precise platform statistics if available: */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; *st = p->stime; } -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -516,7 +516,7 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { cputime_t rtime, utime = p->utime, total = utime + p->stime; @@ -543,7 +543,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Must be called with siglock held. */ -void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct signal_struct *sig = p->signal; struct task_cputime cputime; diff --git a/kernel/sys.c b/kernel/sys.c index e6e0ece5f6a0..265b37690421 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1046,7 +1046,7 @@ void do_sys_times(struct tms *tms) cputime_t tgutime, tgstime, cutime, cstime; spin_lock_irq(¤t->sighand->siglock); - thread_group_times(current, &tgutime, &tgstime); + thread_group_cputime_adjusted(current, &tgutime, &tgstime); cutime = current->signal->cutime; cstime = current->signal->cstime; spin_unlock_irq(¤t->sighand->siglock); @@ -1704,7 +1704,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = stime = 0; if (who == RUSAGE_THREAD) { - task_times(current, &utime, &stime); + task_cputime_adjusted(current, &utime, &stime); accumulate_thread_rusage(p, r); maxrss = p->signal->maxrss; goto out; @@ -1730,7 +1730,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - thread_group_times(p, &tgutime, &tgstime); + thread_group_cputime_adjusted(p, &tgutime, &tgstime); utime += tgutime; stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; -- cgit v1.2.3 From d37f761dbd276790f70dcf73a287fde2c3464482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 22 Nov 2012 00:58:35 +0100 Subject: cputime: Consolidate cputime adjustment code task_cputime_adjusted() and thread_group_cputime_adjusted() essentially share the same code. They just don't use the same source: * The first function uses the cputime in the task struct and the previous adjusted snapshot that ensures monotonicity. * The second adds the cputime of all tasks in the group and the previous adjusted snapshot of the whole group from the signal structure. Just consolidate the common code that does the adjustment. These functions just need to fetch the values from the appropriate source. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/fork.c | 2 +- kernel/sched/cputime.c | 46 +++++++++++++++++++++++----------------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..0e7cdb90476f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1222,7 +1222,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = p->stime = p->gtime = 0; p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = p->prev_stime = 0; + p->prev_cputime.utime = p->prev_cputime.stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7dc155371b95..220fdc4db770 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,14 +516,18 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +static void cputime_adjust(struct task_cputime *curr, + struct cputime *prev, + cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = utime + p->stime; + cputime_t rtime, utime, total; + utime = curr->utime; + total = utime + curr->stime; /* * Use CFS's precise accounting: */ - rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + rtime = nsecs_to_cputime(curr->sum_exec_runtime); if (total) utime = scale_utime(utime, rtime, total); @@ -533,11 +537,22 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) /* * Compare with previous values, to keep monotonicity: */ - p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); + prev->utime = max(prev->utime, utime); + prev->stime = max(prev->stime, rtime - prev->utime); + + *ut = prev->utime; + *st = prev->stime; +} - *ut = p->prev_utime; - *st = p->prev_stime; +void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime = { + .utime = p->utime, + .stime = p->stime, + .sum_exec_runtime = p->se.sum_exec_runtime, + }; + + cputime_adjust(&cputime, &p->prev_cputime, ut, st); } /* @@ -545,24 +560,9 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { - struct signal_struct *sig = p->signal; struct task_cputime cputime; - cputime_t rtime, utime, total; thread_group_cputime(p, &cputime); - - total = cputime.utime + cputime.stime; - rtime = nsecs_to_cputime(cputime.sum_exec_runtime); - - if (total) - utime = scale_utime(cputime.utime, rtime, total); - else - utime = rtime; - - sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); - - *ut = sig->prev_utime; - *st = sig->prev_stime; + cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } #endif -- cgit v1.2.3 From fa09205783d11cc05122ad6e4ce06074624b2c0c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 28 Nov 2012 17:00:57 +0100 Subject: cputime: Comment cputime's adjusting code The reason for the scaling and monotonicity correction performed by cputime_adjust() may not be immediately clear to the reviewer. Add some comments to explain what happens there. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Steven Rostedt Cc: Paul Gortmaker --- kernel/sched/cputime.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 220fdc4db770..b7f731768625 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -516,6 +516,10 @@ static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) return (__force cputime_t) temp; } +/* + * Adjust tick based cputime random precision against scheduler + * runtime accounting. + */ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) @@ -524,8 +528,16 @@ static void cputime_adjust(struct task_cputime *curr, utime = curr->utime; total = utime + curr->stime; + /* - * Use CFS's precise accounting: + * Tick based cputime accounting depend on random scheduling + * timeslices of a task to be interrupted or not by the timer. + * Depending on these circumstances, the number of these interrupts + * may be over or under-optimistic, matching the real user and system + * cputime with a variable precision. + * + * Fix this by scaling these tick based values against the total + * runtime accounted by the CFS scheduler. */ rtime = nsecs_to_cputime(curr->sum_exec_runtime); @@ -535,7 +547,9 @@ static void cputime_adjust(struct task_cputime *curr, utime = rtime; /* - * Compare with previous values, to keep monotonicity: + * If the tick based count grows faster than the scheduler one, + * the result of the scaling may go backward. + * Let's enforce monotonicity. */ prev->utime = max(prev->utime, utime); prev->stime = max(prev->stime, rtime - prev->utime); -- cgit v1.2.3