diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/auditsc.c | 2 | ||||
-rw-r--r-- | kernel/futex.c | 1 | ||||
-rw-r--r-- | kernel/irq/manage.c | 2 | ||||
-rw-r--r-- | kernel/module.c | 3 | ||||
-rw-r--r-- | kernel/posix-timers.c | 9 | ||||
-rw-r--r-- | kernel/printk.c | 13 | ||||
-rw-r--r-- | kernel/sched.c | 88 | ||||
-rw-r--r-- | kernel/sched_debug.c | 3 | ||||
-rw-r--r-- | kernel/sched_fair.c | 106 | ||||
-rw-r--r-- | kernel/sched_rt.c | 11 | ||||
-rw-r--r-- | kernel/signal.c | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 34 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
13 files changed, 199 insertions, 79 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3401293359e8..04f3ffb8d9d4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2023,7 +2023,7 @@ int __audit_signal_info(int sig, struct task_struct *t) axp->d.next = ctx->aux_pids; ctx->aux_pids = (void *)axp; } - BUG_ON(axp->pid_count > AUDIT_AUX_PIDS); + BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); axp->target_pid[axp->pid_count] = t->tgid; selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); diff --git a/kernel/futex.c b/kernel/futex.c index 3415e9ad1391..e8935b195e88 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1670,6 +1670,7 @@ pi_faulted: attempt); if (ret) goto out; + uval = 0; goto retry_unlocked; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 203a518b6f14..853aefbd184b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -462,7 +462,9 @@ void free_irq(unsigned int irq, void *dev_id) * We do this after actually deregistering it, to make sure that * a 'real' IRQ doesn't run in parallel with our fake */ + local_irq_save(flags); handler(irq, dev_id); + local_irq_restore(flags); } #endif } diff --git a/kernel/module.c b/kernel/module.c index 33c04ad51175..db0ead0363e2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -784,8 +784,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module *mod, char *buffer) { - /* sysfs holds a reference */ - return sprintf(buffer, "%u\n", module_refcount(mod)-1); + return sprintf(buffer, "%u\n", module_refcount(mod)); } static struct module_attribute refcnt = { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 55b3761edaa9..7a15afb73ed0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -547,9 +547,9 @@ sys_timer_create(const clockid_t which_clock, new_timer->it_process = process; list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) get_task_struct(process); + spin_unlock_irqrestore(&process->sighand->siglock, flags); } else { spin_unlock_irqrestore(&process->sighand->siglock, flags); process = NULL; @@ -605,13 +605,14 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); if (timr) { spin_lock(&timr->it_lock); - spin_unlock(&idr_lock); if ((timr->it_id != timer_id) || !(timr->it_process) || timr->it_process->tgid != current->tgid) { - unlock_timer(timr, *flags); + spin_unlock(&timr->it_lock); + spin_unlock_irqrestore(&idr_lock, *flags); timr = NULL; - } + } else + spin_unlock(&idr_lock); } else spin_unlock_irqrestore(&idr_lock, *flags); diff --git a/kernel/printk.c b/kernel/printk.c index bd2cd062878d..8451dfc31d25 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1083,6 +1083,19 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +static int __init disable_boot_consoles(void) +{ + if (console_drivers != NULL) { + if (console_drivers->flags & CON_BOOT) { + printk(KERN_INFO "turn off boot console %s%d\n", + console_drivers->name, console_drivers->index); + return unregister_console(console_drivers); + } + } + return 0; +} +late_initcall(disable_boot_consoles); + /** * tty_write_message - write a message to a certain tty, not just the console. * @tty: the destination tty_struct diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b83b7f1..9fe473a190de 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -262,7 +262,8 @@ struct rq { s64 clock_max_delta; unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; + u64 idle_clock; + unsigned int clock_deep_idle_events; u64 tick_timestamp; atomic_t nr_iowait; @@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void) } /* - * CPU frequency is/was unstable - start new by setting prev_clock_raw: + * We are going deep-idle (irqs are disabled): */ -void sched_clock_unstable_event(void) +void sched_clock_idle_sleep_event(void) { - unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(smp_processor_id()); - rq = task_rq_lock(current, &flags); - rq->prev_clock_raw = sched_clock(); - rq->clock_unstable_events++; - task_rq_unlock(rq, &flags); + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); } +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* * resched_task - mark a task 'to be rescheduled now'. @@ -2157,12 +2180,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (task_running(rq, p)) return 0; - /* - * Aggressive migration if too many balance attempts have failed: - */ - if (sd->nr_balance_failed > sd->cache_nice_tries) - return 1; - return 1; } @@ -2494,7 +2511,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -3020,6 +3037,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3056,8 +3074,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -3067,7 +3087,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* @@ -4884,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 100000000; + const unsigned long limit = 100000000; + + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; - sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; + sysctl_sched_runtime_limit = sysctl_sched_latency; + sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; } #ifdef CONFIG_SMP @@ -5234,15 +5265,16 @@ static void migrate_dead_tasks(unsigned int dead_cpu) static struct ctl_table sd_ctl_dir[] = { { .procname = "sched_domain", - .mode = 0755, + .mode = 0555, }, {0,}, }; static struct ctl_table sd_ctl_root[] = { { + .ctl_name = CTL_KERN, .procname = "kernel", - .mode = 0755, + .mode = 0555, .child = sd_ctl_dir, }, {0,}, @@ -5318,7 +5350,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) for_each_domain(cpu, sd) { snprintf(buf, 32, "domain%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_domain_table(sd); entry++; i++; @@ -5338,7 +5370,7 @@ static void init_sched_domain_sysctl(void) for (i = 0; i < cpu_num; i++, entry++) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0755; + entry->mode = 0555; entry->child = sd_alloc_ctl_cpu_table(i); } sd_sysctl_header = register_sysctl_table(sd_ctl_root); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 87e524762b85..ab18f45f2ab2 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(next_balance); P(curr->pid); P(clock); + P(idle_clock); P(prev_clock_raw); P(clock_warps); P(clock_overflows); - P(clock_unstable_events); + P(clock_deep_idle_events); P(clock_max_delta); P(cpu_load[0]); P(cpu_load[1]); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fedbb51bba96..ee3771850aaf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -15,34 +15,42 @@ * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> */ /* - * Preemption granularity: - * (default: 2 msec, units: nanoseconds) + * Targeted preemption latency for CPU-bound tasks: + * (default: 20ms, units: nanoseconds) * - * NOTE: this granularity value is not the same as the concept of - * 'timeslice length' - timeslices in CFS will typically be somewhat - * larger than this value. (to see the precise effective timeslice - * length of your workload, run vmstat and monitor the context-switches - * field) + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length. + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches field) * * On SMP systems the value of this is multiplied by the log2 of the * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) + * Targeted preemption latency for CPU-bound tasks: + */ +unsigned int sysctl_sched_latency __read_mostly = 20000000ULL; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 2 msec, units: nanoseconds) */ -unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; +unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL; /* * SCHED_BATCH wake-up granularity. - * (default: 10 msec, units: nanoseconds) + * (default: 25 msec, units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = - 10000000000ULL/HZ; +unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 25000000UL; /* * SCHED_OTHER wake-up granularity. @@ -52,12 +60,12 @@ unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; +unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000UL; unsigned int sysctl_sched_stat_granularity __read_mostly; /* - * Initialized in sched_init_granularity(): + * Initialized in sched_init_granularity() [to 5 times the base granularity]: */ unsigned int sysctl_sched_runtime_limit __read_mostly; @@ -214,6 +222,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) */ /* + * Calculate the preemption granularity needed to schedule every + * runnable task once per sysctl_sched_latency amount of time. + * (down to a sensible low limit on granularity) + * + * For example, if there are 2 tasks running and latency is 10 msecs, + * we switch tasks every 5 msecs. If we have 3 tasks running, we have + * to switch tasks every 3.33 msecs to get a 10 msecs observed latency + * for each task. We do finer and finer scheduling up to until we + * reach the minimum granularity value. + * + * To achieve this we use the following dynamic-granularity rule: + * + * gran = lat/nr - lat/nr/nr + * + * This comes out of the following equations: + * + * kA1 + gran = kB1 + * kB2 + gran = kA2 + * kA2 = kA1 + * kB2 = kB1 - d + d/nr + * lat = d * nr + * + * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running), + * '1' is start of time, '2' is end of time, 'd' is delay between + * 1 and 2 (during which task B was running), 'nr' is number of tasks + * running, 'lat' is the the period of each task. ('lat' is the + * sched_latency that we aim for.) + */ +static long +sched_granularity(struct cfs_rq *cfs_rq) +{ + unsigned int gran = sysctl_sched_latency; + unsigned int nr = cfs_rq->nr_running; + + if (nr > 1) { + gran = gran/nr - gran/nr/nr; + gran = max(gran, sysctl_sched_min_granularity); + } + + return gran; +} + +/* * We rescale the rescheduling granularity of tasks according to their * nice level, but only linearly, not exponentially: */ @@ -303,10 +354,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr) delta_fair = calc_delta_fair(delta_exec, lw); delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); - if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { - delta = min(cfs_rq->sleeper_bonus, (u64)delta_exec); - delta = calc_delta_mine(delta, curr->load.weight, lw); - delta = min((u64)delta, cfs_rq->sleeper_bonus); + if (cfs_rq->sleeper_bonus > sysctl_sched_latency) { + delta = min((u64)delta_mine, cfs_rq->sleeper_bonus); + delta = min(delta, (unsigned long)( + (long)sysctl_sched_runtime_limit - curr->wait_runtime)); cfs_rq->sleeper_bonus -= delta; delta_mine -= delta; } @@ -494,6 +545,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) unsigned long load = cfs_rq->load.weight, delta_fair; long prev_runtime; + /* + * Do not boost sleepers if there's too much bonus 'in flight' + * already: + */ + if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) + return; + if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) load = rq_of(cfs_rq)->cpu_load[2]; @@ -513,16 +571,13 @@ static void __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) prev_runtime = se->wait_runtime; __add_wait_runtime(cfs_rq, se, delta_fair); + schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); delta_fair = se->wait_runtime - prev_runtime; /* * Track the amount of bonus we've given to sleepers: */ cfs_rq->sleeper_bonus += delta_fair; - if (unlikely(cfs_rq->sleeper_bonus > sysctl_sched_runtime_limit)) - cfs_rq->sleeper_bonus = sysctl_sched_runtime_limit; - - schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -686,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) if (next == curr) return; - __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); + __check_preempt_curr_fair(cfs_rq, next, curr, + sched_granularity(cfs_rq)); } /************************************************** @@ -1031,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * it will preempt the parent: */ p->se.fair_key = current->se.fair_key - - niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; + niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1; /* * The first wait is dominated by the child-runs-first logic, * so do not credit it with that waiting time yet: @@ -1044,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * -granularity/2, so initialize the task with that: */ if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) - p->se.wait_runtime = -(sysctl_sched_granularity / 2); + p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2); __enqueue_entity(cfs_rq, se); } @@ -1057,7 +1113,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) */ static void set_curr_task_fair(struct rq *rq) { - struct sched_entity *se = &rq->curr.se; + struct sched_entity *se = &rq->curr->se; for_each_sched_entity(se) set_next_entity(cfs_rq_of(se), se); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index dcdcad632fd9..4b87476a02d0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -207,10 +207,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) return; p->time_slice = static_prio_timeslice(p->static_prio); - set_tsk_need_resched(p); - /* put it at the end of the queue: */ - requeue_task_rt(rq, p); + /* + * Requeue to the end of queue if we are not the only element + * on the queue: + */ + if (p->run_list.prev != p->run_list.next) { + requeue_task_rt(rq, p); + set_tsk_need_resched(p); + } } static struct sched_class rt_sched_class __read_mostly = { diff --git a/kernel/signal.c b/kernel/signal.c index b27c01a66448..ad63109e413c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -378,7 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - if (tsk == current) + if (likely(tsk == current)) signr = __dequeue_signal(&tsk->pending, mask, info); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, @@ -425,7 +425,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; } - if ( signr && + if (signr && likely(tsk == current) && ((info->si_code & __SI_MASK) == __SI_TIMER) && info->si_sys_private){ /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bdb8c07e04f..6ace893c17c9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -27,7 +27,6 @@ #include <linux/capability.h> #include <linux/ctype.h> #include <linux/utsname.h> -#include <linux/capability.h> #include <linux/smp_lock.h> #include <linux/fs.h> #include <linux/init.h> @@ -223,8 +222,19 @@ static ctl_table kern_table[] = { #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, - .procname = "sched_granularity_ns", - .data = &sysctl_sched_granularity, + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, @@ -284,6 +294,15 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_features", + .data = &sysctl_sched_features, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .ctl_name = CTL_UNNUMBERED, @@ -305,15 +324,6 @@ static ctl_table kern_table[] = { }, #endif { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { .ctl_name = KERN_PANIC, .procname = "panic", .data = &panic_timeout, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 58e5c152a6bb..e080d1d744cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -635,7 +635,7 @@ int keventd_up(void) int current_is_keventd(void) { struct cpu_workqueue_struct *cwq; - int cpu = smp_processor_id(); /* preempt-safe: keventd is per-cpu */ + int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ int ret = 0; BUG_ON(!keventd_wq); |