diff options
author | David S. Miller <davem@davemloft.net> | 2008-04-03 14:33:42 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-04-03 14:33:42 -0700 |
commit | 3bb5da3837cc1aa17736b05139c9a22c3794851a (patch) | |
tree | c92d5684a866542b1cb20641607ac1643ce03a47 /kernel | |
parent | 7feb49c82a74bc7c091b8ab2a3f96baa33d08ece (diff) | |
parent | 9597362d354f8655ece324b01d0c640a0e99c077 (diff) |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 23 | ||||
-rw-r--r-- | kernel/audit.c | 6 | ||||
-rw-r--r-- | kernel/cgroup.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | kernel/futex.c | 6 | ||||
-rw-r--r-- | kernel/futex_compat.c | 2 | ||||
-rw-r--r-- | kernel/marker.c | 40 | ||||
-rw-r--r-- | kernel/printk.c | 83 | ||||
-rw-r--r-- | kernel/relay.c | 12 | ||||
-rw-r--r-- | kernel/sched.c | 71 | ||||
-rw-r--r-- | kernel/sched_debug.c | 1 | ||||
-rw-r--r-- | kernel/sched_fair.c | 205 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 16 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 4 | ||||
-rw-r--r-- | kernel/timer.c | 10 |
15 files changed, 306 insertions, 179 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index 521dfa53cb99..91e1cfd734d2 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -58,6 +58,7 @@ #include <asm/uaccess.h> #include <asm/div64.h> #include <linux/blkdev.h> /* sector_div */ +#include <linux/pid_namespace.h> /* * These constants control the amount of freespace that suspend and @@ -74,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(struct file *); +static void do_acct_process(struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock @@ -86,6 +87,7 @@ struct acct_glbs { volatile int active; volatile int needcheck; struct file *file; + struct pid_namespace *ns; struct timer_list timer; }; @@ -175,9 +177,11 @@ out: static void acct_file_reopen(struct file *file) { struct file *old_acct = NULL; + struct pid_namespace *old_ns = NULL; if (acct_globals.file) { old_acct = acct_globals.file; + old_ns = acct_globals.ns; del_timer(&acct_globals.timer); acct_globals.active = 0; acct_globals.needcheck = 0; @@ -185,6 +189,7 @@ static void acct_file_reopen(struct file *file) } if (file) { acct_globals.file = file; + acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); acct_globals.needcheck = 0; acct_globals.active = 1; /* It's been deleted if it was used before so this is safe */ @@ -196,8 +201,9 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_path.mnt); spin_unlock(&acct_globals.lock); - do_acct_process(old_acct); + do_acct_process(old_ns, old_acct); filp_close(old_acct, NULL); + put_pid_ns(old_ns); spin_lock(&acct_globals.lock); } } @@ -419,7 +425,7 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(struct file *file) +static void do_acct_process(struct pid_namespace *ns, struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -481,8 +487,10 @@ static void do_acct_process(struct file *file) ac.ac_gid16 = current->gid; #endif #if ACCT_VERSION==3 - ac.ac_pid = current->tgid; - ac.ac_ppid = current->real_parent->tgid; + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); #endif spin_lock_irq(¤t->sighand->siglock); @@ -578,6 +586,7 @@ void acct_collect(long exitcode, int group_dead) void acct_process(void) { struct file *file = NULL; + struct pid_namespace *ns; /* * accelerate the common fastpath: @@ -592,8 +601,10 @@ void acct_process(void) return; } get_file(file); + ns = get_pid_ns(acct_globals.ns); spin_unlock(&acct_globals.lock); - do_acct_process(file); + do_acct_process(ns, file); fput(file); + put_pid_ns(ns); } diff --git a/kernel/audit.c b/kernel/audit.c index be55cb503633..b782b046543d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1269,8 +1269,8 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, /** * audit_string_contains_control - does a string need to be logged in hex - * @string - string to be checked - * @len - max length of the string to check + * @string: string to be checked + * @len: max length of the string to check */ int audit_string_contains_control(const char *string, size_t len) { @@ -1285,7 +1285,7 @@ int audit_string_contains_control(const char *string, size_t len) /** * audit_log_n_untrustedstring - log a string that may contain random characters * @ab: audit_buffer - * @len: lenth of string (not including trailing null) + * @len: length of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9c2fb01e89b..53d86b4b0ce0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2082,7 +2082,7 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) kfree(pidarray); } else { - ctr->buf = 0; + ctr->buf = NULL; ctr->bufsz = 0; } file->private_data = ctr; @@ -2614,7 +2614,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) static int cgroupstats_open(struct inode *inode, struct file *file) { - return single_open(file, proc_cgroupstats_show, 0); + return single_open(file, proc_cgroupstats_show, NULL); } static struct file_operations proc_cgroupstats_operations = { diff --git a/kernel/fork.c b/kernel/fork.c index dd249c37b3a3..9c042f901570 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -394,7 +394,6 @@ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); - mm_free_cgroup(mm); destroy_context(mm); free_mm(mm); } @@ -416,6 +415,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + mm_free_cgroup(mm); mmdrop(mm); } } diff --git a/kernel/futex.c b/kernel/futex.c index 06968cd79200..e43945e995f5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -281,7 +281,7 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, */ static void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr == 0) + if (key->both.ptr == NULL) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -2158,7 +2158,7 @@ static struct file_system_type futex_fs_type = { .kill_sb = kill_anon_super, }; -static int __init init(void) +static int __init futex_init(void) { u32 curval; int i; @@ -2194,4 +2194,4 @@ static int __init init(void) return 0; } -__initcall(init); +__initcall(futex_init); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index ff90f049f8f6..04ac3a9e42cf 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -30,7 +30,7 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, return 0; } -static void __user *futex_uaddr(struct robust_list *entry, +static void __user *futex_uaddr(struct robust_list __user *entry, compat_long_t futex_offset) { compat_uptr_t base = ptr_to_compat(entry); diff --git a/kernel/marker.c b/kernel/marker.c index 48a4ea5afffd..005b95954593 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -104,18 +104,18 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, char ptype; /* - * disabling preemption to make sure the teardown of the callbacks can - * be done correctly when they are in modules and they insure RCU read - * coherency. + * preempt_disable does two things : disabling preemption to make sure + * the teardown of the callbacks can be done correctly when they are in + * modules and they insure RCU read coherency. */ preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -133,7 +133,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, fmt); multi[i].func(multi[i].probe_private, call_private, fmt, @@ -161,13 +161,13 @@ void marker_probe_cb_noarg(const struct marker *mdata, char ptype; preempt_disable(); - ptype = ACCESS_ONCE(mdata->ptype); + ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; /* Must read the ptype before ptr. They are not data dependant, * so we put an explicit smp_rmb() here. */ smp_rmb(); - func = ACCESS_ONCE(mdata->single.func); + func = mdata->single.func; /* Must read the ptr before private data. They are not data * dependant, so we put an explicit smp_rmb() here. */ smp_rmb(); @@ -183,7 +183,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = ACCESS_ONCE(mdata->multi); + multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, fmt, &args); @@ -551,9 +551,9 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, /* * Disable a marker and its probe callback. - * Note: only after a synchronize_sched() issued after setting elem->call to the - * empty function insures that the original callback is not used anymore. This - * insured by preemption disabling around the call site. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. */ static void disable_marker(struct marker *elem) { @@ -565,8 +565,8 @@ static void disable_marker(struct marker *elem) elem->ptype = 0; /* single probe */ /* * Leave the private data and id there, because removal is racy and - * should be done only after a synchronize_sched(). These are never used - * until the next initialization anyway. + * should be done only after an RCU period. These are never used until + * the next initialization anyway. */ } @@ -601,9 +601,6 @@ void marker_update_probe_range(struct marker *begin, /* * Update probes, removing the faulty probes. - * Issues a synchronize_sched() when no reference to the module passed - * as parameter is found in the probes so the probe module can be - * safely unloaded from now on. * * Internal callback only changed before the first probe is connected to it. * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 @@ -674,6 +671,9 @@ int marker_probe_register(const char *name, const char *format, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); end: mutex_unlock(&markers_mutex); @@ -717,6 +717,9 @@ int marker_probe_unregister(const char *name, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); remove_marker(name); /* Ignore busy error message */ ret = 0; @@ -795,6 +798,9 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); +#ifdef CONFIG_PREEMPT_RCU + synchronize_sched(); /* Until we have the call_rcu_sched() */ +#endif call_rcu(&entry->rcu, free_old_closure); remove_marker(entry->name); /* Ignore busy error message */ end: diff --git a/kernel/printk.c b/kernel/printk.c index 9adc2a473e6e..c46a20a19a15 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -616,6 +616,40 @@ asmlinkage int printk(const char *fmt, ...) /* cpu currently holding logbuf_lock */ static volatile unsigned int printk_cpu = UINT_MAX; +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ +static inline int can_use_console(unsigned int cpu) +{ + return cpu_online(cpu) || have_callable_console(); +} + +/* + * Try to get console ownership to actually show the kernel + * messages from a 'printk'. Return true (and with the + * console_semaphore held, and 'console_locked' set) if it + * is successful, false otherwise. + * + * This gets called with the 'logbuf_lock' spinlock held and + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +static int acquire_console_semaphore_for_printk(unsigned int cpu) +{ + int retval = 0; + + if (can_use_console(cpu)) + retval = !try_acquire_console_sem(); + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); + return retval; +} + const char printk_recursion_bug_msg [] = KERN_CRIT "BUG: recent printk recursion!\n"; static int printk_recursion_bug; @@ -725,43 +759,22 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!down_trylock(&console_sem)) { - /* - * We own the drivers. We can drop the spinlock and - * let release_console_sem() print the text, maybe ... - */ - console_locked = 1; - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); + /* + * Try to acquire and then immediately release the + * console semaphore. The release will do all the + * actual magic (print out buffers, wake up klogd, + * etc). + * + * The acquire_console_semaphore_for_printk() function + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ + if (acquire_console_semaphore_for_printk(this_cpu)) + release_console_sem(); - /* - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. - */ - if (cpu_online(smp_processor_id()) || have_callable_console()) { - console_may_schedule = 0; - release_console_sem(); - } else { - /* Release by hand to avoid flushing the buffer. */ - console_locked = 0; - up(&console_sem); - } - lockdep_on(); - raw_local_irq_restore(flags); - } else { - /* - * Someone else owns the drivers. We drop the spinlock, which - * allows the semaphore holder to proceed and to call the - * console drivers with the output which we just produced. - */ - printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); - lockdep_on(); + lockdep_on(); out_restore_irqs: - raw_local_irq_restore(flags); - } + raw_local_irq_restore(flags); preempt_enable(); return printed_len; diff --git a/kernel/relay.c b/kernel/relay.c index d080b9d161a7..d6204a485818 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -736,7 +736,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) kref_get(&buf->kref); filp->private_data = buf; - return 0; + return nonseekable_open(inode, filp); } /** @@ -1056,6 +1056,10 @@ static struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) +{ +} + /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ @@ -1066,7 +1070,7 @@ static int subbuf_splice_actor(struct file *in, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1083,6 +1087,7 @@ static int subbuf_splice_actor(struct file *in, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, + .spd_release = relay_page_release, }; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) @@ -1097,8 +1102,9 @@ static int subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; + nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); - for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { + for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; unsigned int cur_pos = read_start + total_len; diff --git a/kernel/sched.c b/kernel/sched.c index d1ad69b270ca..8dcdec6fe0fe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -594,18 +594,14 @@ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, SCHED_FEAT_WAKEUP_PREEMPT = 2, SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_TREE_AVG = 8, - SCHED_FEAT_APPROX_AVG = 16, - SCHED_FEAT_HRTICK = 32, - SCHED_FEAT_DOUBLE_TICK = 64, + SCHED_FEAT_HRTICK = 8, + SCHED_FEAT_DOUBLE_TICK = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0 | SCHED_FEAT_HRTICK * 1 | SCHED_FEAT_DOUBLE_TICK * 0; @@ -1056,6 +1052,49 @@ static void resched_cpu(int cpu) resched_task(cpu_curr(cpu)); spin_unlock_irqrestore(&rq->lock, flags); } + +#ifdef CONFIG_NO_HZ +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} +#endif + #else static void __resched_task(struct task_struct *p, int tif_bit) { @@ -1396,6 +1435,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + /* + * Buddy candidates are cache hot: + */ + if (&p->se == cfs_rq_of(&p->se)->next) + return 1; + if (p->sched_class != &fair_sched_class) return 0; @@ -1855,10 +1900,11 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); - check_preempt_curr(rq, p); success = 1; out_running: + check_preempt_curr(rq, p); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -1892,6 +1938,8 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -3877,7 +3925,7 @@ need_resched_nonpreemptible: if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - unlikely(signal_pending(prev)))) { + signal_pending(prev))) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, 1); @@ -6802,6 +6850,10 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ */ static cpumask_t fallback_doms; +void __attribute__((weak)) arch_update_cpu_topology(void) +{ +} + /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to @@ -6811,6 +6863,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) { int err; + arch_update_cpu_topology(); ndoms_cur = 1; doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms_cur) @@ -6915,7 +6968,7 @@ match2: } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -static int arch_reinit_sched_domains(void) +int arch_reinit_sched_domains(void) { int err; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4a..ef358ba07683 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); + PN(se.avg_overlap); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2cc59080efa..86a93376282c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -302,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) return vslice; } -static u64 sched_vslice(struct cfs_rq *cfs_rq) -{ - return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); -} - static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { return __sched_vslice(cfs_rq->load.weight + se->load.weight, @@ -504,15 +499,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } else vruntime = cfs_rq->min_vruntime; - if (sched_feat(TREE_AVG)) { - struct sched_entity *last = __pick_last_entity(cfs_rq); - if (last) { - vruntime += last->vruntime; - vruntime >>= 1; - } - } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) - vruntime += sched_vslice(cfs_rq)/2; - /* * The 'current' period is already promised to the current tasks, * however the extra weight of the new task will slow them down a @@ -556,6 +542,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) account_entity_enqueue(cfs_rq, se); } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!se->last_wakeup) + return; + + update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); + se->last_wakeup = 0; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -566,6 +567,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -980,96 +982,121 @@ static inline int wake_idle(int cpu, struct task_struct *p) #endif #ifdef CONFIG_SMP -static int select_task_rq_fair(struct task_struct *p, int sync) + +static const struct sched_class fair_sched_class; + +static int +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, + struct task_struct *p, int prev_cpu, int this_cpu, int sync, + int idx, unsigned long load, unsigned long this_load, + unsigned int imbalance) { - int cpu, this_cpu; - struct rq *rq; - struct sched_domain *sd, *this_sd = NULL; - int new_cpu; + struct task_struct *curr = this_rq->curr; + unsigned long tl = this_load; + unsigned long tl_per_task; + + if (!(this_sd->flags & SD_WAKE_AFFINE)) + return 0; + + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && curr->sched_class == &fair_sched_class) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); - cpu = task_cpu(p); - rq = task_rq(p); - this_cpu = smp_processor_id(); - new_cpu = cpu; + return 1; + } + return 0; +} - if (cpu == this_cpu) - goto out_set_cpu; +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + struct sched_domain *sd, *this_sd = NULL; + int prev_cpu, this_cpu, new_cpu; + unsigned long load, this_load; + struct rq *rq, *this_rq; + unsigned int imbalance; + int idx; + + prev_cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + this_rq = cpu_rq(this_cpu); + new_cpu = prev_cpu; + /* + * 'this_sd' is the first domain that both + * this_cpu and prev_cpu are present in: + */ for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpu_isset(prev_cpu, sd->span)) { this_sd = sd; break; } } if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; + goto out; /* * Check for affine wakeup and passive balancing possibilities. */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - unsigned long load, this_load; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } + if (!this_sd) + goto out; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } + idx = this_sd->wake_idx; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + return this_cpu; + + if (prev_cpu == this_cpu) + goto out; + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + return this_cpu; } } - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ -out_set_cpu: +out: return wake_idle(new_cpu, p); } #endif /* CONFIG_SMP */ @@ -1092,6 +1119,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } + se->last_wakeup = se->sum_exec_runtime; + if (unlikely(se == pse)) + return; + cfs_rq_of(pse)->next = pse; /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 548c436a776b..7f60097d443a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,13 +141,8 @@ static void clocksource_watchdog(unsigned long data) } if (!list_empty(&watchdog_list)) { - /* Cycle through CPUs to check if the CPUs stay synchronized to - * each other. */ - int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); - if (next_cpu >= NR_CPUS) - next_cpu = first_cpu(cpu_online_map); - watchdog_timer.expires += WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, next_cpu); + __mod_timer(&watchdog_timer, + watchdog_timer.expires + WATCHDOG_INTERVAL); } spin_unlock(&watchdog_lock); } @@ -169,7 +164,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (!started && watchdog) { watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } else { if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) @@ -179,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) if (watchdog) del_timer(&watchdog_timer); watchdog = cs; - init_timer_deferrable(&watchdog_timer); + init_timer(&watchdog_timer); watchdog_timer.function = clocksource_watchdog; /* Reset watchdog cycles */ @@ -190,8 +185,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) watchdog_last = watchdog->read(); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, - first_cpu(cpu_online_map)); + add_timer(&watchdog_timer); } } } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 671af612b768..a3fa587c350c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -191,8 +191,12 @@ static void change_clocksource(void) tick_clock_notify(); + /* + * We're holding xtime lock and waking up klogd would deadlock + * us on enqueue. So no printing! printk(KERN_INFO "Time: %s clocksource has been installed.\n", clock->name); + */ } #else static inline void change_clocksource(void) { } diff --git a/kernel/timer.c b/kernel/timer.c index 99b00a25f88b..b024106daa70 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -451,10 +451,18 @@ void add_timer_on(struct timer_list *timer, int cpu) spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); spin_unlock_irqrestore(&base->lock, flags); } - /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified |