diff options
Diffstat (limited to 'kernel')
64 files changed, 2542 insertions, 1512 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index dd68b9059418..f6006a60df5d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -548,7 +548,7 @@ static void do_acct_process(struct bsd_acct_struct *acct, #endif spin_lock_irq(¤t->sighand->siglock); - tty = current->signal->tty; + tty = current->signal->tty; /* Safe as we hold the siglock */ ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 59cedfb040e7..cf5bc2f5f9c3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -246,8 +246,8 @@ static int audit_match_perm(struct audit_context *ctx, int mask) unsigned n; if (unlikely(!ctx)) return 0; - n = ctx->major; + switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && @@ -1204,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); - mutex_lock(&tty_mutex); - read_lock(&tasklist_lock); + spin_lock_irq(&tsk->sighand->siglock); if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) tty = tsk->signal->tty->name; else tty = "(none)"; - read_unlock(&tasklist_lock); + spin_unlock_irq(&tsk->sighand->siglock); + audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" " ppid=%d pid=%d auid=%u uid=%u gid=%u" @@ -1230,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->egid, context->sgid, context->fsgid, tty, tsk->sessionid); - mutex_unlock(&tty_mutex); audit_log_task_info(ab, tsk); if (context->filterkey) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13932abde159..8c6e1c17e6d3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2735,21 +2735,24 @@ void cgroup_fork_callbacks(struct task_struct *child) * Called on every change to mm->owner. mm_init_owner() does not * invoke this routine, since it assigns the mm->owner the first time * and does not change it. + * + * The callbacks are invoked with mmap_sem held in read mode. */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) - ss->mm_owner_changed(ss, oldcgrp, newcgrp); + ss->mm_owner_changed(ss, oldcgrp, newcgrp, new); } } } diff --git a/kernel/compat.c b/kernel/compat.c index 32c254a8ab9a..8eafe3eb50d9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,9 +23,68 @@ #include <linux/timex.h> #include <linux/migrate.h> #include <linux/posix-timers.h> +#include <linux/times.h> #include <asm/uaccess.h> +/* + * Note that the native side is already converted to a timespec, because + * that's what we want anyway. + */ +static int compat_get_timeval(struct timespec *o, + struct compat_timeval __user *i) +{ + long usec; + + if (get_user(o->tv_sec, &i->tv_sec) || + get_user(usec, &i->tv_usec)) + return -EFAULT; + o->tv_nsec = usec * 1000; + return 0; +} + +static int compat_put_timeval(struct compat_timeval __user *o, + struct timeval *i) +{ + return (put_user(i->tv_sec, &o->tv_sec) || + put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; +} + +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (compat_put_timeval(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (compat_get_timeval(&kts, tv)) + return -EFAULT; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || @@ -150,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which, return 0; } +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { + struct tms tms; struct compat_tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..86d49045daed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -453,6 +454,25 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79cf516d..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -14,6 +14,8 @@ * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups + * 2008 Rework of the scheduler domains and CPU hotplug handling + * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); -/* This is ugly, but preserves the userspace API for existing cpuset +/* + * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead */ + * silently switch it to mount "cgroup" instead + */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) @@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* - * Helper routine for rebuild_sched_domains(). + * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */ - static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpus_intersects(a->cpus_allowed, b->cpus_allowed); @@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) } /* - * rebuild_sched_domains() - * - * This routine will be called to rebuild the scheduler's dynamic - * sched domains: - * - if the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, - * - or if the 'cpus' allowed changes in any cpuset which has that - * flag enabled, - * - or if the 'sched_relax_domain_level' of any cpuset which has - * that flag enabled and with non-empty 'cpus' changes, - * - or if any cpuset with non-empty 'cpus' is removed, - * - or if a cpu gets offlined. - * - * This routine builds a partial partition of the systems CPUs - * (the set of non-overlappping cpumask_t's in the array 'part' - * below), and passes that partial partition to the kernel/sched.c - * partition_sched_domains() routine, which will rebuild the - * schedulers load balancing domains (sched domains) as specified - * by that partial partition. A 'partial partition' is a set of - * non-overlapping subsets whose union is a subset of that set. + * generate_sched_domains() + * + * This function builds a partial partition of the systems CPUs + * A 'partial partition' is a set of non-overlapping subsets whose + * union is a subset of that set. + * The output of this function needs to be passed to kernel/sched.c + * partition_sched_domains() routine, which will rebuild the scheduler's + * load balancing domains (sched domains) as specified by that partial + * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. @@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Call with cgroup_mutex held. May take callback_mutex during - * call due to the kfifo_alloc() and kmalloc() calls. May nest - * a call to the get_online_cpus()/put_online_cpus() pair. - * Must not be called holding callback_mutex, because we must not - * call get_online_cpus() while holding callback_mutex. Elsewhere - * the kernel nests callback_mutex inside get_online_cpus() calls. - * So the reverse nesting would risk an ABBA deadlock. + * Must be called with cgroup_lock held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ - -void rebuild_sched_domains(void) +static int generate_sched_domains(cpumask_t **domains, + struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned*/ + LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -601,23 +587,26 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - csa = NULL; + ndoms = 0; doms = NULL; dattr = NULL; + csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - ndoms = 1; doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) - goto rebuild; + goto done; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; - goto rebuild; + + ndoms = 1; + goto done; } csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); @@ -680,61 +669,141 @@ restart: } } - /* Convert <csn, csa> to <ndoms, doms> */ + /* + * Now we know how many domains to create. + * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. + */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) - goto rebuild; + if (!doms) { + ndoms = 0; + goto done; + } + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; + cpumask_t *dp; int apn = a->pn; - if (apn >= 0) { - cpumask_t *dp = doms + nslot; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + if (apn < 0) { + /* Skip completed partitions */ + continue; + } + + dp = doms + nslot; + + if (nslot == ndoms) { + static int warnings = 10; + if (warnings) { + printk(KERN_WARNING + "rebuild_sched_domains confused:" + " nslot %d, ndoms %d, csn %d, i %d," + " apn %d\n", + nslot, ndoms, csn, i, apn); + warnings--; } + continue; + } - cpus_clear(*dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); - b->pn = -1; - if (dattr) - update_domain_attr_tree(dattr - + nslot, b); - } + cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + for (j = i; j < csn; j++) { + struct cpuset *b = csa[j]; + + if (apn == b->pn) { + cpus_or(*dp, *dp, b->cpus_allowed); + if (dattr) + update_domain_attr_tree(dattr + nslot, b); + + /* Done with this partition */ + b->pn = -1; } - nslot++; } + nslot++; } BUG_ON(nslot != ndoms); -rebuild: - /* Have scheduler rebuild sched domains */ +done: + kfree(csa); + + *domains = doms; + *attributes = dattr; + return ndoms; +} + +/* + * Rebuild scheduler domains. + * + * Call with neither cgroup_mutex held nor within get_online_cpus(). + * Takes both cgroup_mutex and get_online_cpus(). + * + * Cannot be directly called from cpuset code handling changes + * to the cpuset pseudo-filesystem, because it cannot be called + * from code that already holds cgroup_mutex. + */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + get_online_cpus(); - partition_sched_domains(ndoms, doms, dattr); + + /* Generate domain masks and attrs */ + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + put_online_cpus(); +} -done: - kfree(csa); - /* Don't kfree(doms) -- partition_sched_domains() does that. */ - /* Don't kfree(dattr) -- partition_sched_domains() does that. */ +static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); + +/* + * Rebuild scheduler domains, asynchronously via workqueue. + * + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * + * The rebuild_sched_domains() and partition_sched_domains() + * routines must nest cgroup_lock() inside get_online_cpus(), + * but such cpuset changes as these must nest that locking the + * other way, holding cgroup_lock() for much of the code. + * + * So in order to avoid an ABBA deadlock, the cpuset code handling + * these user changes delegates the actual sched domain rebuilding + * to a separate workqueue thread, which ends up processing the + * above do_rebuild_sched_domains() function. + */ +static void async_rebuild_sched_domains(void) +{ + schedule_work(&rebuild_sched_domains_work); +} + +/* + * Accomplishes the same scheduler domain rebuild as the above + * async_rebuild_sched_domains(), however it directly calls the + * rebuild routine synchronously rather than calling it via an + * asynchronous work thread. + * + * This can only be called from code that is not holding + * cgroup_mutex (not nested in a cgroup_lock() call.) + */ +void rebuild_sched_domains(void) +{ + do_rebuild_sched_domains(NULL); } /** @@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk, /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cgroup_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * Return 0 if successful, -errno if not. + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. */ -static int update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - struct ptr_heap heap; - int retval; - - /* - * cgroup_scan_tasks() will initialize heap->gt for us. - * heap_init() is still needed here for we should not change - * cs->cpus_allowed when heap_init() fails. - */ - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; scan.cg = cs->css.cgroup; scan.test_task = cpuset_test_cpumask; scan.process_task = cpuset_change_cpumask; - scan.heap = &heap; - retval = cgroup_scan_tasks(&scan); - - heap_free(&heap); - return retval; + scan.heap = heap; + cgroup_scan_tasks(&scan); } /** @@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs) */ static int update_cpumask(struct cpuset *cs, const char *buf) { + struct ptr_heap heap; struct cpuset trialcs; int retval; int is_load_balanced; @@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval) + return retval; + is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); @@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * Scan tasks in the cpuset, and update the cpumasks of any * that need an update. */ - retval = update_tasks_cpumask(cs); - if (retval < 0) - return retval; + update_tasks_cpumask(cs, &heap); + + heap_free(&heap); if (is_load_balanced) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains(); + async_rebuild_sched_domains(); } return 0; @@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unreachable but makes gcc happy */ + return 0; } static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) @@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unrechable but makes gcc happy */ + return 0; } @@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create( } /* - * Locking note on the strange update_flag() call below: - * * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains(). The get_online_cpus() - * call in rebuild_sched_domains() must not be made while holding - * callback_mutex. Elsewhere the kernel nests callback_mutex inside - * get_online_cpus() calls. So the reverse nesting would risk an - * ABBA deadlock. + * will call async_rebuild_sched_domains(). */ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) @@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .create = cpuset_create, - .destroy = cpuset_destroy, + .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, .attach = cpuset_attach, .populate = cpuset_populate, @@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) } /* - * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty @@ -1859,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ @@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root) nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, NULL); update_tasks_nodemask(cp, &oldmems); } } } /* - * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to - * track what's online after any CPU or memory node hotplug or unplug event. - * - * Since there are two callers of this routine, one for CPU hotplug - * events and one for memory node hotplug events, we could have coded - * two separate routines here. We code it as a single common routine - * in order to minimize text size. - */ - -static void common_cpu_mem_hotplug_unplug(int rebuild_sd) -{ - cgroup_lock(); - - top_cpuset.cpus_allowed = cpu_online_map; - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); - - /* - * Scheduler destroys domains on hotplug events. - * Rebuild them based on the current settings. - */ - if (rebuild_sd) - rebuild_sched_domains(); - - cgroup_unlock(); -} - -/* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent * (of no affect) on systems that are actively using CPU hotplug @@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. + * + * Called within get_online_cpus(). Needs to call cgroup_lock() + * before calling generate_sched_domains(). */ - -static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, +static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + switch (phase) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - common_cpu_mem_hotplug_unplug(1); break; + default: return NOTIFY_DONE; } + cgroup_lock(); + top_cpuset.cpus_allowed = cpu_online_map; + scan_for_empty_cpusets(&top_cpuset); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after you change - * node_states[N_HIGH_MEMORY]. - * See also the previous routine cpuset_handle_cpuhp(). + * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * See also the previous routine cpuset_track_online_cpus(). */ - void cpuset_track_online_nodes(void) { - common_cpu_mem_hotplug_unplug(0); + cgroup_lock(); + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); + cgroup_unlock(); } #endif @@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void) top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_handle_cpuhp, 0); + hotcpu_notifier(cpuset_track_online_cpus, 0); } /** diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index c1d4d5b4c61c..f013a0c2e111 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, } return (mem != NULL); } +EXPORT_SYMBOL(dma_alloc_from_coherent); /** * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool @@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) } return 0; } +EXPORT_SYMBOL(dma_release_from_coherent); diff --git a/kernel/dma.c b/kernel/dma.c index d2c60a822790..f903189c5304 100644 --- a/kernel/dma.c +++ b/kernel/dma.c @@ -1,4 +1,4 @@ -/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ +/* * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. * * Written by Hennus Bergman, 1992. diff --git a/kernel/exit.c b/kernel/exit.c index 16395644a98f..059b38cae384 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -112,8 +112,6 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -122,7 +120,6 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -583,8 +580,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -627,29 +622,38 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: BUG_ON(c == p); get_task_struct(c); + read_unlock(&tasklist_lock); + down_write(&mm->mmap_sem); /* * The task_lock protects c->mm from changing. * We always want mm->owner->mm == mm */ task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); goto retry; } cgroup_mm_owner_callbacks(mm->owner, c); mm->owner = c; task_unlock(c); + up_write(&mm->mmap_sem); put_task_struct(c); } #endif /* CONFIG_MM_OWNER */ @@ -1294,6 +1298,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@ -1309,20 +1314,23 @@ static int wait_task_zombie(struct task_struct *p, int options, * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; + thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(cputime.utime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(cputime.stime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe84796..44e64d7ba29b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -759,15 +759,44 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + /* Thread group counters. */ + thread_group_cputime_init(sig); + + /* Expiration times and increments. */ + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + /* Cached expiration times. */ + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - return 0; + ret = thread_group_cputime_clone_thread(current); + if (likely(!ret)) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + } + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -795,39 +824,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; - sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; + sig->tty = NULL; - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -837,7 +852,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { + thread_group_cputime_free(sig); exit_thread_group_keys(sig); + tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); } @@ -886,6 +903,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif /* CONFIG_MM_OWNER */ /* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + +/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * @@ -995,12 +1025,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@ -1201,21 +1226,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } } if (likely(p->pid)) { @@ -1227,7 +1237,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - p->signal->tty = current->signal->tty; + tty_kref_put(p->signal->tty); + p->signal->tty = tty_kref_get(current->signal->tty); set_task_pgrp(p, task_pgrp_nr(current)); set_task_session(p, task_session_nr(current)); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a74..95978f48e039 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ BUG_ON(timer->function(timer) != HRTIMER_NORESTART); return 1; - case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + case HRTIMER_CB_IRQSAFE_PERCPU: + case HRTIMER_CB_IRQSAFE_UNLOCKED: /* * This is solely for the sched tick emulation with * dynamic tick support to ensure that we do not * restart the tick right on the edge and end up with * the tick timer in the softirq ! The calling site - * takes care of this. + * takes care of this. Also used for hrtimer sleeper ! */ debug_hrtimer_deactivate(timer); return 1; @@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer) timer_stats_account_hrtimer(timer); fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { /* * Used for scheduler timers, avoid lock inversion with * rq->lock and tasklist_lock. @@ -1401,9 +1403,7 @@ void hrtimer_run_queues(void) if (!base->first) continue; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - else if (gettime) { + if (gettime) { hrtimer_get_softirq_time(cpu_base); gettime = 0; } @@ -1452,7 +1452,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; #ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; #endif } @@ -1591,49 +1591,123 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; + int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); + + /* + * Should not happen. Per CPU timers should be + * canceled _before_ the migration code is called + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { + __remove_hrtimer(timer, old_base, + HRTIMER_STATE_INACTIVE, 0); + WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", + timer, timer->function, dcpu); + continue; + } + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* * Enqueue the timer. Allow reprogramming of the event device */ enqueue_hrtimer(timer, new_base, 1); + +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Happens with high res enabled when the timer was + * already expired and the callback mode is + * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The + * enqueue code does not move them to the soft irq + * pending list for performance/latency reasons, but + * in the migration state, we need to do that + * otherwise we end up with a stale timer. + */ + if (timer->state == HRTIMER_STATE_MIGRATE) { + timer->state = HRTIMER_STATE_PENDING; + list_add_tail(&timer->cb_entry, + &new_base->cpu_base->cb_pending); + raise = 1; + } +#endif + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; } + return raise; } +#ifdef CONFIG_HIGH_RES_TIMERS +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + struct hrtimer *timer; + int raise = 0; + + while (!list_empty(&old_base->cb_pending)) { + timer = list_entry(old_base->cb_pending.next, + struct hrtimer, cb_entry); + + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); + timer->base = &new_base->clock_base[timer->base->index]; + list_add_tail(&timer->cb_entry, &new_base->cb_pending); + raise = 1; + } + return raise; +} +#else +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + return 0; +} +#endif + static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, raise = 0; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); new_base = &get_cpu_var(hrtimer_bases); tick_cancel_sched_timer(cpu); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + if (migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], cpu)) + raise = 1; } + if (migrate_hrtimer_pending(old_base, new_base)) + raise = 1; + spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); + + if (raise) + hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0314074fa232..60c49e324390 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) set_balance_irq_affinity(irq, cpumask); #ifdef CONFIG_GENERIC_PENDING_IRQ - set_pending_irq(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->chip->set_affinity(irq, cpumask); + spin_unlock_irqrestore(&desc->lock, flags); + } else + set_pending_irq(irq, cpumask); #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); diff --git a/kernel/itimer.c b/kernel/itimer.c index ab982747d9bd..db7c358b9a02 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t utime = tsk->signal->utime; - do { - utime = cputime_add(utime, t->utime); - t = next_thread(t); - } while (t != tsk); + struct task_cputime cputime; + cputime_t utime; + + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; case ITIMER_PROF: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t ptime = cputime_add(tsk->signal->utime, - tsk->signal->stime); - do { - ptime = cputime_add(ptime, - cputime_add(t->utime, - t->stime)); - t = next_thread(t); - } while (t != tsk); + struct task_cputime times; + cputime_t ptime; + + thread_group_cputime(tsk, ×); + ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; @@ -185,7 +176,6 @@ again: case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; @@ -200,7 +190,6 @@ again: tsk->signal->it_virt_expires = nval; tsk->signal->it_virt_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); @@ -209,7 +198,6 @@ again: case ITIMER_PROF: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; @@ -224,7 +212,6 @@ again: tsk->signal->it_prof_expires = nval; tsk->signal->it_prof_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 38fc10ac7541..5072cf1685a2 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr, /* see if it's in a module */ return module_address_lookup(addr, symbolsize, offset, modname, namebuf); - return NULL; } int lookup_symbol_name(unsigned long addr, char *symname) diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f3f0df35d4..aef265325cd3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image, *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a - * destination page, so return it. + * destination page, so return it if it's + * gfp_flags honor the ones passed in. */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } addr = old_addr; page = old_page; break; diff --git a/kernel/kgdb.c b/kernel/kgdb.c index eaa21fc9ad1d..e4dcfb2272a4 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -488,7 +488,7 @@ static int write_mem_msg(int binary) if (err) return err; if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length + 1); + flush_icache_range(addr, addr + length); return 0; } @@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1432,6 +1433,7 @@ acquirelock: atomic_read(&kgdb_cpu_doing_single_step) != cpu) { atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1462,7 +1464,7 @@ acquirelock: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = 0; i < NR_CPUS; i++) atomic_set(&passive_cpu_wait[i], 1); } @@ -1475,7 +1477,7 @@ acquirelock: #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -1494,7 +1496,7 @@ acquirelock: kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); kgdb_deactivate_sw_breakpoints(); kgdb_single_step = 0; - kgdb_contthread = NULL; + kgdb_contthread = current; exception_level = 0; /* Talk to debugger with gdbserial protocol */ @@ -1508,7 +1510,7 @@ acquirelock: kgdb_info[ks->cpu].task = NULL; atomic_set(&cpu_in_kgdb[ks->cpu], 0); - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = NR_CPUS-1; i >= 0; i--) atomic_set(&passive_cpu_wait[i], 0); /* @@ -1524,6 +1526,7 @@ acquirelock: kgdb_restore: /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/kmod.c b/kernel/kmod.c index 2456d1a0befb..3d3c3ea3a023 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -113,7 +113,7 @@ int request_module(const char *fmt, ...) return ret; } EXPORT_SYMBOL(request_module); -#endif /* CONFIG_KMOD */ +#endif /* CONFIG_MODULES */ struct subprocess_info { struct work_struct work; @@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work) } } -#ifdef CONFIG_PM +#ifdef CONFIG_PM_SLEEP /* * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY * (used for preventing user land processes from being created after the user @@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); */ #define RUNNING_HELPERS_TIMEOUT (5 * HZ) -static int usermodehelper_pm_callback(struct notifier_block *nfb, - unsigned long action, - void *ignored) +/** + * usermodehelper_disable - prevent new helpers from being started + */ +int usermodehelper_disable(void) { long retval; - switch (action) { - case PM_HIBERNATION_PREPARE: - case PM_SUSPEND_PREPARE: - usermodehelper_disabled = 1; - smp_mb(); - /* - * From now on call_usermodehelper_exec() won't start any new - * helpers, so it is sufficient if running_helpers turns out to - * be zero at one point (it may be increased later, but that - * doesn't matter). - */ - retval = wait_event_timeout(running_helpers_waitq, + usermodehelper_disabled = 1; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, atomic_read(&running_helpers) == 0, RUNNING_HELPERS_TIMEOUT); - if (retval) { - return NOTIFY_OK; - } else { - usermodehelper_disabled = 0; - return NOTIFY_BAD; - } - case PM_POST_HIBERNATION: - case PM_POST_SUSPEND: - usermodehelper_disabled = 0; - return NOTIFY_OK; - } + if (retval) + return 0; - return NOTIFY_DONE; + usermodehelper_disabled = 0; + return -EAGAIN; +} + +/** + * usermodehelper_enable - allow new helpers to be started again + */ +void usermodehelper_enable(void) +{ + usermodehelper_disabled = 0; } static void helper_lock(void) @@ -334,18 +332,12 @@ static void helper_unlock(void) if (atomic_dec_and_test(&running_helpers)) wake_up(&running_helpers_waitq); } - -static void register_pm_notifier_callback(void) -{ - pm_notifier(usermodehelper_pm_callback, 0); -} -#else /* CONFIG_PM */ +#else /* CONFIG_PM_SLEEP */ #define usermodehelper_disabled 0 static inline void helper_lock(void) {} static inline void helper_unlock(void) {} -static inline void register_pm_notifier_callback(void) {} -#endif /* CONFIG_PM */ +#endif /* CONFIG_PM_SLEEP */ /** * call_usermodehelper_setup - prepare to call a usermode helper @@ -515,5 +507,4 @@ void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); BUG_ON(!khelper_wq); - register_pm_notifier_callback(); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 75bc2cd9ebc6..8b57a2597f21 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -404,7 +404,7 @@ void kretprobe_hash_lock(struct task_struct *tsk, spin_lock_irqsave(hlist_lock, *flags); } -void kretprobe_table_lock(unsigned long hash, unsigned long *flags) +static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e53bc30e9ba5..08dd8ed86c77 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/kexec.h> +#include <linux/profile.h> #include <linux/sched.h> #define KERNEL_ATTR_RO(_name) \ @@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj, KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_PROFILING +static ssize_t profiling_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", prof_on); +} +static ssize_t profiling_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + if (prof_on) + return -EEXIST; + /* + * This eventually calls into get_option() which + * has a ton of callers and is not const. It is + * easiest to cast it away here. + */ + profile_setup((char *)buf); + ret = profile_init(); + if (ret) + return ret; + ret = create_proc_profile(); + if (ret) + return ret; + return count; +} +KERNEL_ATTR_RW(profiling); +#endif + #ifdef CONFIG_KEXEC static ssize_t kexec_loaded_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_PROFILING + &profiling_attr.attr, +#endif #ifdef CONFIG_KEXEC &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, diff --git a/kernel/module.c b/kernel/module.c index 9db11911e04b..25bc9ac9e226 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -100,7 +100,7 @@ static inline int strong_try_module_get(struct module *mod) static inline void add_taint_module(struct module *mod, unsigned flag) { add_taint(flag); - mod->taints |= flag; + mod->taints |= (1U << flag); } /* @@ -784,6 +784,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) mutex_lock(&module_mutex); /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + unregister_dynamic_debug_module(mod->name); free_module(mod); out: @@ -923,7 +924,7 @@ static const char vermagic[] = VERMAGIC_STRING; static int try_to_force_load(struct module *mod, const char *symname) { #ifdef CONFIG_MODULE_FORCE_LOAD - if (!(tainted & TAINT_FORCED_MODULE)) + if (!test_taint(TAINT_FORCED_MODULE)) printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint_module(mod, TAINT_FORCED_MODULE); @@ -1033,7 +1034,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, const unsigned long *crc; ret = find_symbol(name, &owner, &crc, - !(mod->taints & TAINT_PROPRIETARY_MODULE), true); + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); if (!IS_ERR_VALUE(ret)) { /* use_module can fail due to OOM, or module initialization or unloading */ @@ -1173,7 +1174,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, while (i-- > 0) sysfs_remove_bin_file(notes_attrs->dir, ¬es_attrs->attrs[i]); - kobject_del(notes_attrs->dir); + kobject_put(notes_attrs->dir); } kfree(notes_attrs); } @@ -1634,7 +1635,7 @@ static void set_license(struct module *mod, const char *license) license = "unspecified"; if (!license_is_gpl_compatible(license)) { - if (!(tainted & TAINT_PROPRIETARY_MODULE)) + if (!test_taint(TAINT_PROPRIETARY_MODULE)) printk(KERN_WARNING "%s: module license '%s' taints " "kernel.\n", mod->name, license); add_taint_module(mod, TAINT_PROPRIETARY_MODULE); @@ -1783,6 +1784,33 @@ static inline void add_kallsyms(struct module *mod, } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG +static void dynamic_printk_setup(Elf_Shdr *sechdrs, unsigned int verboseindex) +{ + struct mod_debug *debug_info; + unsigned long pos, end; + unsigned int num_verbose; + + pos = sechdrs[verboseindex].sh_addr; + num_verbose = sechdrs[verboseindex].sh_size / + sizeof(struct mod_debug); + end = pos + (num_verbose * sizeof(struct mod_debug)); + + for (; pos < end; pos += sizeof(struct mod_debug)) { + debug_info = (struct mod_debug *)pos; + register_dynamic_debug_module(debug_info->modname, + debug_info->type, debug_info->logical_modname, + debug_info->flag_names, debug_info->hash, + debug_info->hash2); + } +} +#else +static inline void dynamic_printk_setup(Elf_Shdr *sechdrs, + unsigned int verboseindex) +{ +} +#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */ + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -1806,6 +1834,7 @@ static noinline struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; + char *staging; unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; @@ -1831,6 +1860,7 @@ static noinline struct module *load_module(void __user *umod, #endif unsigned int markersindex; unsigned int markersstringsindex; + unsigned int verboseindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1960,6 +1990,14 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + staging = get_modinfo(sechdrs, infoindex, "staging"); + if (staging) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + /* Now copy in args */ args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(args)) { @@ -2117,6 +2155,7 @@ static noinline struct module *load_module(void __user *umod, markersindex = find_sec(hdr, sechdrs, secstrings, "__markers"); markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); + verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2167,6 +2206,7 @@ static noinline struct module *load_module(void __user *umod, marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif + dynamic_printk_setup(sechdrs, verboseindex); err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2552,10 +2592,12 @@ static char *module_flags(struct module *mod, char *buf) mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { buf[bx++] = '('; - if (mod->taints & TAINT_PROPRIETARY_MODULE) + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) buf[bx++] = 'P'; - if (mod->taints & TAINT_FORCED_MODULE) + if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[bx++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't diff --git a/kernel/panic.c b/kernel/panic.c index 12c5a0a6c89b..bda561ef3cdf 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,7 +23,7 @@ #include <linux/kallsyms.h> int panic_on_oops; -int tainted; +static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); @@ -143,6 +143,27 @@ NORET_TYPE void panic(const char * fmt, ...) EXPORT_SYMBOL(panic); + +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, + { TAINT_CRAP, 'C', ' ' }, +}; + /** * print_tainted - return a string to represent the kernel taint state. * @@ -155,35 +176,45 @@ EXPORT_SYMBOL(panic); * 'U' - Userspace-defined naughtiness. * 'A' - ACPI table overridden. * 'W' - Taint on warning. + * 'C' - modules from drivers/staging are loaded. * * The string is overwritten by the next call to print_taint(). */ - const char *print_tainted(void) { - static char buf[20]; - if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", - tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', - tainted & TAINT_FORCED_MODULE ? 'F' : ' ', - tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', - tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', - tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' ', - tainted & TAINT_USER ? 'U' : ' ', - tainted & TAINT_DIE ? 'D' : ' ', - tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', - tainted & TAINT_WARN ? 'W' : ' '); - } - else + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else snprintf(buf, sizeof(buf), "Not tainted"); return(buf); } +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + void add_taint(unsigned flag) { debug_locks = 0; /* can't trust the integrity of the kernel anymore */ - tainted |= flag; + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c42a03aef36f..153dcb2639c3 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -7,6 +7,93 @@ #include <linux/errno.h> #include <linux/math64.h> #include <asm/uaccess.h> +#include <linux/kernel_stat.h> + +/* + * Allocate the thread_group_cputime structure appropriately and fill in the + * current values of the fields. Called from copy_signal() via + * thread_group_cputime_clone_thread() when adding a second or subsequent + * thread to a thread group. Assumes interrupts are enabled when called. + */ +int thread_group_cputime_alloc(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct task_cputime *cputime; + + /* + * If we have multiple threads and we don't already have a + * per-CPU task_cputime struct (checked in the caller), allocate + * one and fill it in with the times accumulated so far. We may + * race with another thread so recheck after we pick up the sighand + * lock. + */ + cputime = alloc_percpu(struct task_cputime); + if (cputime == NULL) + return -ENOMEM; + spin_lock_irq(&tsk->sighand->siglock); + if (sig->cputime.totals) { + spin_unlock_irq(&tsk->sighand->siglock); + free_percpu(cputime); + return 0; + } + sig->cputime.totals = cputime; + cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); + cputime->utime = tsk->utime; + cputime->stime = tsk->stime; + cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; + spin_unlock_irq(&tsk->sighand->siglock); + return 0; +} + +/** + * thread_group_cputime - Sum the thread group time fields across all CPUs. + * + * @tsk: The task we use to identify the thread group. + * @times: task_cputime structure in which we return the summed fields. + * + * Walk the list of CPUs to sum the per-CPU time fields in the thread group + * time structure. + */ +void thread_group_cputime( + struct task_struct *tsk, + struct task_cputime *times) +{ + struct signal_struct *sig; + int i; + struct task_cputime *tot; + + sig = tsk->signal; + if (unlikely(!sig) || !sig->cputime.totals) { + times->utime = tsk->utime; + times->stime = tsk->stime; + times->sum_exec_runtime = tsk->se.sum_exec_runtime; + return; + } + times->stime = times->utime = cputime_zero; + times->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + tot = per_cpu_ptr(tsk->signal->cputime.totals, i); + times->utime = cputime_add(times->utime, tot->utime); + times->stime = cputime_add(times->stime, tot->stime); + times->sum_exec_runtime += tot->sum_exec_runtime; + } +} + +/* + * Called after updating RLIMIT_CPU to set timer expiration if necessary. + */ +void update_rlimit_cpu(unsigned long rlim_new) +{ + cputime_t cputime; + + cputime = secs_to_cputime(rlim_new); + if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || + cputime_lt(current->signal->it_prof_expires, cputime)) { + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); + } +} static int check_clock(const clockid_t which_clock) { @@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p) { return p->utime; } -static inline unsigned long long sched_ns(struct task_struct *p) -{ - return task_sched_runtime(p); -} int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = sched_ns(p); + cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. - * Must be called with tasklist_lock held for reading, and p->sighand->siglock. */ -static int cpu_clock_sample_group_locked(unsigned int clock_idx, - struct task_struct *p, - union cpu_time_count *cpu) +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) { - struct task_struct *t = p; - switch (clock_idx) { + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + switch (which_clock) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); - do { - cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = p->signal->utime; - do { - cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sum_sched_runtime; - /* Add in each other live thread. */ - while ((t = next_thread(t)) != p) { - cpu->sched += t->se.sum_exec_runtime; - } - cpu->sched += sched_ns(p); + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - int ret; - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, - cpu); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; -} - int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { @@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, tsk->signal->utime), - cputime_add(tsk->stime, tsk->signal->stime), - tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); -} + struct task_cputime cputime; - -/* - * Set the expiry times of all the threads in the process so one of them - * will go off before the process cumulative expiry total is reached. - */ -static void process_timer_rebalance(struct task_struct *p, - unsigned int clock_idx, - union cpu_time_count expires, - union cpu_time_count val) -{ - cputime_t ticks, left; - unsigned long long ns, nsleft; - struct task_struct *t = p; - unsigned int nthreads = atomic_read(&p->signal->live); - - if (!nthreads) - return; - - switch (clock_idx) { - default: - BUG(); - break; - case CPUCLOCK_PROF: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(prof_ticks(t), left); - if (cputime_eq(t->it_prof_expires, - cputime_zero) || - cputime_gt(t->it_prof_expires, ticks)) { - t->it_prof_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_VIRT: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(virt_ticks(t), left); - if (cputime_eq(t->it_virt_expires, - cputime_zero) || - cputime_gt(t->it_virt_expires, ticks)) { - t->it_virt_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_SCHED: - nsleft = expires.sched - val.sched; - do_div(nsleft, nthreads); - nsleft = max_t(unsigned long long, nsleft, 1); - do { - if (likely(!(t->flags & PF_EXITING))) { - ns = t->se.sum_exec_runtime + nsleft; - if (t->it_sched_expires == 0 || - t->it_sched_expires > ns) { - t->it_sched_expires = ns; - } - } - t = next_thread(t); - } while (t != p); - break; - } + thread_group_cputime(tsk, &cputime); + cleanup_timers(tsk->signal->cpu_timers, + cputime.utime, cputime.stime, cputime.sum_exec_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) @@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->it_prof_expires, + if (cputime_eq(p->cputime_expires.prof_exp, cputime_zero) || - cputime_gt(p->it_prof_expires, + cputime_gt(p->cputime_expires.prof_exp, nt->expires.cpu)) - p->it_prof_expires = nt->expires.cpu; + p->cputime_expires.prof_exp = + nt->expires.cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->it_virt_expires, + if (cputime_eq(p->cputime_expires.virt_exp, cputime_zero) || - cputime_gt(p->it_virt_expires, + cputime_gt(p->cputime_expires.virt_exp, nt->expires.cpu)) - p->it_virt_expires = nt->expires.cpu; + p->cputime_expires.virt_exp = + nt->expires.cpu; break; case CPUCLOCK_SCHED: - if (p->it_sched_expires == 0 || - p->it_sched_expires > nt->expires.sched) - p->it_sched_expires = nt->expires.sched; + if (p->cputime_expires.sched_exp == 0 || + p->cputime_expires.sched_exp > + nt->expires.sched) + p->cputime_expires.sched_exp = + nt->expires.sched; break; } } else { /* - * For a process timer, we must balance - * all the live threads' expirations. + * For a process timer, set the cached expiration time. */ switch (CPUCLOCK_WHICH(timer->it_clock)) { default: @@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) cputime_lt(p->signal->it_virt_expires, timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.virt_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_PROF: if (!cputime_eq(p->signal->it_prof_expires, cputime_zero) && @@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) if (i != RLIM_INFINITY && i <= cputime_to_secs(timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.prof_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_SCHED: - rebalance: - process_timer_rebalance( - timer->it.cpu.task, - CPUCLOCK_WHICH(timer->it_clock), - timer->it.cpu.expires, now); + p->signal->cputime_expires.sched_exp = + timer->it.cpu.expires.sched; break; } } @@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk, struct signal_struct *const sig = tsk->signal; maxfire = 20; - tsk->it_prof_expires = cputime_zero; + tsk->cputime_expires.prof_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { - tsk->it_prof_expires = t->expires.cpu; + tsk->cputime_expires.prof_exp = t->expires.cpu; break; } t->firing = 1; @@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_virt_expires = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { - tsk->it_virt_expires = t->expires.cpu; + tsk->cputime_expires.virt_exp = t->expires.cpu; break; } t->firing = 1; @@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_sched_expires = 0; + tsk->cputime_expires.sched_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->it_sched_expires = t->expires.sched; + tsk->cputime_expires.sched_exp = t->expires.sched; break; } t->firing = 1; @@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime, ptime, virt_expires, prof_expires; + cputime_t utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; - struct task_struct *t; struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; /* * Don't sample the current process CPU clocks if there are no timers. @@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk, /* * Collect the current process totals. */ - utime = sig->utime; - stime = sig->stime; - sum_sched_runtime = sig->sum_sched_runtime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->se.sum_exec_runtime; - t = next_thread(t); - } while (t != tsk); - ptime = cputime_add(utime, stime); - + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; + ptime = cputime_add(utime, cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { @@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk, } } - if (!cputime_eq(prof_expires, cputime_zero) || - !cputime_eq(virt_expires, cputime_zero) || - sched_expires != 0) { - /* - * Rebalance the threads' expiry times for the remaining - * process CPU timers. - */ - - cputime_t prof_left, virt_left, ticks; - unsigned long long sched_left, sched; - const unsigned int nthreads = atomic_read(&sig->live); - - if (!nthreads) - return; - - prof_left = cputime_sub(prof_expires, utime); - prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div_non_zero(prof_left, nthreads); - virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div_non_zero(virt_left, nthreads); - if (sched_expires) { - sched_left = sched_expires - sum_sched_runtime; - do_div(sched_left, nthreads); - sched_left = max_t(unsigned long long, sched_left, 1); - } else { - sched_left = 0; - } - t = tsk; - do { - if (unlikely(t->flags & PF_EXITING)) - continue; - - ticks = cputime_add(cputime_add(t->utime, t->stime), - prof_left); - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(t->it_prof_expires, cputime_zero) || - cputime_gt(t->it_prof_expires, ticks))) { - t->it_prof_expires = ticks; - } - - ticks = cputime_add(t->utime, virt_left); - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(t->it_virt_expires, cputime_zero) || - cputime_gt(t->it_virt_expires, ticks))) { - t->it_virt_expires = ticks; - } - - sched = t->se.sum_exec_runtime + sched_left; - if (sched_expires && (t->it_sched_expires == 0 || - t->it_sched_expires > sched)) { - t->it_sched_expires = sched; - } - } while ((t = next_thread(t)) != tsk); - } + if (!cputime_eq(prof_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) + sig->cputime_expires.prof_exp = prof_expires; + if (!cputime_eq(virt_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) + sig->cputime_expires.virt_exp = virt_expires; + if (sched_expires != 0 && + (sig->cputime_expires.sched_exp == 0 || + sig->cputime_expires.sched_exp > sched_expires)) + sig->cputime_expires.sched_exp = sched_expires; } /* @@ -1314,6 +1253,86 @@ out: ++timer->it_requeue_pending; } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (!cputime_eq(expires->utime, cputime_zero) && + cputime_ge(sample->utime, expires->utime)) + return 1; + if (!cputime_eq(expires->stime, cputime_zero) && + cputime_ge(cputime_add(sample->utime, sample->stime), + expires->stime)) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + if (unlikely(!sig)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + if (!task_cputime_zero(&sig->cputime_expires)) { + struct task_cputime group_sample; + + thread_group_cputime(tsk, &group_sample); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + return 0; +} + /* * This is called from the timer interrupt handler. The irq handler has * already updated our counts. We need to check if any timers fire now. @@ -1326,42 +1345,31 @@ void run_posix_cpu_timers(struct task_struct *tsk) BUG_ON(!irqs_disabled()); -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) return; -#undef UNEXPIRED - + spin_lock(&tsk->sighand->siglock); /* - * Double-check with locks held. + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. */ - read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); - - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); /* * Now that all the timers on our list have the firing flag, @@ -1389,10 +1397,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Set one of the process-wide special case CPU timers. - * The tasklist_lock and tsk->sighand->siglock must be held by the caller. - * The oldval argument is null for the RLIMIT_CPU timer, where *newval is - * absolute; non-null for ITIMER_*, where *newval is relative and we update - * it to be absolute, *oldval is absolute and we update it to be relative. + * The tsk->sighand->siglock must be held by the caller. + * The *newval argument is relative and we update it to be absolute, *oldval + * is absolute and we update it to be relative. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) @@ -1401,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group_locked(clock_idx, tsk, &now); + cpu_clock_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { @@ -1435,13 +1442,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { - /* - * Rejigger each thread's expiry time so that one will - * notice before we hit the process-cumulative expiry time. - */ - union cpu_time_count expires = { .sched = 0 }; - expires.cpu = *newval; - process_timer_rebalance(tsk, clock_idx, expires, now); + switch (clock_idx) { + case CPUCLOCK_PROF: + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } } } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbff..b931d7cedbfa 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* + * Get monotonic time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + +/* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) @@ -235,9 +244,15 @@ static __init int init_posix_timers(void) .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -298,6 +313,7 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { + int shared, ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -311,25 +327,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_code = SI_TIMER; - timr->sigq->info.si_tid = timr->it_id; - timr->sigq->info.si_value = timr->it_sigev_value; - - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); - - if (likely(ret >= 0)) - return ret; - - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; - put_task_struct(timr->it_process); - timr->it_process = leader; - } - - return send_sigqueue(timr->sigq, timr->it_process, 1); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, timr->it_process, shared); + /* If we failed to send the signal the timer stops. */ + return ret > 0; } EXPORT_SYMBOL_GPL(posix_timer_event); @@ -441,7 +442,7 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); - tmr = NULL; + return NULL; } memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; @@ -468,11 +469,9 @@ sys_timer_create(const clockid_t which_clock, struct sigevent __user *timer_event_spec, timer_t __user * created_timer_id) { - int error = 0; - struct k_itimer *new_timer = NULL; - int new_timer_id; - struct task_struct *process = NULL; - unsigned long flags; + struct k_itimer *new_timer; + int error, new_timer_id; + struct task_struct *process; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -490,12 +489,11 @@ sys_timer_create(const clockid_t which_clock, goto out; } spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, (void *) new_timer, - &new_timer_id); + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); - if (error == -EAGAIN) - goto retry; - else if (error) { + if (error) { + if (error == -EAGAIN) + goto retry; /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) @@ -526,67 +524,43 @@ sys_timer_create(const clockid_t which_clock, error = -EFAULT; goto out; } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->it_sigev_signo = event.sigev_signo; - new_timer->it_sigev_value = event.sigev_value; - - read_lock(&tasklist_lock); - if ((process = good_sigevent(&event))) { - /* - * We may be setting up this process for another - * thread. It may be exiting. To catch this - * case the we check the PF_EXITING flag. If - * the flag is not set, the siglock will catch - * him before it is too late (in exit_itimers). - * - * The exec case is a bit more invloved but easy - * to code. If the process is in our thread - * group (and it must be or we would not allow - * it here) and is doing an exec, it will cause - * us to be killed. In this case it will wait - * for us to die which means we can finish this - * linkage with our last gasp. I.e. no code :) - */ - spin_lock_irqsave(&process->sighand->siglock, flags); - if (!(process->flags & PF_EXITING)) { - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - get_task_struct(process); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } else { - spin_unlock_irqrestore(&process->sighand->siglock, flags); - process = NULL; - } - } - read_unlock(&tasklist_lock); + rcu_read_lock(); + process = good_sigevent(&event); + if (process) + get_task_struct(process); + rcu_read_unlock(); if (!process) { error = -EINVAL; goto out; } } else { - new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->it_sigev_signo = SIGALRM; - new_timer->it_sigev_value.sival_int = new_timer->it_id; + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; process = current->group_leader; - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); + get_task_struct(process); } + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_process = process; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + + return 0; /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify * new_timer after the unlock call. */ - out: - if (error) - release_posix_timer(new_timer, it_id_set); - + release_posix_timer(new_timer, it_id_set); return error; } @@ -597,7 +571,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* @@ -605,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) * flags part over to the timer lock. Must not let interrupts in * while we are moving the lock. */ - spin_lock_irqsave(&idr_lock, *flags); - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); + timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - - if ((timr->it_id != timer_id) || !(timr->it_process) || - !same_thread_group(timr->it_process, current)) { - spin_unlock(&timr->it_lock); - spin_unlock_irqrestore(&idr_lock, *flags); - timr = NULL; - } else + if (timr->it_process && + same_thread_group(timr->it_process, current)) { spin_unlock(&idr_lock); - } else - spin_unlock_irqrestore(&idr_lock, *flags); + return timr; + } + spin_unlock(&timr->it_lock); + } + spin_unlock_irqrestore(&idr_lock, *flags); - return timr; + return NULL; } /* @@ -862,8 +833,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); @@ -890,8 +860,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index bbd85c60f741..331f9836383f 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -14,6 +14,7 @@ #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> +#include <linux/kmod.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/mount.h> @@ -520,6 +521,10 @@ int hibernate(void) if (error) goto Exit; + error = usermodehelper_disable(); + if (error) + goto Exit; + /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) @@ -558,6 +563,7 @@ int hibernate(void) thaw_processes(); Finish: free_basic_memory_bitmaps(); + usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -634,6 +640,10 @@ static int software_resume(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + error = create_basic_memory_bitmaps(); if (error) goto Finish; @@ -656,6 +666,7 @@ static int software_resume(void) thaw_processes(); Done: free_basic_memory_bitmaps(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 540b16b68565..19122cf6d827 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -14,6 +14,7 @@ #include <linux/string.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/kmod.h> #include <linux/init.h> #include <linux/console.h> #include <linux/cpu.h> @@ -237,6 +238,10 @@ static int suspend_prepare(void) if (error) goto Finish; + error = usermodehelper_disable(); + if (error) + goto Finish; + if (suspend_freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -256,6 +261,7 @@ static int suspend_prepare(void) Thaw: suspend_thaw_processes(); + usermodehelper_enable(); Finish: pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); @@ -376,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state) static void suspend_finish(void) { suspend_thaw_processes(); + usermodehelper_enable(); pm_notifier_call_chain(PM_POST_SUSPEND); pm_restore_console(); } diff --git a/kernel/power/user.c b/kernel/power/user.c index a6332a313262..005b93d839ba 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, case SNAPSHOT_FREEZE: if (data->frozen) break; + printk("Syncing filesystems ... "); sys_sync(); printk("done.\n"); - error = freeze_processes(); + error = usermodehelper_disable(); if (error) + break; + + error = freeze_processes(); + if (error) { thaw_processes(); + usermodehelper_enable(); + } if (!error) data->frozen = 1; break; @@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!data->frozen || data->ready) break; thaw_processes(); + usermodehelper_enable(); data->frozen = 0; break; diff --git a/kernel/printk.c b/kernel/printk.c index b51b1567bb55..6341af77eb65 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -13,7 +13,7 @@ * Fixed SMP synchronization, 08/08/99, Manfred Spraul * manfred@colorfullife.com * Rewrote bits to get rid of console_lock - * 01Mar01 Andrew Morton <andrewm@uow.edu.au> + * 01Mar01 Andrew Morton */ #include <linux/kernel.h> @@ -577,9 +577,6 @@ static int have_callable_console(void) * @fmt: format string * * This is printk(). It can be called from any context. We want it to work. - * Be aware of the fact that if oops_in_progress is not set, we might try to - * wake klogd up which could deadlock on runqueue lock if printk() is called - * from scheduler code. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output @@ -593,6 +590,8 @@ static int have_callable_console(void) * * See also: * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. */ asmlinkage int printk(const char *fmt, ...) @@ -982,10 +981,25 @@ int is_console_locked(void) return console_locked; } -void wake_up_klogd(void) +static DEFINE_PER_CPU(int, printk_pending); + +void printk_tick(void) { - if (!oops_in_progress && waitqueue_active(&log_wait)) + if (__get_cpu_var(printk_pending)) { + __get_cpu_var(printk_pending) = 0; wake_up_interruptible(&log_wait); + } +} + +int printk_needs_cpu(int cpu) +{ + return per_cpu(printk_pending, cpu); +} + +void wake_up_klogd(void) +{ + if (waitqueue_active(&log_wait)) + __raw_get_cpu_var(printk_pending) = 1; } /** @@ -1291,22 +1305,6 @@ static int __init disable_boot_consoles(void) } late_initcall(disable_boot_consoles); -/** - * tty_write_message - write a message to a certain tty, not just the console. - * @tty: the destination tty_struct - * @msg: the message to write - * - * This is used for messages that need to be redirected to a specific tty. - * We don't put it into the syslog queue right now maybe in the future if - * really needed. - */ -void tty_write_message(struct tty_struct *tty, char *msg) -{ - if (tty && tty->ops->write) - tty->ops->write(tty, msg, strlen(msg)); - return; -} - #if defined CONFIG_PRINTK /* diff --git a/kernel/profile.c b/kernel/profile.c index cd26bed4cc26..a9e422df6bf6 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -22,6 +22,8 @@ #include <linux/cpu.h> #include <linux/highmem.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> #include <asm/sections.h> #include <asm/irq_regs.h> #include <asm/ptrace.h> @@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); #endif /* CONFIG_SMP */ -static int __init profile_setup(char *str) +int profile_setup(char *str) { - static char __initdata schedstr[] = "schedule"; - static char __initdata sleepstr[] = "sleep"; - static char __initdata kvmstr[] = "kvm"; + static char schedstr[] = "schedule"; + static char sleepstr[] = "sleep"; + static char kvmstr[] = "kvm"; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { @@ -100,14 +102,33 @@ static int __init profile_setup(char *str) __setup("profile=", profile_setup); -void __init profile_init(void) +int profile_init(void) { + int buffer_bytes; if (!prof_on) - return; + return 0; /* only text is profiled */ prof_len = (_etext - _stext) >> prof_shift; - prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); + buffer_bytes = prof_len*sizeof(atomic_t); + if (!slab_is_available()) { + prof_buffer = alloc_bootmem(buffer_bytes); + return 0; + } + + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); + if (prof_buffer) + return 0; + + prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); + if (prof_buffer) + return 0; + + prof_buffer = vmalloc(buffer_bytes); + if (prof_buffer) + return 0; + + return -ENOMEM; } /* Profile event notifications */ @@ -527,7 +548,7 @@ static void __init profile_nop(void *unused) { } -static int __init create_hash_tables(void) +static int create_hash_tables(void) { int cpu; @@ -575,14 +596,14 @@ out_cleanup: #define create_hash_tables() ({ 0; }) #endif -static int __init create_proc_profile(void) +int create_proc_profile(void) { struct proc_dir_entry *entry; if (!prof_on) return 0; if (create_hash_tables()) - return -1; + return -ENOMEM; entry = proc_create("profile", S_IWUSR | S_IRUGO, NULL, &proc_profile_operations); if (!entry) diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index aad93cdc9f68..37f72e551542 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> +#include <linux/time.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + unsigned long flags; + + if (list) { + local_irq_save(flags); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_restore(flags); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + unsigned long flags; + + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } + unsigned long flags; + long completed_snap; - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); + if (rdp->nxtlist) { + local_irq_save(flags); + completed_snap = ACCESS_ONCE(rcp->completed); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ - smp_rmb(); + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } + + local_irq_restore(flags); + + if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags2; - if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags2); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } + spin_unlock_irqrestore(&rcp->lock, flags2); } } @@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp); - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + return 1; + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; + + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27827931ca0d..ca4bbbe04aa4 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -59,14 +59,6 @@ #include <linux/rcupreempt_trace.h> /* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -/* * PREEMPT_RCU data structures. */ diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c34bbc..35c2d3360ecf 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) diff --git a/kernel/resource.c b/kernel/resource.c index 03d796c1b2e9..4089d12af6e0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -#ifdef CONFIG_PROC_FS - -enum { MAX_IORES_LEVEL = 5 }; - static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { @@ -516,6 +516,70 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t return result; } +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + /* failed, split and try again */ + + /* conflict covered whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + EXPORT_SYMBOL(adjust_resource); /** @@ -562,33 +626,34 @@ struct resource * __request_region(struct resource *parent, { struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); - if (res) { - res->name = name; - res->start = start; - res->end = start + n - 1; - res->flags = IORESOURCE_BUSY; + if (!res) + return NULL; - write_lock(&resource_lock); + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; - for (;;) { - struct resource *conflict; + write_lock(&resource_lock); - conflict = __request_resource(parent, res); - if (!conflict) - break; - if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) - continue; - } + for (;;) { + struct resource *conflict; - /* Uhhuh, that didn't work out.. */ - kfree(res); - res = NULL; + conflict = __request_resource(parent, res); + if (!conflict) break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; } - write_unlock(&resource_lock); + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; } + write_unlock(&resource_lock); return res; } EXPORT_SYMBOL(__request_region); @@ -763,3 +828,40 @@ static int __init reserve_setup(char *str) } __setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (p->start <= addr && (p->end >= addr + size - 1)) + continue; + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} diff --git a/kernel/sched.c b/kernel/sched.c index 1a5f73c1fcdc..09a8c15748f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -604,9 +609,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -1087,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } @@ -1102,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1119,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1133,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@ -1380,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1419,15 +1410,43 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; } +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1486,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1515,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1522,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1536,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1550,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1561,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@ -1921,11 +1939,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* @@ -2285,7 +2300,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2420,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2880,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -4037,23 +4052,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. */ -unsigned long long task_sched_runtime(struct task_struct *p) +unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; - u64 ns, delta_exec; struct rq *rq; + u64 ns = 0; rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; + if (task_current(rq, p)) { + u64 delta_exec; + update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) - ns += delta_exec; + ns = delta_exec; } + task_rq_unlock(rq, &flags); return ns; @@ -4070,6 +4088,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4094,6 +4113,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4129,6 +4149,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } p->stime = cputime_add(p->stime, cputime); + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4170,6 +4191,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else @@ -4627,6 +4649,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4638,6 +4669,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4658,10 +4695,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@ -4689,12 +4723,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4702,6 +4755,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4711,6 +4771,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4719,6 +4787,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@ -5121,7 +5196,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -5957,7 +6033,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } done: ret = 1; @@ -6282,7 +6358,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6310,7 +6386,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -7194,13 +7272,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@ -7696,24 +7782,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * + * If doms_new==NULL it will be replaced with cpu_online_map. + * ndoms_new==0 is a special case for destroying existing domains. + * It will not create the default domain. + * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { - int i, j; + int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) - ndoms_new = 0; + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < ndoms_new; j++) { + for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7726,7 +7815,6 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; @@ -7763,8 +7851,13 @@ match2: int arch_reinit_sched_domains(void) { get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + rebuild_sched_domains(); put_online_cpus(); + return 0; } @@ -7848,7 +7941,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - partition_sched_domains(0, NULL, NULL); + partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: @@ -8235,20 +8328,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); @@ -8746,73 +8844,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8822,14 +8942,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; - goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8898,16 +9013,25 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; @@ -8918,6 +9042,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -8978,7 +9105,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -8987,9 +9113,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; } diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e8ab096ddfe3..81787248b60f 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -118,13 +118,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) /* * scd->clock = clamp(scd->tick_gtod + delta, - * max(scd->tick_gtod, scd->clock), - * scd->tick_gtod + TICK_NSEC); + * max(scd->tick_gtod, scd->clock), + * max(scd->clock, scd->tick_gtod + TICK_NSEC)); */ clock = scd->tick_gtod + delta; min_clock = wrap_max(scd->tick_gtod, scd->clock); - max_clock = scd->tick_gtod + TICK_NSEC; + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index bbe6b31c3c56..ad958c1ec708 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) unsigned long flags; int num_threads = 1; - rcu_read_lock(); if (lock_task_sighand(p, &flags)) { num_threads = atomic_read(&p->signal->count); unlock_task_sighand(p, &flags); } - rcu_read_unlock(); SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); SEQ_printf(m, diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c6d4bb..f604dae71316 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - -/* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ @@ -507,6 +449,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); } } @@ -586,11 +529,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -599,11 +543,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } cfs_rq->nr_running--; se->on_rq = 0; - list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1085,7 +1030,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - long more_w; if (!tg->parent) return wl; @@ -1097,18 +1041,17 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - long S, rw, s, a, b; + long more_w; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1117,7 +1060,11 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b)/D(b); + wl = s*(a-b); + + if (likely(b)) + wl /= b; + /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1126,7 +1073,6 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; -#undef D } return wl; @@ -1143,7 +1089,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1158,6 +1104,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (!sync && sched_feat(SYNC_WAKEUPS) && + curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1182,17 +1133,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } + if (sync && balanced) + return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { + if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= + tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1211,16 +1159,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *rq, *this_rq; + struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); - rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (prev_cpu == this_cpu) + goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1248,13 +1197,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * Start passive balancing when half the imbalance_pct * limit is reached. @@ -1281,62 +1227,20 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } /* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -/* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; + s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1351,6 +1255,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) cfs_rq_of(pse)->next = pse; /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + + /* * Batch tasks do not preempt (their preemption is driven by * the tick): */ @@ -1360,33 +1271,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { + resched_task(curr); + return; } - if (wakeup_preempt_entity(se, pse) == 1) + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > wakeup_gran(pse)) resched_task(curr); } @@ -1445,19 +1338,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) if (next == &cfs_rq->tasks) return NULL; - /* Skip over entities that are not tasks */ - do { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - - if (next == &cfs_rq->tasks) - return NULL; - - cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); + se = list_entry(next, struct sched_entity, group_node); + p = task_of(se); + cfs_rq->balance_iterator = next->next; return p; } @@ -1507,7 +1390,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry(tg, &task_groups, list) { + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; @@ -1620,10 +1503,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); } /* @@ -1642,7 +1525,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* @@ -1659,7 +1542,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca78154e..7c9e8f4a049f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 0) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe66..dec4ccabe2f5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 552310798dad..b446dc87494f 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - - enqueue_rt_entity(rt_se); + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } @@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ if (iter->rt_runtime == RUNTIME_INF) goto next; + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -274,6 +286,9 @@ next: return more; } +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ want = rt_b->rt_runtime - rt_rq->rt_runtime; + /* + * Greedy reclaim, take back as much as we can. + */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; + /* + * Can't reclaim from ourselves or disabled runqueues. + */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ BUG_ON(want); balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; + /* + * Reset each runqueue's bandwidth settings + */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -350,6 +392,7 @@ static void __enable_runtime(struct rq *rq) spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } @@ -388,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -483,9 +526,14 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); @@ -783,7 +831,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); @@ -1412,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p) p->rt.timeout++; next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) - p->it_sched_expires = p->se.sum_exec_runtime; + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43987e2..b8c156979cf2 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -270,3 +270,89 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->utime = cputime_add(times->utime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->stime = cputime_add(times->stime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->sum_exec_runtime += ns; + put_cpu_no_resched(); + } +} diff --git a/kernel/signal.c b/kernel/signal.c index e661b01d340f..6eea5826d618 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1338,6 +1338,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; + struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1368,10 +1369,9 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); + thread_group_cputime(tsk, &cputime); + info.si_utime = cputime_to_jiffies(cputime.utime); + info.si_stime = cputime_to_jiffies(cputime.stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) diff --git a/kernel/softirq.c b/kernel/softirq.c index c506f266a6b9..7110daeb9a90 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/module.h> @@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; EXPORT_SYMBOL(irq_stat); #endif -static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; +static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); @@ -205,7 +207,18 @@ restart: do { if (pending & 1) { + int prev_count = preempt_count(); + h->action(h); + + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered softirq %td %p" + "with preempt_count %08x," + " exited with %08x?\n", h - softirq_vec, + h->action, prev_count, preempt_count()); + preempt_count() = prev_count; + } + rcu_bh_qsctr_inc(cpu); } h++; @@ -254,16 +267,12 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { -#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); + if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif + tick_check_idle(cpu); + __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -463,17 +472,144 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --git a/kernel/softlockup.c b/kernel/softlockup.c index cb838ee93a82..3953e4aed733 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -226,7 +226,7 @@ static void check_hung_uninterruptible_tasks(int this_cpu) * If the system crashed already then all bets are off, * do not report extra hung tasks: */ - if ((tainted & TAINT_DIE) || did_panic) + if (test_taint(TAINT_DIE) || did_panic) return; read_lock(&tasklist_lock); diff --git a/kernel/sys.c b/kernel/sys.c index 038a7bc0901d..53879cdae483 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +void do_sys_times(struct tms *tms) +{ + struct task_cputime cputime; + cputime_t cutime, cstime; + + spin_lock_irq(¤t->sighand->siglock); + thread_group_cputime(current, &cputime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(cputime.utime); + tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + asmlinkage long sys_times(struct tms __user * tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - spin_lock_irq(&tsk->sighand->siglock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - - tmp.tms_utime = cputime_to_clock_t(utime); - tmp.tms_stime = cputime_to_clock_t(stime); - tmp.tms_cutime = cputime_to_clock_t(cutime); - tmp.tms_cstime = cputime_to_clock_t(cstime); + + do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@ -1060,9 +1050,7 @@ asmlinkage long sys_setsid(void) group_leader->signal->leader = 1; __set_special_pids(sid); - spin_lock(&group_leader->sighand->siglock); - group_leader->signal->tty = NULL; - spin_unlock(&group_leader->sighand->siglock); + proc_clear_tty(group_leader); err = session; out: @@ -1351,8 +1339,10 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->nodename, tmp, len); - utsname()->nodename[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; } up_write(&uts_sem); @@ -1364,15 +1354,17 @@ asmlinkage long sys_sethostname(char __user *name, int len) asmlinkage long sys_gethostname(char __user *name, int len) { int i, errno; + struct new_utsname *u; if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(utsname()->nodename); + u = utsname(); + i = 1 + strlen(u->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, utsname()->nodename, i)) + if (copy_to_user(name, u->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1397,8 +1389,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(utsname()->domainname, tmp, len); - utsname()->domainname[len] = 0; + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; } up_write(&uts_sem); @@ -1445,21 +1439,28 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; - unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) return -EINVAL; if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; old_rlim = current->signal->rlim + resource; if ((new_rlim.rlim_max > old_rlim->rlim_max) && !capable(CAP_SYS_RESOURCE)) return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; + + if (resource == RLIMIT_NOFILE) { + if (new_rlim.rlim_max == RLIM_INFINITY) + new_rlim.rlim_max = sysctl_nr_open; + if (new_rlim.rlim_cur == RLIM_INFINITY) + new_rlim.rlim_cur = sysctl_nr_open; + if (new_rlim.rlim_max > sysctl_nr_open) + return -EPERM; + } + + if (new_rlim.rlim_cur > new_rlim.rlim_max) + return -EINVAL; retval = security_task_setrlimit(resource, &new_rlim); if (retval) @@ -1491,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (new_rlim.rlim_cur == RLIM_INFINITY) goto out; - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { - unsigned long rlim_cur = new_rlim.rlim_cur; - cputime_t cputime; - - cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - } + update_rlimit_cpu(new_rlim.rlim_cur); out: return 0; } @@ -1540,11 +1530,8 @@ out: * */ -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, - cputime_t *utimep, cputime_t *stimep) +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { - *utimep = cputime_add(*utimep, t->utime); - *stimep = cputime_add(*stimep, t->stime); r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; @@ -1558,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + struct task_cputime cputime; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - accumulate_thread_rusage(p, r, &utime, &stime); + accumulate_thread_rusage(p, r); goto out; } @@ -1586,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - utime = cputime_add(utime, p->signal->utime); - stime = cputime_add(stime, p->signal->stime); + thread_group_cputime(p, &cputime); + utime = cputime_add(utime, cputime.utime); + stime = cputime_add(stime, cputime.stime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1596,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += p->signal->oublock; t = p; do { - accumulate_thread_rusage(t, r, &utime, &stime); + accumulate_thread_rusage(t, r); t = next_thread(t); } while (t != p); break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 08d6e1bb99ac..a77b27b11b04 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -125,6 +125,12 @@ cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_flock); +cond_syscall(sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec0886fa3d..617d41e4d6a0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -80,7 +80,6 @@ extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; -extern int maps_protect; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifdef CONFIG_RCU_TORTURE_TEST @@ -97,7 +96,7 @@ static int sixty = 60; static int neg_one = -1; #endif -#ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING) static int two = 2; #endif @@ -118,10 +117,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif -#ifdef __sparc__ -extern char reboot_command []; -extern int stop_a_enabled; -extern int scons_pwroff; +#ifdef CONFIG_SPARC +#include <asm/system.h> #endif #ifdef __hppa__ @@ -152,7 +149,7 @@ extern int max_lock_depth; #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -382,10 +379,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", - .data = &tainted, - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = &proc_dointvec_taint, + .proc_handler = &proc_taint, }, #endif #ifdef CONFIG_LATENCYTOP @@ -415,7 +411,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef __sparc__ +#ifdef CONFIG_SPARC { .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", @@ -810,16 +806,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_PROC_FS - { - .ctl_name = CTL_UNNUMBERED, - .procname = "maps_protect", - .data = &maps_protect, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif { .ctl_name = CTL_UNNUMBERED, .procname = "poweroff_cmd", @@ -1261,6 +1247,7 @@ static struct ctl_table fs_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASES, .procname = "leases-enable", @@ -1269,6 +1256,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#endif #ifdef CONFIG_DNOTIFY { .ctl_name = FS_DIR_NOTIFY, @@ -1280,6 +1268,7 @@ static struct ctl_table fs_table[] = { }, #endif #ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING { .ctl_name = FS_LEASE_TIME, .procname = "lease-break-time", @@ -1291,6 +1280,8 @@ static struct ctl_table fs_table[] = { .extra1 = &zero, .extra2 = &two, }, +#endif +#ifdef CONFIG_AIO { .procname = "aio-nr", .data = &aio_nr, @@ -1305,6 +1296,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, +#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, @@ -1510,7 +1502,6 @@ void register_sysctl_root(struct ctl_table_root *root) /* Perform the actual read/write of a sysctl table entry. */ static int do_sysctl_strategy(struct ctl_table_root *root, struct ctl_table *table, - int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -1524,8 +1515,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, return -EPERM; if (table->strategy) { - rc = table->strategy(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = table->strategy(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; if (rc > 0) @@ -1535,8 +1525,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root, /* If there is no strategy routine, or if the strategy returns * zero, proceed with automatic r/w */ if (table->data && table->maxlen) { - rc = sysctl_data(table, name, nlen, oldval, oldlenp, - newval, newlen); + rc = sysctl_data(table, oldval, oldlenp, newval, newlen); if (rc < 0) return rc; } @@ -1568,7 +1557,7 @@ repeat: table = table->child; goto repeat; } - error = do_sysctl_strategy(root, table, name, nlen, + error = do_sysctl_strategy(root, table, oldval, oldlenp, newval, newlen); return error; @@ -2237,49 +2226,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, NULL,NULL); } -#define OP_SET 0 -#define OP_AND 1 -#define OP_OR 2 - -static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, - int *valp, - int write, void *data) -{ - int op = *(int *)data; - if (write) { - int val = *negp ? -*lvalp : *lvalp; - switch(op) { - case OP_SET: *valp = val; break; - case OP_AND: *valp &= val; break; - case OP_OR: *valp |= val; break; - } - } else { - int val = *valp; - if (val < 0) { - *negp = -1; - *lvalp = (unsigned long)-val; - } else { - *negp = 0; - *lvalp = (unsigned long)val; - } - } - return 0; -} - /* - * Taint values can only be increased + * Taint values can only be increased + * This means we can safely use a temporary. */ -static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - int op; + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; - op = OP_OR; - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, - do_proc_dointvec_bset_conv,&op); + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; } struct do_proc_dointvec_minmax_conv_param { @@ -2727,7 +2706,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, */ /* The generic sysctl data routine (used if no strategy routine supplied) */ -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2761,7 +2740,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen, } /* The generic string strategy routine: */ -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2807,7 +2786,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen, * are between the minimum and maximum values given in the arrays * table->extra1 and table->extra2, respectively. */ -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2843,7 +2822,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2877,7 +2856,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, } /* Strategy function to convert jiffies to seconds */ -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -2932,35 +2911,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args) return error; } -int sysctl_data(struct ctl_table *table, int __user *name, int nlen, +int sysctl_data(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_string(struct ctl_table *table, int __user *name, int nlen, +int sysctl_string(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, +int sysctl_intvec(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } -int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, +int sysctl_ms_jiffies(struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8d53106a0a92..95ed42951e0a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -3,7 +3,6 @@ # config TICK_ONESHOT bool - default n config NO_HZ bool "Tickless System (Dynamic Ticks)" diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3d1e3e1a1971..f8d968063cea 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev, } /** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + +/** * clockevents_program_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) * @@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev) /* * Noop handler when we shut down an event device */ -static void clockevents_handle_noop(struct clock_event_device *dev) +void clockevents_handle_noop(struct clock_event_device *dev) { } @@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { - old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); @@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (new) { BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(new); } local_irq_restore(flags); } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 093d4acf993b..9ed2eec97526 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; + /* save mult_orig on registration */ + c->mult_orig = c->mult; + spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 4c256fdb8875..1ca99557e929 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ddb0465a6baa..1a20715bfd6e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -245,7 +245,7 @@ static void sync_cmos_clock(struct work_struct *work) if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) fail = update_persistent_clock(now); - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 31463d370b94..f98a1b7b16e9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { + ktime_t next; + tick_do_periodic_broadcast(); /* @@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have - * periodic mode: + * periodic mode. We read dev->next_event first and add to it + * when the event alrady expired. clockevents_program_event() + * sets dev->next_event only when the event is really + * programmed to the device. */ - for (;;) { - ktime_t next = ktime_add(dev->next_event, tick_period); + for (next = dev->next_event; ;) { + next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, ktime_get())) return; @@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags, *reason = why; - int cpu; + int cpu, bc_stopped; spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; + bc_stopped = cpus_empty(tick_broadcast_mask); + switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) - clockevents_set_mode(dev, - CLOCK_EVT_MODE_SHUTDOWN); + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) tick_broadcast_force = 1; @@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; } - if (cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - else { + if (cpus_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else @@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -313,7 +322,7 @@ void tick_suspend_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void) static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; - ktime_t now = ktime_get(); - int res; - - for(;;) { - res = clockevents_program_event(bc, expires, now); - if (!res || !force) - return res; - now = ktime_get(); - expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); - } + + return tick_dev_program_event(bc, expires, force); } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -383,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) } /* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + +/* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) @@ -491,14 +505,52 @@ static void tick_broadcast_clear_oneshot(int cpu) cpu_clear(cpu, tick_broadcast_oneshot_mask); } +static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu_mask_nr(cpu, *mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + /** * tick_broadcast_setup_oneshot - setup the broadcast device */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int cpu = smp_processor_id(); + cpumask_t mask; + + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + + /* Take the do_timer update */ + tick_do_timer_cpu = cpu; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + mask = tick_broadcast_mask; + cpu_clear(cpu, mask); + cpus_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, mask); + + if (was_periodic && !cpus_empty(mask)) { + tick_broadcast_init_next_event(&mask, tick_next_period); + tick_broadcast_set_event(tick_next_period, 1); + } else + bc->next_event.tv64 = KTIME_MAX; + } } /* @@ -538,4 +590,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 80c4336f4188..df12434b43ca 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = -1; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); /* @@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if (!tick_device_is_functional(dev)) return; - if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); } else { unsigned long seq; @@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == -1) { + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); @@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td, } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; @@ -248,7 +250,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { - clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); @@ -299,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup) if (*cpup == tick_do_timer_cpu) { int cpu = first_cpu(cpu_online_map); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -310,7 +313,7 @@ static void tick_suspend(void) unsigned long flags; spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(td->evtdev); spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f13f2b7f4fd4..b1c05bf75ee0 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; @@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void clockevents_shutdown(struct clock_event_device *dev); + /* * NO_HZ / high resolution timer shared code */ @@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); +extern int tick_dev_program_event(struct clock_event_device *dev, + ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); @@ -27,6 +35,8 @@ extern void tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -35,6 +45,8 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } # endif /* !BROADCAST */ #else /* !ONESHOT */ @@ -64,6 +76,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } #endif /* !TICK_ONESHOT */ /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 450c04935b66..2e8de678e767 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -23,24 +23,56 @@ #include "tick-internal.h" /** - * tick_program_event + * tick_program_event internal worker function */ -int tick_program_event(ktime_t expires, int force) +int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, + int force) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; ktime_t now = ktime_get(); + int i; - while (1) { + for (i = 0;;) { int ret = clockevents_program_event(dev, expires, now); if (!ret || !force) return ret; + + /* + * We tried 2 times to program the device with the given + * min_delta_ns. If that's not working then we double it + * and emit a warning. + */ + if (++i > 2) { + /* Increase the min. delta and try again */ + if (!dev->min_delta_ns) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + + i = 0; + } + now = ktime_get(); - expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + expires = ktime_add_ns(now, dev->min_delta_ns); } } /** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return tick_dev_program_event(dev, expires, force); +} + +/** * tick_resume_onshot - resume oneshot mode */ void tick_resume_oneshot(void) @@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - clockevents_program_event(newdev, next_event, ktime_get()); + tick_dev_program_event(newdev, next_event, 1); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b0468568b..0581c11fe6c6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/tick.h> +#include <linux/module.h> #include <asm/irq_regs.h> @@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now) incr * ticks); } do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&xtime_lock); } @@ -151,7 +155,7 @@ void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -187,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - *last_update_time = ktime_to_us(ts->idle_lastupdate); + if (!tick_nohz_enabled) + return -1; + + if (ts->idle_active) + *last_update_time = ktime_to_us(ts->idle_lastupdate); + else + *last_update_time = ktime_to_us(ktime_get()); + return ktime_to_us(ts->idle_sleeptime); } +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task @@ -221,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) @@ -258,7 +270,7 @@ void tick_nohz_stop_sched_tick(int inidle) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - if (rcu_needs_cpu(cpu)) + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) delta_jiffies = 1; /* * Do not stop the tick, if we are only one off @@ -303,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle) * invoked. */ if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->idle_sleeps++; @@ -365,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } +} + /** * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * @@ -418,28 +456,7 @@ void tick_nohz_restart_sched_tick(void) */ ts->tick_stopped = 0; ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } + tick_nohz_restart(ts, now); local_irq_enable(); } @@ -468,7 +485,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; /* Check, if the jiffies need an update */ @@ -491,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -540,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void) smp_processor_id()); } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!ts->tick_stopped) + return; + + tick_nohz_restart(ts, ktime_get()); +} + #else static inline void tick_nohz_switch_to_nohz(void) { } @@ -547,6 +581,19 @@ static inline void tick_nohz_switch_to_nohz(void) { } #endif /* NO_HZ */ /* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ + tick_check_oneshot_broadcast(cpu); +#ifdef CONFIG_NO_HZ + tick_nohz_stop_idle(cpu); + tick_nohz_update_jiffies(); + tick_nohz_kick_tick(cpu); +#endif +} + +/* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -570,7 +617,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; #endif @@ -599,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) profile_tick(CPU_PROFILING); } - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; @@ -622,7 +665,7 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5ecbfc39a268..e7acfb482a68 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,27 +58,26 @@ struct clocksource *clock; #ifdef CONFIG_GENERIC_TIME /** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * clocksource_forward_now - update clock to the current time * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. */ -static inline s64 __get_nsec_offset(void) +static void clocksource_forward_now(void) { cycle_t cycle_now, cycle_delta; - s64 ns_offset; + s64 nsec; - /* read clocksource: */ cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + clock->cycle_last = cycle_now; - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); + nsec = cyc2ns(clock, cycle_delta); + timespec_add_ns(&xtime, nsec); - return ns_offset; + nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + clock->raw_time.tv_nsec += nsec; } /** @@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void) */ void getnstimeofday(struct timespec *ts) { + cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - nsecs = __get_nsec_offset(); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); } while (read_seqretry(&xtime_lock, seq)); @@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(struct timespec *tv) { + struct timespec ts_delta; unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); - nsec -= __get_nsec_offset(); + clocksource_forward_now(); + + ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + xtime = *tv; - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); update_xtime_cache(0); clock->error = 0; @@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday); static void change_clocksource(void) { struct clocksource *new; - cycle_t now; - u64 nsec; new = clocksource_get_next(); if (clock == new) return; - new->cycle_last = 0; - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); + clocksource_forward_now(); - clock = new; - clock->cycle_last = now; + new->raw_time = clock->raw_time; + clock = new; + clock->cycle_last = 0; + clock->cycle_last = clocksource_read(new); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -200,11 +205,44 @@ static void change_clocksource(void) */ } #else +static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -static inline s64 __get_nsec_offset(void) { return 0; } #endif /** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + cycle_t cycle_now, cycle_delta; + + do { + seq = read_seqbegin(&xtime_lock); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + + *ts = clock->raw_time; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + +/** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) @@ -265,8 +303,6 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -/* xtime offset when we went into suspend */ -static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } - /* Make sure that we have the correct xtime reference */ - timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = 0; @@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* Get the current xtime offset */ - timekeeping_suspend_nsecs = __get_nsec_offset(); + clocksource_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -461,16 +494,22 @@ void update_wall_time(void) */ while (offset >= clock->cycle_interval) { /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; offset -= clock->cycle_interval; + clock->cycle_last += clock->cycle_interval; + clock->xtime_nsec += clock->xtime_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } + clock->raw_time.tv_nsec += clock->raw_interval; + if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { + clock->raw_time.tv_nsec -= NSEC_PER_SEC; + clock->raw_time.tv_sec++; + } + /* accumulate error between NTP and clock interval */ clock->error += tick_length; clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20fd0001..f6426911e35a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym) } static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) { #ifdef CONFIG_TIMER_STATS char tmp[TASK_COMM_LEN + 1]; #endif SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); + print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02lx", timer->state); @@ -99,7 +100,7 @@ next_one: tmp = *timer; spin_unlock_irqrestore(&base->cpu_base->lock, flags); - print_timer(m, &tmp, i, now); + print_timer(m, timer, &tmp, i, now); next++; goto next_one; } @@ -109,6 +110,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -183,12 +185,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_GENERIC_CLOCKEVENTS static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { @@ -222,7 +228,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) int cpu; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); + print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", tick_get_broadcast_mask()->bits[0]); #ifdef CONFIG_TICK_ONESHOT @@ -232,7 +238,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) SEQ_printf(m, "\n"); #endif for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); + print_tickdevice(m, tick_get_device(cpu), cpu); SEQ_printf(m, "\n"); } #else @@ -244,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "Timer List Version: v0.4\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 03bc7f1f1593..56becf373c58 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -978,6 +978,7 @@ void update_process_times(int user_tick) run_local_timers(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_tick); + printk_tick(); scheduler_tick(); run_posix_cpu_timers(p); } @@ -1435,9 +1436,11 @@ static void __cpuinit migrate_timers(int cpu) BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1452,8 +1455,7 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index bb948e52ce20..db58fb66a135 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,7 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a096..39d6159fae43 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ab9659d269e..3b34b3545936 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -60,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, #ifdef CONFIG_SYSCTL_SYSCALL /* The generic string strategy routine: */ -static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, +static int sysctl_uts_string(ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { @@ -69,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, write = newval && newlen; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = sysctl_string(&uts_table, name, nlen, - oldval, oldlenp, newval, newlen); + r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen); put_uts(table, write, uts_table.data); return r; } diff --git a/kernel/wait.c b/kernel/wait.c index c275c56cf2d3..cd87131f2fc2 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait); @@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) spin_lock_irqsave(&q->lock, flags); if (list_empty(&wait->task_list)) __add_wait_queue_tail(q, wait); - /* - * don't alter the task state if this is just going to - * queue an async wait queue callback - */ - if (is_sync_wait(wait)) - set_current_state(state); + set_current_state(state); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(prepare_to_wait_exclusive); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4048e92aa04f..714afad46539 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -9,7 +9,7 @@ * Derived from the taskqueue/keventd code by: * * David Woodhouse <dwmw2@infradead.org> - * Andrew Morton <andrewm@uow.edu.au> + * Andrew Morton * Kai Petzke <wpp@marie.physik.tu-berlin.de> * Theodore Ts'o <tytso@mit.edu> * |