diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-13 11:05:51 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-13 11:05:51 +0200 |
commit | accba5f3965d6a9d1bf7c1e1a7995d17e9d521b6 (patch) | |
tree | 8fb40782e79472ed882ff2098d4dd295557278ee /kernel | |
parent | 6852fd9b86d05063c6ef49d2e12e061cc7f6a105 (diff) | |
parent | 4480f15b3306f43bbb0310d461142b4e897ca45b (diff) |
Merge branch 'linus' into oprofile-v2
Conflicts:
arch/x86/kernel/apic_32.c
arch/x86/oprofile/nmi_int.c
include/linux/pci_ids.h
Diffstat (limited to 'kernel')
75 files changed, 3201 insertions, 1756 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 382dd5a8b2d7..94fabd534b03 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && USE_GENERIC_SMP_HELPERS + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) diff --git a/kernel/Makefile b/kernel/Makefile index 54f69837d35a..4e1d7df7c3e2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -84,6 +84,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FTRACE) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o diff --git a/kernel/audit.c b/kernel/audit.c index e092f1c0ce30..4414e93d8750 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(status_get->enabled, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(status_get->failure, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_PID) { int new_pid = status_get->pid; @@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = new_pid; audit_nlk_pid = NETLINK_CB(skb).pid; } - if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) + if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, loginuid, sessionid, sid); + if (err < 0) + return err; + } if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sessionid, sid); @@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len && *p; p++) { - if (*p == '"' || *p < 0x21 || *p > 0x7f) + if (*p == '"' || *p < 0x21 || *p > 0x7e) return 1; } return 0; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 98c50cc671bb..b7d354e2b0ef 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); audit_log_format(ab, - "op=updated rules specifying path="); + " op=updated rules specifying path="); audit_log_untrustedstring(ab, owatch->path); audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); @@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule path="); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_format(ab, " op=remove rule path="); audit_log_untrustedstring(ab, w->path); if (r->filterkey) { audit_log_format(ab, " key="); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4699950e65bd..59cedfb040e7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -243,7 +243,11 @@ static inline int open_arg(int flags, int mask) static int audit_match_perm(struct audit_context *ctx, int mask) { - unsigned n = ctx->major; + unsigned n; + if (unlikely(!ctx)) + return 0; + + n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ if ((mask & AUDIT_PERM_WRITE) && @@ -284,6 +288,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which) { unsigned index = which & ~S_IFMT; mode_t mode = which & S_IFMT; + + if (unlikely(!ctx)) + return 0; + if (index >= ctx->name_count) return 0; if (ctx->names[index].ino == -1) @@ -610,7 +618,7 @@ static int audit_filter_rules(struct task_struct *tsk, if (!result) return 0; } - if (rule->filterkey) + if (rule->filterkey && ctx) ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; @@ -2375,7 +2383,7 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_context *ctx = tsk->audit_context; if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; diff --git a/kernel/capability.c b/kernel/capability.c index 0101e847603e..33e51e78c2d8 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) return ret; } -int __capable(struct task_struct *t, int cap) +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +int capable(int cap) { - if (security_capable(t, cap) == 0) { - t->flags |= PF_SUPERPRIV; + if (has_capability(current, cap)) { + current->flags |= PF_SUPERPRIV; return 1; } return 0; } - -int capable(int cap) -{ - return __capable(current, cap); -} EXPORT_SYMBOL(capable); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66ec9fd21e0c..a0123d75ec9a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -45,6 +45,7 @@ #include <linux/delayacct.h> #include <linux/cgroupstats.h> #include <linux/hash.h> +#include <linux/namei.h> #include <asm/atomic.h> @@ -354,6 +355,17 @@ static struct css_set *find_existing_css_set( return NULL; } +static void free_cg_links(struct list_head *tmp) +{ + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { + list_del(&link->cgrp_link_list); + kfree(link); + } +} + /* * allocate_cg_links() allocates "count" cg_cgroup_link structures * and chains them on tmp through their cgrp_link_list fields. Returns 0 on @@ -362,17 +374,12 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - list_for_each_entry_safe(link, saved_link, tmp, - cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } + free_cg_links(tmp); return -ENOMEM; } list_add(&link->cgrp_link_list, tmp); @@ -380,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } -static void free_cg_links(struct list_head *tmp) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } -} - /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -955,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *root; struct list_head tmp_cg_links; - INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); @@ -1423,14 +1418,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, if (buffer == NULL) return -ENOMEM; } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out; + } buffer[nbytes] = 0; /* nul-terminate */ strstrip(buffer); retval = cft->write_string(cgrp, cft, buffer); if (!retval) retval = nbytes; +out: if (buffer != local_buffer) kfree(buffer); return retval; @@ -1529,7 +1527,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return cft->read_seq_string(state->cgroup, cft, m); } -int cgroup_seqfile_release(struct inode *inode, struct file *file) +static int cgroup_seqfile_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; kfree(seq->private); @@ -2370,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static inline int cgroup_has_css_refs(struct cgroup *cgrp) +static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the @@ -2740,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) diff --git a/kernel/cpu.c b/kernel/cpu.c index 10ba5f1004a5..86d49045daed 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); @@ -216,7 +217,6 @@ static int __ref take_cpu_down(void *_param) static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; - struct task_struct *p; cpumask_t old_allowed, tmp; void *hcpu = (void *)(long)cpu; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; @@ -249,21 +249,18 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpus_setall(tmp); cpu_clear(cpu, tmp); set_cpus_allowed_ptr(current, &tmp); + tmp = cpumask_of_cpu(cpu); - p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); - - if (IS_ERR(p) || cpu_online(cpu)) { + err = __stop_machine(take_cpu_down, &tcd_param, &tmp); + if (err) { /* CPU didn't die: tell everyone. Can't complain. */ if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, hcpu) == NOTIFY_BAD) BUG(); - if (IS_ERR(p)) { - err = PTR_ERR(p); - goto out_allowed; - } - goto out_thread; + goto out_allowed; } + BUG_ON(cpu_online(cpu)); /* Wait for it to sleep (leaving idle task). */ while (!idle_cpu(cpu)) @@ -279,8 +276,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) check_for_tasks(cpu); -out_thread: - err = kthread_stop(p); out_allowed: set_cpus_allowed_ptr(current, &old_allowed); out_release: @@ -355,6 +350,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); + cpu_set(cpu, cpu_active_map); + /* Now call notifier in preparation. */ raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); @@ -373,7 +370,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (!cpu_isset(cpu, cpu_possible_map)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390) +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) printk(KERN_ERR "please check additional_cpus= boot " "parameter\n"); #endif @@ -389,9 +386,6 @@ int __cpuinit cpu_up(unsigned int cpu) err = _cpu_up(cpu, 0); - if (cpu_online(cpu)) - cpu_set(cpu, cpu_active_map); - out: cpu_maps_update_done(); return err; @@ -460,4 +454,48 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ + +/* + * cpu_bit_bitmap[] is a special, "compressed" data structure that + * represents all NR_CPUS bits binary values of 1<<nr. + * + * It is used by cpumask_of_cpu() to get a constant address to a CPU + * mask value that has a single bit set only. + */ + +/* cpu_bit_bitmap[0] is empty - so we can back into it */ +#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) +#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) +#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) + +const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { + + MASK_DECLARE_8(0), MASK_DECLARE_8(8), + MASK_DECLARE_8(16), MASK_DECLARE_8(24), +#if BITS_PER_LONG > 32 + MASK_DECLARE_8(32), MASK_DECLARE_8(40), + MASK_DECLARE_8(48), MASK_DECLARE_8(56), +#endif +}; +EXPORT_SYMBOL_GPL(cpu_bit_bitmap); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 91cf85b36dd5..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -14,6 +14,8 @@ * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups + * 2008 Rework of the scheduler domains and CPU hotplug handling + * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -54,7 +56,6 @@ #include <asm/uaccess.h> #include <asm/atomic.h> #include <linux/mutex.h> -#include <linux/kfifo.h> #include <linux/workqueue.h> #include <linux/cgroup.h> @@ -237,9 +238,11 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); -/* This is ugly, but preserves the userspace API for existing cpuset +/* + * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead */ + * silently switch it to mount "cgroup" instead + */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) @@ -474,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* - * Helper routine for rebuild_sched_domains(). + * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */ - static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpus_intersects(a->cpus_allowed, b->cpus_allowed); @@ -486,34 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { - if (!dattr) - return; if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; } +static void +update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +{ + LIST_HEAD(q); + + list_add(&c->stack_list, &q); + while (!list_empty(&q)) { + struct cpuset *cp; + struct cgroup *cont; + struct cpuset *child; + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpus_empty(cp->cpus_allowed)) + continue; + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &q); + } + } +} + /* - * rebuild_sched_domains() - * - * This routine will be called to rebuild the scheduler's dynamic - * sched domains: - * - if the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, - * - or if the 'cpus' allowed changes in any cpuset which has that - * flag enabled, - * - or if the 'sched_relax_domain_level' of any cpuset which has - * that flag enabled and with non-empty 'cpus' changes, - * - or if any cpuset with non-empty 'cpus' is removed, - * - or if a cpu gets offlined. - * - * This routine builds a partial partition of the systems CPUs - * (the set of non-overlappping cpumask_t's in the array 'part' - * below), and passes that partial partition to the kernel/sched.c - * partition_sched_domains() routine, which will rebuild the - * schedulers load balancing domains (sched domains) as specified - * by that partial partition. A 'partial partition' is a set of - * non-overlapping subsets whose union is a subset of that set. + * generate_sched_domains() + * + * This function builds a partial partition of the systems CPUs + * A 'partial partition' is a set of non-overlapping subsets whose + * union is a subset of that set. + * The output of this function needs to be passed to kernel/sched.c + * partition_sched_domains() routine, which will rebuild the scheduler's + * load balancing domains (sched domains) as specified by that partial + * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. @@ -523,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Call with cgroup_mutex held. May take callback_mutex during - * call due to the kfifo_alloc() and kmalloc() calls. May nest - * a call to the get_online_cpus()/put_online_cpus() pair. - * Must not be called holding callback_mutex, because we must not - * call get_online_cpus() while holding callback_mutex. Elsewhere - * the kernel nests callback_mutex inside get_online_cpus() calls. - * So the reverse nesting would risk an ABBA deadlock. + * Must be called with cgroup_lock held. * * The three key local variables below are: - * q - a kfifo queue of cpuset pointers, used to implement a + * q - a linked-list queue of cpuset pointers, used to implement a * top-down scan of all cpusets. This scan loads a pointer * to each cpuset marked is_sched_load_balance into the * array 'csa'. For our purposes, rebuilding the schedulers @@ -564,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ - -void rebuild_sched_domains(void) +static int generate_sched_domains(cpumask_t **domains, + struct sched_domain_attr **attributes) { - struct kfifo *q; /* queue of cpusets to be scanned */ + LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -577,49 +587,58 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - q = NULL; - csa = NULL; + ndoms = 0; doms = NULL; dattr = NULL; + csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - ndoms = 1; doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) - goto rebuild; + goto done; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; - update_domain_attr(dattr, &top_cpuset); + update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; - goto rebuild; - } - q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); - if (IS_ERR(q)) + ndoms = 1; goto done; + } + csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; - cp = &top_cpuset; - __kfifo_put(q, (void *)&cp, sizeof(cp)); - while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { + list_add(&top_cpuset.stack_list, &q); + while (!list_empty(&q)) { struct cgroup *cont; struct cpuset *child; /* scans child cpusets of cp */ + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + if (cpus_empty(cp->cpus_allowed)) continue; - if (is_sched_load_balance(cp)) + /* + * All child cpusets contain a subset of the parent's cpus, so + * just skip them, and then we call update_domain_attr_tree() + * to calc relax_domain_level of the corresponding sched + * domain. + */ + if (is_sched_load_balance(cp)) { csa[csn++] = cp; + continue; + } list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); - __kfifo_put(q, (void *)&child, sizeof(cp)); + list_add_tail(&child->stack_list, &q); } } @@ -650,63 +669,141 @@ restart: } } - /* Convert <csn, csa> to <ndoms, doms> */ + /* + * Now we know how many domains to create. + * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. + */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) - goto rebuild; + if (!doms) { + ndoms = 0; + goto done; + } + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; + cpumask_t *dp; int apn = a->pn; - if (apn >= 0) { - cpumask_t *dp = doms + nslot; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + if (apn < 0) { + /* Skip completed partitions */ + continue; + } + + dp = doms + nslot; + + if (nslot == ndoms) { + static int warnings = 10; + if (warnings) { + printk(KERN_WARNING + "rebuild_sched_domains confused:" + " nslot %d, ndoms %d, csn %d, i %d," + " apn %d\n", + nslot, ndoms, csn, i, apn); + warnings--; } + continue; + } - cpus_clear(*dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); - b->pn = -1; - if (dattr) - update_domain_attr(dattr - + nslot, b); - } + cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + for (j = i; j < csn; j++) { + struct cpuset *b = csa[j]; + + if (apn == b->pn) { + cpus_or(*dp, *dp, b->cpus_allowed); + if (dattr) + update_domain_attr_tree(dattr + nslot, b); + + /* Done with this partition */ + b->pn = -1; } - nslot++; } + nslot++; } BUG_ON(nslot != ndoms); -rebuild: - /* Have scheduler rebuild sched domains */ +done: + kfree(csa); + + *domains = doms; + *attributes = dattr; + return ndoms; +} + +/* + * Rebuild scheduler domains. + * + * Call with neither cgroup_mutex held nor within get_online_cpus(). + * Takes both cgroup_mutex and get_online_cpus(). + * + * Cannot be directly called from cpuset code handling changes + * to the cpuset pseudo-filesystem, because it cannot be called + * from code that already holds cgroup_mutex. + */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + get_online_cpus(); - partition_sched_domains(ndoms, doms, dattr); + + /* Generate domain masks and attrs */ + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + put_online_cpus(); +} -done: - if (q && !IS_ERR(q)) - kfifo_free(q); - kfree(csa); - /* Don't kfree(doms) -- partition_sched_domains() does that. */ - /* Don't kfree(dattr) -- partition_sched_domains() does that. */ +static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); + +/* + * Rebuild scheduler domains, asynchronously via workqueue. + * + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * + * The rebuild_sched_domains() and partition_sched_domains() + * routines must nest cgroup_lock() inside get_online_cpus(), + * but such cpuset changes as these must nest that locking the + * other way, holding cgroup_lock() for much of the code. + * + * So in order to avoid an ABBA deadlock, the cpuset code handling + * these user changes delegates the actual sched domain rebuilding + * to a separate workqueue thread, which ends up processing the + * above do_rebuild_sched_domains() function. + */ +static void async_rebuild_sched_domains(void) +{ + schedule_work(&rebuild_sched_domains_work); +} + +/* + * Accomplishes the same scheduler domain rebuild as the above + * async_rebuild_sched_domains(), however it directly calls the + * rebuild routine synchronously rather than calling it via an + * asynchronous work thread. + * + * This can only be called from code that is not holding + * cgroup_mutex (not nested in a cgroup_lock() call.) + */ +void rebuild_sched_domains(void) +{ + do_rebuild_sched_domains(NULL); } /** @@ -746,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk, /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cgroup_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * Return 0 if successful, -errno if not. + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. */ -static int update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - struct ptr_heap heap; - int retval; - - /* - * cgroup_scan_tasks() will initialize heap->gt for us. - * heap_init() is still needed here for we should not change - * cs->cpus_allowed when heap_init() fails. - */ - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; scan.cg = cs->css.cgroup; scan.test_task = cpuset_test_cpumask; scan.process_task = cpuset_change_cpumask; - scan.heap = &heap; - retval = cgroup_scan_tasks(&scan); - - heap_free(&heap); - return retval; + scan.heap = heap; + cgroup_scan_tasks(&scan); } /** @@ -786,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs) */ static int update_cpumask(struct cpuset *cs, const char *buf) { + struct ptr_heap heap; struct cpuset trialcs; int retval; int is_load_balanced; @@ -820,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval) + return retval; + is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); @@ -830,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * Scan tasks in the cpuset, and update the cpumasks of any * that need an update. */ - retval = update_tasks_cpumask(cs); - if (retval < 0) - return retval; + update_tasks_cpumask(cs, &heap); + + heap_free(&heap); if (is_load_balanced) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1062,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains(); + async_rebuild_sched_domains(); } return 0; @@ -1103,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1464,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unreachable but makes gcc happy */ + return 0; } static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) @@ -1476,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unrechable but makes gcc happy */ + return 0; } @@ -1664,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create( } /* - * Locking note on the strange update_flag() call below: - * * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains(). The get_online_cpus() - * call in rebuild_sched_domains() must not be made while holding - * callback_mutex. Elsewhere the kernel nests callback_mutex inside - * get_online_cpus() calls. So the reverse nesting would risk an - * ABBA deadlock. + * will call async_rebuild_sched_domains(). */ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) @@ -1691,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .create = cpuset_create, - .destroy = cpuset_destroy, + .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, .attach = cpuset_attach, .populate = cpuset_populate, @@ -1783,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) } /* - * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty @@ -1831,26 +1921,23 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { + LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ - struct list_head queue; struct cgroup *cont; nodemask_t oldmems; - INIT_LIST_HEAD(&queue); - list_add_tail((struct list_head *)&root->stack_list, &queue); while (!list_empty(&queue)) { - cp = container_of(queue.next, struct cpuset, stack_list); + cp = list_first_entry(&queue, struct cpuset, stack_list); list_del(queue.next); list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &queue); } - cont = cp->css.cgroup; /* Continue past cpusets with all cpus, mems online */ if (cpus_subset(cp->cpus_allowed, cpu_online_map) && @@ -1871,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root) nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, NULL); update_tasks_nodemask(cp, &oldmems); } } } /* - * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to - * track what's online after any CPU or memory node hotplug or unplug event. - * - * Since there are two callers of this routine, one for CPU hotplug - * events and one for memory node hotplug events, we could have coded - * two separate routines here. We code it as a single common routine - * in order to minimize text size. - */ - -static void common_cpu_mem_hotplug_unplug(int rebuild_sd) -{ - cgroup_lock(); - - top_cpuset.cpus_allowed = cpu_online_map; - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); - - /* - * Scheduler destroys domains on hotplug events. - * Rebuild them based on the current settings. - */ - if (rebuild_sd) - rebuild_sched_domains(); - - cgroup_unlock(); -} - -/* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent * (of no affect) on systems that are actively using CPU hotplug @@ -1914,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. + * + * Called within get_online_cpus(). Needs to call cgroup_lock() + * before calling generate_sched_domains(). */ - -static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, +static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + switch (phase) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - common_cpu_mem_hotplug_unplug(1); break; + default: return NOTIFY_DONE; } + cgroup_lock(); + top_cpuset.cpus_allowed = cpu_online_map; + scan_for_empty_cpusets(&top_cpuset); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after you change - * node_states[N_HIGH_MEMORY]. - * See also the previous routine cpuset_handle_cpuhp(). + * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * See also the previous routine cpuset_track_online_cpus(). */ - void cpuset_track_online_nodes(void) { - common_cpu_mem_hotplug_unplug(0); + cgroup_lock(); + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); + cgroup_unlock(); } #endif @@ -1962,7 +2032,7 @@ void __init cpuset_init_smp(void) top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_handle_cpuhp, 0); + hotcpu_notifier(cpuset_track_online_cpus, 0); } /** diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c new file mode 100644 index 000000000000..f013a0c2e111 --- /dev/null +++ b/kernel/dma-coherent.c @@ -0,0 +1,155 @@ +/* + * Coherent per-device memory handling. + * Borrowed from i386 + */ +#include <linux/kernel.h> +#include <linux/dma-mapping.h> + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; + if (!size) + goto out; + if (dev->dma_mem) + goto out; + + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ + + mem_base = ioremap(bus_addr, size); + if (!mem_base) + goto out; + + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dev->dma_mem) + goto out; + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dev->dma_mem->bitmap) + goto free1_out; + + dev->dma_mem->virt_base = mem_base; + dev->dma_mem->device_base = device_addr; + dev->dma_mem->size = pages; + dev->dma_mem->flags = flags; + + if (flags & DMA_MEMORY_MAP) + return DMA_MEMORY_MAP; + + return DMA_MEMORY_IO; + + free1_out: + kfree(dev->dma_mem); + out: + if (mem_base) + iounmap(mem_base); + return 0; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if (!mem) + return; + dev->dma_mem = NULL; + iounmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + int pos, err; + + size += device_addr & ~PAGE_MASK; + + if (!mem) + return ERR_PTR(-EINVAL); + + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; + err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +/** + * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area + * + * @dev: device from which we allocate memory + * @size: size of requested memory area + * @dma_handle: This will be filled with the correct dma handle + * @ret: This pointer will be filled with the virtual address + * to allocated area. + * + * This function should be only called from per-arch dma_alloc_coherent() + * to support allocation from per-device coherent memory pools. + * + * Returns 0 if dma_alloc_coherent should continue with allocating from + * generic memory areas, or !0 if dma_alloc_coherent should return @ret. + */ +int dma_alloc_from_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + *ret = mem->virt_base + (page << PAGE_SHIFT); + memset(*ret, 0, size); + } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) + *ret = NULL; + } + return (mem != NULL); +} +EXPORT_SYMBOL(dma_alloc_from_coherent); + +/** + * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool + * @dev: device from which the memory was allocated + * @order: the order of pages allocated + * @vaddr: virtual address of allocated pages + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, releases that memory. + * + * Returns 1 if we correctly released the memory, or 0 if + * dma_release_coherent() should proceed with releasing memory from + * generic pools. + */ +int dma_release_from_coherent(struct device *dev, int order, void *vaddr) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + + if (mem && vaddr >= mem->virt_base && vaddr < + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); + return 1; + } + return 0; +} +EXPORT_SYMBOL(dma_release_from_coherent); diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c1ef192aa655..0d407e886735 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -168,7 +168,6 @@ __set_personality(u_long personality) current->personality = personality; oep = current_thread_info()->exec_domain; current_thread_info()->exec_domain = ep; - set_fs_altroot(); module_put(oep->module); return 0; diff --git a/kernel/exit.c b/kernel/exit.c index ad933bb29ec7..85a83c831856 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -46,6 +46,7 @@ #include <linux/resource.h> #include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> +#include <linux/tracehook.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -111,27 +112,16 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); -#ifdef CONFIG_TASK_XACCT - sig->rchar += tsk->rchar; - sig->wchar += tsk->wchar; - sig->syscr += tsk->syscr; - sig->syscw += tsk->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - sig->ioac.read_bytes += tsk->ioac.read_bytes; - sig->ioac.write_bytes += tsk->ioac.write_bytes; - sig->ioac.cancelled_write_bytes += - tsk->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -162,27 +152,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp) put_task_struct(container_of(rhp, struct task_struct, rcu)); } -/* - * Do final ptrace-related cleanup of a zombie being reaped. - * - * Called with write_lock(&tasklist_lock) held. - */ -static void ptrace_release_task(struct task_struct *p) -{ - BUG_ON(!list_empty(&p->ptraced)); - ptrace_unlink(p); - BUG_ON(!list_empty(&p->ptrace_entry)); -} void release_task(struct task_struct * p) { struct task_struct *leader; int zap_leader; repeat: + tracehook_prepare_release_task(p); atomic_dec(&p->user->processes); proc_flush_task(p); write_lock_irq(&tasklist_lock); - ptrace_release_task(p); + tracehook_finish_release_task(p); __exit_signal(p); /* @@ -204,6 +184,13 @@ repeat: * that case. */ zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; } write_unlock_irq(&tasklist_lock); @@ -567,8 +554,6 @@ void put_fs_struct(struct fs_struct *fs) if (atomic_dec_and_test(&fs->count)) { path_put(&fs->root); path_put(&fs->pwd); - if (fs->altroot.dentry) - path_put(&fs->altroot); kmem_cache_free(fs_cachep, fs); } } @@ -598,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -642,6 +625,16 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: @@ -846,26 +839,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father) * the child reaper process (ie "init") in our pid * space. */ +static struct task_struct *find_new_reaper(struct task_struct *father) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + static void forget_original_parent(struct task_struct *father) { - struct task_struct *p, *n, *reaper = father; + struct task_struct *p, *n, *reaper; LIST_HEAD(ptrace_dead); write_lock_irq(&tasklist_lock); - + reaper = find_new_reaper(father); /* * First clean up ptrace if we were using it. */ ptrace_exit(father, &ptrace_dead); - do { - reaper = next_thread(reaper); - if (reaper == father) { - reaper = task_child_reaper(father); - break; - } - } while (reaper->flags & PF_EXITING); - list_for_each_entry_safe(p, n, &father->children, sibling) { p->real_parent = reaper; if (p->parent == father) { @@ -887,7 +904,8 @@ static void forget_original_parent(struct task_struct *father) */ static void exit_notify(struct task_struct *tsk, int group_dead) { - int state; + int signal; + void *cookie; /* * This does two things: @@ -924,33 +942,24 @@ static void exit_notify(struct task_struct *tsk, int group_dead) !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; - /* If something other than our normal parent is ptracing us, then - * send it a SIGCHLD instead of honoring exit_signal. exit_signal - * only has special meaning to our real parent. - */ - if (!task_detached(tsk) && thread_group_empty(tsk)) { - int signal = ptrace_reparented(tsk) ? - SIGCHLD : tsk->exit_signal; - do_notify_parent(tsk, signal); - } else if (tsk->ptrace) { - do_notify_parent(tsk, SIGCHLD); - } + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal >= 0) + signal = do_notify_parent(tsk, signal); - state = EXIT_ZOMBIE; - if (task_detached(tsk) && likely(!tsk->ptrace)) - state = EXIT_DEAD; - tsk->exit_state = state; + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && - tsk->signal->notify_count < 0 && - tsk->signal->group_exit_task) + tsk->signal->group_exit_task && + tsk->signal->notify_count < 0) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); + tracehook_report_death(tsk, signal, cookie, group_dead); + /* If the process is dead, release it - nobody will wait for it */ - if (state == EXIT_DEAD) + if (signal == DEATH_REAP) release_task(tsk); } @@ -982,39 +991,6 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -static inline void exit_child_reaper(struct task_struct *tsk) -{ - if (likely(tsk->group_leader != task_child_reaper(tsk))) - return; - - if (tsk->nsproxy->pid_ns == &init_pid_ns) - panic("Attempted to kill init!"); - - /* - * @tsk is the last thread in the 'cgroup-init' and is exiting. - * Terminate all remaining processes in the namespace and reap them - * before exiting @tsk. - * - * Note that @tsk (last thread of cgroup-init) may not necessarily - * be the child-reaper (i.e main thread of cgroup-init) of the - * namespace i.e the child_reaper may have already exited. - * - * Even after a child_reaper exits, we let it inherit orphaned children, - * because, pid_ns->child_reaper remains valid as long as there is - * at least one living sub-thread in the cgroup init. - - * This living sub-thread of the cgroup-init will be notified when - * a child inherited by the 'child-reaper' exits (do_notify_parent() - * uses __group_send_sig_info()). Further, when reaping child processes, - * do_wait() iterates over children of all living sub threads. - - * i.e even though 'child_reaper' thread is listed as the parent of the - * orphaned children, any living sub-thread in the cgroup-init can - * perform the role of the child_reaper. - */ - zap_pid_ns_processes(tsk->nsproxy->pid_ns); -} - NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; @@ -1029,10 +1005,7 @@ NORET_TYPE void do_exit(long code) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - if (unlikely(current->ptrace & PT_TRACE_EXIT)) { - current->ptrace_message = code; - ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); - } + tracehook_report_exit(&code); /* * We're taking recursive faults here in do_exit. Safest is to just @@ -1077,7 +1050,6 @@ NORET_TYPE void do_exit(long code) } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { - exit_child_reaper(tsk); hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); } @@ -1378,21 +1350,8 @@ static int wait_task_zombie(struct task_struct *p, int options, psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; -#ifdef CONFIG_TASK_XACCT - psig->rchar += p->rchar + sig->rchar; - psig->wchar += p->wchar + sig->wchar; - psig->syscr += p->syscr + sig->syscr; - psig->syscw += p->syscw + sig->syscw; -#endif /* CONFIG_TASK_XACCT */ -#ifdef CONFIG_TASK_IO_ACCOUNTING - psig->ioac.read_bytes += - p->ioac.read_bytes + sig->ioac.read_bytes; - psig->ioac.write_bytes += - p->ioac.write_bytes + sig->ioac.write_bytes; - psig->ioac.cancelled_write_bytes += - p->ioac.cancelled_write_bytes + - sig->ioac.cancelled_write_bytes; -#endif /* CONFIG_TASK_IO_ACCOUNTING */ + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->parent->sighand->siglock); } diff --git a/kernel/fork.c b/kernel/fork.c index b99d73e971a4..7ce2ebe84796 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -27,6 +27,7 @@ #include <linux/key.h> #include <linux/binfmts.h> #include <linux/mman.h> +#include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/nsproxy.h> #include <linux/capability.h> @@ -37,6 +38,7 @@ #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/jiffies.h> +#include <linux/tracehook.h> #include <linux/futex.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> @@ -413,6 +415,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mmu_notifier_mm_init(mm); return mm; } @@ -445,6 +448,7 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); mm_free_pgd(mm); destroy_context(mm); + mmu_notifier_mm_destroy(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -656,13 +660,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old) path_get(&old->root); fs->pwd = old->pwd; path_get(&old->pwd); - if (old->altroot.dentry) { - fs->altroot = old->altroot; - path_get(&old->altroot); - } else { - fs->altroot.mnt = NULL; - fs->altroot.dentry = NULL; - } read_unlock(&old->lock); } return fs; @@ -812,12 +809,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; -#ifdef CONFIG_TASK_XACCT - sig->rchar = sig->wchar = sig->syscr = sig->syscw = 0; -#endif -#ifdef CONFIG_TASK_IO_ACCOUNTING - memset(&sig->ioac, 0, sizeof(sig->ioac)); -#endif + task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); @@ -865,8 +857,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~PF_SUPERPRIV; new_flags |= PF_FORKNOEXEC; - if (!(clone_flags & CLONE_PTRACE)) - p->ptrace = 0; + new_flags |= PF_STARTING; p->flags = new_flags; clear_freeze_flag(p); } @@ -907,7 +898,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, struct pt_regs *regs, unsigned long stack_size, int __user *child_tidptr, - struct pid *pid) + struct pid *pid, + int trace) { int retval; struct task_struct *p; @@ -1000,13 +992,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->last_switch_timestamp = 0; #endif -#ifdef CONFIG_TASK_XACCT - p->rchar = 0; /* I/O counter: bytes read */ - p->wchar = 0; /* I/O counter: bytes written */ - p->syscr = 0; /* I/O counter: read syscalls */ - p->syscw = 0; /* I/O counter: write syscalls */ -#endif - task_io_accounting_init(p); + task_io_accounting_init(&p->ioac); acct_clear_integrals(p); p->it_virt_expires = cputime_zero; @@ -1163,8 +1149,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, */ p->group_leader = p; INIT_LIST_HEAD(&p->thread_group); - INIT_LIST_HEAD(&p->ptrace_entry); - INIT_LIST_HEAD(&p->ptraced); /* Now that the task is set up, run cgroup callbacks if * necessary. We need to run them before the task is visible @@ -1195,7 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_parent = current->real_parent; else p->real_parent = current; - p->parent = p->real_parent; spin_lock(¤t->sighand->siglock); @@ -1237,8 +1220,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (likely(p->pid)) { list_add_tail(&p->sibling, &p->real_parent->children); - if (unlikely(p->ptrace & PT_PTRACED)) - __ptrace_link(p, current->parent); + tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { if (clone_flags & CLONE_NEWPID) @@ -1323,29 +1305,13 @@ struct task_struct * __cpuinit fork_idle(int cpu) struct pt_regs regs; task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, - &init_struct_pid); + &init_struct_pid, 0); if (!IS_ERR(task)) init_idle(task, cpu); return task; } -static int fork_traceflag(unsigned clone_flags) -{ - if (clone_flags & CLONE_UNTRACED) - return 0; - else if (clone_flags & CLONE_VFORK) { - if (current->ptrace & PT_TRACE_VFORK) - return PTRACE_EVENT_VFORK; - } else if ((clone_flags & CSIGNAL) != SIGCHLD) { - if (current->ptrace & PT_TRACE_CLONE) - return PTRACE_EVENT_CLONE; - } else if (current->ptrace & PT_TRACE_FORK) - return PTRACE_EVENT_FORK; - - return 0; -} - /* * Ok, this is the main fork-routine. * @@ -1380,14 +1346,14 @@ long do_fork(unsigned long clone_flags, } } - if (unlikely(current->ptrace)) { - trace = fork_traceflag (clone_flags); - if (trace) - clone_flags |= CLONE_PTRACE; - } + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + if (likely(user_mode(regs))) + trace = tracehook_prepare_clone(clone_flags); p = copy_process(clone_flags, stack_start, regs, stack_size, - child_tidptr, NULL); + child_tidptr, NULL, trace); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. @@ -1405,32 +1371,35 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { + tracehook_report_clone(trace, regs, clone_flags, nr, p); + + /* + * We set PF_STARTING at creation in case tracing wants to + * use this to distinguish a fully live task from one that + * hasn't gotten to tracehook_report_clone() yet. Now we + * clear it and set the child going. + */ + p->flags &= ~PF_STARTING; + + if (unlikely(clone_flags & CLONE_STOPPED)) { /* * We'll start up with an immediate SIGSTOP. */ sigaddset(&p->pending.signal, SIGSTOP); set_tsk_thread_flag(p, TIF_SIGPENDING); - } - - if (!(clone_flags & CLONE_STOPPED)) - wake_up_new_task(p, clone_flags); - else __set_task_state(p, TASK_STOPPED); - - if (unlikely (trace)) { - current->ptrace_message = nr; - ptrace_notify ((trace << 8) | SIGTRAP); + } else { + wake_up_new_task(p, clone_flags); } + tracehook_report_clone_complete(trace, regs, + clone_flags, nr, p); + if (clone_flags & CLONE_VFORK) { freezer_do_not_count(); wait_for_completion(&vfork); freezer_count(); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { - current->ptrace_message = nr; - ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); - } + tracehook_report_vfork_done(p, nr); } } else { nr = PTR_ERR(p); @@ -1442,7 +1411,7 @@ long do_fork(unsigned long clone_flags, #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -static void sighand_ctor(struct kmem_cache *cachep, void *data) +static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a74..cdec83e722fa 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ BUG_ON(timer->function(timer) != HRTIMER_NORESTART); return 1; - case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + case HRTIMER_CB_IRQSAFE_PERCPU: + case HRTIMER_CB_IRQSAFE_UNLOCKED: /* * This is solely for the sched tick emulation with * dynamic tick support to ensure that we do not * restart the tick right on the edge and end up with * the tick timer in the softirq ! The calling site - * takes care of this. + * takes care of this. Also used for hrtimer sleeper ! */ debug_hrtimer_deactivate(timer); return 1; @@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer) timer_stats_account_hrtimer(timer); fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { /* * Used for scheduler timers, avoid lock inversion with * rq->lock and tasklist_lock. @@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; #ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; #endif } @@ -1591,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; + int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); + + /* + * Should not happen. Per CPU timers should be + * canceled _before_ the migration code is called + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { + __remove_hrtimer(timer, old_base, + HRTIMER_STATE_INACTIVE, 0); + WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", + timer, timer->function, dcpu); + continue; + } + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* * Enqueue the timer. Allow reprogramming of the event device */ enqueue_hrtimer(timer, new_base, 1); + +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Happens with high res enabled when the timer was + * already expired and the callback mode is + * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The + * enqueue code does not move them to the soft irq + * pending list for performance/latency reasons, but + * in the migration state, we need to do that + * otherwise we end up with a stale timer. + */ + if (timer->state == HRTIMER_STATE_MIGRATE) { + timer->state = HRTIMER_STATE_PENDING; + list_add_tail(&timer->cb_entry, + &new_base->cpu_base->cb_pending); + raise = 1; + } +#endif + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; + } + return raise; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + struct hrtimer *timer; + int raise = 0; + + while (!list_empty(&old_base->cb_pending)) { + timer = list_entry(old_base->cb_pending.next, + struct hrtimer, cb_entry); + + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); + timer->base = &new_base->clock_base[timer->base->index]; + list_add_tail(&timer->cb_entry, &new_base->cb_pending); + raise = 1; } + return raise; +} +#else +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + return 0; } +#endif static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, raise = 0; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); @@ -1626,14 +1694,21 @@ static void migrate_hrtimers(int cpu) spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + if (migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i], cpu)) + raise = 1; } + if (migrate_hrtimer_pending(old_base, new_base)) + raise = 1; + spin_unlock(&old_base->lock); spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); + + if (raise) + hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 964964baefa2..3cd441ebf5d2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } @@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } @@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); - printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", + WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", irq); - WARN_ON(1); return; } desc->msi_desc = NULL; @@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) unsigned long flags; if (irq >= NR_IRQS) { - printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f8914b92b664..60c49e324390 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -89,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) set_balance_irq_affinity(irq, cpumask); #ifdef CONFIG_GENERIC_PENDING_IRQ - set_pending_irq(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->chip->set_affinity(irq, cpumask); + spin_unlock_irqrestore(&desc->lock, flags); + } else + set_pending_irq(irq, cpumask); #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); @@ -177,8 +184,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) { switch (desc->depth) { case 0: - printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); - WARN_ON(1); + WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { unsigned int status = desc->status & ~IRQ_DISABLED; @@ -324,7 +330,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); if (ret) - pr_err("setting flow type for irq %u failed (%pF)\n", + pr_err("setting trigger mode %d for irq %u failed (%pF)\n", + (int)(flags & IRQF_TRIGGER_MASK), irq, chip->set_type); return ret; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c6d35d68ee9..a09dd29c2fd7 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -8,6 +8,7 @@ #include <linux/irq.h> #include <linux/proc_fs.h> +#include <linux/seq_file.h> #include <linux/interrupt.h> #include "internals.h" @@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int irq_affinity_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)data; + struct irq_desc *desc = irq_desc + (long)m->private; cpumask_t *mask = &desc->affinity; - int len; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) mask = &desc->pending_mask; #endif - len = cpumask_scnprintf(page, count, *mask); - - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + seq_cpumask(m, mask); + seq_putc(m, '\n'); + return 0; } #ifndef is_affinity_mask_valid @@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off, #endif int no_irq_affinity; -static int irq_affinity_write_proc(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t irq_affinity_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)data, full_count = count, err; + unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; cpumask_t new_value; + int err; if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) @@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity(irq) ? -EINVAL : full_count; + return irq_select_affinity(irq) ? -EINVAL : count; irq_set_affinity(irq, new_value); - return full_count; + return count; } -static int default_affinity_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - int len = cpumask_scnprintf(page, count, irq_default_affinity); - if (count - len < 2) - return -EINVAL; - len += sprintf(page + len, "\n"); - return len; + return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } -static int default_affinity_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static const struct file_operations irq_affinity_proc_fops = { + .open = irq_affinity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_proc_write, +}; + +static int default_affinity_show(struct seq_file *m, void *v) +{ + seq_cpumask(m, &irq_default_affinity); + seq_putc(m, '\n'); + return 0; +} + +static ssize_t default_affinity_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) { - unsigned int full_count = count, err; cpumask_t new_value; + int err; err = cpumask_parse_user(buffer, count, new_value); if (err) @@ -105,8 +112,21 @@ static int default_affinity_write(struct file *file, const char __user *buffer, irq_default_affinity = new_value; - return full_count; + return count; } + +static int default_affinity_open(struct inode *inode, struct file *file) +{ + return single_open(file, default_affinity_show, NULL); +} + +static const struct file_operations default_affinity_proc_fops = { + .open = default_affinity_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = default_affinity_write, +}; #endif static int irq_spurious_read(char *page, char **start, off_t off, @@ -178,16 +198,9 @@ void register_irq_proc(unsigned int irq) irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP - { - /* create /proc/irq/<irq>/smp_affinity */ - entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); - - if (entry) { - entry->data = (void *)(long)irq; - entry->read_proc = irq_affinity_read_proc; - entry->write_proc = irq_affinity_write_proc; - } - } + /* create /proc/irq/<irq>/smp_affinity */ + proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + &irq_affinity_proc_fops, (void *)(long)irq); #endif entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); @@ -208,15 +221,8 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action) void register_default_affinity_proc(void) { #ifdef CONFIG_SMP - struct proc_dir_entry *entry; - - /* create /proc/irq/default_smp_affinity */ - entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir); - if (entry) { - entry->data = NULL; - entry->read_proc = default_affinity_read; - entry->write_proc = default_affinity_write; - } + proc_create("irq/default_smp_affinity", 0600, NULL, + &default_affinity_proc_fops); #endif } diff --git a/kernel/kexec.c b/kernel/kexec.c index 1c5fcacbcf33..aef265325cd3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -12,7 +12,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> -#include <linux/spinlock.h> +#include <linux/mutex.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/syscalls.h> @@ -24,6 +24,12 @@ #include <linux/utsrelease.h> #include <linux/utsname.h> #include <linux/numa.h> +#include <linux/suspend.h> +#include <linux/device.h> +#include <linux/freezer.h> +#include <linux/pm.h> +#include <linux/cpu.h> +#include <linux/console.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -71,7 +77,7 @@ int kexec_should_crash(struct task_struct *p) * * The code for the transition from the current kernel to the * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range @@ -236,12 +242,18 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; } + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + printk(KERN_ERR "Could not allocate swap buffer\n"); + goto out; + } + result = 0; out: if (result == 0) @@ -305,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; @@ -589,14 +601,12 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->unuseable_pages); } -static int kimage_terminate(struct kimage *image) +static void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; - - return 0; } #define for_each_kimage_entry(image, ptr, entry) \ @@ -743,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image, *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a - * destination page, so return it. + * destination page, so return it if it's + * gfp_flags honor the ones passed in. */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } addr = old_addr; page = old_page; break; @@ -914,19 +930,14 @@ static int kimage_load_segment(struct kimage *image, */ struct kimage *kexec_image; struct kimage *kexec_crash_image; -/* - * A home grown binary mutex. - * Nothing can wait so this mutex is safe to use - * in interrupt context :) - */ -static int kexec_lock; + +static DEFINE_MUTEX(kexec_mutex); asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, unsigned long flags) { struct kimage **dest_image, *image; - int locked; int result; /* We only trust the superuser with rebooting the system. */ @@ -962,8 +973,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, * * KISS: always take the mutex. */ - locked = xchg(&kexec_lock, 1); - if (locked) + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; dest_image = &kexec_image; @@ -988,6 +998,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, if (result) goto out; + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; result = machine_kexec_prepare(image); if (result) goto out; @@ -997,16 +1009,13 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, if (result) goto out; } - result = kimage_terminate(image); - if (result) - goto out; + kimage_terminate(image); } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); out: - locked = xchg(&kexec_lock, 0); /* Release the mutex */ - BUG_ON(!locked); + mutex_unlock(&kexec_mutex); kimage_free(image); return result; @@ -1053,10 +1062,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, void crash_kexec(struct pt_regs *regs) { - int locked; - - - /* Take the kexec_lock here to prevent sys_kexec_load + /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -1064,8 +1070,7 @@ void crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - locked = xchg(&kexec_lock, 1); - if (!locked) { + if (mutex_trylock(&kexec_mutex)) { if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&fixed_regs, regs); @@ -1073,8 +1078,7 @@ void crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - locked = xchg(&kexec_lock, 0); - BUG_ON(!locked); + mutex_unlock(&kexec_mutex); } } @@ -1415,3 +1419,79 @@ static int __init crash_save_vmcoreinfo_init(void) } module_init(crash_save_vmcoreinfo_init) + +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + mutex_lock(&pm_mutex); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_console; + error = disable_nonboot_cpus(); + if (error) + goto Resume_devices; + device_pm_lock(); + local_irq_disable(); + /* At this point, device_suspend() has been called, + * but *not* device_power_down(). We *must* + * device_power_down() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = device_power_down(PMSG_FREEZE); + if (error) + goto Enable_irqs; + } else +#endif + { + kernel_restart_prepare(NULL); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + device_power_up(PMSG_RESTORE); + Enable_irqs: + local_irq_enable(); + device_pm_unlock(); + enable_nonboot_cpus(); + Resume_devices: + device_resume(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + mutex_unlock(&pm_mutex); + } +#endif + + Unlock: + mutex_unlock(&kexec_mutex); + return error; +} diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 3ec23c3ec97f..e4dcfb2272a4 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -56,12 +56,14 @@ static int kgdb_break_asap; +#define KGDB_MAX_THREAD_QUERY 17 struct kgdb_state { int ex_vector; int signo; int err_code; int cpu; int pass_exception; + unsigned long thr_query; unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; @@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup); * Weak aliases for breakpoint management, * can be overriden by architectures when needed: */ -int __weak kgdb_validate_break_address(unsigned long addr) -{ - char tmp_variable[BREAK_INSTR_SIZE]; - - return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); -} - int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) { int err; @@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) (char *)bundle, BREAK_INSTR_SIZE); } +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + int err; + /* Validate setting the breakpoint and then removing it. In the + * remove fails, the kernel needs to emit a bad message because we + * are deep trouble not being able to put things back the way we + * found them. + */ + err = kgdb_arch_set_breakpoint(addr, tmp_variable); + if (err) + return err; + err = kgdb_arch_remove_breakpoint(addr, tmp_variable); + if (err) + printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " + "memory destroyed at: %lx", addr); + return err; +} + unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); @@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) { int hex_val; int num = 0; + int negate = 0; *long_val = 0; + if (**ptr == '-') { + negate = 1; + (*ptr)++; + } while (**ptr) { hex_val = hex(**ptr); if (hex_val < 0) @@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } + if (negate) + *long_val = -*long_val; + return num; } @@ -466,7 +488,7 @@ static int write_mem_msg(int binary) if (err) return err; if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length + 1); + flush_icache_range(addr, addr + length); return 0; } @@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value) static struct task_struct *getthread(struct pt_regs *regs, int tid) { /* - * Non-positive TIDs are remapped idle tasks: + * Non-positive TIDs are remapped to the cpu shadow information */ - if (tid <= 0) - return idle_task(-tid); + if (tid == 0 || tid == -1) + tid = -atomic_read(&kgdb_active) - 2; + if (tid < 0) { + if (kgdb_info[-tid - 2].task) + return kgdb_info[-tid - 2].task; + else + return idle_task(-tid - 2); + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore @@ -562,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -725,14 +754,15 @@ setundefined: } /* - * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: + * Remap normal tasks to their real PID, + * CPU shadow threads are mapped to -CPU - 2 */ static inline int shadow_pid(int realpid) { if (realpid) return realpid; - return -1-raw_smp_processor_id(); + return -raw_smp_processor_id() - 2; } static char gdbmsgbuf[BUFMAX + 1]; @@ -826,7 +856,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; } else { local_debuggerinfo = NULL; - for (i = 0; i < NR_CPUS; i++) { + for_each_online_cpu(i) { /* * Try to find the task on some other * or possibly this node if we do not @@ -960,10 +990,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks) /* Handle the 'q' query packets */ static void gdb_cmd_query(struct kgdb_state *ks) { - struct task_struct *thread; + struct task_struct *g; + struct task_struct *p; unsigned char thref[8]; char *ptr; int i; + int cpu; + int finished = 0; switch (remcom_in_buffer[1]) { case 's': @@ -973,22 +1006,34 @@ static void gdb_cmd_query(struct kgdb_state *ks) break; } - if (remcom_in_buffer[1] == 'f') - ks->threadid = 1; - + i = 0; remcom_out_buffer[0] = 'm'; ptr = remcom_out_buffer + 1; - - for (i = 0; i < 17; ks->threadid++) { - thread = getthread(ks->linux_regs, ks->threadid); - if (thread) { - int_to_threadref(thref, ks->threadid); + if (remcom_in_buffer[1] == 'f') { + /* Each cpu is a shadow thread */ + for_each_online_cpu(cpu) { + ks->thr_query = 0; + int_to_threadref(thref, -cpu - 2); pack_threadid(ptr, thref); ptr += BUF_THREAD_ID_SIZE; *(ptr++) = ','; i++; } } + + do_each_thread(g, p) { + if (i >= ks->thr_query && !finished) { + int_to_threadref(thref, p->pid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + ks->thr_query++; + if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) + finished = 1; + } + i++; + } while_each_thread(g, p); + *(--ptr) = '\0'; break; @@ -1011,15 +1056,15 @@ static void gdb_cmd_query(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - if (ks->threadid > 0) { + if ((int)ks->threadid > 0) { kgdb_mem2hex(getthread(ks->linux_regs, ks->threadid)->comm, remcom_out_buffer, 16); } else { static char tmpstr[23 + BUF_THREAD_ID_SIZE]; - sprintf(tmpstr, "Shadow task %d for pid 0", - (int)(-ks->threadid-1)); + sprintf(tmpstr, "shadowCPU%d", + (int)(-ks->threadid - 2)); kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); } break; @@ -1388,6 +1433,7 @@ acquirelock: atomic_read(&kgdb_cpu_doing_single_step) != cpu) { atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1418,7 +1464,7 @@ acquirelock: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = 0; i < NR_CPUS; i++) atomic_set(&passive_cpu_wait[i], 1); } @@ -1431,7 +1477,7 @@ acquirelock: #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -1450,7 +1496,7 @@ acquirelock: kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); kgdb_deactivate_sw_breakpoints(); kgdb_single_step = 0; - kgdb_contthread = NULL; + kgdb_contthread = current; exception_level = 0; /* Talk to debugger with gdbserial protocol */ @@ -1464,7 +1510,7 @@ acquirelock: kgdb_info[ks->cpu].task = NULL; atomic_set(&cpu_in_kgdb[ks->cpu], 0); - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = NR_CPUS-1; i >= 0; i--) atomic_set(&passive_cpu_wait[i], 0); /* @@ -1480,6 +1526,7 @@ acquirelock: kgdb_restore: /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/kthread.c b/kernel/kthread.c index 6111c27491b1..96cff2f8710b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -176,7 +176,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) return; } /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k); + wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index d38a64362973..dbda475b13bd 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; unsigned long nr_lock_classes; static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; +static inline struct lock_class *hlock_class(struct held_lock *hlock) +{ + if (!hlock->class_idx) { + DEBUG_LOCKS_WARN_ON(1); + return NULL; + } + return lock_classes + hlock->class_idx - 1; +} + #ifdef CONFIG_LOCK_STAT static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); @@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock) holdtime = sched_clock() - hlock->holdtime_stamp; - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (hlock->read) lock_time_inc(&stats->read_holdtime, holdtime); else @@ -372,6 +381,19 @@ unsigned int nr_process_chains; unsigned int max_lockdep_depth; unsigned int max_recursion_depth; +static unsigned int lockdep_dependency_gen_id; + +static bool lockdep_dependency_visit(struct lock_class *source, + unsigned int depth) +{ + if (!depth) + lockdep_dependency_gen_id++; + if (source->dep_gen_id == lockdep_dependency_gen_id) + return true; + source->dep_gen_id = lockdep_dependency_gen_id; + return false; +} + #ifdef CONFIG_DEBUG_LOCKDEP /* * We cannot printk in early bootup code. Not even early_printk() @@ -505,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock) static void print_lock(struct held_lock *hlock) { - print_lock_name(hlock->class); + print_lock_name(hlock_class(hlock)); printk(", at: "); print_ip_sym(hlock->acquire_ip); } @@ -558,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth) { struct lock_list *entry; + if (lockdep_dependency_visit(class, depth)) + return; + if (DEBUG_LOCKS_WARN_ON(depth >= 20)) return; @@ -850,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, if (!entry) return 0; - entry->class = this; - entry->distance = distance; if (!save_trace(&entry->trace)) return 0; + entry->class = this; + entry->distance = distance; /* * Since we never remove from the dependency list, the list can * be walked lockless by other CPUs, it's only allocation @@ -932,7 +957,7 @@ static noinline int print_circular_bug_tail(void) if (debug_locks_silent) return 0; - this.class = check_source->class; + this.class = hlock_class(check_source); if (!save_trace(&this.trace)) return 0; @@ -959,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void) return 0; } +unsigned long __lockdep_count_forward_deps(struct lock_class *class, + unsigned int depth) +{ + struct lock_list *entry; + unsigned long ret = 1; + + if (lockdep_dependency_visit(class, depth)) + return 0; + + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_after, entry) + ret += __lockdep_count_forward_deps(entry->class, depth + 1); + + return ret; +} + +unsigned long lockdep_count_forward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + + local_irq_save(flags); + __raw_spin_lock(&lockdep_lock); + ret = __lockdep_count_forward_deps(class, 0); + __raw_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + +unsigned long __lockdep_count_backward_deps(struct lock_class *class, + unsigned int depth) +{ + struct lock_list *entry; + unsigned long ret = 1; + + if (lockdep_dependency_visit(class, depth)) + return 0; + /* + * Recurse this class's dependency list: + */ + list_for_each_entry(entry, &class->locks_before, entry) + ret += __lockdep_count_backward_deps(entry->class, depth + 1); + + return ret; +} + +unsigned long lockdep_count_backward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + + local_irq_save(flags); + __raw_spin_lock(&lockdep_lock); + ret = __lockdep_count_backward_deps(class, 0); + __raw_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + /* * Prove that the dependency graph starting at <entry> can not * lead to <target>. Print an error and return 0 if it does. @@ -968,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth) { struct lock_list *entry; + if (lockdep_dependency_visit(source, depth)) + return 1; + debug_atomic_inc(&nr_cyclic_check_recursions); if (depth > max_recursion_depth) max_recursion_depth = depth; @@ -977,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth) * Check this lock's dependency list: */ list_for_each_entry(entry, &source->locks_after, entry) { - if (entry->class == check_target->class) + if (entry->class == hlock_class(check_target)) return print_circular_bug_header(entry, depth+1); debug_atomic_inc(&nr_cyclic_checks); if (!check_noncircular(entry->class, depth+1)) @@ -1011,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (lockdep_dependency_visit(source, depth)) + return 1; + if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) @@ -1050,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (lockdep_dependency_visit(source, depth)) + return 1; + if (!__raw_spin_is_locked(&lockdep_lock)) return DEBUG_LOCKS_WARN_ON(1); @@ -1064,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) return 2; } + if (!source && debug_locks_off_graph_unlock()) { + WARN_ON(1); + return 0; + } + /* * Check this lock's dependency list: */ @@ -1103,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr, printk("\nand this task is already holding:\n"); print_lock(prev); printk("which would create a new lock dependency:\n"); - print_lock_name(prev->class); + print_lock_name(hlock_class(prev)); printk(" ->"); - print_lock_name(next->class); + print_lock_name(hlock_class(next)); printk("\n"); printk("\nbut this new dependency connects a %s-irq-safe lock:\n", @@ -1146,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev, find_usage_bit = bit_backwards; /* fills in <backwards_match> */ - ret = find_usage_backwards(prev->class, 0); + ret = find_usage_backwards(hlock_class(prev), 0); if (!ret || ret == 1) return ret; find_usage_bit = bit_forwards; - ret = find_usage_forwards(next->class, 0); + ret = find_usage_forwards(hlock_class(next), 0); if (!ret || ret == 1) return ret; /* ret == 2 */ @@ -1272,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, struct lockdep_map *next_instance, int read) { struct held_lock *prev; + struct held_lock *nest = NULL; int i; for (i = 0; i < curr->lockdep_depth; i++) { prev = curr->held_locks + i; - if (prev->class != next->class) + + if (prev->instance == next->nest_lock) + nest = prev; + + if (hlock_class(prev) != hlock_class(next)) continue; + /* * Allow read-after-read recursion of the same * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((read == 2) && prev->read) return 2; + + /* + * We're holding the nest_lock, which serializes this lock's + * nesting behaviour. + */ + if (nest) + return 2; + return print_deadlock_bug(curr, prev, next); } return 1; @@ -1329,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, */ check_source = next; check_target = prev; - if (!(check_noncircular(next->class, 0))) + if (!(check_noncircular(hlock_class(next), 0))) return print_circular_bug_tail(); if (!check_prev_add_irq(curr, prev, next)) @@ -1353,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * chains - the second one will be new, but L1 already has * L2 added to its dependency list, due to the first chain.) */ - list_for_each_entry(entry, &prev->class->locks_after, entry) { - if (entry->class == next->class) { + list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { + if (entry->class == hlock_class(next)) { if (distance == 1) entry->distance = 1; return 2; @@ -1365,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(prev->class, next->class, - &prev->class->locks_after, next->acquire_ip, distance); + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + &hlock_class(prev)->locks_after, + next->acquire_ip, distance); if (!ret) return 0; - ret = add_lock_to_list(next->class, prev->class, - &next->class->locks_before, next->acquire_ip, distance); + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + &hlock_class(next)->locks_before, + next->acquire_ip, distance); if (!ret) return 0; /* * Debugging printouts: */ - if (verbose(prev->class) || verbose(next->class)) { + if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { graph_unlock(); printk("\n new dependency: "); - print_lock_name(prev->class); + print_lock_name(hlock_class(prev)); printk(" => "); - print_lock_name(next->class); + print_lock_name(hlock_class(next)); printk("\n"); dump_stack(); return graph_lock(); @@ -1481,7 +1597,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct held_lock *hlock, u64 chain_key) { - struct lock_class *class = hlock->class; + struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr, *hlock_next; @@ -1554,7 +1670,7 @@ cache_hit: if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = cn; for (j = 0; j < chain->depth - 1; j++, i++) { - int lock_id = curr->held_locks[i].class - lock_classes; + int lock_id = curr->held_locks[i].class_idx - 1; chain_hlocks[chain->base + j] = lock_id; } chain_hlocks[chain->base + j] = class - lock_classes; @@ -1643,14 +1759,13 @@ static void check_chain_key(struct task_struct *curr) hlock = curr->held_locks + i; if (chain_key != hlock->prev_chain_key) { debug_locks_off(); - printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", + WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)hlock->prev_chain_key); - WARN_ON(1); return; } - id = hlock->class - lock_classes; + id = hlock->class_idx - 1; if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return; @@ -1662,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr) } if (chain_key != curr->curr_chain_key) { debug_locks_off(); - printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", + WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, (unsigned long long)curr->curr_chain_key); - WARN_ON(1); } #endif } @@ -1695,7 +1809,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, print_lock(this); printk("{%s} state was registered at:\n", usage_str[prev_bit]); - print_stack_trace(this->class->usage_traces + prev_bit, 1); + print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); print_irqtrace_events(curr); printk("\nother info that might help us debug this:\n"); @@ -1714,7 +1828,7 @@ static inline int valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { - if (unlikely(this->class->usage_mask & (1 << bad_bit))) + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) return print_usage_bug(curr, this, bad_bit, new_bit); return 1; } @@ -1753,7 +1867,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, lockdep_print_held_locks(curr); printk("\nthe first lock's dependencies:\n"); - print_lock_dependencies(this->class, 0); + print_lock_dependencies(hlock_class(this), 0); printk("\nthe second lock's dependencies:\n"); print_lock_dependencies(other, 0); @@ -1776,7 +1890,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, find_usage_bit = bit; /* fills in <forwards_match> */ - ret = find_usage_forwards(this->class, 0); + ret = find_usage_forwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; @@ -1795,7 +1909,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, find_usage_bit = bit; /* fills in <backwards_match> */ - ret = find_usage_backwards(this->class, 0); + ret = find_usage_backwards(hlock_class(this), 0); if (!ret || ret == 1) return ret; @@ -1861,7 +1975,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_ENABLED_HARDIRQS_READ, "hard-read")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_SOFTIRQ: @@ -1886,7 +2000,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_HARDIRQ_READ: @@ -1899,7 +2013,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (!check_usage_forwards(curr, this, LOCK_ENABLED_HARDIRQS, "hard")) return 0; - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_USED_IN_SOFTIRQ_READ: @@ -1912,7 +2026,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, if (!check_usage_forwards(curr, this, LOCK_ENABLED_SOFTIRQS, "soft")) return 0; - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_HARDIRQS: @@ -1938,7 +2052,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_HARDIRQ_READ, "hard-read")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_SOFTIRQS: @@ -1964,7 +2078,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_HARDIRQS_READ: @@ -1979,7 +2093,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_HARDIRQ, "hard")) return 0; #endif - if (hardirq_verbose(this->class)) + if (hardirq_verbose(hlock_class(this))) ret = 2; break; case LOCK_ENABLED_SOFTIRQS_READ: @@ -1994,7 +2108,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this, LOCK_USED_IN_SOFTIRQ, "soft")) return 0; #endif - if (softirq_verbose(this->class)) + if (softirq_verbose(hlock_class(this))) ret = 2; break; default: @@ -2310,7 +2424,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, * If already set then do not dirty the cacheline, * nor do any checks: */ - if (likely(this->class->usage_mask & new_mask)) + if (likely(hlock_class(this)->usage_mask & new_mask)) return 1; if (!graph_lock()) @@ -2318,14 +2432,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, /* * Make sure we didnt race: */ - if (unlikely(this->class->usage_mask & new_mask)) { + if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); return 1; } - this->class->usage_mask |= new_mask; + hlock_class(this)->usage_mask |= new_mask; - if (!save_trace(this->class->usage_traces + new_bit)) + if (!save_trace(hlock_class(this)->usage_traces + new_bit)) return 0; switch (new_bit) { @@ -2405,7 +2519,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); */ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, - unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -2459,14 +2573,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, return 0; hlock = curr->held_locks + depth; - - hlock->class = class; + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + hlock->class_idx = class - lock_classes + 1; hlock->acquire_ip = ip; hlock->instance = lock; + hlock->nest_lock = nest_lock; hlock->trylock = trylock; hlock->read = read; hlock->check = check; - hlock->hardirqs_off = hardirqs_off; + hlock->hardirqs_off = !!hardirqs_off; #ifdef CONFIG_LOCK_STAT hlock->waittime_stamp = 0; hlock->holdtime_stamp = sched_clock(); @@ -2574,6 +2690,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, return 1; } +static int +__lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (hlock->instance == lock) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + class = register_lock_class(lock, subclass, 0); + hlock->class_idx = class - lock_classes + 1; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + /* * Remove the lock to the list of currently held locks in a * potentially non-nested (out of order) manner. This is a @@ -2624,9 +2789,9 @@ found_it: for (i++; i < depth; i++) { hlock = curr->held_locks + i; if (!__lock_acquire(hlock->instance, - hlock->class->subclass, hlock->trylock, + hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, - hlock->acquire_ip)) + hlock->nest_lock, hlock->acquire_ip)) return 0; } @@ -2669,7 +2834,7 @@ static int lock_release_nested(struct task_struct *curr, #ifdef CONFIG_DEBUG_LOCKDEP hlock->prev_chain_key = 0; - hlock->class = NULL; + hlock->class_idx = 0; hlock->acquire_ip = 0; hlock->irq_context = 0; #endif @@ -2738,18 +2903,36 @@ static void check_flags(unsigned long flags) #endif } +void +lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_subclass(lock, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +EXPORT_SYMBOL_GPL(lock_set_subclass); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: */ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, unsigned long ip) + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip) { unsigned long flags; - if (unlikely(!lock_stat && !prove_locking)) - return; - if (unlikely(current->lockdep_recursion)) return; @@ -2758,7 +2941,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), ip); + irqs_disabled_flags(flags), nest_lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } @@ -2770,9 +2953,6 @@ void lock_release(struct lockdep_map *lock, int nested, { unsigned long flags; - if (unlikely(!lock_stat && !prove_locking)) - return; - if (unlikely(current->lockdep_recursion)) return; @@ -2845,11 +3025,11 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) found_it: hlock->waittime_stamp = sched_clock(); - point = lock_contention_point(hlock->class, ip); + point = lock_contention_point(hlock_class(hlock), ip); - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (point < ARRAY_SIZE(stats->contention_point)) - stats->contention_point[i]++; + stats->contention_point[point]++; if (lock->cpu != smp_processor_id()) stats->bounces[bounce_contended + !!hlock->read]++; put_lock_stats(stats); @@ -2893,7 +3073,7 @@ found_it: hlock->holdtime_stamp = now; } - stats = get_lock_stats(hlock->class); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) lock_time_inc(&stats->read_waittime, waittime); @@ -2988,6 +3168,7 @@ static void zap_class(struct lock_class *class) list_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); + class->key = NULL; } static inline int within(const void *addr, void *start, unsigned long size) diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index c3600a091a28..56b196932c08 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h @@ -17,9 +17,6 @@ */ #define MAX_LOCKDEP_ENTRIES 8192UL -#define MAX_LOCKDEP_KEYS_BITS 11 -#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) - #define MAX_LOCKDEP_CHAINS_BITS 14 #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) @@ -53,6 +50,22 @@ extern unsigned int nr_process_chains; extern unsigned int max_lockdep_depth; extern unsigned int max_recursion_depth; +#ifdef CONFIG_PROVE_LOCKING +extern unsigned long lockdep_count_forward_deps(struct lock_class *); +extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#else +static inline unsigned long +lockdep_count_forward_deps(struct lock_class *class) +{ + return 0; +} +static inline unsigned long +lockdep_count_backward_deps(struct lock_class *class) +{ + return 0; +} +#endif + #ifdef CONFIG_DEBUG_LOCKDEP /* * Various lockdep statistics: diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 9b0e940e2545..20dbcbf9c7dd 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v) { } -static unsigned long count_forward_deps(struct lock_class *class) -{ - struct lock_list *entry; - unsigned long ret = 1; - - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_after, entry) - ret += count_forward_deps(entry->class); - - return ret; -} - -static unsigned long count_backward_deps(struct lock_class *class) -{ - struct lock_list *entry; - unsigned long ret = 1; - - /* - * Recurse this class's dependency list: - */ - list_for_each_entry(entry, &class->locks_before, entry) - ret += count_backward_deps(entry->class); - - return ret; -} - static void print_name(struct seq_file *m, struct lock_class *class) { char str[128]; @@ -110,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class) static int l_show(struct seq_file *m, void *v) { - unsigned long nr_forward_deps, nr_backward_deps; struct lock_class *class = v; struct lock_list *entry; char c1, c2, c3, c4; @@ -124,11 +95,10 @@ static int l_show(struct seq_file *m, void *v) #ifdef CONFIG_DEBUG_LOCKDEP seq_printf(m, " OPS:%8ld", class->ops); #endif - nr_forward_deps = count_forward_deps(class); - seq_printf(m, " FD:%5ld", nr_forward_deps); - - nr_backward_deps = count_backward_deps(class); - seq_printf(m, " BD:%5ld", nr_backward_deps); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); +#endif get_usage_chars(class, &c1, &c2, &c3, &c4); seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); @@ -229,6 +199,9 @@ static int lc_show(struct seq_file *m, void *v) for (i = 0; i < chain->depth; i++) { class = lock_chain_get_class(chain, i); + if (!class->key) + continue; + seq_printf(m, "[%p] ", class->key); print_name(m, class); seq_puts(m, "\n"); @@ -350,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v) if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) nr_hardirq_read_unsafe++; - sum_forward_deps += count_forward_deps(class); +#ifdef CONFIG_PROVE_LOCKING + sum_forward_deps += lockdep_count_forward_deps(class); +#endif } #ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); @@ -497,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr) { unsigned long rem; + nr += 5; /* for display rounding */ rem = do_div(nr, 1000); /* XXX: do_div_signed */ - snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); + snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); } static void seq_time(struct seq_file *m, s64 time) diff --git a/kernel/marker.c b/kernel/marker.c index 971da5317903..7d1faecd7a51 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) struct marker_probe_closure *multi; int i; /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, * we must insure that the pointer is read _before_ the array @@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, call_private); multi[i].func(multi[i].probe_private, call_private, @@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) struct marker_probe_closure *multi; int i; /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, * we must insure that the pointer is read _before_ the array @@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); diff --git a/kernel/module.c b/kernel/module.c index d8b5605132a0..9db11911e04b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -325,18 +325,6 @@ static unsigned long find_symbol(const char *name, return -ENOENT; } -/* lookup symbol in given range of kernel_symbols */ -static const struct kernel_symbol *lookup_symbol(const char *name, - const struct kernel_symbol *start, - const struct kernel_symbol *stop) -{ - const struct kernel_symbol *ks = start; - for (; ks < stop; ks++) - if (strcmp(ks->name, name) == 0) - return ks; - return NULL; -} - /* Search for module by name: must hold module_mutex. */ static struct module *find_module(const char *name) { @@ -690,7 +678,7 @@ static int try_stop_module(struct module *mod, int flags, int *forced) if (flags & O_NONBLOCK) { struct stopref sref = { mod, flags, forced }; - return stop_machine_run(__try_stop_module, &sref, NR_CPUS); + return stop_machine(__try_stop_module, &sref, NULL); } else { /* We don't need to stop the machine for this. */ mod->state = MODULE_STATE_GOING; @@ -1428,7 +1416,7 @@ static int __unlink_module(void *_mod) static void free_module(struct module *mod) { /* Delete from various lists */ - stop_machine_run(__unlink_module, mod, NR_CPUS); + stop_machine(__unlink_module, mod, NULL); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); @@ -1703,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } #ifdef CONFIG_KALLSYMS + +/* lookup symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + const struct kernel_symbol *ks = start; + for (; ks < stop; ks++) + if (strcmp(ks->name, name) == 0) + return ks; + return NULL; +} + static int is_exported(const char *name, const struct module *mod) { if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) @@ -1798,7 +1799,7 @@ static void *module_alloc_update_bounds(unsigned long size) /* Allocate and load the module: note that size of section 0 is always zero, and we rely on this for optional sections. */ -static struct module *load_module(void __user *umod, +static noinline struct module *load_module(void __user *umod, unsigned long len, const char __user *uargs) { @@ -2196,7 +2197,7 @@ static struct module *load_module(void __user *umod, /* Now sew it into the lists so we can get lockdep and oops * info during argument parsing. Noone should access us, since * strong_try_module_get() will fail. */ - stop_machine_run(__link_module, mod, NR_CPUS); + stop_machine(__link_module, mod, NULL); /* Size of section 0 is 0, so this works well if no params */ err = parse_args(mod->name, mod->args, @@ -2230,7 +2231,7 @@ static struct module *load_module(void __user *umod, return mod; unlink: - stop_machine_run(__unlink_module, mod, NR_CPUS); + stop_machine(__unlink_module, mod, NULL); module_arch_cleanup(mod); cleanup: kobject_del(&mod->mkobj.kobj); @@ -2287,7 +2288,7 @@ sys_init_module(void __user *umod, /* Start the module */ if (mod->init != NULL) - ret = mod->init(); + ret = do_one_initcall(mod->init); if (ret < 0) { /* Init routine failed: abort. Try to protect us from buggy refcounters. */ diff --git a/kernel/mutex.c b/kernel/mutex.c index bcdc9ac8ef60..12c779dc65d4 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -34,6 +34,7 @@ /*** * mutex_init - initialize the mutex * @lock: the mutex to be initialized + * @key: the lock_class_key for the class; used by mutex lock debugging * * Initialize the mutex to unlocked state. * diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 21575fc46d05..1d3ef29a2583 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -14,7 +14,6 @@ */ #include <linux/module.h> -#include <linux/version.h> #include <linux/nsproxy.h> #include <linux/init_task.h> #include <linux/mnt_namespace.h> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ea567b78d1aa..fab8ea86fac3 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -179,9 +179,6 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); - - /* Child reaper for the pid namespace is going away */ - pid_ns->child_reaper = NULL; acct_exit_ns(pid_ns); return; } diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 8cb757026386..dfdec524d1b7 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -24,7 +24,7 @@ * requirement that the application has is cleaned up when closes the file * pointer or exits the pm_qos_object will get an opportunity to clean up. * - * mark gross mgross@linux.intel.com + * Mark Gross <mgross@linux.intel.com> */ #include <linux/pm_qos_params.h> @@ -43,7 +43,7 @@ #include <linux/uaccess.h> /* - * locking rule: all changes to target_value or requirements or notifiers lists + * locking rule: all changes to requirements or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ @@ -66,7 +66,7 @@ struct pm_qos_object { struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - s32 target_value; + atomic_t target_value; s32 (*comparitor)(s32, s32); }; @@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = { .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = 2000 * USEC_PER_SEC, + .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), .comparitor = min_compare }; @@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = { .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = 0, + .target_value = ATOMIC_INIT(0), .comparitor = max_compare }; @@ -150,11 +150,11 @@ static void update_target(int target) extreme_value = pm_qos_array[target]->comparitor( extreme_value, node->value); } - if (pm_qos_array[target]->target_value != extreme_value) { + if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { call_notifier = 1; - pm_qos_array[target]->target_value = extreme_value; + atomic_set(&pm_qos_array[target]->target_value, extreme_value); pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - pm_qos_array[target]->target_value); + atomic_read(&pm_qos_array[target]->target_value)); } spin_unlock_irqrestore(&pm_qos_lock, flags); @@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_requirement(int pm_qos_class) { - int ret_val; - unsigned long flags; - - spin_lock_irqsave(&pm_qos_lock, flags); - ret_val = pm_qos_array[pm_qos_class]->target_value; - spin_unlock_irqrestore(&pm_qos_lock, flags); - - return ret_val; + return atomic_read(&pm_qos_array[pm_qos_class]->target_value); } EXPORT_SYMBOL_GPL(pm_qos_requirement); @@ -211,8 +204,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement); * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos - * performance charactoistics. It recomputes the agregate QoS expectations for - * the pm_qos_class of parrameters. + * performance characteristics. It recomputes the aggregate QoS expectations + * for the pm_qos_class of parameters. */ int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) { @@ -250,10 +243,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement); * @name: identifies the request * @value: defines the qos request * - * Updates an existing qos requierement for the pm_qos_class of parameters along + * Updates an existing qos requirement for the pm_qos_class of parameters along * with updating the target pm_qos_class value. * - * If the named request isn't in the lest then no change is made. + * If the named request isn't in the list then no change is made. */ int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) { @@ -287,7 +280,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement); * @pm_qos_class: identifies which list of qos request to us * @name: identifies the request * - * Will remove named qos request from pm_qos_class list of parrameters and + * Will remove named qos request from pm_qos_class list of parameters and * recompute the current target value for the pm_qos_class. */ void pm_qos_remove_requirement(int pm_qos_class, char *name) @@ -319,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); * @notifier: notifier block managed by caller. * * will register the notifier into a notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) { @@ -338,7 +331,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier); * @notifier: notifier block to be removed. * * will remove the notifier from the notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9a21681aa80f..5131e5471169 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -289,21 +289,29 @@ void do_schedule_next_timer(struct siginfo *info) else schedule_next_timer(timr); - info->si_overrun = timr->it_overrun_last; + info->si_overrun += timr->it_overrun_last; } if (timr) unlock_timer(timr, flags); } -int posix_timer_event(struct k_itimer *timr,int si_private) +int posix_timer_event(struct k_itimer *timr, int si_private) { - memset(&timr->sigq->info, 0, sizeof(siginfo_t)); + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls do_schedule_next_timer(). + * We re-queue ->sigq and drop ->it_lock(). + * do_schedule_next_timer() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ timr->sigq->info.si_sys_private = si_private; - /* Send signal to the process that owns this timer.*/ timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_errno = 0; timr->sigq->info.si_code = SI_TIMER; timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; @@ -433,8 +441,9 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); - tmr = NULL; + return NULL; } + memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f011e0870b52..bbd85c60f741 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -21,6 +21,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> +#include <linux/ftrace.h> #include "power.h" @@ -255,7 +256,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { - int error; + int error, ftrace_save; /* Free memory before shutting down devices. */ error = swsusp_shrink_memory(); @@ -267,6 +268,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_FREEZE); if (error) goto Recover_platform; @@ -296,6 +298,7 @@ int hibernation_snapshot(int platform_mode) Resume_devices: device_resume(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: platform_end(platform_mode); @@ -366,10 +369,11 @@ static int resume_target_kernel(void) int hibernation_restore(int platform_mode) { - int error; + int error, ftrace_save; pm_prepare_console(); suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_QUIESCE); if (error) goto Finish; @@ -384,6 +388,7 @@ int hibernation_restore(int platform_mode) platform_restore_cleanup(platform_mode); device_resume(PMSG_RECOVER); Finish: + __ftrace_enabled_restore(ftrace_save); resume_console(); pm_restore_console(); return error; @@ -396,7 +401,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { - int error; + int error, ftrace_save; if (!hibernation_ops) return -ENOSYS; @@ -411,6 +416,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); + ftrace_save = __ftrace_enabled_save(); error = device_suspend(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -445,6 +451,7 @@ int hibernation_platform_enter(void) hibernation_ops->finish(); Resume_devices: device_resume(PMSG_RESTORE); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: hibernation_ops->end(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 95bff23ecdaa..540b16b68565 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,6 +21,7 @@ #include <linux/freezer.h> #include <linux/vmstat.h> #include <linux/syscalls.h> +#include <linux/ftrace.h> #include "power.h" @@ -310,7 +311,7 @@ static int suspend_enter(suspend_state_t state) */ int suspend_devices_and_enter(suspend_state_t state) { - int error; + int error, ftrace_save; if (!suspend_ops) return -ENOSYS; @@ -321,6 +322,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_save = __ftrace_enabled_save(); suspend_test_start(); error = device_suspend(PMSG_SUSPEND); if (error) { @@ -352,6 +354,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); device_resume(PMSG_RESUME); suspend_test_finish("resume devices"); + __ftrace_enabled_restore(ftrace_save); resume_console(); Close: if (suspend_ops->end) @@ -635,6 +638,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) } if (status < 0) printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); } static int __init has_wakealarm(struct device *dev, void *name_ptr) diff --git a/kernel/power/power.h b/kernel/power/power.h index 700f44ec8406..acc0c101dbd5 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void); extern int pfn_is_nosave(unsigned long); -extern struct mutex pm_mutex; - #define power_attr(_name) \ static struct kobj_attribute _name##_attr = { \ .attr = { \ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0abf9a463f9..80ccac849e46 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -14,7 +14,6 @@ #include <linux/module.h> #include <linux/file.h> #include <linux/utsname.h> -#include <linux/version.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> diff --git a/kernel/printk.c b/kernel/printk.c index a7f7559c5f6c..b51b1567bb55 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg) #if defined CONFIG_PRINTK -DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); /* * printk rate limiting, lifted from the networking subsystem. * - * This enforces a rate limit: not more than one kernel message - * every printk_ratelimit_jiffies to make a denial-of-service - * attack impossible. + * This enforces a rate limit: not more than 10 kernel messages + * every 5s to make a denial-of-service attack impossible. */ +DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); + int printk_ratelimit(void) { return __ratelimit(&printk_ratelimit_state); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8392a9da6450..356699a96d56 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) read_unlock(&tasklist_lock); if (!ret && !kill) - wait_task_inactive(child); + ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; /* All systems go.. */ return ret; @@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace(current, task, mode); + return security_ptrace_may_access(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) @@ -499,8 +499,7 @@ repeat: goto repeat; } - ret = security_ptrace(current->parent, current, - PTRACE_MODE_ATTACH); + ret = security_ptrace_traceme(current->parent); /* * Set the ptrace bit in the process ptrace flags. diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 6f8696c502f4..37f72e551542 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> +#include <linux/time.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -91,8 +97,8 @@ static void force_quiescent_state(struct rcu_data *rdp, * rdp->cpu is the current cpu. * * cpu_online_map is updated by the _cpu_down() - * using stop_machine_run(). Since we're in irqs disabled - * section, stop_machine_run() is not exectuting, hence + * using __stop_machine(). Since we're in irqs disabled + * section, __stop_machine() is not exectuting, hence * the cpu_online_map is stable. * * However, a cpu might have been offlined _just_ before @@ -109,6 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; - head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + unsigned long flags; + + if (list) { + local_irq_save(flags); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_restore(flags); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + unsigned long flags; + + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } + unsigned long flags; + long completed_snap; - if (rdp->nxtlist && !rdp->curlist) { - local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); + if (rdp->nxtlist) { + local_irq_save(flags); + completed_snap = ACCESS_ONCE(rcp->completed); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; - /* determine batch number */ - rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ - smp_rmb(); + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } + + local_irq_restore(flags); + + if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags2; - if (!rcp->next_pending) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); - spin_unlock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags2); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } + spin_unlock_irqrestore(&rcp->lock, flags2); } } @@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp); - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(completed_snap, rdp->batch)) + return 1; + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; + + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f14f372cf6f5..467d5940f624 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head) * sections are delimited by rcu_read_lock() and rcu_read_unlock(), * and may be nested. */ +void synchronize_rcu(void); /* Makes kernel-doc tools happy */ synchronize_rcu_xxx(synchronize_rcu, call_rcu) EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27827931ca0d..ca4bbbe04aa4 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -59,14 +59,6 @@ #include <linux/rcupreempt_trace.h> /* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - -/* * PREEMPT_RCU data structures. */ diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c34bbc..35c2d3360ecf 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) diff --git a/kernel/relay.c b/kernel/relay.c index 7de644cdec43..8d13a7855c08 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan) } EXPORT_SYMBOL_GPL(relay_reset); +static inline void relay_set_buf_dentry(struct rchan_buf *buf, + struct dentry *dentry) +{ + buf->dentry = dentry; + buf->dentry->d_inode->i_size = buf->early_bytes; +} + +static struct dentry *relay_create_buf_file(struct rchan *chan, + struct rchan_buf *buf, + unsigned int cpu) +{ + struct dentry *dentry; + char *tmpname; + + tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmpname) + return NULL; + snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); + + /* Create file in fs */ + dentry = chan->cb->create_buf_file(tmpname, chan->parent, + S_IRUSR, buf, + &chan->is_global); + + kfree(tmpname); + + return dentry; +} + /* * relay_open_buf - create a new relay channel buffer * @@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) { struct rchan_buf *buf = NULL; struct dentry *dentry; - char *tmpname; if (chan->is_global) return chan->buf[0]; - tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmpname) - goto end; - snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); - buf = relay_create_buf(chan); if (!buf) - goto free_name; + return NULL; + + if (chan->has_base_filename) { + dentry = relay_create_buf_file(chan, buf, cpu); + if (!dentry) + goto free_buf; + relay_set_buf_dentry(buf, dentry); + } buf->cpu = cpu; __relay_reset(buf, 1); - /* Create file in fs */ - dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, - buf, &chan->is_global); - if (!dentry) - goto free_buf; - - buf->dentry = dentry; - if(chan->is_global) { chan->buf[0] = buf; buf->cpu = 0; } - goto free_name; + return buf; free_buf: relay_destroy_buf(buf); - buf = NULL; -free_name: - kfree(tmpname); -end: - return buf; + return NULL; } /** @@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, /** * relay_open - create a new relay channel - * @base_filename: base name of files to create - * @parent: dentry of parent directory, %NULL for root directory + * @base_filename: base name of files to create, %NULL for buffering only + * @parent: dentry of parent directory, %NULL for root directory or buffer * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions @@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename, { unsigned int i; struct rchan *chan; - if (!base_filename) - return NULL; if (!(subbuf_size && n_subbufs)) return NULL; @@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename, chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); chan->parent = parent; chan->private_data = private_data; - strlcpy(chan->base_filename, base_filename, NAME_MAX); + if (base_filename) { + chan->has_base_filename = 1; + strlcpy(chan->base_filename, base_filename, NAME_MAX); + } setup_callbacks(chan, cb); kref_init(&chan->kref); @@ -604,6 +623,94 @@ free_bufs: } EXPORT_SYMBOL_GPL(relay_open); +struct rchan_percpu_buf_dispatcher { + struct rchan_buf *buf; + struct dentry *dentry; +}; + +/* Called in atomic context. */ +static void __relay_set_buf_dentry(void *info) +{ + struct rchan_percpu_buf_dispatcher *p = info; + + relay_set_buf_dentry(p->buf, p->dentry); +} + +/** + * relay_late_setup_files - triggers file creation + * @chan: channel to operate on + * @base_filename: base name of files to create + * @parent: dentry of parent directory, %NULL for root directory + * + * Returns 0 if successful, non-zero otherwise. + * + * Use to setup files for a previously buffer-only channel. + * Useful to do early tracing in kernel, before VFS is up, for example. + */ +int relay_late_setup_files(struct rchan *chan, + const char *base_filename, + struct dentry *parent) +{ + int err = 0; + unsigned int i, curr_cpu; + unsigned long flags; + struct dentry *dentry; + struct rchan_percpu_buf_dispatcher disp; + + if (!chan || !base_filename) + return -EINVAL; + + strlcpy(chan->base_filename, base_filename, NAME_MAX); + + mutex_lock(&relay_channels_mutex); + /* Is chan already set up? */ + if (unlikely(chan->has_base_filename)) + return -EEXIST; + chan->has_base_filename = 1; + chan->parent = parent; + curr_cpu = get_cpu(); + /* + * The CPU hotplug notifier ran before us and created buffers with + * no files associated. So it's safe to call relay_setup_buf_file() + * on all currently online CPUs. + */ + for_each_online_cpu(i) { + if (unlikely(!chan->buf[i])) { + printk(KERN_ERR "relay_late_setup_files: CPU %u " + "has no buffer, it must have!\n", i); + BUG(); + err = -EINVAL; + break; + } + + dentry = relay_create_buf_file(chan, chan->buf[i], i); + if (unlikely(!dentry)) { + err = -EINVAL; + break; + } + + if (curr_cpu == i) { + local_irq_save(flags); + relay_set_buf_dentry(chan->buf[i], dentry); + local_irq_restore(flags); + } else { + disp.buf = chan->buf[i]; + disp.dentry = dentry; + smp_mb(); + /* relay_channels_mutex must be held, so wait. */ + err = smp_call_function_single(i, + __relay_set_buf_dentry, + &disp, 1); + } + if (unlikely(err)) + break; + } + put_cpu(); + mutex_unlock(&relay_channels_mutex); + + return err; +} + /** * relay_switch_subbuf - switch to a new sub-buffer * @buf: channel buffer @@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; buf->padding[old_subbuf] = buf->prev_padding; buf->subbufs_produced++; - buf->dentry->d_inode->i_size += buf->chan->subbuf_size - - buf->padding[old_subbuf]; + if (buf->dentry) + buf->dentry->d_inode->i_size += + buf->chan->subbuf_size - + buf->padding[old_subbuf]; + else + buf->early_bytes += buf->chan->subbuf_size - + buf->padding[old_subbuf]; smp_mb(); if (waitqueue_active(&buf->read_wait)) /* @@ -832,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf, size_t n_subbufs = buf->chan->n_subbufs; size_t read_subbuf; + if (buf->subbufs_produced == buf->subbufs_consumed && + buf->offset == buf->bytes_consumed) + return; + if (buf->bytes_consumed + bytes_consumed > subbuf_size) { relay_subbufs_consumed(buf->chan, buf->cpu, 1); buf->bytes_consumed = 0; @@ -863,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) relay_file_read_consume(buf, read_pos, 0); + consumed = buf->subbufs_consumed; + if (unlikely(buf->offset > subbuf_size)) { if (produced == consumed) return 0; @@ -881,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) if (consumed > produced) produced += n_subbufs * subbuf_size; - if (consumed == produced) + if (consumed == produced) { + if (buf->offset == subbuf_size && + buf->subbufs_produced > buf->subbufs_consumed) + return 1; return 0; + } return 1; } @@ -1237,4 +1359,4 @@ static __init int relay_init(void) return 0; } -module_init(relay_init); +early_initcall(relay_init); diff --git a/kernel/resource.c b/kernel/resource.c index 74af2d7cb5a1..414d6fc9131e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new, EXPORT_SYMBOL(allocate_resource); -/** - * insert_resource - Inserts a resource in the resource tree - * @parent: parent of the new resource - * @new: new resource to insert - * - * Returns 0 on success, -EBUSY if the resource can't be inserted. - * - * This function is equivalent to request_resource when no conflict - * happens. If a conflict happens, and the conflicting resources - * entirely fit within the range of the new resource, then the new - * resource is inserted and the conflicting resources become children of - * the new resource. +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) */ -int insert_resource(struct resource *parent, struct resource *new) +static struct resource * __insert_resource(struct resource *parent, struct resource *new) { - int result; struct resource *first, *next; - write_lock(&resource_lock); - for (;; parent = first) { - result = 0; first = __request_resource(parent, new); if (!first) - goto out; + return first; - result = -EBUSY; if (first == parent) - goto out; + return first; if ((first->start > new->start) || (first->end < new->end)) break; @@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new) for (next = first; ; next = next->sibling) { /* Partial overlap? Bad, and unfixable */ if (next->start < new->start || next->end > new->end) - goto out; + return next; if (!next->sibling) break; if (next->sibling->start > new->end) break; } - result = 0; - new->parent = parent; new->sibling = next->sibling; new->child = first; @@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new) next = next->sibling; next->sibling = new; } + return NULL; +} - out: +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent to request_resource when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @root: root resource descriptor + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } write_unlock(&resource_lock); - return result; } /** @@ -478,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t return result; } +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", + conflict->name, conflict->start, conflict->end, + name, start, end); + + /* failed, split and try again */ + + /* conflict coverred whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + EXPORT_SYMBOL(adjust_resource); /** @@ -490,7 +596,7 @@ resource_size_t resource_alignment(struct resource *res) { switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { case IORESOURCE_SIZEALIGN: - return res->end - res->start + 1; + return resource_size(res); case IORESOURCE_STARTALIGN: return res->start; default: diff --git a/kernel/sched.c b/kernel/sched.c index 0047bd9b96aa..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -201,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -298,9 +303,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -600,14 +605,13 @@ struct rq { /* BKL stats */ unsigned int bkl_count; #endif - struct lock_class_key rq_lock_key; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -809,9 +813,9 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; /* * ratelimit for updating the group shares. - * default: 0.5ms + * default: 0.25ms */ -const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; +unsigned int sysctl_sched_shares_ratelimit = 250000; /* * period over which we measure -rt task cpu usage in us. @@ -834,7 +838,7 @@ static inline u64 global_rt_period(void) static inline u64 global_rt_runtime(void) { - if (sysctl_sched_rt_period < 0) + if (sysctl_sched_rt_runtime < 0) return RUNTIME_INF; return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; @@ -1088,7 +1092,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } @@ -1103,7 +1107,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1120,9 +1124,9 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1134,7 +1138,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. @@ -1381,38 +1385,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1420,15 +1410,43 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; } +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* @@ -1487,11 +1505,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1516,6 +1534,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1523,10 +1543,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1537,11 +1557,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1551,7 +1568,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1562,9 +1579,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else @@ -1867,16 +1884,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * wait_task_inactive - wait for a thread to unschedule. * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(struct task_struct *p) +unsigned long wait_task_inactive(struct task_struct *p, long match_state) { unsigned long flags; int running, on_rq; + unsigned long ncsw; struct rq *rq; for (;;) { @@ -1899,8 +1924,11 @@ void wait_task_inactive(struct task_struct *p) * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; cpu_relax(); + } /* * Ok, time to look more closely! We need the rq @@ -1910,9 +1938,18 @@ void wait_task_inactive(struct task_struct *p) rq = task_rq_lock(p, &flags); running = task_running(rq, p); on_rq = p->se.on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* * Was it really running after all now that we * checked with the proper locks actually held? * @@ -1944,6 +1981,8 @@ void wait_task_inactive(struct task_struct *p) */ break; } + + return ncsw; } /*** @@ -2261,7 +2300,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2396,7 +2435,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2734,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) } else { if (rq1 < rq2) { spin_lock(&rq1->lock); - spin_lock(&rq2->lock); + spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&rq2->lock); - spin_lock(&rq1->lock); + spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); } } update_rq_clock(rq1); @@ -2780,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); - spin_lock(&this_rq->lock); + spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); ret = 1; } else - spin_lock(&busiest->lock); + spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); } return ret; } +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only @@ -2849,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -3612,7 +3658,7 @@ redo: ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, CPU_NEWLY_IDLE, &all_pinned); - spin_unlock(&busiest->lock); + double_unlock_balance(this_rq, busiest); if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), *cpus); @@ -3727,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) else schedstat_inc(sd, alb_failed); } - spin_unlock(&target_rq->lock); + double_unlock_balance(busiest_rq, target_rq); } #ifdef CONFIG_NO_HZ @@ -4148,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal) } /* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +cputime_t task_utime(struct task_struct *p) +{ + return p->utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + return p->stime; +} +#else +cputime_t task_utime(struct task_struct *p) +{ + clock_t utime = cputime_to_clock_t(p->utime), + total = utime + cputime_to_clock_t(p->stime); + u64 temp; + + /* + * Use CFS's precise accounting: + */ + temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } + utime = (clock_t)temp; + + p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + return p->prev_utime; +} + +cputime_t task_stime(struct task_struct *p) +{ + clock_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ + stime = nsec_to_clock_t(p->se.sum_exec_runtime) - + cputime_to_clock_t(task_utime(p)); + + if (stime >= 0) + p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + + return p->prev_stime; +} +#endif + +inline cputime_t task_gtime(struct task_struct *p) +{ + return p->gtime; +} + +/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -4537,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4548,6 +4662,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4568,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } @@ -4599,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4612,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4621,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4629,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); @@ -4638,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_killable); +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + int ret = 1; + + spin_lock_irq(&x->wait.lock); + if (!x->done) + ret = 0; + spin_unlock_irq(&x->wait.lock); + return ret; +} +EXPORT_SYMBOL(completion_done); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -4979,19 +5183,22 @@ recheck: return -EPERM; } + if (user) { #ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (user - && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; #endif - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -5707,6 +5914,8 @@ static inline void sched_init_granularity(void) sysctl_sched_latency = limit; sysctl_sched_wakeup_granularity *= factor; + + sysctl_sched_shares_ratelimit *= factor; } #ifdef CONFIG_SMP @@ -5817,7 +6026,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } done: ret = 1; @@ -6142,7 +6351,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6170,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -6389,7 +6600,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -void __init migration_init(void) +static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -6399,7 +6610,10 @@ void __init migration_init(void) BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + + return err; } +early_initcall(migration_init); #endif #ifdef CONFIG_SMP @@ -7051,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) @@ -7553,24 +7775,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * + * If doms_new==NULL it will be replaced with cpu_online_map. + * ndoms_new==0 is a special case for destroying existing domains. + * It will not create the default domain. + * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { - int i, j; + int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) - ndoms_new = 0; + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < ndoms_new; j++) { + for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7583,7 +7808,6 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; @@ -7620,8 +7844,13 @@ match2: int arch_reinit_sched_domains(void) { get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + rebuild_sched_domains(); put_online_cpus(); + return 0; } @@ -7643,34 +7872,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, sched_smt_power_savings_store); #endif @@ -7705,7 +7934,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - partition_sched_domains(0, NULL, NULL); + partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: @@ -7970,7 +8199,6 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); - lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); @@ -8093,20 +8321,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); @@ -8427,8 +8660,8 @@ struct task_group *sched_create_group(struct task_group *parent) WARN_ON(!parent); /* root should already exist */ tg->parent = parent; - list_add_rcu(&tg->siblings, &parent->children); INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); return tg; @@ -8604,73 +8837,95 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8680,14 +8935,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; - goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8756,16 +9006,25 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; @@ -8776,6 +9035,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; @@ -8836,7 +9098,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -8845,9 +9106,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; } diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 22ed55d1167f..e8ab096ddfe3 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -12,19 +12,17 @@ * * Create a semi stable clock from a mixture of other events, including: * - gtod - * - jiffies * - sched_clock() * - explicit idle events * * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. This window - * is set up using jiffies. + * making it monotonic and keeping it within an expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 1 jiffies difference). + * consistent between cpus (never more than 2 jiffies difference). */ #include <linux/sched.h> #include <linux/percpu.h> @@ -32,13 +30,19 @@ #include <linux/ktime.h> #include <linux/module.h> +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); +} -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static __read_mostly int sched_clock_running; -#define MULTI_SHIFT 15 -/* Max is double, Min is 1/2 */ -#define MAX_MULTI (2LL << MULTI_SHIFT) -#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK struct sched_clock_data { /* @@ -48,15 +52,9 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long tick_jiffies; - u64 prev_raw; u64 tick_raw; u64 tick_gtod; u64 clock; - s64 multi; -#ifdef CONFIG_NO_HZ - int check_max; -#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -71,121 +69,69 @@ static inline struct sched_clock_data *cpu_sdc(int cpu) return &per_cpu(sched_clock_data, cpu); } -static __read_mostly int sched_clock_running; - void sched_clock_init(void) { u64 ktime_now = ktime_to_ns(ktime_get()); - unsigned long now_jiffies = jiffies; int cpu; for_each_possible_cpu(cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->tick_jiffies = now_jiffies; - scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; - scd->multi = 1 << MULTI_SHIFT; -#ifdef CONFIG_NO_HZ - scd->check_max = 1; -#endif } sched_clock_running = 1; } -#ifdef CONFIG_NO_HZ /* - * The dynamic ticks makes the delta jiffies inaccurate. This - * prevents us from checking the maximum time update. - * Disable the maximum check during stopped ticks. + * min,max except they take wrapping into account */ -void sched_clock_tick_stop(int cpu) -{ - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 0; -} -void sched_clock_tick_start(int cpu) +static inline u64 wrap_min(u64 x, u64 y) { - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->check_max = 1; + return (s64)(x - y) < 0 ? x : y; } -static int check_max(struct sched_clock_data *scd) +static inline u64 wrap_max(u64 x, u64 y) { - return scd->check_max; + return (s64)(x - y) > 0 ? x : y; } -#else -static int check_max(struct sched_clock_data *scd) -{ - return 1; -} -#endif /* CONFIG_NO_HZ */ /* * update the percpu scd from the raw @now value * * - filter out backward motion - * - use jiffies to generate a min,max window to clip the raw values + * - use the GTOD tick value to create a window to filter crazy TSC values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) +static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) { - unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->tick_jiffies; - u64 clock = scd->clock; - u64 min_clock, max_clock; - s64 delta = now - scd->prev_raw; + s64 delta = now - scd->tick_raw; + u64 clock, min_clock, max_clock; WARN_ON_ONCE(!irqs_disabled()); - /* - * At schedule tick the clock can be just under the gtod. We don't - * want to push it too prematurely. - */ - min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); - if (min_clock > TICK_NSEC) - min_clock -= TICK_NSEC / 2; - - if (unlikely(delta < 0)) { - clock++; - goto out; - } + if (unlikely(delta < 0)) + delta = 0; /* - * The clock must stay within a jiffie of the gtod. - * But since we may be at the start of a jiffy or the end of one - * we add another jiffy buffer. + * scd->clock = clamp(scd->tick_gtod + delta, + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); */ - max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - delta *= scd->multi; - delta >>= MULTI_SHIFT; + clock = scd->tick_gtod + delta; + min_clock = wrap_max(scd->tick_gtod, scd->clock); + max_clock = scd->tick_gtod + TICK_NSEC; - if (unlikely(clock + delta > max_clock) && check_max(scd)) { - if (clock < max_clock) - clock = max_clock; - else - clock++; - } else { - clock += delta; - } + clock = wrap_max(clock, min_clock); + clock = wrap_min(clock, max_clock); - out: - if (unlikely(clock < min_clock)) - clock = min_clock; + scd->clock = clock; - if (time) - *time = clock; - else { - scd->prev_raw = now; - scd->clock = clock; - } + return scd->clock; } static void lock_double_clock(struct sched_clock_data *data1, @@ -203,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1, u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd = cpu_sdc(cpu); - u64 now, clock; + u64 now, clock, this_clock, remote_clock; if (unlikely(!sched_clock_running)) return 0ull; @@ -212,43 +158,44 @@ u64 sched_clock_cpu(int cpu) now = sched_clock(); if (cpu != raw_smp_processor_id()) { - /* - * in order to update a remote cpu's clock based on our - * unstable raw time rebase it against: - * tick_raw (offset between raw counters) - * tick_gotd (tick offset between cpus) - */ struct sched_clock_data *my_scd = this_scd(); lock_double_clock(scd, my_scd); - now -= my_scd->tick_raw; - now += scd->tick_raw; + this_clock = __update_sched_clock(my_scd, now); + remote_clock = scd->clock; - now += my_scd->tick_gtod; - now -= scd->tick_gtod; + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + clock = this_clock; + scd->clock = clock; + } else { + /* + * Should be rare, but possible: + */ + clock = remote_clock; + my_scd->clock = remote_clock; + } __raw_spin_unlock(&my_scd->lock); - - __update_sched_clock(scd, now, &clock); - - __raw_spin_unlock(&scd->lock); - } else { __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); - clock = scd->clock; - __raw_spin_unlock(&scd->lock); + clock = __update_sched_clock(scd, now); } + __raw_spin_unlock(&scd->lock); + return clock; } void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); - unsigned long now_jiffies = jiffies; - s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -260,29 +207,9 @@ void sched_clock_tick(void) now = sched_clock(); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now, NULL); - /* - * update tick_gtod after __update_sched_clock() because that will - * already observe 1 new jiffy; adding a new tick_gtod to that would - * increase the clock 2 jiffies. - */ - delta_gtod = now_gtod - scd->tick_gtod; - delta_raw = now - scd->tick_raw; - - if ((long)delta_raw > 0) { - mult = delta_gtod << MULTI_SHIFT; - do_div(mult, delta_raw); - scd->multi = mult; - if (scd->multi > MAX_MULTI) - scd->multi = MAX_MULTI; - else if (scd->multi < MIN_MULTI) - scd->multi = MIN_MULTI; - } else - scd->multi = 1 << MULTI_SHIFT; - scd->tick_raw = now; scd->tick_gtod = now_gtod; - scd->tick_jiffies = now_jiffies; + __update_sched_clock(scd, now); __raw_spin_unlock(&scd->lock); } @@ -300,37 +227,28 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); */ void sched_clock_idle_wakeup_event(u64 delta_ns) { - struct sched_clock_data *scd = this_scd(); - u64 now = sched_clock(); - - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - __raw_spin_lock(&scd->lock); - scd->prev_raw = now; - scd->clock += delta_ns; - scd->multi = 1 << MULTI_SHIFT; - __raw_spin_unlock(&scd->lock); - + sched_clock_tick(); touch_softlockup_watchdog(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#endif +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) +void sched_clock_init(void) { - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + sched_clock_running = 1; } +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + +#endif + unsigned long long cpu_clock(int cpu) { unsigned long long clock; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cf2cd6ce4cb2..18fd17172eb6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -409,64 +409,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) } /* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - -/* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. */ @@ -586,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -599,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } cfs_rq->nr_running--; se->on_rq = 0; - list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -899,7 +843,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) * doesn't make sense. Rely on vruntime for fairness. */ if (rq->curr != p) - delta = max(10000LL, delta); + delta = max_t(s64, 10000LL, delta); hrtick_start(rq, delta); } @@ -1085,7 +1029,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - long more_w; if (!tg->parent) return wl; @@ -1097,18 +1040,17 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - long S, rw, s, a, b; + long more_w; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1117,7 +1059,11 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b)/D(b); + wl = s*(a-b); + + if (likely(b)) + wl /= b; + /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1126,7 +1072,6 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; -#undef D } return wl; @@ -1143,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1158,6 +1103,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (!sync && sched_feat(SYNC_WAKEUPS) && + curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1182,17 +1132,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } + if (sync && balanced) + return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { + if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= + tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1211,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *rq, *this_rq; + struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); - rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (prev_cpu == this_cpu) + goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1248,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * Start passive balancing when half the imbalance_pct * limit is reached. @@ -1281,62 +1226,20 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } /* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - -/* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; + s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1351,6 +1254,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) cfs_rq_of(pse)->next = pse; /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + + /* * Batch tasks do not preempt (their preemption is driven by * the tick): */ @@ -1360,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { + resched_task(curr); + return; } - if (wakeup_preempt_entity(se, pse) == 1) + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > wakeup_gran(pse)) resched_task(curr); } @@ -1442,18 +1334,13 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) struct task_struct *p = NULL; struct sched_entity *se; - while (next != &cfs_rq->tasks) { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; + if (next == &cfs_rq->tasks) + return NULL; - /* Skip over entities that are not tasks */ - if (entity_is_task(se)) { - p = task_of(se); - break; - } - } + se = list_entry(next, struct sched_entity, group_node); + p = task_of(se); + cfs_rq->balance_iterator = next->next; - cfs_rq->balance_iterator = next; return p; } @@ -1502,7 +1389,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry(tg, &task_groups, list) { + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; @@ -1615,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); } /* @@ -1637,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* @@ -1654,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 862b06bd560a..7c9e8f4a049f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -8,6 +8,7 @@ SCHED_FEAT(SYNC_WAKEUPS, 1) SCHED_FEAT(HRTICK, 1) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) -SCHED_FEAT(LB_BIAS, 0) +SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 0) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe66..dec4ccabe2f5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 908c04f9dad0..cdf5740ab03e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - - enqueue_rt_entity(rt_se); + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } @@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + if (rt_rq->rt_nr_running) + resched_task(rq_of_rt_rq(rt_rq)->curr); } static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -229,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -248,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ if (iter->rt_runtime == RUNTIME_INF) goto next; + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -272,6 +286,9 @@ next: return more; } +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -287,18 +304,34 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ want = rt_b->rt_runtime - rt_rq->rt_runtime; + /* + * Greedy reclaim, take back as much as we can. + */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; - if (iter == rt_rq) + /* + * Can't reclaim from ourselves or disabled runqueues. + */ + if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; spin_lock(&iter->rt_runtime_lock); @@ -317,8 +350,16 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ BUG_ON(want); balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -341,6 +382,9 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; + /* + * Reset each runqueue's bandwidth settings + */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -348,6 +392,7 @@ static void __enable_runtime(struct rq *rq) spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } @@ -386,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -438,9 +483,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -487,13 +529,18 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } @@ -782,7 +829,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); @@ -861,6 +908,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) #define RT_MAX_TRIES 3 static int double_lock_balance(struct rq *this_rq, struct rq *busiest); +static void double_unlock_balance(struct rq *this_rq, struct rq *busiest); + static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) @@ -1022,7 +1071,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) break; /* try again */ - spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; } @@ -1091,7 +1140,7 @@ static int push_rt_task(struct rq *rq) resched_task(lowest_rq->curr); - spin_unlock(&lowest_rq->lock); + double_unlock_balance(rq, lowest_rq); ret = 1; out: @@ -1197,7 +1246,7 @@ static int pull_rt_task(struct rq *this_rq) } skip: - spin_unlock(&src_rq->lock); + double_unlock_balance(this_rq, src_rq); } return ret; diff --git a/kernel/semaphore.c b/kernel/semaphore.c index aaaeae8244e7..94a62c0d4ade 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state, waiter.up = 0; for (;;) { - if (state == TASK_INTERRUPTIBLE && signal_pending(task)) - goto interrupted; - if (state == TASK_KILLABLE && fatal_signal_pending(task)) + if (signal_pending_state(state, task)) goto interrupted; if (timeout <= 0) goto timed_out; diff --git a/kernel/signal.c b/kernel/signal.c index 82c3545596c5..e661b01d340f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -22,6 +22,7 @@ #include <linux/ptrace.h> #include <linux/signal.h> #include <linux/signalfd.h> +#include <linux/tracehook.h> #include <linux/capability.h> #include <linux/freezer.h> #include <linux/pid_namespace.h> @@ -39,24 +40,21 @@ static struct kmem_cache *sigqueue_cachep; -static int __sig_ignored(struct task_struct *t, int sig) +static void __user *sig_handler(struct task_struct *t, int sig) { - void __user *handler; + return t->sighand->action[sig - 1].sa.sa_handler; +} +static int sig_handler_ignored(void __user *handler, int sig) +{ /* Is it explicitly or implicitly ignored? */ - - handler = t->sighand->action[sig - 1].sa.sa_handler; return handler == SIG_IGN || (handler == SIG_DFL && sig_kernel_ignore(sig)); } static int sig_ignored(struct task_struct *t, int sig) { - /* - * Tracers always want to know about signals.. - */ - if (t->ptrace & PT_PTRACED) - return 0; + void __user *handler; /* * Blocked signals are never ignored, since the @@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - return __sig_ignored(t, sig); + handler = sig_handler(t, sig); + if (!sig_handler_ignored(handler, sig)) + return 0; + + /* + * Tracers may want to know about even ignored signals. + */ + return !tracehook_consider_ignored_signal(t, sig, handler); } /* @@ -129,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (unlikely(tracehook_force_sigpending())) + set_thread_flag(TIF_SIGPENDING); + else if (!recalc_sigpending_tsk(current) && !freezing(current)) clear_thread_flag(TIF_SIGPENDING); } @@ -295,12 +302,12 @@ flush_signal_handlers(struct task_struct *t, int force_default) int unhandled_signal(struct task_struct *tsk, int sig) { + void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) return 1; - if (tsk->ptrace & PT_PTRACED) + if (handler != SIG_IGN && handler != SIG_DFL) return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); + return !tracehook_consider_fatal_signal(tsk, sig, handler); } @@ -591,9 +598,6 @@ static int check_kill_permission(int sig, struct siginfo *info, return security_task_kill(t, info, sig, 0); } -/* forward decl */ -static void do_notify_parent_cldstop(struct task_struct *tsk, int why); - /* * Handle magic process-wide effects of stop/continue signals. Unlike * the signal actions, these happen immediately at signal-generation @@ -756,7 +760,8 @@ static void complete_signal(int sig, struct task_struct *p, int group) if (sig_fatal(p, sig) && !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { + (sig == SIGKILL || + !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) { /* * This signal will be fatal to the whole group. */ @@ -1299,6 +1304,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) q->info.si_overrun++; goto out; } + q->info.si_overrun = 0; signalfd_notify(t, sig); pending = group ? &t->signal->shared_pending : &t->pending; @@ -1323,13 +1329,16 @@ static inline void __wake_up_parent(struct task_struct *p, /* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. + * + * Returns -1 if our parent ignored us and so we've switched to + * self-reaping, or else @sig. */ - -void do_notify_parent(struct task_struct *tsk, int sig) +int do_notify_parent(struct task_struct *tsk, int sig) { struct siginfo info; unsigned long flags; struct sighand_struct *psig; + int ret = sig; BUG_ON(sig == -1); @@ -1394,14 +1403,16 @@ void do_notify_parent(struct task_struct *tsk, int sig) * is implementation-defined: we do (if you don't want * it, just use SIG_IGN instead). */ - tsk->exit_signal = -1; + ret = tsk->exit_signal = -1; if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) - sig = 0; + sig = -1; } if (valid_signal(sig) && sig > 0) __group_send_sig_info(sig, &info, tsk->parent); __wake_up_parent(tsk, tsk->parent); spin_unlock_irqrestore(&psig->siglock, flags); + + return ret; } static void do_notify_parent_cldstop(struct task_struct *tsk, int why) @@ -1599,7 +1610,7 @@ finish_stop(int stop_count) * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { + if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(current, CLD_STOPPED); read_unlock(&tasklist_lock); @@ -1735,6 +1746,9 @@ relock: signal->flags &= ~SIGNAL_CLD_MASK; spin_unlock_irq(&sighand->siglock); + if (unlikely(!tracehook_notify_jctl(1, why))) + goto relock; + read_lock(&tasklist_lock); do_notify_parent_cldstop(current->group_leader, why); read_unlock(&tasklist_lock); @@ -1748,17 +1762,33 @@ relock: do_signal_stop(0)) goto relock; - signr = dequeue_signal(current, ¤t->blocked, info); - if (!signr) - break; /* will return 0 */ + /* + * Tracing can induce an artifical signal and choose sigaction. + * The return value in @signr determines the default action, + * but @info->si_signo is the signal number we will report. + */ + signr = tracehook_get_signal(current, regs, info, return_ka); + if (unlikely(signr < 0)) + goto relock; + if (unlikely(signr != 0)) + ka = return_ka; + else { + signr = dequeue_signal(current, ¤t->blocked, + info); - if (signr != SIGKILL) { - signr = ptrace_signal(signr, info, regs, cookie); if (!signr) - continue; + break; /* will return 0 */ + + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, + regs, cookie); + if (!signr) + continue; + } + + ka = &sighand->action[signr-1]; } - ka = &sighand->action[signr-1]; if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ continue; if (ka->sa.sa_handler != SIG_DFL) { @@ -1806,7 +1836,7 @@ relock: spin_lock_irq(&sighand->siglock); } - if (likely(do_signal_stop(signr))) { + if (likely(do_signal_stop(info->si_signo))) { /* It released the siglock. */ goto relock; } @@ -1827,7 +1857,7 @@ relock: if (sig_kernel_coredump(signr)) { if (print_fatal_signals) - print_fatal_signal(regs, signr); + print_fatal_signal(regs, info->si_signo); /* * If it was able to dump core, this kills all * other threads in the group and synchronizes with @@ -1836,13 +1866,13 @@ relock: * first and our do_group_exit call below will use * that value and ignore the one we pass it. */ - do_coredump((long)signr, signr, regs); + do_coredump(info->si_signo, info->si_signo, regs); } /* * Death signals, no core dump. */ - do_group_exit(signr); + do_group_exit(info->si_signo); /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); @@ -1884,7 +1914,7 @@ void exit_signals(struct task_struct *tsk) out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop)) { + if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { read_lock(&tasklist_lock); do_notify_parent_cldstop(tsk, CLD_STOPPED); read_unlock(&tasklist_lock); @@ -1895,7 +1925,6 @@ EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); @@ -2299,7 +2328,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) * (for example, SIGCHLD), shall cause the pending signal to * be discarded, whether or not it is blocked" */ - if (__sig_ignored(t, sig)) { + if (sig_handler_ignored(sig_handler(t, sig), sig)) { sigemptyset(&mask); sigaddset(&mask, sig); rm_from_queue_full(&mask, &t->signal->shared_pending); diff --git a/kernel/smp.c b/kernel/smp.c index 462c785ca1ee..f362a8553777 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -33,7 +33,7 @@ struct call_single_queue { spinlock_t lock; }; -void __cpuinit init_call_single_data(void) +static int __cpuinit init_call_single_data(void) { int i; @@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void) spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } + return 0; } +early_initcall(init_call_single_data); static void csd_flag_wait(struct call_single_data *data) { @@ -133,7 +135,8 @@ void generic_smp_call_function_interrupt(void) */ smp_wmb(); data->csd.flags &= ~CSD_FLAG_WAIT; - } else + } + if (data->csd.flags & CSD_FLAG_ALLOC) call_rcu(&data->rcu_head, rcu_free_call_data); } rcu_read_unlock(); @@ -207,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, { struct call_single_data d; unsigned long flags; - /* prevent preemption and reschedule on another processor */ + /* prevent preemption and reschedule on another processor, + as well as CPU removal */ int me = get_cpu(); + int err = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -217,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, local_irq_save(flags); func(info); local_irq_restore(flags); - } else { + } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { struct call_single_data *data = NULL; if (!wait) { @@ -233,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, data->func = func; data->info = info; generic_exec_single(cpu, data); + } else { + err = -ENXIO; /* CPU not online */ } put_cpu(); - return 0; + return err; } EXPORT_SYMBOL(smp_call_function_single); @@ -258,6 +265,42 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) generic_exec_single(cpu, data); } +/* Dummy function */ +static void quiesce_dummy(void *unused) +{ +} + +/* + * Ensure stack based data used in call function mask is safe to free. + * + * This is needed by smp_call_function_mask when using on-stack data, because + * a single call function queue is shared by all CPUs, and any CPU may pick up + * the data item on the queue at any time before it is deleted. So we need to + * ensure that all CPUs have transitioned through a quiescent state after + * this call. + * + * This is a very slow function, implemented by sending synchronous IPIs to + * all possible CPUs. For this reason, we have to alloc data rather than use + * stack based data even in the case of synchronous calls. The stack based + * data is then just used for deadlock/oom fallback which will be very rare. + * + * If a faster scheme can be made, we could go back to preferring stack based + * data -- the data allocation/free is non-zero cost. + */ +static void smp_call_function_mask_quiesce_stack(cpumask_t mask) +{ + struct call_single_data data; + int cpu; + + data.func = quiesce_dummy; + data.info = NULL; + + for_each_cpu_mask(cpu, mask) { + data.flags = CSD_FLAG_WAIT; + generic_exec_single(cpu, &data); + } +} + /** * smp_call_function_mask(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on. @@ -283,6 +326,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, cpumask_t allbutself; unsigned long flags; int cpu, num_cpus; + int slowpath = 0; /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -304,15 +348,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, return smp_call_function_single(cpu, func, info, wait); } - if (!wait) { - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->csd.flags = CSD_FLAG_ALLOC; - } - if (!data) { + data = kmalloc(sizeof(*data), GFP_ATOMIC); + if (data) { + data->csd.flags = CSD_FLAG_ALLOC; + if (wait) + data->csd.flags |= CSD_FLAG_WAIT; + } else { data = &d; data->csd.flags = CSD_FLAG_WAIT; wait = 1; + slowpath = 1; } spin_lock_init(&data->lock); @@ -329,8 +374,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, arch_send_call_function_ipi(mask); /* optionally wait for the CPUs to complete */ - if (wait) + if (wait) { csd_flag_wait(&data->csd); + if (unlikely(slowpath)) + smp_call_function_mask_quiesce_stack(mask); + } return 0; } diff --git a/kernel/softirq.c b/kernel/softirq.c index f6b03d56c2bf..c506f266a6b9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -630,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init int spawn_ksoftirqd(void) +static __init int spawn_ksoftirqd(void) { void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); @@ -640,6 +640,7 @@ __init int spawn_ksoftirqd(void) register_cpu_notifier(&cpu_nfb); return 0; } +early_initcall(spawn_ksoftirqd); #ifdef CONFIG_SMP /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7bd8d1aadd5d..cb838ee93a82 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -233,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu) do_each_thread(g, t) { if (!--max_count) goto unlock; - if (t->state & TASK_UNINTERRUPTIBLE) + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) check_hung_task(t, now); } while_each_thread(g, t); unlock: @@ -338,14 +339,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; -__init void spawn_softlockup_task(void) +static int __initdata nosoftlockup; + +static int __init nosoftlockup_setup(char *str) +{ + nosoftlockup = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); + +static int __init spawn_softlockup_task(void) { void *cpu = (void *)(long)smp_processor_id(); - int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + int err; - BUG_ON(err == NOTIFY_BAD); + if (nosoftlockup) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + if (err == NOTIFY_BAD) { + BUG(); + return 1; + } cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + return 0; } +early_initcall(spawn_softlockup_task); diff --git a/kernel/spinlock.c b/kernel/spinlock.c index a1fb54c93cdd..29ab20749dd3 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -290,8 +290,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass) spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); } - EXPORT_SYMBOL(_spin_lock_nested); + unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) { unsigned long flags; @@ -311,9 +311,17 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas #endif return flags; } - EXPORT_SYMBOL(_spin_lock_irqsave_nested); +void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); +} +EXPORT_SYMBOL(_spin_lock_nest_lock); + #endif void __lockfunc _spin_unlock(spinlock_t *lock) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 738b411ff2d3..af3c7cea258b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -1,4 +1,4 @@ -/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. +/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. * GPL v2 and any later version. */ #include <linux/cpu.h> @@ -13,204 +13,177 @@ #include <asm/atomic.h> #include <asm/uaccess.h> -/* Since we effect priority and affinity (both of which are visible - * to, and settable by outside processes) we do indirection via a - * kthread. */ - -/* Thread to stop each CPU in user context. */ +/* This controls the threads on each CPU. */ enum stopmachine_state { - STOPMACHINE_WAIT, + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Awaiting everyone to be scheduled. */ STOPMACHINE_PREPARE, + /* Disable interrupts. */ STOPMACHINE_DISABLE_IRQ, + /* Run the function */ + STOPMACHINE_RUN, + /* Exit */ STOPMACHINE_EXIT, }; +static enum stopmachine_state state; -static enum stopmachine_state stopmachine_state; -static unsigned int stopmachine_num_threads; -static atomic_t stopmachine_thread_ack; - -static int stopmachine(void *cpu) -{ - int irqs_disabled = 0; - int prepared = 0; - cpumask_of_cpu_ptr(cpumask, (int)(long)cpu); - - set_cpus_allowed_ptr(current, cpumask); - - /* Ack: we are alive */ - smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ - atomic_inc(&stopmachine_thread_ack); - - /* Simple state machine */ - while (stopmachine_state != STOPMACHINE_EXIT) { - if (stopmachine_state == STOPMACHINE_DISABLE_IRQ - && !irqs_disabled) { - local_irq_disable(); - hard_irq_disable(); - irqs_disabled = 1; - /* Ack: irqs disabled. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } else if (stopmachine_state == STOPMACHINE_PREPARE - && !prepared) { - /* Everyone is in place, hold CPU. */ - preempt_disable(); - prepared = 1; - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - } - /* Yield in first stage: migration threads need to - * help our sisters onto their CPUs. */ - if (!prepared && !irqs_disabled) - yield(); - cpu_relax(); - } - - /* Ack: we are exiting. */ - smp_mb(); /* Must read state first. */ - atomic_inc(&stopmachine_thread_ack); - - if (irqs_disabled) - local_irq_enable(); - if (prepared) - preempt_enable(); +struct stop_machine_data { + int (*fn)(void *); + void *data; + int fnret; +}; - return 0; -} +/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ +static unsigned int num_threads; +static atomic_t thread_ack; +static struct completion finished; +static DEFINE_MUTEX(lock); -/* Change the thread state */ -static void stopmachine_set_state(enum stopmachine_state state) +static void set_state(enum stopmachine_state newstate) { - atomic_set(&stopmachine_thread_ack, 0); + /* Reset ack counter. */ + atomic_set(&thread_ack, num_threads); smp_wmb(); - stopmachine_state = state; - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) - cpu_relax(); + state = newstate; } -static int stop_machine(void) +/* Last one to ack a state moves to the next state. */ +static void ack_state(void) { - int i, ret = 0; - - atomic_set(&stopmachine_thread_ack, 0); - stopmachine_num_threads = 0; - stopmachine_state = STOPMACHINE_WAIT; - - for_each_online_cpu(i) { - if (i == raw_smp_processor_id()) - continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) - break; - stopmachine_num_threads++; - } - - /* Wait for them all to come to life. */ - while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { - yield(); - cpu_relax(); + if (atomic_dec_and_test(&thread_ack)) { + /* If we're the last one to ack the EXIT, we're finished. */ + if (state == STOPMACHINE_EXIT) + complete(&finished); + else + set_state(state + 1); } +} - /* If some failed, kill them all. */ - if (ret < 0) { - stopmachine_set_state(STOPMACHINE_EXIT); - return ret; - } +/* This is the actual thread which stops the CPU. It exits by itself rather + * than waiting for kthread_stop(), because it's easier for hotplug CPU. */ +static int stop_cpu(struct stop_machine_data *smdata) +{ + enum stopmachine_state curstate = STOPMACHINE_NONE; - /* Now they are all started, make them hold the CPUs, ready. */ - preempt_disable(); - stopmachine_set_state(STOPMACHINE_PREPARE); + /* Simple state machine */ + do { + /* Chill out and ensure we re-read stopmachine_state. */ + cpu_relax(); + if (state != curstate) { + curstate = state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + /* |= allows error detection if functions on + * multiple CPUs. */ + smdata->fnret |= smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(); + } + } while (curstate != STOPMACHINE_EXIT); - /* Make them disable irqs. */ - local_irq_disable(); - hard_irq_disable(); - stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); + local_irq_enable(); + do_exit(0); +} +/* Callback for CPUs which aren't supposed to do anything. */ +static int chill(void *unused) +{ return 0; } -static void restart_machine(void) +int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { - stopmachine_set_state(STOPMACHINE_EXIT); - local_irq_enable(); - preempt_enable_no_resched(); -} + int i, err; + struct stop_machine_data active, idle; + struct task_struct **threads; + + active.fn = fn; + active.data = data; + active.fnret = 0; + idle.fn = chill; + idle.data = NULL; + + /* This could be too big for stack on large machines. */ + threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL); + if (!threads) + return -ENOMEM; + + /* Set up initial state. */ + mutex_lock(&lock); + init_completion(&finished); + num_threads = num_online_cpus(); + set_state(STOPMACHINE_PREPARE); -struct stop_machine_data { - int (*fn)(void *); - void *data; - struct completion done; -}; + for_each_online_cpu(i) { + struct stop_machine_data *smdata = &idle; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; -static int do_stop(void *_smdata) -{ - struct stop_machine_data *smdata = _smdata; - int ret; + if (!cpus) { + if (i == first_cpu(cpu_online_map)) + smdata = &active; + } else { + if (cpu_isset(i, *cpus)) + smdata = &active; + } - ret = stop_machine(); - if (ret == 0) { - ret = smdata->fn(smdata->data); - restart_machine(); - } + threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u", + i); + if (IS_ERR(threads[i])) { + err = PTR_ERR(threads[i]); + threads[i] = NULL; + goto kill_threads; + } - /* We're done: you can kthread_stop us now */ - complete(&smdata->done); + /* Place it onto correct cpu. */ + kthread_bind(threads[i], i); - /* Wait for kthread_stop */ - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); + /* Make it highest prio. */ + if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, ¶m)) + BUG(); } - __set_current_state(TASK_RUNNING); - return ret; -} -struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, - unsigned int cpu) -{ - static DEFINE_MUTEX(stopmachine_mutex); - struct stop_machine_data smdata; - struct task_struct *p; + /* We've created all the threads. Wake them all: hold this CPU so one + * doesn't hit this CPU until we're ready. */ + get_cpu(); + for_each_online_cpu(i) + wake_up_process(threads[i]); - smdata.fn = fn; - smdata.data = data; - init_completion(&smdata.done); + /* This will release the thread on our CPU. */ + put_cpu(); + wait_for_completion(&finished); + mutex_unlock(&lock); - mutex_lock(&stopmachine_mutex); + kfree(threads); - /* If they don't care which CPU fn runs on, bind to any online one. */ - if (cpu == NR_CPUS) - cpu = raw_smp_processor_id(); + return active.fnret; - p = kthread_create(do_stop, &smdata, "kstopmachine"); - if (!IS_ERR(p)) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; +kill_threads: + for_each_online_cpu(i) + if (threads[i]) + kthread_stop(threads[i]); + mutex_unlock(&lock); - /* One high-prio thread per cpu. We'll do this one. */ - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - kthread_bind(p, cpu); - wake_up_process(p); - wait_for_completion(&smdata.done); - } - mutex_unlock(&stopmachine_mutex); - return p; + kfree(threads); + return err; } -int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) +int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { - struct task_struct *p; int ret; /* No CPUs can come up or down during this. */ get_online_cpus(); - p = __stop_machine_run(fn, data, cpu); - if (!IS_ERR(p)) - ret = kthread_stop(p); - else - ret = PTR_ERR(p); + ret = __stop_machine(fn, data, cpus); put_online_cpus(); return ret; } -EXPORT_SYMBOL_GPL(stop_machine_run); +EXPORT_SYMBOL_GPL(stop_machine); diff --git a/kernel/sys.c b/kernel/sys.c index 0c9d3fa1f5ff..038a7bc0901d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); - do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -274,7 +274,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -static void kernel_restart_prepare(char *cmd) +void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; @@ -301,26 +301,6 @@ void kernel_restart(char *cmd) } EXPORT_SYMBOL_GPL(kernel_restart); -/** - * kernel_kexec - reboot the system - * - * Move into place and start executing a preloaded standalone - * executable. If nothing was preloaded return an error. - */ -static void kernel_kexec(void) -{ -#ifdef CONFIG_KEXEC - struct kimage *image; - image = xchg(&kexec_image, NULL); - if (!image) - return; - kernel_restart_prepare(NULL); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - machine_kexec(image); -#endif -} - static void kernel_shutdown_prepare(enum system_states state) { blocking_notifier_call_chain(&reboot_notifier_list, @@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user kernel_restart(buffer); break; +#ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: - kernel_kexec(); - unlock_kernel(); - return -EINVAL; + { + int ret; + ret = kernel_kexec(); + unlock_kernel(); + return ret; + } +#endif #ifdef CONFIG_HIBERNATION case LINUX_REBOOT_CMD_SW_SUSPEND: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 35a50db9b6ce..1bf369bd4423 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -118,10 +118,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif -#ifdef __sparc__ -extern char reboot_command []; -extern int stop_a_enabled; -extern int scons_pwroff; +#ifdef CONFIG_SPARC +#include <asm/system.h> #endif #ifdef __hppa__ @@ -159,13 +157,15 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { + .count = 1, .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, }; static struct ctl_table_root sysctl_table_root = { .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), + .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), }; static struct ctl_table kern_table[]; @@ -413,7 +413,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef __sparc__ +#ifdef CONFIG_SPARC { .ctl_name = KERN_SPARC_REBOOT, .procname = "reboot-cmd", @@ -1386,6 +1386,9 @@ static void start_unregistering(struct ctl_table_header *p) spin_unlock(&sysctl_lock); wait_for_completion(&wait); spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); } /* * do not remove from the list until nobody holds it; walking the @@ -1394,6 +1397,32 @@ static void start_unregistering(struct ctl_table_header *p) list_del_init(&p->ctl_entry); } +void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree(head); + spin_unlock(&sysctl_lock); +} + +struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + if (!head) + BUG(); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + void sysctl_head_finish(struct ctl_table_header *head) { if (!head) @@ -1403,14 +1432,20 @@ void sysctl_head_finish(struct ctl_table_header *head) spin_unlock(&sysctl_lock); } +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + static struct list_head * lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) { - struct list_head *header_list; - header_list = &root->header_list; - if (root->lookup) - header_list = root->lookup(root, namespaces); - return header_list; + struct ctl_table_set *set = lookup_header_set(root, namespaces); + return &set->list; } struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, @@ -1480,9 +1515,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root, int op = 0, rc; if (oldval) - op |= 004; + op |= MAY_READ; if (newval) - op |= 002; + op |= MAY_WRITE; if (sysctl_perm(root, table, op)) return -EPERM; @@ -1524,7 +1559,7 @@ repeat: if (n == table->ctl_name) { int error; if (table->child) { - if (sysctl_perm(root, table, 001)) + if (sysctl_perm(root, table, MAY_EXEC)) return -EPERM; name++; nlen--; @@ -1599,7 +1634,7 @@ static int test_perm(int mode, int op) mode >>= 6; else if (in_egroup_p(0)) mode >>= 3; - if ((mode & op & 0007) == op) + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) return 0; return -EACCES; } @@ -1609,7 +1644,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) int error; int mode; - error = security_sysctl(table, op); + error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); if (error) return error; @@ -1644,6 +1679,54 @@ static __init int sysctl_init(void) core_initcall(sysctl_init); +static struct ctl_table *is_branch_in(struct ctl_table *branch, + struct ctl_table *table) +{ + struct ctl_table *p; + const char *s = branch->procname; + + /* branch should have named subdirectory as its first element */ + if (!s || !branch->child) + return NULL; + + /* ... and nothing else */ + if (branch[1].procname || branch[1].ctl_name) + return NULL; + + /* table should contain subdirectory with the same name */ + for (p = table; p->procname || p->ctl_name; p++) { + if (!p->child) + continue; + if (p->procname && strcmp(p->procname, s) == 0) + return p; + } + return NULL; +} + +/* see if attaching q to p would be an improvement */ +static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) +{ + struct ctl_table *to = p->ctl_table, *by = q->ctl_table; + struct ctl_table *next; + int is_better = 0; + int not_in_parent = !p->attached_by; + + while ((next = is_branch_in(by, to)) != NULL) { + if (by == q->attached_by) + is_better = 1; + if (to == p->attached_by) + not_in_parent = 1; + by = by->child; + to = next->child; + } + + if (is_better && not_in_parent) { + q->attached_by = by; + q->attached_to = to; + q->parent = p; + } +} + /** * __register_sysctl_paths - register a sysctl hierarchy * @root: List of sysctl headers to register on @@ -1720,10 +1803,10 @@ struct ctl_table_header *__register_sysctl_paths( struct nsproxy *namespaces, const struct ctl_path *path, struct ctl_table *table) { - struct list_head *header_list; struct ctl_table_header *header; struct ctl_table *new, **prevp; unsigned int n, npath; + struct ctl_table_set *set; /* Count the path components */ for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) @@ -1765,6 +1848,7 @@ struct ctl_table_header *__register_sysctl_paths( header->unregistering = NULL; header->root = root; sysctl_set_parent(NULL, header->ctl_table); + header->count = 1; #ifdef CONFIG_SYSCTL_SYSCALL_CHECK if (sysctl_check_table(namespaces, header->ctl_table)) { kfree(header); @@ -1772,8 +1856,20 @@ struct ctl_table_header *__register_sysctl_paths( } #endif spin_lock(&sysctl_lock); - header_list = lookup_header_list(root, namespaces); - list_add_tail(&header->ctl_entry, header_list); + header->set = lookup_header_set(root, namespaces); + header->attached_by = header->ctl_table; + header->attached_to = root_table; + header->parent = &root_table_header; + for (set = header->set; set; set = set->parent) { + struct ctl_table_header *p; + list_for_each_entry(p, &set->list, ctl_entry) { + if (p->unregistering) + continue; + try_attach(p, header); + } + } + header->parent->count++; + list_add_tail(&header->ctl_entry, &header->set->list); spin_unlock(&sysctl_lock); return header; @@ -1828,8 +1924,37 @@ void unregister_sysctl_table(struct ctl_table_header * header) spin_lock(&sysctl_lock); start_unregistering(header); + if (!--header->parent->count) { + WARN_ON(1); + kfree(header->parent); + } + if (!--header->count) + kfree(header); spin_unlock(&sysctl_lock); - kfree(header); +} + +int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ + INIT_LIST_HEAD(&p->list); + p->parent = parent ? parent : &sysctl_table_root.default_set; + p->is_seen = is_seen; } #else /* !CONFIG_SYSCTL */ @@ -1848,6 +1973,16 @@ void unregister_sysctl_table(struct ctl_table_header * table) { } +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ +} + +void sysctl_head_put(struct ctl_table_header *head) +{ +} + #endif /* CONFIG_SYSCTL */ /* diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3d1e3e1a1971..f8d968063cea 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev, } /** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + +/** * clockevents_program_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) * @@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev) /* * Noop handler when we shut down an event device */ -static void clockevents_handle_noop(struct clock_event_device *dev) +void clockevents_handle_noop(struct clock_event_device *dev) { } @@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { - old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); @@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (new) { BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(new); } local_irq_restore(flags); } diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd8196b..1ad46f3df6e7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy) if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) fail = update_persistent_clock(now); - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 31463d370b94..cb01cd8f919b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { + ktime_t next; + tick_do_periodic_broadcast(); /* @@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have - * periodic mode: + * periodic mode. We read dev->next_event first and add to it + * when the event alrady expired. clockevents_program_event() + * sets dev->next_event only when the event is really + * programmed to the device. */ - for (;;) { - ktime_t next = ktime_add(dev->next_event, tick_period); + for (next = dev->next_event; ;) { + next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, ktime_get())) return; @@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags, *reason = why; - int cpu; + int cpu, bc_stopped; spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; + bc_stopped = cpus_empty(tick_broadcast_mask); + switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) - clockevents_set_mode(dev, - CLOCK_EVT_MODE_SHUTDOWN); + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) tick_broadcast_force = 1; @@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; } - if (cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - else { + if (cpus_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else @@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -313,7 +322,7 @@ void tick_suspend_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } @@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void) static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; - ktime_t now = ktime_get(); - int res; - - for(;;) { - res = clockevents_program_event(bc, expires, now); - if (!res || !force) - return res; - now = ktime_get(); - expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); - } + + return tick_dev_program_event(bc, expires, force); } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) @@ -491,14 +492,52 @@ static void tick_broadcast_clear_oneshot(int cpu) cpu_clear(cpu, tick_broadcast_oneshot_mask); } +static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu_mask_nr(cpu, *mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + /** * tick_broadcast_setup_oneshot - setup the broadcast device */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int cpu = smp_processor_id(); + cpumask_t mask; + + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + + /* Take the do_timer update */ + tick_do_timer_cpu = cpu; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + mask = tick_broadcast_mask; + cpu_clear(cpu, mask); + cpus_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, mask); + + if (was_periodic && !cpus_empty(mask)) { + tick_broadcast_init_next_event(&mask, tick_next_period); + tick_broadcast_set_event(tick_next_period, 1); + } else + bc->next_event.tv64 = KTIME_MAX; + } } /* @@ -538,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index bf43284d6855..df12434b43ca 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = -1; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); /* @@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if (!tick_device_is_functional(dev)) return; - if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); } else { unsigned long seq; @@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == -1) { + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); @@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td, } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; @@ -196,12 +198,10 @@ static int tick_check_new_device(struct clock_event_device *newdev) struct tick_device *td; int cpu, ret = NOTIFY_OK; unsigned long flags; - cpumask_of_cpu_ptr_declare(cpumask); spin_lock_irqsave(&tick_device_lock, flags); cpu = smp_processor_id(); - cpumask_of_cpu_ptr_next(cpumask, cpu); if (!cpu_isset(cpu, newdev->cpumask)) goto out_bc; @@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) curdev = td->evtdev; /* cpu local device ? */ - if (!cpus_equal(newdev->cpumask, *cpumask)) { + if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { /* * If the cpu affinity of the device interrupt can not @@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * If we have a cpu local device already, do not replace it * by a non cpu local device */ - if (curdev && cpus_equal(curdev->cpumask, *cpumask)) + if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) goto out_bc; } @@ -250,11 +250,11 @@ static int tick_check_new_device(struct clock_event_device *newdev) * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { - clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); - tick_setup_device(td, newdev, cpu, cpumask); + tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) tick_oneshot_notify(); @@ -301,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup) if (*cpup == tick_do_timer_cpu) { int cpu = first_cpu(cpu_online_map); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); } @@ -312,7 +313,7 @@ static void tick_suspend(void) unsigned long flags; spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(td->evtdev); spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f13f2b7f4fd4..469248782c23 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; @@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void clockevents_shutdown(struct clock_event_device *dev); + /* * NO_HZ / high resolution timer shared code */ @@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); +extern int tick_dev_program_event(struct clock_event_device *dev, + ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); @@ -27,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -35,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } # endif /* !BROADCAST */ #else /* !ONESHOT */ @@ -64,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } #endif /* !TICK_ONESHOT */ /* diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 450c04935b66..2e8de678e767 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -23,24 +23,56 @@ #include "tick-internal.h" /** - * tick_program_event + * tick_program_event internal worker function */ -int tick_program_event(ktime_t expires, int force) +int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, + int force) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; ktime_t now = ktime_get(); + int i; - while (1) { + for (i = 0;;) { int ret = clockevents_program_event(dev, expires, now); if (!ret || !force) return ret; + + /* + * We tried 2 times to program the device with the given + * min_delta_ns. If that's not working then we double it + * and emit a warning. + */ + if (++i > 2) { + /* Increase the min. delta and try again */ + if (!dev->min_delta_ns) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + + i = 0; + } + now = ktime_get(); - expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + expires = ktime_add_ns(now, dev->min_delta_ns); } } /** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return tick_dev_program_event(dev, expires, force); +} + +/** * tick_resume_onshot - resume oneshot mode */ void tick_resume_oneshot(void) @@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - clockevents_program_event(newdev, next_event, ktime_get()); + tick_dev_program_event(newdev, next_event, 1); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 825b4c00fe44..a4d219398167 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include <linux/profile.h> #include <linux/sched.h> #include <linux/tick.h> +#include <linux/module.h> #include <asm/irq_regs.h> @@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now) incr * ticks); } do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&xtime_lock); } @@ -162,6 +166,8 @@ void tick_nohz_stop_idle(int cpu) ts->idle_lastupdate = now; ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_active = 0; + + sched_clock_idle_wakeup_event(0); } } @@ -177,6 +183,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts) } ts->idle_entrytime = now; ts->idle_active = 1; + sched_clock_idle_sleep_event(); return now; } @@ -184,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - *last_update_time = ktime_to_us(ts->idle_lastupdate); + if (!tick_nohz_enabled) + return -1; + + if (ts->idle_active) + *last_update_time = ktime_to_us(ts->idle_lastupdate); + else + *last_update_time = ktime_to_us(ktime_get()); + return ktime_to_us(ts->idle_sleeptime); } +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task @@ -218,7 +233,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) @@ -289,7 +304,6 @@ void tick_nohz_stop_sched_tick(int inidle) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); - sched_clock_tick_stop(cpu); } /* @@ -301,7 +315,7 @@ void tick_nohz_stop_sched_tick(int inidle) * invoked. */ if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->idle_sleeps++; @@ -392,7 +406,6 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); - sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* @@ -467,7 +480,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; /* Check, if the jiffies need an update */ @@ -569,7 +582,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; #endif @@ -621,7 +634,7 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); @@ -645,17 +658,21 @@ void tick_setup_sched_timer(void) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } +#endif /* HIGH_RES_TIMERS */ +#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +# ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); +# endif ts->nohz_mode = NOHZ_MODE_INACTIVE; } -#endif /* HIGH_RES_TIMERS */ +#endif /** * Async notification about clocksource changes diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4231a3dc224a..f6e3af31b403 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -587,7 +587,7 @@ static int __ftrace_modify_code(void *data) static void ftrace_run_update_code(int command) { - stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); + stop_machine(__ftrace_modify_code, &command, NULL); } void ftrace_disable_daemon(void) @@ -787,7 +787,7 @@ static int ftrace_update_code(void) !ftrace_enabled || !ftraced_trigger) return 0; - stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); + stop_machine(__ftrace_update_code, NULL, NULL); return 1; } @@ -1564,7 +1564,7 @@ static int __init ftrace_dynamic_init(void) addr = (unsigned long)ftrace_record_ip; - stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); + stop_machine(ftrace_dyn_arch_init, &addr, NULL); /* ftrace_dyn_arch_init places the return code in addr */ if (addr) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 868e121c8e38..8f3fb3db61c3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1183,7 +1183,6 @@ static void *find_next_entry_inc(struct trace_iterator *iter) static void *s_next(struct seq_file *m, void *v, loff_t *pos) { struct trace_iterator *iter = m->private; - void *last_ent = iter->ent; int i = (int)*pos; void *ent; @@ -1203,9 +1202,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) iter->pos = *pos; - if (last_ent && !ent) - seq_puts(m, "\n\nvim:ft=help\n"); - return ent; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 421d6fe3650e..ece6cfb649fa 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -253,12 +253,14 @@ void start_critical_timings(void) if (preempt_trace() || irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +EXPORT_SYMBOL_GPL(start_critical_timings); void stop_critical_timings(void) { if (preempt_trace() || irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } +EXPORT_SYMBOL_GPL(stop_critical_timings); #ifdef CONFIG_IRQSOFF_TRACER #ifdef CONFIG_PROVE_LOCKING @@ -337,12 +339,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - stop_critical_timing(a0, a1); + if (preempt_trace()) + stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - start_critical_timing(a0, a1); + if (preempt_trace()) + start_critical_timing(a0, a1); } #endif /* CONFIG_PREEMPT_TRACER */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3c8d61df4474..e303ccb62cdf 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -26,7 +26,8 @@ static struct task_struct *wakeup_task; static int wakeup_cpu; static unsigned wakeup_prio = -1; -static DEFINE_SPINLOCK(wakeup_lock); +static raw_spinlock_t wakeup_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static void __wakeup_reset(struct trace_array *tr); @@ -56,7 +57,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) if (unlikely(disabled != 1)) goto out; - spin_lock_irqsave(&wakeup_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); if (unlikely(!wakeup_task)) goto unlock; @@ -71,7 +73,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags); unlock: - spin_unlock_irqrestore(&wakeup_lock, flags); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); out: atomic_dec(&data->disabled); @@ -145,7 +148,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (likely(disabled != 1)) goto out; - spin_lock_irqsave(&wakeup_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); /* We could race with grabbing wakeup_lock */ if (unlikely(!tracer_enabled || next != wakeup_task)) @@ -174,7 +178,8 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, out_unlock: __wakeup_reset(tr); - spin_unlock_irqrestore(&wakeup_lock, flags); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); out: atomic_dec(&tr->data[cpu]->disabled); } @@ -209,8 +214,6 @@ static void __wakeup_reset(struct trace_array *tr) struct trace_array_cpu *data; int cpu; - assert_spin_locked(&wakeup_lock); - for_each_possible_cpu(cpu) { data = tr->data[cpu]; tracing_reset(data); @@ -229,9 +232,11 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; - spin_lock_irqsave(&wakeup_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&wakeup_lock); __wakeup_reset(tr); - spin_unlock_irqrestore(&wakeup_lock, flags); + __raw_spin_unlock(&wakeup_lock); + local_irq_restore(flags); } static void @@ -252,7 +257,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, goto out; /* interrupts should be off from try_to_wake_up */ - spin_lock(&wakeup_lock); + __raw_spin_lock(&wakeup_lock); /* check for races. */ if (!tracer_enabled || p->prio >= wakeup_prio) @@ -274,7 +279,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, CALLER_ADDR1, CALLER_ADDR2, flags); out_locked: - spin_unlock(&wakeup_lock); + __raw_spin_unlock(&wakeup_lock); out: atomic_dec(&tr->data[cpu]->disabled); } diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index ce2d723c10e1..db58fb66a135 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,7 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } @@ -213,9 +213,7 @@ static void start_stack_timers(void) int cpu; for_each_online_cpu(cpu) { - cpumask_of_cpu_ptr(new_mask, cpu); - - set_cpus_allowed_ptr(current, new_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); start_stack_timer(cpu); } set_cpus_allowed_ptr(current, &saved_mask); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 3da47ccdc5e5..8ebcd8532dfb 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -94,10 +94,10 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; mmput(mm); } - stats->read_char = p->rchar; - stats->write_char = p->wchar; - stats->read_syscalls = p->syscr; - stats->write_syscalls = p->syscw; + stats->read_char = p->ioac.rchar; + stats->write_char = p->ioac.wchar; + stats->read_syscalls = p->ioac.syscr; + stats->write_syscalls = p->ioac.syscw; #ifdef CONFIG_TASK_IO_ACCOUNTING stats->read_bytes = p->ioac.read_bytes; stats->write_bytes = p->ioac.write_bytes; diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a096..39d6159fae43 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a9ab0596de44..532858fa5b88 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -6,7 +6,6 @@ */ #include <linux/module.h> -#include <linux/version.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> diff --git a/kernel/utsname.c b/kernel/utsname.c index 64d398f12444..815237a55af8 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -12,7 +12,6 @@ #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> -#include <linux/version.h> #include <linux/err.h> #include <linux/slab.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index fe3a56c2256d..4ab9659d269e 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -12,7 +12,6 @@ #include <linux/module.h> #include <linux/uts.h> #include <linux/utsname.h> -#include <linux/version.h> #include <linux/sysctl.h> static void *get_uts(ctl_table *table, int write) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ec7e4f62aaff..4048e92aa04f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -290,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) BUG_ON(get_wq_data(work) != cwq); work_clear_pending(work); - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); f(work); - lock_release(&lockdep_map, 1, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " @@ -413,8 +413,8 @@ void flush_workqueue(struct workqueue_struct *wq) int cpu; might_sleep(); - lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&wq->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); for_each_cpu_mask_nr(cpu, *cpu_map) flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); } @@ -441,8 +441,8 @@ int flush_work(struct work_struct *work) if (!cwq) return 0; - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); prev = NULL; spin_lock_irq(&cwq->lock); @@ -536,8 +536,8 @@ static void wait_on_work(struct work_struct *work) might_sleep(); - lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&work->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); cwq = get_wq_data(work); if (!cwq) @@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name, start_workqueue_thread(cwq, -1); } else { cpu_maps_update_begin(); + /* + * We must place this wq on list even if the code below fails. + * cpu_down(cpu) can remove cpu from cpu_populated_map before + * destroy_workqueue() takes the lock, in that case we leak + * cwq[cpu]->thread. + */ spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); spin_unlock(&workqueue_lock); - + /* + * We must initialize cwqs for each possible cpu even if we + * are going to call destroy_workqueue() finally. Otherwise + * cpu_up() can hit the uninitialized cwq once we drop the + * lock. + */ for_each_possible_cpu(cpu) { cwq = init_cpu_workqueue(wq, cpu); if (err || !cpu_online(cpu)) @@ -861,8 +872,8 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) if (cwq->thread == NULL) return; - lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); - lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); flush_cpu_workqueue(cwq); /* |