From ae9d67aff60af59548b6c7d1a74febea09660122 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Tue, 18 Jan 2011 06:48:12 +0100 Subject: audit: export symbol for use with xt_AUDIT When xt_AUDIT is built as a module, modpost reports a problem. MODPOST 322 modules ERROR: "audit_enabled" [net/netfilter/x_tables.ko] undefined! WARNING: modpost: Found 1 section mismatch(es). Cc: Thomas Graf Signed-off-by: Jan Engelhardt --- kernel/audit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 77770a034d59..5842f65bedcb 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -74,6 +74,8 @@ static int audit_initialized; int audit_enabled; int audit_ever_enabled; +EXPORT_SYMBOL_GPL(audit_enabled); + /* Default state when kernel boots without any parameters. */ static int audit_default; -- cgit v1.2.3 From 42b16b3fbb5ee4555f5dee6220f3ccaa6e1ebe47 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 17 Jan 2011 00:09:38 +0100 Subject: =?UTF-8?q?Kill=20off=20warning:=20=E2=80=98inline=E2=80=99=20is?= =?UTF-8?q?=20not=20at=20beginning=20of=20declaration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a bunch of warning: ‘inline’ is not at beginning of declaration messages when building a 'make allyesconfig' kernel with -Wextra. These warnings are trivial to kill, yet rather annoying when building with -Wextra. The more we can cut down on pointless crap like this the better (IMHO). A previous patch to do this for a 'allnoconfig' build has already been merged. This just takes the cleanup a little further. Signed-off-by: Jesper Juhl Signed-off-by: Jiri Kosina --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..6ee56b4ad136 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -669,7 +669,7 @@ static struct list_head *rb_list_head(struct list_head *list) * the reader page). But if the next page is a header page, * its flags will be non zero. */ -static int inline +static inline int rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *page, struct list_head *list) { -- cgit v1.2.3 From cd7eab44e9946c28d595abe3e9a43e945bc49141 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 19 Jan 2011 21:01:44 +0000 Subject: genirq: Add IRQ affinity notifiers When initiating I/O on a multiqueue and multi-IRQ device, we may want to select a queue for which the response will be handled on the same or a nearby CPU. This requires a reverse-map of IRQ affinity. Add a notification mechanism to support this. This is based closely on work by Thomas Gleixner . Signed-off-by: Ben Hutchings Cc: linux-net-drivers@solarflare.com Cc: Tom Herbert Cc: David Miller LKML-Reference: <1295470904.11126.84.camel@bwh-desktop> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..0587c5ceaed8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -134,6 +134,10 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) irq_set_thread_affinity(desc); } #endif + if (desc->affinity_notify) { + kref_get(&desc->affinity_notify->kref); + schedule_work(&desc->affinity_notify->work); + } desc->status |= IRQ_AFFINITY_SET; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -155,6 +159,79 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +static void irq_affinity_notify(struct work_struct *work) +{ + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); + struct irq_desc *desc = irq_to_desc(notify->irq); + cpumask_var_t cpumask; + unsigned long flags; + + if (!desc) + goto out; + + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (desc->status & IRQ_MOVE_PENDING) + cpumask_copy(cpumask, desc->pending_mask); + else +#endif + cpumask_copy(cpumask, desc->affinity); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + notify->notify(notify, cpumask); + + free_cpumask_var(cpumask); +out: + kref_put(¬ify->kref, notify->release); +} + +/** + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is + * freed using free_irq(). + */ +int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_affinity_notify *old_notify; + unsigned long flags; + + /* The release function is promised process context */ + might_sleep(); + + if (!desc) + return -EINVAL; + + /* Complete initialisation of *notify */ + if (notify) { + notify->irq = irq; + kref_init(¬ify->kref); + INIT_WORK(¬ify->work, irq_affinity_notify); + } + + raw_spin_lock_irqsave(&desc->lock, flags); + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + if (old_notify) + kref_put(&old_notify->kref, old_notify->release); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); + #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. @@ -1004,6 +1081,11 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; +#ifdef CONFIG_SMP + if (WARN_ON(desc->affinity_notify)) + desc->affinity_notify = NULL; +#endif + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); chip_bus_sync_unlock(desc); -- cgit v1.2.3 From 6d5ab2932a21ea54406ab95c43ecff90a3eddfda Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:01 -0800 Subject: sched: Simplify update_cfs_shares parameters Re-visiting this: Since update_cfs_shares will now only ever re-weight an entity that is a relative parent of the current entity in enqueue_entity; we can safely issue the account_entity_enqueue relative to that cfs_rq and avoid the requirement for special handling of the enqueue case in update_cfs_shares. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044851.915214637@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 30 ++++++++++++++---------------- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..e0fa3ff7f194 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8510,7 +8510,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Propagate contribution to hierarchy */ raw_spin_lock_irqsave(&rq->lock, flags); for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se), 0); + update_cfs_shares(group_cfs_rq(se)); raw_spin_unlock_irqrestore(&rq->lock, flags); } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450e..0c550c841eee 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -540,7 +540,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +static void update_cfs_shares(struct cfs_rq *cfs_rq); /* * Update the current task's runtime statistics. Skip current tasks that @@ -763,16 +763,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { long load_weight, load, shares; - load = cfs_rq->load.weight + weight_delta; + load = cfs_rq->load.weight; load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; load_weight += load; + load_weight -= cfs_rq->load_contribution; shares = (tg->shares * load); if (load_weight) @@ -790,7 +789,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) { if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } } # else /* CONFIG_SMP */ @@ -798,8 +797,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, - long weight_delta) +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { return tg->shares; } @@ -824,7 +822,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, account_entity_enqueue(cfs_rq, se); } -static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static void update_cfs_shares(struct cfs_rq *cfs_rq) { struct task_group *tg; struct sched_entity *se; @@ -838,7 +836,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) if (likely(se->load.weight == tg->shares)) return; #endif - shares = calc_cfs_shares(cfs_rq, tg, weight_delta); + shares = calc_cfs_shares(cfs_rq, tg); reweight_entity(cfs_rq_of(se), se, shares); } @@ -847,7 +845,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { } -static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) { } @@ -978,8 +976,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, se->load.weight); account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -1041,7 +1039,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); /* * Normalize the entity after updating the min_vruntime because the @@ -1282,7 +1280,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -1312,7 +1310,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq = cfs_rq_of(se); update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); } hrtick_update(rq); @@ -2123,7 +2121,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) * We need to update shares after updating tg->load_weight in * order to adjust the weight of groups with long running tasks. */ - update_cfs_shares(cfs_rq, 0); + update_cfs_shares(cfs_rq); raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.3 From f07333bf6ee66d9b49286cec4371cf375e745b7a Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 21 Jan 2011 20:45:03 -0800 Subject: sched: Avoid expensive initial update_cfs_load() Since cfs->{load_stamp,load_last} are zero-initalized the initial load update will consider the delta to be 'since the beginning of time'. This results in a lot of pointless divisions to bring this large period to be within the sysctl_sched_shares_window. Fix this by initializing load_stamp to be 1 at cfs_rq initialization, this allows for an initial load_stamp > load_last which then lets standard idle truncation proceed. We avoid spinning (and slightly improve consistency) by fixing delta to be [period - 1] in this path resulting in a slightly more predictable shares ramp. (Previously the amount of idle time preserved by the overflow would range between [period/2,period-1].) Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110122044852.102126037@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ kernel/sched_fair.c | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e0fa3ff7f194..6820b5b3a969 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7796,6 +7796,8 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c550c841eee..4cbc9121094c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -733,6 +733,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) now - cfs_rq->load_last > 4 * period) { cfs_rq->load_period = 0; cfs_rq->load_avg = 0; + delta = period - 1; } cfs_rq->load_stamp = now; -- cgit v1.2.3 From 4dd53d891ca46dcc1fde0376a33540d3fd83cb9a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:00 -0800 Subject: softirqs: Free up pf flag PF_KSOFTIRQD Cleanup patch, freeing up PF_KSOFTIRQD and use per_cpu ksoftirqd pointer instead, as suggested by Eric Dumazet. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-2-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/softirq.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6820b5b3a969..8b89b3bba565 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1880,7 +1880,7 @@ void account_system_vtime(struct task_struct *curr) */ if (hardirq_count()) __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..0cee50487629 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", @@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); - current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { -- cgit v1.2.3 From a1dabb6bfffccb897eff3e1d102dacf2a4bedf3b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:01 -0800 Subject: time: Add nsecs_to_cputime64 interface for asm-generic Add nsecs_to_cputime64 interface. This is used in following patches that updates cpu irq stat based on ns granularity info in IRQ_TIME_ACCOUNTING. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-3-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/time.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 32174359576f..55337a816b20 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) } /** - * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 * * @n: nsecs in u64 * @@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years */ -unsigned long nsecs_to_jiffies(u64 n) +u64 nsecs_to_jiffies64(u64 n) { #if (NSEC_PER_SEC % HZ) == 0 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ @@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n) #endif } + +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ + return (unsigned long)nsecs_to_jiffies64(n); +} + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { -- cgit v1.2.3 From 70a89a6620f658d47a1488515bada4b8ee6291d8 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:02 -0800 Subject: sched: Refactor account_system_time separating id-update Refactor account_system_time, to separate out the logic of identifying the update needed and code that does actual update. This is used by following patch for IRQ_TIME_ACCOUNTING, which has different identification logic and same update logic. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-4-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8b89b3bba565..e3fa92106ed7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3567,6 +3567,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, } } +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + cputime64_t tmp = cputime_to_cputime64(cputime); + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + /* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -3578,31 +3604,21 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + cputime64_t *target_cputime64; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); return; } - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) - cpustat->irq = cputime64_add(cpustat->irq, tmp); + target_cputime64 = &cpustat->irq; else if (in_serving_softirq()) - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + target_cputime64 = &cpustat->softirq; else - cpustat->system = cputime64_add(cpustat->system, tmp); + target_cputime64 = &cpustat->system; - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); - - /* Account for system time used */ - acct_update_integrals(p); + __account_system_time(p, cputime, cputime_scaled, target_cputime64); } /* -- cgit v1.2.3 From abb74cefa9c682fb38ba86c17ca3c86fed6cc464 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:03 -0800 Subject: sched: Export ns irqtimes through /proc/stat CONFIG_IRQ_TIME_ACCOUNTING adds ns granularity irq time on each CPU. This info is already used in scheduler to do proper task chargeback (earlier patches). This patch retro-fits this ns granularity hardirq and softirq information to /proc/stat irq and softirq fields. The update is still done on timer tick, where we look at accumulated ns hardirq/softirq time and account the tick to user/system/irq/hardirq/guest accordingly. No new interface added. Earlier versions looked at adding this as new fields in some /proc files. This one seems to be the best in terms of impact to existing apps, even though it has somewhat more kernel code than earlier versions. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-5-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e3fa92106ed7..2a3c9799d76b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1920,8 +1920,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) sched_rt_avg_update(rq, irq_delta); } +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + #else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#define sched_clock_irqtime (0) + static void update_rq_clock_task(struct rq *rq, s64 delta) { rq->clock_task += delta; @@ -3621,6 +3653,65 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + if (irqtime_account_hi_update()) { + cpustat->irq = cputime64_add(cpustat->irq, tmp); + } else if (irqtime_account_si_update()) { + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->system); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif + /* * Account for involuntary wait time. * @steal: the cpu time spent in involuntary wait @@ -3661,6 +3752,11 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + if (user_tick) account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) @@ -3686,6 +3782,12 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + account_idle_time(jiffies_to_cputime(ticks)); } -- cgit v1.2.3 From 414bee9ba613adb3804965e2d84db32d0599f9c6 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 21 Dec 2010 17:09:04 -0800 Subject: softirqs: Account ksoftirqd time as cpustat softirq softirq time in ksoftirqd context is not accounted in ns granularity per cpu softirq stats, as we want that to be a part of ksoftirqd exec_runtime. Accounting them as softirq on /proc/stat separately. Tested-by: Shaun Ruffell Signed-off-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra LKML-Reference: <1292980144-28796-6-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2a3c9799d76b..8b718b59b09f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3686,6 +3686,14 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, cpustat->irq = cputime64_add(cpustat->irq, tmp); } else if (irqtime_account_si_update()) { cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->softirq); } else if (user_tick) { account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); } else if (p == rq->idle) { -- cgit v1.2.3 From a8941d7ec81678fb69aea7183338175f112f3e0d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Jan 2011 16:30:03 +0100 Subject: sched: Simplify the idle scheduling class Since commit 48c5ccae88dcd (sched: Simplify cpu-hot-unplug task migration) this should no longer happen, so remove the code. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_idletask.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87c..41eb62a0808b 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -52,31 +52,16 @@ static void set_curr_task_idle(struct rq *rq) { } -static void switched_to_idle(struct rq *rq, struct task_struct *p, - int running) +static void +switched_to_idle(struct rq *rq, struct task_struct *p, int running) { - /* Can this actually happen?? */ - if (running) - resched_task(rq->curr); - else - check_preempt_curr(rq, p, 0); + BUG(); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio, int running) { - /* This can happen for hot plug CPUS */ - - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else - check_preempt_curr(rq, p, 0); + BUG(); } static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) -- cgit v1.2.3 From da7a735e51f9622eb3e1672594d4a41da01d7e4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Jan 2011 17:03:27 +0100 Subject: sched: Fix switch_from_fair() When a task is taken out of the fair class we must ensure the vruntime is properly normalized because when we put it back in it will assume to be normalized. The case that goes wrong is when changing away from the fair class while sleeping. Sleeping tasks have non-normalized vruntime in order to make sleeper-fairness work. So treat the switch away from fair as a wakeup and preserve the relative vruntime. Also update sysrq-n to call the ->switch_{to,from} methods. Reported-by: Onkalo Samu Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 ++++++++++++++----------- kernel/sched_fair.c | 42 ++++++++++++++++++++++++++++++++++++------ kernel/sched_idletask.c | 7 +++---- kernel/sched_rt.c | 19 ++++++++++--------- kernel/sched_stoptask.c | 7 +++---- 5 files changed, 66 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8b718b59b09f..78fa75394011 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2057,14 +2057,14 @@ inline int task_curr(const struct task_struct *p) static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, - int oldprio, int running) + int oldprio) { if (prev_class != p->sched_class) { if (prev_class->switched_from) - prev_class->switched_from(rq, p, running); - p->sched_class->switched_to(rq, p, running); - } else - p->sched_class->prio_changed(rq, p, oldprio, running); + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); } static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) @@ -2598,6 +2598,7 @@ static void __sched_fork(struct task_struct *p) p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; + p->se.vruntime = 0; #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); @@ -4696,11 +4697,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); task_rq_unlock(rq, &flags); } @@ -5028,11 +5028,10 @@ recheck: if (running) p->sched_class->set_curr_task(rq); - if (on_rq) { + if (on_rq) activate_task(rq, p, 0); - check_class_changed(rq, p, prev_class, oldprio, running); - } + check_class_changed(rq, p, prev_class, oldprio); __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -8237,6 +8236,8 @@ EXPORT_SYMBOL(__might_sleep); #ifdef CONFIG_MAGIC_SYSRQ static void normalize_task(struct rq *rq, struct task_struct *p) { + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; int on_rq; on_rq = p->se.on_rq; @@ -8247,6 +8248,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) activate_task(rq, p, 0); resched_task(rq->curr); } + + check_class_changed(rq, p, prev_class, old_prio); } void normalize_rt_tasks(void) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4cbc9121094c..55040f3938d8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -4078,33 +4078,62 @@ static void task_fork_fair(struct task_struct *p) * Priority of the task has changed. Check to see if we preempt * the current task. */ -static void prio_changed_fair(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { + if (!p->se.on_rq) + return; + /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (running) { + if (rq->curr == p) { if (p->prio > oldprio) resched_task(rq->curr); } else check_preempt_curr(rq, p, 0); } +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when its + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it was on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it was !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!se->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } +} + /* * We switched to the sched_fair class. */ -static void switched_to_fair(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_fair(struct rq *rq, struct task_struct *p) { + if (!p->se.on_rq) + return; + /* * We were most likely switched from sched_rt, so * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (running) + if (rq->curr == p) resched_task(rq->curr); else check_preempt_curr(rq, p, 0); @@ -4190,6 +4219,7 @@ static const struct sched_class fair_sched_class = { .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 41eb62a0808b..c82f26c1b7c3 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -52,14 +52,13 @@ static void set_curr_task_idle(struct rq *rq) { } -static void -switched_to_idle(struct rq *rq, struct task_struct *p, int running) +static void switched_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } -static void prio_changed_idle(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) { BUG(); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index c914ec747ca6..c381fdc18c64 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1595,8 +1595,7 @@ static void rq_offline_rt(struct rq *rq) * When switch from the rt queue, we bring ourselves to a position * that we might want to pull RT tasks from other runqueues. */ -static void switched_from_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_from_rt(struct rq *rq, struct task_struct *p) { /* * If there are other RT tasks then we will reschedule @@ -1605,7 +1604,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, * we may need to handle the pulling of RT tasks * now. */ - if (!rq->rt.rt_nr_running) + if (p->se.on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1624,8 +1623,7 @@ static inline void init_sched_rt_class(void) * with RT tasks. In this case we try to push them off to * other runqueues. */ -static void switched_to_rt(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_rt(struct rq *rq, struct task_struct *p) { int check_resched = 1; @@ -1636,7 +1634,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (!running) { + if (p->se.on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1652,10 +1650,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, * Priority of the task has changed. This may cause * us to initiate a push or pull. */ -static void prio_changed_rt(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (running) { + if (!p->se.on_rq) + return; + + if (rq->curr == p) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47058c1..84ec9bcf82d9 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p, - int running) +static void switched_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } -static void prio_changed_stop(struct rq *rq, struct task_struct *p, - int oldprio, int running) +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) { BUG(); /* how!?, what priority? */ } -- cgit v1.2.3 From ccaa8d657117bb1876d471bd91579d774106778d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Jan 2011 23:17:32 +0100 Subject: rtmutex-tester: Remove BKL tests The BKL is going away, no need to test it any more. I left the definitions of the test case numbers in, so that the other tests do not get renumbered. Signed-off-by: Arnd Bergmann Cc: Arjan van de Ven Cc: Ingo Molnar Cc: Andrew Morton LKML-Reference: <1295993854-4971-19-git-send-email-arnd@arndb.de> Signed-off-by: Thomas Gleixner --- kernel/rtmutex-tester.c | 39 ++++----------------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 66cb89bc5ef1..d5b543506cbc 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -27,7 +26,6 @@ struct test_thread_data { int opcode; int opdata; int mutexes[MAX_RT_TEST_MUTEXES]; - int bkl; int event; struct sys_device sysdev; }; @@ -46,8 +44,8 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ + RTTEST_LOCKBKL, /* 9 Was: Lock BKL */ + RTTEST_UNLOCKBKL, /* 10 Was: Unlock BKL */ RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ RTTEST_RESET = 99, /* 99 Reset all pending operations */ @@ -74,13 +72,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[i] = 0; } } - - if (!lockwakeup && td->bkl == 4) { -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - } return 0; case RTTEST_RESETEVENT: @@ -131,25 +122,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup) td->mutexes[id] = 0; return 0; - case RTTEST_LOCKBKL: - if (td->bkl) - return 0; - td->bkl = 1; -#ifdef CONFIG_LOCK_KERNEL - lock_kernel(); -#endif - td->bkl = 4; - return 0; - - case RTTEST_UNLOCKBKL: - if (td->bkl != 4) - break; -#ifdef CONFIG_LOCK_KERNEL - unlock_kernel(); -#endif - td->bkl = 0; - return 0; - default: break; } @@ -196,7 +168,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); break; - case RTTEST_LOCKBKL: default: break; } @@ -229,8 +200,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex) td->event = atomic_add_return(1, &rttest_event); return; - case RTTEST_LOCKBKL: - return; default: return; } @@ -380,11 +349,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute spin_lock(&rttest_lock); curr += sprintf(curr, - "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", td->opcode, td->event, tsk->state, (MAX_RT_PRIO - 1) - tsk->prio, (MAX_RT_PRIO - 1) - tsk->normal_prio, - tsk->pi_blocked_on, td->bkl); + tsk->pi_blocked_on); for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) curr += sprintf(curr, "%d", td->mutexes[i]); -- cgit v1.2.3 From 10389a15e25fd4784d42de7e0e3fc8c242f2011d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 23 Jan 2011 15:25:56 +0100 Subject: cred: Replace deprecated spinlock initialization SPIN_LOCK_UNLOCK is deprecated. Use the lockdep capable variant instead. Signed-off-by: Thomas Gleixner --- kernel/cred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 6a1aa004e376..b5496e81b0f7 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar; static struct thread_group_cred init_tgcred = { .usage = ATOMIC_INIT(2), .tgid = 0, - .lock = SPIN_LOCK_UNLOCKED, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), }; #endif -- cgit v1.2.3 From 6ea72f12069306b235151c5b05ac0cca7e1dedfa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Jan 2011 13:36:03 +0100 Subject: sched: Avoid expensive initial update_cfs_load(), on UP too Fix the build on UP. Signed-off-by: Peter Zijlstra Cc: Paul Turner LKML-Reference: <20110122044852.102126037@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 78fa75394011..477e1bcc63f9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7922,7 +7922,9 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; /* allow initial update_cfs_load() to truncate */ +#ifdef CONFIG_SMP cfs_rq->load_stamp = 1; +#endif #endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } -- cgit v1.2.3 From 8161239a8bcce9ad6b537c04a1fa3b5c68bae693 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 14 Jan 2011 17:09:41 +0800 Subject: rtmutex: Simplify PI algorithm and make highest prio task get lock In current rtmutex, the pending owner may be boosted by the tasks in the rtmutex's waitlist when the pending owner is deboosted or a task in the waitlist is boosted. This boosting is unrelated, because the pending owner does not really take the rtmutex. It is not reasonable. Example. time1: A(high prio) onwers the rtmutex. B(mid prio) and C (low prio) in the waitlist. time2 A release the lock, B becomes the pending owner A(or other high prio task) continues to run. B's prio is lower than A, so B is just queued at the runqueue. time3 A or other high prio task sleeps, but we have passed some time The B and C's prio are changed in the period (time2 ~ time3) due to boosting or deboosting. Now C has the priority higher than B. ***Is it reasonable that C has to boost B and help B to get the rtmutex? NO!! I think, it is unrelated/unneed boosting before B really owns the rtmutex. We should give C a chance to beat B and win the rtmutex. This is the motivation of this patch. This patch *ensures* only the top waiter or higher priority task can take the lock. How? 1) we don't dequeue the top waiter when unlock, if the top waiter is changed, the old top waiter will fail and go to sleep again. 2) when requiring lock, it will get the lock when the lock is not taken and: there is no waiter OR higher priority than waiters OR it is top waiter. 3) In any time, the top waiter is changed, the top waiter will be woken up. The algorithm is much simpler than before, no pending owner, no boosting for pending owner. Other advantage of this patch: 1) The states of a rtmutex are reduced a half, easier to read the code. 2) the codes become shorter. 3) top waiter is not dequeued until it really take the lock: they will retain FIFO when it is stolen. Not advantage nor disadvantage 1) Even we may wakeup multiple waiters(any time when top waiter changed), we hardly cause "thundering herd", the number of wokenup task is likely 1 or very little. 2) two APIs are changed. rt_mutex_owner() will not return pending owner, it will return NULL when the top waiter is going to take the lock. rt_mutex_next_owner() always return the top waiter. will not return NULL if we have waiters because the top waiter is not dequeued. I have fixed the code that use these APIs. need updated after this patch is accepted 1) Document/* 2) the testcase scripts/rt-tester/t4-l2-pi-deboost.tst Signed-off-by: Lai Jiangshan LKML-Reference: <4D3012D5.4060709@cn.fujitsu.com> Reviewed-by: Steven Rostedt Signed-off-by: Steven Rostedt --- kernel/futex.c | 22 ++-- kernel/rtmutex-debug.c | 1 - kernel/rtmutex.c | 318 +++++++++++++++++------------------------------- kernel/rtmutex_common.h | 16 +-- 4 files changed, 127 insertions(+), 230 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..64c38115c7b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1556,10 +1556,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the - * pending owner or we are the pending owner which failed to - * get the rtmutex. We have to replace the pending owner TID - * in the user space variable. This must be atomic as we have - * to preserve the owner died bit here. + * previous highest priority waiter or we are the highest priority + * waiter but failed to get the rtmutex the first time. + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state * because we can fault here. Imagine swapped out pages or a fork @@ -1608,8 +1608,8 @@ retry: /* * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the pending - * owner itself or the task which stole the rtmutex) the + * lock here. That gives the other task (either the highest priority + * waiter itself or the task which stole the rtmutex) the * chance to try the fixup of the pi_state. So once we are * back from handling the fault we need to check the pi_state * after reacquiring the hash bucket lock and before trying to @@ -1685,18 +1685,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * pi_state is incorrect, some other task did a lock steal and * we returned due to timeout or signal without taking the - * rt_mutex. Too late. We can access the rt_mutex_owner without - * locking, as the other task is now blocked on the hash bucket - * lock. Fix the state up. + * rt_mutex. Too late. */ + raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); + if (!owner) + owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); + raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } /* * Paranoia check. If we did not take the lock, then we should not be - * the owner, nor the pending owner, of the rt_mutex. + * the owner of the rt_mutex. */ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index ddabb54bb5c8..3c7cbc2c33be 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) put_pid(waiter->deadlock_task_pid); TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); - TRACE_WARN_ON(waiter->task); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index a9604815786a..ab449117aaf2 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -20,41 +20,34 @@ /* * lock->owner state tracking: * - * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 - * are used to keep track of the "owner is pending" and "lock has - * waiters" state. + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. * - * owner bit1 bit0 - * NULL 0 0 lock is free (fast acquire possible) - * NULL 0 1 invalid state - * NULL 1 0 Transitional State* - * NULL 1 1 invalid state - * taskpointer 0 0 lock is held (fast release possible) - * taskpointer 0 1 task is pending owner - * taskpointer 1 0 lock is held and has waiters - * taskpointer 1 1 task is pending owner and lock has more waiters - * - * Pending ownership is assigned to the top (highest priority) - * waiter of the lock, when the lock is released. The thread is woken - * up and can now take the lock. Until the lock is taken (bit 0 - * cleared) a competing higher priority thread can steal the lock - * which puts the woken up thread back on the waiters list. + * owner bit0 + * NULL 0 lock is free (fast acquire possible) + * NULL 1 lock is free and has waiters and the top waiter + * is going to take the lock* + * taskpointer 0 lock is held (fast release possible) + * taskpointer 1 lock is held and has waiters** * * The fast atomic compare exchange based acquire and release is only - * possible when bit 0 and 1 of lock->owner are 0. + * possible when bit 0 of lock->owner is 0. + * + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. * - * (*) There's a small time where the owner can be NULL and the - * "lock has waiters" bit is set. This can happen when grabbing the lock. - * To prevent a cmpxchg of the owner releasing the lock, we need to set this - * bit before looking at the lock, hence the reason this is a transitional - * state. + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. */ static void -rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, - unsigned long mask) +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) { - unsigned long val = (unsigned long)owner | mask; + unsigned long val = (unsigned long)owner; if (rt_mutex_has_waiters(lock)) val |= RT_MUTEX_HAS_WAITERS; @@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * reached or the state of the chain has changed while we * dropped the locks. */ - if (!waiter || !waiter->task) + if (!waiter) goto out_unlock_pi; /* * Check the orig_waiter state. After we dropped the locks, - * the previous owner of the lock might have released the lock - * and made us the pending owner: + * the previous owner of the lock might have released the lock. */ - if (orig_waiter && !orig_waiter->task) + if (orig_waiter && !rt_mutex_owner(orig_lock)) goto out_unlock_pi; /* @@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* Release the task */ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!rt_mutex_owner(lock)) { + /* + * If the requeue above changed the top waiter, then we need + * to wake the new top waiter up to try to get the lock. + */ + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); + raw_spin_unlock(&lock->wait_lock); + goto out_put_task; + } put_task_struct(task); /* Grab the next task */ @@ -295,79 +298,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, return ret; } -/* - * Optimization: check if we can steal the lock from the - * assigned pending owner [which might not have taken the - * lock yet]: - */ -static inline int try_to_steal_lock(struct rt_mutex *lock, - struct task_struct *task) -{ - struct task_struct *pendowner = rt_mutex_owner(lock); - struct rt_mutex_waiter *next; - unsigned long flags; - - if (!rt_mutex_owner_pending(lock)) - return 0; - - if (pendowner == task) - return 1; - - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - if (task->prio >= pendowner->prio) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 0; - } - - /* - * Check if a waiter is enqueued on the pending owners - * pi_waiters list. Remove it and readjust pending owners - * priority. - */ - if (likely(!rt_mutex_has_waiters(lock))) { - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - return 1; - } - - /* No chain handling, pending owner is not blocked on anything: */ - next = rt_mutex_top_waiter(lock); - plist_del(&next->pi_list_entry, &pendowner->pi_waiters); - __rt_mutex_adjust_prio(pendowner); - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - /* - * We are going to steal the lock and a waiter was - * enqueued on the pending owners pi_waiters queue. So - * we have to enqueue this waiter into - * task->pi_waiters list. This covers the case, - * where task is boosted because it holds another - * lock and gets unboosted because the booster is - * interrupted, so we would delay a waiter with higher - * priority as task->normal_prio. - * - * Note: in the rare case of a SCHED_OTHER task changing - * its priority and thus stealing the lock, next->task - * might be task: - */ - if (likely(next->task != task)) { - raw_spin_lock_irqsave(&task->pi_lock, flags); - plist_add(&next->pi_list_entry, &task->pi_waiters); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - } - return 1; -} - /* * Try to take an rt-mutex * - * This fails - * - when the lock has a real owner - * - when a different pending owner exists and has higher priority than current - * * Must be called with lock->wait_lock held. + * + * @lock: the lock to be acquired. + * @task: the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) */ -static int try_to_take_rt_mutex(struct rt_mutex *lock) +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) { /* * We have to be careful here if the atomic speedups are @@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) + if (rt_mutex_owner(lock)) return 0; + /* + * It will get the lock because of one of these conditions: + * 1) there is no waiter + * 2) higher priority than waiters + * 3) it is top waiter + */ + if (rt_mutex_has_waiters(lock)) { + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } + } + + if (waiter || rt_mutex_has_waiters(lock)) { + unsigned long flags; + struct rt_mutex_waiter *top; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* remove the queued waiter. */ + if (waiter) { + plist_del(&waiter->list_entry, &lock->wait_list); + task->pi_blocked_on = NULL; + } + + /* + * We have to enqueue the top waiter(if it exists) into + * task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) { + top = rt_mutex_top_waiter(lock); + top->pi_list_entry.prio = top->list_entry.prio; + plist_add(&top->pi_list_entry, &task->pi_waiters); + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + } + /* We got the lock. */ debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, current, 0); + rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, current); + rt_mutex_deadlock_account_lock(lock, task); return 1; } @@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!owner) + return 0; + if (waiter == rt_mutex_top_waiter(lock)) { raw_spin_lock_irqsave(&owner->pi_lock, flags); plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); @@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* * Wake up the next waiter on the lock. * - * Remove the top waiter from the current tasks waiter list and from - * the lock waiter list. Set it as pending owner. Then wake it up. + * Remove the top waiter from the current tasks waiter list and wake it up. * * Called with lock->wait_lock held. */ static void wakeup_next_waiter(struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - struct task_struct *pendowner; unsigned long flags; raw_spin_lock_irqsave(¤t->pi_lock, flags); waiter = rt_mutex_top_waiter(lock); - plist_del(&waiter->list_entry, &lock->wait_list); /* * Remove it from current->pi_waiters. We do not adjust a @@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock) * lock->wait_lock. */ plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); - pendowner = waiter->task; - waiter->task = NULL; - rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); + rt_mutex_set_owner(lock, NULL); raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - /* - * Clear the pi_blocked_on variable and enqueue a possible - * waiter into the pi_waiters list of the pending owner. This - * prevents that in case the pending owner gets unboosted a - * waiter with higher priority than pending-owner->normal_prio - * is blocked on the unboosted (pending) owner. - */ - raw_spin_lock_irqsave(&pendowner->pi_lock, flags); - - WARN_ON(!pendowner->pi_blocked_on); - WARN_ON(pendowner->pi_blocked_on != waiter); - WARN_ON(pendowner->pi_blocked_on->lock != lock); - - pendowner->pi_blocked_on = NULL; - - if (rt_mutex_has_waiters(lock)) { - struct rt_mutex_waiter *next; - - next = rt_mutex_top_waiter(lock); - plist_add(&next->pi_list_entry, &pendowner->pi_waiters); - } - raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags); - - wake_up_process(pendowner); + wake_up_process(waiter->task); } /* - * Remove a waiter from a lock + * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) @@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock, raw_spin_lock_irqsave(¤t->pi_lock, flags); plist_del(&waiter->list_entry, &lock->wait_list); - waiter->task = NULL; current->pi_blocked_on = NULL; raw_spin_unlock_irqrestore(¤t->pi_lock, flags); - if (first && owner != current) { + if (!owner) + return; + + if (first) { raw_spin_lock_irqsave(&owner->pi_lock, flags); @@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task) * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter - * @detect_deadlock: passed to task_blocks_on_rt_mutex * * lock->wait_lock must be held by the caller. */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter, - int detect_deadlock) + struct rt_mutex_waiter *waiter) { int ret = 0; for (;;) { /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) + if (try_to_take_rt_mutex(lock, current, waiter)) break; /* @@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - /* - * waiter->task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter->task) { - ret = task_blocks_on_rt_mutex(lock, waiter, current, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter->task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); - if (waiter->task) - schedule_rt_mutex(lock); + schedule_rt_mutex(lock); raw_spin_lock(&lock->wait_lock); set_current_state(state); @@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, int ret = 0; debug_rt_mutex_init_waiter(&waiter); - waiter.task = NULL; raw_spin_lock(&lock->wait_lock); /* Try to acquire the lock again: */ - if (try_to_take_rt_mutex(lock)) { + if (try_to_take_rt_mutex(lock, current, NULL)) { raw_spin_unlock(&lock->wait_lock); return 0; } @@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, - detect_deadlock); + ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + + if (likely(!ret)) + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter.task)) + if (unlikely(ret)) remove_waiter(lock, &waiter); /* @@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(timeout)) hrtimer_cancel(&timeout->timer); - /* - * Readjust priority, when we did not get the lock. We might - * have been the pending owner and boosted. Since we did not - * take the lock, the PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - debug_rt_mutex_free_waiter(&waiter); return ret; @@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock) if (likely(rt_mutex_owner(lock) != current)) { - ret = try_to_take_rt_mutex(lock); + ret = try_to_take_rt_mutex(lock, current, NULL); /* * try_to_take_rt_mutex() sets the lock waiters * bit unconditionally. Clean this up. @@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, { __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); - rt_mutex_set_owner(lock, proxy_owner, 0); + rt_mutex_set_owner(lock, proxy_owner); rt_mutex_deadlock_account_lock(lock, proxy_owner); } @@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) { debug_rt_mutex_proxy_unlock(lock); - rt_mutex_set_owner(lock, NULL, 0); + rt_mutex_set_owner(lock, NULL); rt_mutex_deadlock_account_unlock(proxy_owner); } @@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock(&lock->wait_lock); - mark_rt_mutex_waiters(lock); - - if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { - /* We got the lock for task. */ - debug_rt_mutex_lock(lock); - rt_mutex_set_owner(lock, task, 0); + if (try_to_take_rt_mutex(lock, task, NULL)) { raw_spin_unlock(&lock->wait_lock); - rt_mutex_deadlock_account_lock(lock, task); return 1; } ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); - if (ret && !waiter->task) { + if (ret && !rt_mutex_owner(lock)) { /* * Reset the return value. We might have * returned with -EDEADLK and the owner @@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, */ ret = 0; } + + if (unlikely(ret)) + remove_waiter(lock, waiter); + raw_spin_unlock(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, - detect_deadlock); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); set_current_state(TASK_RUNNING); - if (unlikely(waiter->task)) + if (unlikely(ret)) remove_waiter(lock, waiter); /* @@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock(&lock->wait_lock); - /* - * Readjust priority, when we did not get the lock. We might have been - * the pending owner and boosted. Since we did not take the lock, the - * PI boost has to go. - */ - if (unlikely(ret)) - rt_mutex_adjust_prio(current); - return ret; } diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index 97a2f81866af..53a66c85261b 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p) /* * lock->owner state tracking: */ -#define RT_MUTEX_OWNER_PENDING 1UL -#define RT_MUTEX_HAS_WAITERS 2UL -#define RT_MUTEX_OWNER_MASKALL 3UL +#define RT_MUTEX_HAS_WAITERS 1UL +#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { @@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); } -static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) -{ - return (struct task_struct *) - ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); -} - -static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) -{ - return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; -} - /* * PI-futex support (proxy locking functions, etc.): */ -- cgit v1.2.3 From 1fb0ef31f428f345a7c3666f8e7444a563edd537 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Jan 2011 08:57:41 +0100 Subject: genirq: Fix affinity notifier fallout The new code of commit cd7eab44e(genirq: Add IRQ affinity notifiers) references irq_desc.affinity which fails to compile with CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED=y. Use irq_desc.irq_data.affinity instead. Signed-off-by: Thomas Gleixner Cc: Ben Hutchings --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0587c5ceaed8..538fce2db51c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -179,7 +179,7 @@ static void irq_affinity_notify(struct work_struct *work) cpumask_copy(cpumask, desc->pending_mask); else #endif - cpumask_copy(cpumask, desc->affinity); + cpumask_copy(cpumask, desc->irq_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); notify->notify(notify, cpumask); -- cgit v1.2.3 From 871cf1e5f2a17702f58539a3af8b18fc8666ad4c Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:58:55 +0100 Subject: time: Move do_timer() to kernel/time/timekeeping.c do_timer() is primary timekeeping related. calc_global_load() is called from do_timer() as well, but that's more for historical reasons. [ tglx: Fixed up the calc_global_load() reject andmassaged changelog ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145855.23248.56933.stgit@localhost> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 14 +++++++++++++- kernel/timer.c | 13 ------------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d27c7562902c..c1a178ca0f50 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -779,7 +779,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) * * Called from the timer interrupt, must hold a write on xtime_lock. */ -void update_wall_time(void) +static void update_wall_time(void) { struct clocksource *clock; cycle_t offset; @@ -946,3 +946,15 @@ struct timespec get_monotonic_coarse(void) now.tv_nsec + mono.tv_nsec); return now; } + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_wall_time(); + calc_global_load(ticks); +} diff --git a/kernel/timer.c b/kernel/timer.c index 43ca9936f2d0..87f656cc2a55 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1293,19 +1293,6 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } -/* - * The 64-bit jiffies value is not atomic - you MUST NOT read it - * without sampling the sequence number in xtime_lock. - * jiffies is defined in the linker script... - */ - -void do_timer(unsigned long ticks) -{ - jiffies_64 += ticks; - update_wall_time(); - calc_global_load(ticks); -} - #ifdef __ARCH_WANT_SYS_ALARM /* -- cgit v1.2.3 From fbad1ea94159a71bc0f68b00e57ae803606af9fb Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:00 +0100 Subject: time: Move get_jiffies_64 to kernel/time/jiffies.c Move the jiffies access functions to the jiffies clocksource code. [ tglx: Add missing include ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145900.23248.73352.stgit@localhost> Signed-off-by: Thomas Gleixner --- kernel/time.c | 17 ----------------- kernel/time/jiffies.c | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 32174359576f..a31b51220ac6 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -674,23 +674,6 @@ unsigned long nsecs_to_jiffies(u64 n) #endif } -#if (BITS_PER_LONG < 64) -u64 get_jiffies_64(void) -{ - unsigned long seq; - u64 ret; - - do { - seq = read_seqbegin(&xtime_lock); - ret = jiffies_64; - } while (read_seqretry(&xtime_lock, seq)); - return ret; -} -EXPORT_SYMBOL(get_jiffies_64); -#endif - -EXPORT_SYMBOL(jiffies); - /* * Add two timespec values and do a safety check for overflow. * It's assumed that both values are valid (>= 0) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 5404a8456909..2fbc20744797 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -22,6 +22,7 @@ ************************************************************************/ #include #include +#include #include /* The Jiffies based clocksource is the lowest common @@ -64,6 +65,23 @@ struct clocksource clocksource_jiffies = { .shift = JIFFIES_SHIFT, }; +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + static int __init init_jiffies_clocksource(void) { return clocksource_register(&clocksource_jiffies); -- cgit v1.2.3 From 48cf76f7104f655bbd48a75c7759dce82c3e1ab6 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:05 +0100 Subject: time: Provide get_xtime_and_monotonic_offset() The hrtimer code accesses timekeeping variables under xtime_lock. Provide a sensible accessor function and use it. [ tglx: Removed the conditionals, unused variable, fixed codingstyle and massaged changelog ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145905.23248.30458.stgit@localhost> Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 13 ++----------- kernel/time/timekeeping.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c048615..57c4d33c9a9d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -85,13 +85,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; struct timespec xts, tom; - unsigned long seq; - do { - seq = read_seqbegin(&xtime_lock); - xts = __current_kernel_time(); - tom = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_offset(&xts, &tom); xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); @@ -612,15 +607,11 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; struct timespec realtime_offset, wtm; - unsigned long seq; if (!hrtimer_hres_active()) return; - do { - seq = read_seqbegin(&xtime_lock); - wtm = __get_wall_to_monotonic(); - } while (read_seqretry(&xtime_lock, seq)); + get_xtime_and_monotonic_offset(&realtime_offset, &wtm); set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c1a178ca0f50..c50aaf6cd01d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -958,3 +958,19 @@ void do_timer(unsigned long ticks) update_wall_time(); calc_global_load(ticks); } + +/** + * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic + * @xtim: pointer to timespec to be set with xtime + * @wtom: pointer to timespec to be set with wall_to_monotonic + */ +void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom) +{ + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + *xtim = xtime; + *wtom = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); +} -- cgit v1.2.3 From 79ecaf0d15344d78904becf0f25de3fc9b49d430 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Jan 2011 11:07:54 +0100 Subject: time: Remove unused __get_wall_to_monotonic() No users left. Remove it. Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c50aaf6cd01d..8da35d1b9e16 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -910,11 +910,6 @@ struct timespec __current_kernel_time(void) return xtime; } -struct timespec __get_wall_to_monotonic(void) -{ - return wall_to_monotonic; -} - struct timespec current_kernel_time(void) { struct timespec now; -- cgit v1.2.3 From f0af911a9dec9de702645182c8d269449e24d24b Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 15:59:10 +0100 Subject: time: Provide xtime_update() xtime_update() takes xtime_lock write locked and calls do_timer(). Provided to replace the do_timer() calls in the architecture code. Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145910.23248.21379.stgit@localhost> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8da35d1b9e16..02c13a313d15 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -969,3 +969,16 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom *wtom = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); } + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&xtime_lock); + do_timer(ticks); + write_sequnlock(&xtime_lock); +} -- cgit v1.2.3 From e2830b5c1b2b2217894370a3b95af87d4a958401 Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 27 Jan 2011 16:00:32 +0100 Subject: time: Make do_timer() and xtime_lock local to kernel/time/ All callers of do_timer() are converted to xtime_update(). The only users of xtime_lock are in kernel/time/. Make both local to kernel/time/ and remove them from the global header files. [ tglx: Reuse tick-internal.h instead of creating another local header file. Massaged changelog ] Signed-off-by: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 1 - kernel/time/jiffies.c | 2 ++ kernel/time/ntp.c | 2 ++ kernel/time/tick-broadcast.c | 1 - kernel/time/tick-common.c | 1 - kernel/time/tick-internal.h | 5 +++++ kernel/time/tick-oneshot.c | 1 - kernel/time/tick-sched.c | 1 - 8 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d7395fdfb9f3..0d74b9ba90c8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "tick-internal.h" diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 2fbc20744797..b2fa506667c0 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -25,6 +25,8 @@ #include #include +#include "tick-internal.h" + /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on * all systems. It has the same coarse resolution as diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5c00242fa921..ed8cfdf16983 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,6 +16,8 @@ #include #include +#include "tick-internal.h" + /* * NTP timekeeping variables: */ diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761b5668..92ef9a54f0a4 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "tick-internal.h" diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 051bc80a0c43..0e98fac3d479 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f60..28c578568c9d 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,8 @@ /* * tick internal variable and functions used by low/high res code */ +#include +#include #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -132,3 +134,6 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) { return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } + +extern void do_timer(unsigned long ticks); +extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5cbc101f908b..2d04411a5f05 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -18,7 +18,6 @@ #include #include #include -#include #include "tick-internal.h" diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c55ea2433471..d5097c44b407 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 7cf37e87dd2cfa17a64f28ea7f31eed4525f79e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 09:34:58 +0100 Subject: time: Fix legacy arch fallout The xtime/dotimer cleanup broke architectures which do not implement clockevents. Time to send out another __do_IRQ threat. Signed-off-by: Thomas Gleixner Reported-by: Ingo Molnar Cc: Torben Hohn Cc: Peter Zijlstra Cc: johnstul@us.ibm.com Cc: yong.zhang0@gmail.com Cc: hch@infradead.org LKML-Reference: <20110127145905.23248.30458.stgit@localhost> Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 28c578568c9d..f77b93df0006 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@ #include #include +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD + #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -135,5 +137,7 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +#endif + extern void do_timer(unsigned long ticks); extern seqlock_t xtime_lock; -- cgit v1.2.3 From 4916ca401e3051dad326ddd69765bd0e3f32fb9b Mon Sep 17 00:00:00 2001 From: Lucian Adrian Grijincu Date: Tue, 1 Feb 2011 18:44:56 +0200 Subject: security: remove unused security_sysctl hook The only user for this hook was selinux. sysctl routes every call through /proc/sys/. Selinux and other security modules use the file system checks for sysctl too, so no need for this hook any more. Signed-off-by: Lucian Adrian Grijincu Signed-off-by: Eric Paris --- kernel/sysctl.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae5cbb1e3ced..e24254c27eaf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1691,13 +1691,8 @@ static int test_perm(int mode, int op) int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) { - int error; int mode; - error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC)); - if (error) - return error; - if (root->permissions) mode = root->permissions(root, current->nsproxy, table); else -- cgit v1.2.3 From 1e6d767924c74929c0cfe839ae8f37bcee9e544e Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:50:58 +0000 Subject: time: Correct the *settime* parameters Both settimeofday() and clock_settime() promise with a 'const' attribute not to alter the arguments passed in. This patch adds the missing 'const' attribute into the various kernel functions implementing these calls. Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134417.545698637@linutronix.de> Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 4 ++-- kernel/time.c | 2 +- kernel/time/timekeeping.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 93bd2eb2bc53..21b7ca205f38 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -192,7 +192,7 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp) } static inline int common_clock_set(const clockid_t which_clock, - struct timespec *tp) + const struct timespec *tp) { return do_sys_settimeofday(tp, NULL); } @@ -928,7 +928,7 @@ void exit_itimers(struct signal_struct *sig) } /* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp) +int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp) { return -EINVAL; } diff --git a/kernel/time.c b/kernel/time.c index a31b51220ac6..5cb80533d8b5 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -150,7 +150,7 @@ static inline void warp_clock(void) * various programs will get confused when the clock gets warped. */ -int do_sys_settimeofday(struct timespec *tv, struct timezone *tz) +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) { static int firsttime = 1; int error = 0; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 02c13a313d15..4f9f65b91323 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday); * * Sets the time of day to the new time and update NTP and notify hrtimers */ -int do_settimeofday(struct timespec *tv) +int do_settimeofday(const struct timespec *tv) { struct timespec ts_delta; unsigned long flags; -- cgit v1.2.3 From 65da528d7cc94966cf24d2a1e0837b689159b543 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:01 +0000 Subject: posix-timers: Define nanosleep not supported error separate Define the conditional nanosleep not supported error value outside of do_posix_clock_nonanosleep(). Preparatory patch for further cleanups. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134417.643486574@linutronix.de> --- kernel/posix-timers.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 21b7ca205f38..89bff3766d7d 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -81,6 +81,14 @@ static DEFINE_SPINLOCK(idr_lock); #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif /* * The timer ID is turned into a timer address by idr_find(). @@ -937,11 +945,7 @@ EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); int do_posix_clock_nonanosleep(const clockid_t clock, int flags, struct timespec *t, struct timespec __user *r) { -#ifndef ENOTSUP - return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */ -#else /* parisc does define it separately. */ - return -ENOTSUP; -#endif + return -ENANOSLEEP_NOTSUP; } EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); -- cgit v1.2.3 From 2fd1f04089cb657c5d6c484b280ec4d3398aa157 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:03 +0000 Subject: posix-timers: Cleanup struct initializers Cosmetic. No functional change Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134417.745627057@linutronix.de> --- kernel/posix-cpu-timers.c | 24 ++++++++++++------------ kernel/posix-timers.c | 38 +++++++++++++++++++------------------- 2 files changed, 31 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 05bb7173850e..11b91dc0992b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1607,20 +1607,20 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block) static __init int init_posix_cpu_timers(void) { struct k_clock process = { - .clock_getres = process_cpu_clock_getres, - .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = process_cpu_timer_create, - .nsleep = process_cpu_nsleep, - .nsleep_restart = process_cpu_nsleep_restart, + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .clock_set = do_posix_clock_nosettime, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, }; struct k_clock thread = { - .clock_getres = thread_cpu_clock_getres, - .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, - .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .clock_set = do_posix_clock_nosettime, + .timer_create = thread_cpu_timer_create, + .nsleep = thread_cpu_nsleep, + .nsleep_restart = thread_cpu_nsleep_restart, }; struct timespec ts; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 89bff3766d7d..e7d26afd8ee5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -281,33 +281,33 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = hrtimer_get_res, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .clock_set = do_posix_clock_nosettime, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, - .clock_get = posix_get_monotonic_raw, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, }; struct k_clock clock_realtime_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_realtime_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, }; struct k_clock clock_monotonic_coarse = { - .clock_getres = posix_get_coarse_res, - .clock_get = posix_get_monotonic_coarse, - .clock_set = do_posix_clock_nosettime, - .timer_create = no_timer_create, - .nsleep = no_nsleep, + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + .clock_set = do_posix_clock_nosettime, + .timer_create = no_timer_create, + .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); -- cgit v1.2.3 From 1976945eeaab5fa461735a6225a82c3cf1e65d62 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:06 +0000 Subject: posix-timers: Introduce clock_posix_cpu The CLOCK_DISPATCH() macro is a horrible magic. We call common functions if a function pointer is not set. That's just backwards. To support dynamic file decriptor based clocks we need to cleanup that dispatch logic. Create a k_clock struct clock_posix_cpu which has all the posix-cpu-timer functions filled in. After the cleanup the functions can be made static. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134417.841974553@linutronix.de> --- kernel/posix-cpu-timers.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 11b91dc0992b..816cd49a5ad9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1604,6 +1604,18 @@ static long thread_cpu_nsleep_restart(struct restart_block *restart_block) return -EINVAL; } +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; + static __init int init_posix_cpu_timers(void) { struct k_clock process = { -- cgit v1.2.3 From cc785ac22b17ed53e8ff5c1501e422be6d10be3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:09 +0000 Subject: posix-timers: Introduce clockid_to_kclock() New function to find the kclock for a given clockid. Returns a pointer to clock_posix_cpu if clockid < 0. If clockid >= MAXCLOCK or if the clock_getres pointer is not set it returns NULL. For valid clocks it returns a pointer to the matching posix_clock. Signed-off-by: Thomas Gleixner Cc: John Stultz Acked-by: Richard Cochran LKML-Reference: <20110201134417.938447839@linutronix.de> --- kernel/posix-timers.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e7d26afd8ee5..14b0a70ffb1e 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -531,6 +531,16 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) kmem_cache_free(posix_timers_cache, tmr); } +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + /* Create a POSIX.1b interval timer. */ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, -- cgit v1.2.3 From a5cd2880106cb2c79b3fe24f1c53dadba6a542a0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:11 +0000 Subject: posix-timers: Convert clock_nanosleep to clockid_to_kclock() Use the new kclock decoding function in clock_nanosleep and cleanup all kclocks which use the default functions. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.034175556@linutronix.de> --- kernel/posix-timers.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 14b0a70ffb1e..ee69b216d5c3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -216,12 +216,6 @@ static int no_timer_create(struct k_itimer *new_timer) return -EOPNOTSUPP; } -static int no_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) -{ - return -EOPNOTSUPP; -} - /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -282,32 +276,31 @@ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, + .nsleep = common_nsleep, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, + .nsleep = common_nsleep, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_monotonic_raw, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, - .nsleep = no_nsleep, }; struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_realtime_coarse, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, - .nsleep = no_nsleep, }; struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, - .nsleep = no_nsleep, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); @@ -952,13 +945,6 @@ int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp) } EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); -int do_posix_clock_nonanosleep(const clockid_t clock, int flags, - struct timespec *t, struct timespec __user *r) -{ - return -ENANOSLEEP_NOTSUP; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); - SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { @@ -1023,10 +1009,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec t; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; @@ -1034,8 +1023,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (!timespec_valid(&t)) return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep, - (which_clock, flags, &t, rmtp)); + return kc->nsleep(which_clock, flags, &t, rmtp); } /* -- cgit v1.2.3 From 59bd5bc24aa69f6c62da1e242c16f09f667def96 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:17 +0000 Subject: posix-timers: Convert clock_nanosleep_restart to clockid_to_kclock() Use the new kclock decoding function in clock_nanosleep_restart. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.131263211@linutronix.de> --- kernel/posix-timers.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ee69b216d5c3..4dd86d15bbd0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -277,12 +277,14 @@ static __init int init_posix_timers(void) struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -1026,23 +1028,17 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, return kc->nsleep(which_clock, flags, &t, rmtp); } -/* - * nanosleep_restart for monotonic and realtime clocks - */ -static int common_nsleep_restart(struct restart_block *restart_block) -{ - return hrtimer_nanosleep_restart(restart_block); -} - /* * This will restart clock_nanosleep. This is required only by * compat_clock_nanosleep_restart for now. */ -long -clock_nanosleep_restart(struct restart_block *restart_block) +long clock_nanosleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->arg0; + struct k_clock *kc = clockid_to_kclock(which_clock); + + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; - return CLOCK_DISPATCH(which_clock, nsleep_restart, - (restart_block)); + return kc->nsleep_restart(restart_block); } -- cgit v1.2.3 From 3751f9f29bcbc19bd10e92254a273486f150c245 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:20 +0000 Subject: posix-timers: Cleanup restart_block usage posix timers still use the legacy arg0-arg3 members of restart_block. Use restart_block.nanosleep instead Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.232288779@linutronix.de> --- kernel/posix-cpu-timers.c | 38 +++++++++++++++----------------------- kernel/posix-timers.c | 2 +- 2 files changed, 16 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 816cd49a5ad9..9e617b00afa9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1485,7 +1485,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + ¤t_thread_info()->restart_block; struct itimerspec it; int error; @@ -1501,50 +1501,42 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, if (error == -ERESTART_RESTARTBLOCK) { - if (flags & TIMER_ABSTIME) + if (flags & TIMER_ABSTIME) return -ERESTARTNOHAND; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = rqtp->tv_sec; - restart_block->arg3 = rqtp->tv_nsec; + restart_block->nanosleep.index = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); } return error; } long posix_cpu_nsleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; - struct timespec __user *rmtp; + clockid_t which_clock = restart_block->nanosleep.index; struct timespec t; struct itimerspec it; int error; - rmtp = (struct timespec __user *) restart_block->arg1; - t.tv_sec = restart_block->arg2; - t.tv_nsec = restart_block->arg3; + t = ns_to_timespec(restart_block->nanosleep.expires); - restart_block->fn = do_no_restart_syscall; error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; /* - * Report back to the user the time still remaining. - */ - if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) return -EFAULT; - restart_block->fn = posix_cpu_nsleep_restart; - restart_block->arg0 = which_clock; - restart_block->arg1 = (unsigned long) rmtp; - restart_block->arg2 = t.tv_sec; - restart_block->arg3 = t.tv_nsec; + restart_block->nanosleep.expires = timespec_to_ns(&t); } return error; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4dd86d15bbd0..4762986814c8 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1034,7 +1034,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, */ long clock_nanosleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->arg0; + clockid_t which_clock = restart_block->nanosleep.index; struct k_clock *kc = clockid_to_kclock(which_clock); if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) -- cgit v1.2.3 From 79c9da0d0539fb341a1b48a2a5a23d974726de90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:45 +0000 Subject: posix-cpu-timers: Remove the stub nanosleep functions CLOCK_THREAD_CPUTIME_ID implements stub functions for nanosleep and nanosleep_restart, which return -EINVAL. That return value is wrong. The correct return value is -ENOTSUP. Remove the stubs and let the new dispatch code return the correct error code. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.422446502@linutronix.de> --- kernel/posix-cpu-timers.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9e617b00afa9..8dc4cd7faf89 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1586,15 +1586,6 @@ static int thread_cpu_timer_create(struct k_itimer *timer) timer->it_clock = THREAD_CLOCK; return posix_cpu_timer_create(timer); } -static int thread_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) -{ - return -EINVAL; -} -static long thread_cpu_nsleep_restart(struct restart_block *restart_block) -{ - return -EINVAL; -} struct k_clock clock_posix_cpu = { .clock_getres = posix_cpu_clock_getres, @@ -1623,8 +1614,6 @@ static __init int init_posix_cpu_timers(void) .clock_get = thread_cpu_clock_get, .clock_set = do_posix_clock_nosettime, .timer_create = thread_cpu_timer_create, - .nsleep = thread_cpu_nsleep, - .nsleep_restart = thread_cpu_nsleep_restart, }; struct timespec ts; -- cgit v1.2.3 From 26f9a4796af330173d790c8d2b5e2efcc489e755 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:48 +0000 Subject: posix-timers: Convert clock_settime to clockid_to_kclock() Use the new kclock decoding function in clock_settime and cleanup all kclocks which use the default functions. Rename the misnomed common_clock_set() to posix_clock_realtime_set(). Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.518851246@linutronix.de> --- kernel/posix-cpu-timers.c | 2 -- kernel/posix-timers.c | 31 ++++++++++++------------------- 2 files changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8dc4cd7faf89..504fbab10b82 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1604,7 +1604,6 @@ static __init int init_posix_cpu_timers(void) struct k_clock process = { .clock_getres = process_cpu_clock_getres, .clock_get = process_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, .timer_create = process_cpu_timer_create, .nsleep = process_cpu_nsleep, .nsleep_restart = process_cpu_nsleep_restart, @@ -1612,7 +1611,6 @@ static __init int init_posix_cpu_timers(void) struct k_clock thread = { .clock_getres = thread_cpu_clock_getres, .clock_get = thread_cpu_clock_get, - .clock_set = do_posix_clock_nosettime, .timer_create = thread_cpu_timer_create, }; struct timespec ts; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4762986814c8..49f358c37708 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -199,12 +199,6 @@ static int common_clock_get(clockid_t which_clock, struct timespec *tp) return 0; } -static inline int common_clock_set(const clockid_t which_clock, - const struct timespec *tp) -{ - return do_sys_settimeofday(tp, NULL); -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -232,6 +226,13 @@ static inline int invalid_clockid(const clockid_t which_clock) return 1; } +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) +{ + return do_sys_settimeofday(tp, NULL); +} + /* * Get monotonic time for posix timers */ @@ -276,32 +277,29 @@ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, + .clock_set = posix_clock_realtime_set, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, - .clock_set = do_posix_clock_nosettime, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_monotonic_raw, - .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, }; struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_realtime_coarse, - .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, }; struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, - .clock_set = do_posix_clock_nosettime, .timer_create = no_timer_create, }; @@ -940,24 +938,19 @@ void exit_itimers(struct signal_struct *sig) } } -/* Not available / possible... functions */ -int do_posix_clock_nosettime(const clockid_t clockid, const struct timespec *tp) -{ - return -EINVAL; -} -EXPORT_SYMBOL_GPL(do_posix_clock_nosettime); - SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec new_tp; - if (invalid_clockid(which_clock)) + if (!kc || !kc->clock_set) return -EINVAL; + if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); + return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, -- cgit v1.2.3 From 42285777631aa0654fbb6442057b3e176445c6c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:50 +0000 Subject: posix-timers: Convert clock_gettime() to clockid_to_kclock() Use the new kclock decoding mechanism and rename the misnomed common_clock_get() to posix_clock_realtime_get(). Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.611097203@linutronix.de> --- kernel/posix-timers.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 49f358c37708..d9e5edfe8a1b 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -190,15 +190,6 @@ static inline int common_clock_getres(const clockid_t which_clock, return 0; } -/* - * Get real time for posix timers - */ -static int common_clock_get(clockid_t which_clock, struct timespec *tp) -{ - ktime_get_real_ts(tp); - return 0; -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -226,6 +217,13 @@ static inline int invalid_clockid(const clockid_t which_clock) return 1; } +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_real_ts(tp); + return 0; +} + /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec *tp) @@ -277,6 +275,7 @@ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -956,18 +955,21 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec kernel_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_get, - (which_clock, &kernel_tp)); + if (!kc->clock_get) + return -EOPNOTSUPP; + + error = kc->clock_get(which_clock, &kernel_tp); + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; return error; - } SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, -- cgit v1.2.3 From 4359ac0ace1a2a267927390ad27f781a2f8e0ab8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 11:45:23 +0100 Subject: posix-timers: Make clock_getres and clock_get mandatory Richard said: "I would think that we can require k_clocks to provide the read function. This could be checked and enforced in register_posix_clock()." Add checks for clock_getres and clock_get in the register function. Suggested-by: Richard Cochran Cc: John Stultz Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index d9e5edfe8a1b..7f66143d1ce5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -485,7 +485,18 @@ static struct pid *good_sigevent(sigevent_t * event) void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { - printk("POSIX clock register failed for clock_id %d\n", + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", clock_id); return; } @@ -961,8 +972,6 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, if (!kc) return -EINVAL; - if (!kc->clock_get) - return -EOPNOTSUPP; error = kc->clock_get(which_clock, &kernel_tp); -- cgit v1.2.3 From e5e542eea9075dd008993c2ee80b2cc9f31fc494 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:53 +0000 Subject: posix-timers: Convert clock_getres() to clockid_to_kclock() Use the new kclock decoding. Fixup the fallout in mmtimer.c Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.709802797@linutronix.de> --- kernel/posix-timers.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 7f66143d1ce5..748497fffd0f 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -182,14 +182,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) * the function pointer CALL in struct k_clock. */ -static inline int common_clock_getres(const clockid_t which_clock, - struct timespec *tp) -{ - tp->tv_sec = 0; - tp->tv_nsec = posix_clocks[which_clock].res; - return 0; -} - static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); @@ -984,18 +976,17 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec rtn_tp; int error; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; - error = CLOCK_DISPATCH(which_clock, clock_getres, - (which_clock, &rtn_tp)); + error = kc->clock_getres(which_clock, &rtn_tp); - if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) { + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; - } return error; } -- cgit v1.2.3 From ebaac757acae0431e2c79c00e09f1debdabbddd7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:56 +0000 Subject: posix-timers: Remove useless res field from k_clock The res member of kclock is only used by mmtimer.c, but even there it contains redundant information. Remove the field and fixup mmtimer. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.808714587@linutronix.de> --- kernel/posix-timers.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 748497fffd0f..f9142a99b5cb 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -204,8 +204,6 @@ static inline int invalid_clockid(const clockid_t which_clock) return 1; if (posix_clocks[which_clock].clock_getres != NULL) return 0; - if (posix_clocks[which_clock].res != 0) - return 0; return 1; } -- cgit v1.2.3 From 838394fbf989973ec7f5a0ad82cb6ff09e5c39aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:51:58 +0000 Subject: posix-timers: Convert timer_create() to clockid_to_kclock() Setup timer_create for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and remove the no_timer_create() implementation. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134418.903604289@linutronix.de> --- kernel/posix-timers.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index f9142a99b5cb..4f71382a4ca8 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -146,6 +146,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; */ static int common_nsleep(const clockid_t, int flags, struct timespec *t, struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); static void common_timer_get(struct k_itimer *, struct itimerspec *); static int common_timer_set(struct k_itimer *, int, struct itimerspec *, struct itimerspec *); @@ -174,25 +175,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) (posix_clocks[clock].call != NULL \ ? (*posix_clocks[clock].call) arglist : common_##call arglist)) -/* - * Default clock hook functions when the struct k_clock passed - * to register_posix_clock leaves a function pointer null. - * - * The function common_CALL is the default implementation for - * the function pointer CALL in struct k_clock. - */ - -static int common_timer_create(struct k_itimer *new_timer) -{ - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); - return 0; -} - -static int no_timer_create(struct k_itimer *new_timer) -{ - return -EOPNOTSUPP; -} - /* * Return nonzero if we know a priori this clockid_t value is bogus. */ @@ -269,27 +251,26 @@ static __init int init_posix_timers(void) .clock_set = posix_clock_realtime_set, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, .clock_get = posix_get_monotonic_raw, - .timer_create = no_timer_create, }; struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_realtime_coarse, - .timer_create = no_timer_create, }; struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, - .timer_create = no_timer_create, }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); @@ -534,19 +515,28 @@ static struct k_clock *clockid_to_kclock(const clockid_t id) return &posix_clocks[id]; } +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + /* Create a POSIX.1b interval timer. */ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { + struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; sigevent_t event; int it_id_set = IT_ID_NOT_SET; - if (invalid_clockid(which_clock)) + if (!kc) return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) @@ -608,7 +598,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, goto out; } - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + error = kc->timer_create(new_timer); if (error) goto out; @@ -618,7 +608,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, spin_unlock_irq(¤t->sighand->siglock); return 0; - /* + /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify -- cgit v1.2.3 From 27722df16ef143017db55ac7baac1703a68017ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:01 +0000 Subject: posix-timers: Convert timer_settime() to clockid_to_kclock() Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and use the new decoding function. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.001863714@linutronix.de> --- kernel/posix-timers.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4f71382a4ca8..a4dbfe71c5a5 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -252,6 +252,7 @@ static __init int init_posix_timers(void) .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, + .timer_set = common_timer_set, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, @@ -259,6 +260,7 @@ static __init int init_posix_timers(void) .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, + .timer_set = common_timer_set, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -814,6 +816,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, int error = 0; unsigned long flag; struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; if (!new_setting) return -EINVAL; @@ -829,8 +832,11 @@ retry: if (!timr) return -EINVAL; - error = CLOCK_DISPATCH(timr->it_clock, timer_set, - (timr, flags, &new_spec, rtn)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { -- cgit v1.2.3 From a7319fa253a549c4c6528fb550ae6e72a9c83811 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:04 +0000 Subject: posix-timers: Convert timer_gettime() to clockid_to_kclock() Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and use the new decoding function. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.101243181@linutronix.de> --- kernel/posix-timers.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a4dbfe71c5a5..c1e2636f9e45 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -253,6 +253,7 @@ static __init int init_posix_timers(void) .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, + .timer_get = common_timer_get, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, @@ -261,6 +262,7 @@ static __init int init_posix_timers(void) .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, .timer_set = common_timer_set, + .timer_get = common_timer_get, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -712,22 +714,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { - struct k_itimer *timr; struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; unsigned long flags; + int ret = 0; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; - CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting)); + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); unlock_timer(timr, flags); - if (copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; - return 0; + return ret; } /* -- cgit v1.2.3 From 6761c6702e2c647582e1829abe8cf90794f61d9d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:07 +0000 Subject: posix-timers: Convert timer_delete() to clockid_to_kclock() Set the common function for CLOCK_MONOTONIC and CLOCK_REALTIME kclocks and use the new decoding function. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.198999420@linutronix.de> --- kernel/posix-timers.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index c1e2636f9e45..ade7dec49f96 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -254,6 +254,7 @@ static __init int init_posix_timers(void) .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { .clock_getres = hrtimer_get_res, @@ -263,6 +264,7 @@ static __init int init_posix_timers(void) .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, + .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { .clock_getres = hrtimer_get_res, @@ -859,7 +861,7 @@ retry: return error; } -static inline int common_timer_del(struct k_itimer *timer) +static int common_timer_del(struct k_itimer *timer) { timer->it.real.interval.tv64 = 0; @@ -870,7 +872,11 @@ static inline int common_timer_del(struct k_itimer *timer) static inline int timer_delete_hook(struct k_itimer *timer) { - return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer)); + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); } /* Delete a POSIX.1b interval timer. */ -- cgit v1.2.3 From 0aa3975f02ce78f27be3076fbfa3d94ae5a659d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:09 +0000 Subject: posix-timers: Remove CLOCK_DISPATCH leftovers All users gone. Remove the cruft. Huge thanks to Richard Cochran who tackled that maze first. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.294620613@linutronix.de> --- kernel/posix-timers.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ade7dec49f96..ad154dfd7c51 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -167,28 +167,6 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) spin_unlock_irqrestore(&timr->it_lock, flags); } -/* - * Call the k_clock hook function if non-null, or the default function. - */ -#define CLOCK_DISPATCH(clock, call, arglist) \ - ((clock) < 0 ? posix_cpu_##call arglist : \ - (posix_clocks[clock].call != NULL \ - ? (*posix_clocks[clock].call) arglist : common_##call arglist)) - -/* - * Return nonzero if we know a priori this clockid_t value is bogus. - */ -static inline int invalid_clockid(const clockid_t which_clock) -{ - if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */ - return 0; - if ((unsigned) which_clock >= MAX_CLOCKS) - return 1; - if (posix_clocks[which_clock].clock_getres != NULL) - return 0; - return 1; -} - /* Get clock_realtime */ static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) { -- cgit v1.2.3 From bc2c8ea483d73e95fc88f1fc9e7755180f89b892 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 1 Feb 2011 13:52:12 +0000 Subject: posix-timers: Make posix-cpu-timers functions static All functions are accessed via clock_posix_cpu now. So make them static. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Richard Cochran LKML-Reference: <20110201134419.389755466@linutronix.de> --- kernel/posix-cpu-timers.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 504fbab10b82..609e187f43e7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p) return p->utime; } -int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { int error = check_clock(which_clock); if (!error) { @@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) return error; } -int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, } -int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int error = -EINVAL; @@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) * This is called from sys_timer_create() and do_cpu_nanosleep() with the * new timer already all-zeros initialized. */ -int posix_cpu_timer_create(struct k_itimer *new_timer) +static int posix_cpu_timer_create(struct k_itimer *new_timer) { int ret = 0; const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); @@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_del(struct k_itimer *timer) +static int posix_cpu_timer_del(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; int ret = 0; @@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * If we return TIMER_RETRY, it's necessary to release the timer's lock * and try again. (This happens when the timer is in the middle of firing.) */ -int posix_cpu_timer_set(struct k_itimer *timer, int flags, - struct itimerspec *new, struct itimerspec *old) +static int posix_cpu_timer_set(struct k_itimer *timer, int flags, + struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; union cpu_time_count old_expires, new_expires, old_incr, val; @@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, return ret; } -void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) { union cpu_time_count now; struct task_struct *p = timer->it.cpu.task; @@ -1481,8 +1483,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, return error; } -int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t_thread_info()->restart_block; @@ -1517,7 +1521,7 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags, return error; } -long posix_cpu_nsleep_restart(struct restart_block *restart_block) +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.index; struct timespec t; @@ -1542,7 +1546,6 @@ long posix_cpu_nsleep_restart(struct restart_block *restart_block) } - #define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) -- cgit v1.2.3 From 0061748dd2400d0bcd4d49d258db5d7b5d106ca0 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:15 +0000 Subject: posix-timer: Update comment Pick the cleanup to the comment in posix-timers.c from Richards all in one conversion patch. Originally-from: Richard Cochran Signed-off-by: Thomas Gleixner Acked-by: John Stultz LKML-Reference: <20110201134419.487708516@linutronix.de> --- kernel/posix-timers.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad154dfd7c51..a3fdfd4be0ec 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -102,11 +102,7 @@ static DEFINE_SPINLOCK(idr_lock); /* * CLOCKs: The POSIX standard calls for a couple of clocks and allows us * to implement others. This structure defines the various - * clocks and allows the possibility of adding others. We - * provide an interface to add clocks to the table and expect - * the "arch" code to add at least one clock that is high - * resolution. Here we define the standard CLOCK_REALTIME as a - * 1/HZ resolution clock. + * clocks. * * RESOLUTION: Clock resolution is used to round up timer and interval * times, NOT to report clock times, which are reported with as @@ -116,20 +112,13 @@ static DEFINE_SPINLOCK(idr_lock); * necessary code is written. The standard says we should say * something about this issue in the documentation... * - * FUNCTIONS: The CLOCKs structure defines possible functions to handle - * various clock functions. For clocks that use the standard - * system timer code these entries should be NULL. This will - * allow dispatch without the overhead of indirect function - * calls. CLOCKS that depend on other sources (e.g. WWV or GPS) - * must supply functions here, even if the function just returns - * ENOSYS. The standard POSIX timer management code assumes the - * following: 1.) The k_itimer struct (sched.h) is used for the - * timer. 2.) The list, it_lock, it_clock, it_id and it_pid - * fields are not modified by timer code. + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. * - * At this time all functions EXCEPT clock_nanosleep can be - * redirected by the CLOCKS structure. Clock_nanosleep is in - * there, but the code ignores it. + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. * * Permissions: It is assumed that the clock_settime() function defined * for each clock will take care of permission checks. Some -- cgit v1.2.3 From c528f7c6c208f1fae6b4025957173dec045e5f21 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 1 Feb 2011 13:52:17 +0000 Subject: time: Introduce timekeeping_inject_offset This adds a kernel-internal timekeeping interface to add or subtract a fixed amount from CLOCK_REALTIME. This makes it so kernel users or interfaces trying to do so do not have to read the time, then add an offset and then call settimeofday(), which adds some extra error in comparision to just simply adding the offset in the kernel timekeeping core. Signed-off-by: John Stultz Signed-off-by: Richard Cochran LKML-Reference: <20110201134419.584311693@linutronix.de> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 4f9f65b91323..6262c1d18397 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -387,6 +387,42 @@ int do_settimeofday(const struct timespec *tv) EXPORT_SYMBOL(do_settimeofday); + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ + unsigned long flags; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + timekeeping_forward_now(); + + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + + timekeeper.ntp_error = 0; + ntp_clear(); + + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + /** * change_clocksource - Swaps clocksources if a new one is available * -- cgit v1.2.3 From 094aa1881fdc1b8889b442eb3511b31f3ec2b762 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:20 +0000 Subject: ntp: Add ADJ_SETOFFSET mode bit This patch adds a new mode bit into the timex structure. When set, the bit instructs the kernel to add the given time value to the current time. Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134320.688829863@linutronix.de> Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ed8cfdf16983..5ac593267a26 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -648,6 +648,17 @@ int do_adjtimex(struct timex *txc) hrtimer_cancel(&leap_timer); } + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC) + return -EINVAL; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + timekeeping_inject_offset(&delta); + } + getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); -- cgit v1.2.3 From 65f5d80bdf83ec0d7f3887db10153bf3f36ed73c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:23 +0000 Subject: time: Splitout compat timex accessors Split out the compat timex accessors into separate functions. Preparatory patch for a new syscall. [ tglx: Split that patch from Richards "posix-timers: Introduce a syscall for clock tuning.". Keeps the changes strictly separate ] Originally-from: Richard Cochran Acked-by: John Stultz Signed-off-by: Thomas Gleixner LKML-Reference: <20110201134419.772343089@linutronix.de> --- kernel/compat.c | 113 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index c9e2ec0b34a8..449e853cf41d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o, put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; } +static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +{ + memset(txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc->modes, &utp->modes) || + __get_user(txc->offset, &utp->offset) || + __get_user(txc->freq, &utp->freq) || + __get_user(txc->maxerror, &utp->maxerror) || + __get_user(txc->esterror, &utp->esterror) || + __get_user(txc->status, &utp->status) || + __get_user(txc->constant, &utp->constant) || + __get_user(txc->precision, &utp->precision) || + __get_user(txc->tolerance, &utp->tolerance) || + __get_user(txc->time.tv_sec, &utp->time.tv_sec) || + __get_user(txc->time.tv_usec, &utp->time.tv_usec) || + __get_user(txc->tick, &utp->tick) || + __get_user(txc->ppsfreq, &utp->ppsfreq) || + __get_user(txc->jitter, &utp->jitter) || + __get_user(txc->shift, &utp->shift) || + __get_user(txc->stabil, &utp->stabil) || + __get_user(txc->jitcnt, &utp->jitcnt) || + __get_user(txc->calcnt, &utp->calcnt) || + __get_user(txc->errcnt, &utp->errcnt) || + __get_user(txc->stbcnt, &utp->stbcnt)) + return -EFAULT; + + return 0; +} + +static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) +{ + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc->modes, &utp->modes) || + __put_user(txc->offset, &utp->offset) || + __put_user(txc->freq, &utp->freq) || + __put_user(txc->maxerror, &utp->maxerror) || + __put_user(txc->esterror, &utp->esterror) || + __put_user(txc->status, &utp->status) || + __put_user(txc->constant, &utp->constant) || + __put_user(txc->precision, &utp->precision) || + __put_user(txc->tolerance, &utp->tolerance) || + __put_user(txc->time.tv_sec, &utp->time.tv_sec) || + __put_user(txc->time.tv_usec, &utp->time.tv_usec) || + __put_user(txc->tick, &utp->tick) || + __put_user(txc->ppsfreq, &utp->ppsfreq) || + __put_user(txc->jitter, &utp->jitter) || + __put_user(txc->shift, &utp->shift) || + __put_user(txc->stabil, &utp->stabil) || + __put_user(txc->jitcnt, &utp->jitcnt) || + __put_user(txc->calcnt, &utp->calcnt) || + __put_user(txc->errcnt, &utp->errcnt) || + __put_user(txc->stbcnt, &utp->stbcnt) || + __put_user(txc->tai, &utp->tai)) + return -EFAULT; + return 0; +} + asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) { @@ -951,58 +1009,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) { struct timex txc; - int ret; + int err, ret; - memset(&txc, 0, sizeof(struct timex)); - - if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || - __get_user(txc.modes, &utp->modes) || - __get_user(txc.offset, &utp->offset) || - __get_user(txc.freq, &utp->freq) || - __get_user(txc.maxerror, &utp->maxerror) || - __get_user(txc.esterror, &utp->esterror) || - __get_user(txc.status, &utp->status) || - __get_user(txc.constant, &utp->constant) || - __get_user(txc.precision, &utp->precision) || - __get_user(txc.tolerance, &utp->tolerance) || - __get_user(txc.time.tv_sec, &utp->time.tv_sec) || - __get_user(txc.time.tv_usec, &utp->time.tv_usec) || - __get_user(txc.tick, &utp->tick) || - __get_user(txc.ppsfreq, &utp->ppsfreq) || - __get_user(txc.jitter, &utp->jitter) || - __get_user(txc.shift, &utp->shift) || - __get_user(txc.stabil, &utp->stabil) || - __get_user(txc.jitcnt, &utp->jitcnt) || - __get_user(txc.calcnt, &utp->calcnt) || - __get_user(txc.errcnt, &utp->errcnt) || - __get_user(txc.stbcnt, &utp->stbcnt)) - return -EFAULT; + err = compat_get_timex(&txc, utp); + if (err) + return err; ret = do_adjtimex(&txc); - if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || - __put_user(txc.modes, &utp->modes) || - __put_user(txc.offset, &utp->offset) || - __put_user(txc.freq, &utp->freq) || - __put_user(txc.maxerror, &utp->maxerror) || - __put_user(txc.esterror, &utp->esterror) || - __put_user(txc.status, &utp->status) || - __put_user(txc.constant, &utp->constant) || - __put_user(txc.precision, &utp->precision) || - __put_user(txc.tolerance, &utp->tolerance) || - __put_user(txc.time.tv_sec, &utp->time.tv_sec) || - __put_user(txc.time.tv_usec, &utp->time.tv_usec) || - __put_user(txc.tick, &utp->tick) || - __put_user(txc.ppsfreq, &utp->ppsfreq) || - __put_user(txc.jitter, &utp->jitter) || - __put_user(txc.shift, &utp->shift) || - __put_user(txc.stabil, &utp->stabil) || - __put_user(txc.jitcnt, &utp->jitcnt) || - __put_user(txc.calcnt, &utp->calcnt) || - __put_user(txc.errcnt, &utp->errcnt) || - __put_user(txc.stbcnt, &utp->stbcnt) || - __put_user(txc.tai, &utp->tai)) - ret = -EFAULT; + err = compat_put_timex(utp, &txc); + if (err) + return err; return ret; } -- cgit v1.2.3 From f1f1d5ebd10ffa4242bce7a90a56a222d6b7bc77 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:26 +0000 Subject: posix-timers: Introduce a syscall for clock tuning. A new syscall is introduced that allows tuning of a POSIX clock. The new call, clock_adjtime, takes two parameters, the clock ID and a pointer to a struct timex. Any ADJTIMEX(2) operation may be requested via this system call, but various POSIX clocks may or may not support tuning. [ tglx: Adapted to the posix-timer cleanup series. Avoid copy_to_user in the error case ] Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134419.869804645@linutronix.de> Signed-off-by: Thomas Gleixner --- kernel/compat.c | 23 +++++++++++++++++++++++ kernel/posix-timers.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 449e853cf41d..38b1d2c1cbe8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -675,6 +675,29 @@ long compat_sys_clock_gettime(clockid_t which_clock, return err; } +long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *utp) +{ + struct timex txc; + mm_segment_t oldfs; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); + set_fs(oldfs); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} + long compat_sys_clock_getres(clockid_t which_clock, struct compat_timespec __user *tp) { diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index a3fdfd4be0ec..5a5a4f1c0971 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -170,6 +170,12 @@ static int posix_clock_realtime_set(const clockid_t which_clock, return do_sys_settimeofday(tp, NULL); } +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) +{ + return do_adjtimex(t); +} + /* * Get monotonic time for posix timers */ @@ -216,6 +222,7 @@ static __init int init_posix_timers(void) .clock_getres = hrtimer_get_res, .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, .timer_create = common_timer_create, @@ -948,6 +955,29 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, return error; } +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; +} + SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { -- cgit v1.2.3 From 81e294cba2596f5f10848bbe19d98b344c2a2d5c Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:32 +0000 Subject: posix-timers: Add support for fd based clocks Extend the negative clockids which are currently used by posix cpu timers to encode the PID with a file descriptor based type which encodes the fd in the upper bits. Originally-from: Richard Cochran Signed-off-by: Thomas Gleixner Acked-by: John Stultz LKML-Reference: <20110201134420.062860200@linutronix.de> --- kernel/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5a5a4f1c0971..df629d853a81 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -488,7 +488,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) static struct k_clock *clockid_to_kclock(const clockid_t id) { if (id < 0) - return &clock_posix_cpu; + return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu; if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) return NULL; -- cgit v1.2.3 From 527087374faa488776a789375a7d6ea74fda6f71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 12:10:09 +0100 Subject: posix-timers: Cleanup namespace Rename register_posix_clock() to posix_timers_register_clock(). That's what the function really does. As a side effect this cleans up the posix_clock namespace for the upcoming dynamic posix_clock infrastructure. Signed-off-by: Thomas Gleixner Tested-by: Richard Cochran Cc: John Stultz LKML-Reference: --- kernel/posix-cpu-timers.c | 4 ++-- kernel/posix-timers.c | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 609e187f43e7..67fea9d25d55 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1618,8 +1618,8 @@ static __init int init_posix_cpu_timers(void) }; struct timespec ts; - register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); - register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); cputime_to_timespec(cputime_one_jiffy, &ts); onecputick = ts.tv_nsec; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index df629d853a81..af936fd37140 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -253,11 +253,11 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_coarse, }; - register_posix_clock(CLOCK_REALTIME, &clock_realtime); - register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); - register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); - register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); - register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -433,7 +433,8 @@ static struct pid *good_sigevent(sigevent_t * event) return task_pid(rtn); } -void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) { if ((unsigned) clock_id >= MAX_CLOCKS) { printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", @@ -454,7 +455,7 @@ void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) posix_clocks[clock_id] = *new_clock; } -EXPORT_SYMBOL_GPL(register_posix_clock); +EXPORT_SYMBOL_GPL(posix_timers_register_clock); static struct k_itimer * alloc_posix_timer(void) { -- cgit v1.2.3 From 0606f422b453f76c31ab2b1bd52943ff06a2dcf2 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 1 Feb 2011 13:52:35 +0000 Subject: posix clocks: Introduce dynamic clocks This patch adds support for adding and removing posix clocks. The clock lifetime cycle is patterned after usb devices. Each clock is represented by a standard character device. In addition, the driver may optionally implement custom character device operations. The posix clock and timer system calls listed below now work with dynamic posix clocks, as well as the traditional static clocks. The following system calls are affected: - clock_adjtime (brand new syscall) - clock_gettime - clock_getres - clock_settime - timer_create - timer_delete - timer_gettime - timer_settime [ tglx: Adapted to the posix-timer cleanup. Moved clock_posix_dynamic to posix-clock.c and made all referenced functions static ] Signed-off-by: Richard Cochran Acked-by: John Stultz LKML-Reference: <20110201134420.164172635@linutronix.de> Signed-off-by: Thomas Gleixner --- kernel/posix-timers.c | 4 +- kernel/time/Makefile | 3 +- kernel/time/posix-clock.c | 441 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 446 insertions(+), 2 deletions(-) create mode 100644 kernel/time/posix-clock.c (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index af936fd37140..44fcff131b38 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -489,7 +490,8 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) static struct k_clock *clockid_to_kclock(const clockid_t id) { if (id < 0) - return (id & CLOCKFD_MASK) == CLOCKFD ? NULL : &clock_posix_cpu; + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) return NULL; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06c..b0425991e9ac 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,5 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timeconv.o posix-clock.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 000000000000..04498cbf6002 --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,441 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include +#include +#include +#include + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + mutex_lock(&clk->mutex); + + if (!clk->zombie) + return clk; + + mutex_unlock(&clk->mutex); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + mutex_unlock(&clk->mutex); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + mutex_lock(&clk->mutex); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + mutex_unlock(&clk->mutex); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + mutex_init(&clk->mutex); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + if (err) + goto no_cdev; + + return err; +no_cdev: + mutex_destroy(&clk->mutex); + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + mutex_destroy(&clk->mutex); + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + mutex_lock(&clk->mutex); + clk->zombie = true; + mutex_unlock(&clk->mutex); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; -- cgit v1.2.3 From fe4b04fa31a6dcf4358aa84cf81e5a7fd079469b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Feb 2011 13:19:09 +0100 Subject: perf: Cure task_oncpu_function_call() races Oleg reported that on architectures with __ARCH_WANT_INTERRUPTS_ON_CTXSW the IPI from task_oncpu_function_call() can land before perf_event_task_sched_in() and cause interesting situations for eg. perf_install_in_context(). This patch reworks the task_oncpu_function_call() interface to give a more usable primitive as well as rework all its users to hopefully be more obvious as well as remove the races. While looking at the code I also found a number of races against perf_event_task_sched_out() which can flip contexts between tasks so plug those too. Reported-and-reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 260 +++++++++++++++++++++++++++++++++------------------- kernel/sched.c | 29 +----- 2 files changed, 172 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 126a302c481c..7d3faa25e136 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,6 +38,79 @@ #include +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -254,7 +327,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); } /* @@ -618,35 +690,24 @@ __get_cpu_context(struct perf_event_context *ctx) * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static void __perf_event_remove_from_context(void *info) +static int __perf_remove_from_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - raw_spin_unlock(&ctx->lock); + + return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * Must be called with ctx->mutex held. - * * CPU events are removed with a smp call. For task events we only * call when the task is on a CPU. * @@ -657,49 +718,48 @@ static void __perf_event_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_event_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + if (!task) { /* * Per cpu events are removed via an smp call and * the removal is always successful. */ - smp_call_function_single(event->cpu, - __perf_event_remove_from_context, - event, 1); + cpu_function_call(event->cpu, __perf_remove_from_context, event); return; } retry: - task_oncpu_function_call(task, __perf_event_remove_from_context, - event); + if (!task_function_call(task, __perf_remove_from_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * If the context is active we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->nr_active && !list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can remove the event safely, if the call above did not - * succeed. + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. */ - if (!list_empty(&event->group_entry)) - list_del_event(event, ctx); + list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to disable a performance event */ -static void __perf_event_disable(void *info) +static int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -708,9 +768,12 @@ static void __perf_event_disable(void *info) /* * If this is a per-task event, need to check whether this * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. */ if (ctx->task && cpuctx->task_ctx != ctx) - return; + return -EINVAL; raw_spin_lock(&ctx->lock); @@ -729,6 +792,8 @@ static void __perf_event_disable(void *info) } raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -753,13 +818,13 @@ void perf_event_disable(struct perf_event *event) /* * Disable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_disable, - event, 1); + cpu_function_call(event->cpu, __perf_event_disable, event); return; } retry: - task_oncpu_function_call(task, __perf_event_disable, event); + if (!task_function_call(task, __perf_event_disable, event)) + return; raw_spin_lock_irq(&ctx->lock); /* @@ -767,6 +832,11 @@ retry: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -778,7 +848,6 @@ retry: update_group_times(event); event->state = PERF_EVENT_STATE_OFF; } - raw_spin_unlock_irq(&ctx->lock); } @@ -928,12 +997,14 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } +static void perf_event_context_sched_in(struct perf_event_context *ctx); + /* * Cross CPU call to install and enable a performance event * * Must be called with ctx->mutex held */ -static void __perf_install_in_context(void *info) +static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -942,17 +1013,12 @@ static void __perf_install_in_context(void *info) int err; /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no events. + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx); raw_spin_lock(&ctx->lock); ctx->is_active = 1; @@ -997,6 +1063,8 @@ static void __perf_install_in_context(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1008,8 +1076,6 @@ unlock: * If the event is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_event_context *ctx, @@ -1018,6 +1084,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + event->ctx = ctx; if (!task) { @@ -1025,31 +1093,29 @@ perf_install_in_context(struct perf_event_context *ctx, * Per cpu events are installed via an smp call and * the install is always successful. */ - smp_call_function_single(cpu, __perf_install_in_context, - event, 1); + cpu_function_call(cpu, __perf_install_in_context, event); return; } retry: - task_oncpu_function_call(task, __perf_install_in_context, - event); + if (!task_function_call(task, __perf_install_in_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->is_active && list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can add the event safely, if it the call above did not - * succeed. + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. */ - if (list_empty(&event->group_entry)) - add_event_to_ctx(event, ctx); + add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1078,7 +1144,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, /* * Cross CPU call to enable a performance event */ -static void __perf_event_enable(void *info) +static int __perf_event_enable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -1086,18 +1152,10 @@ static void __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); if (event->state >= PERF_EVENT_STATE_INACTIVE) @@ -1138,6 +1196,8 @@ static void __perf_event_enable(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1158,8 +1218,7 @@ void perf_event_enable(struct perf_event *event) /* * Enable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_enable, - event, 1); + cpu_function_call(event->cpu, __perf_event_enable, event); return; } @@ -1178,8 +1237,15 @@ void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + raw_spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_event_enable, event); + + if (!task_function_call(task, __perf_event_enable, event)) + return; raw_spin_lock_irq(&ctx->lock); @@ -1187,15 +1253,14 @@ retry: * If the context is active and the event is still off, * we need to retry the cross-call. */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_OFF) - __perf_event_mark_enabled(event, ctx); + } out: raw_spin_unlock_irq(&ctx->lock); @@ -1339,8 +1404,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; @@ -1533,7 +1598,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, { struct perf_cpu_context *cpuctx; - cpuctx = __get_cpu_context(ctx); + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; @@ -1541,7 +1606,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, cpuctx->task_ctx = ctx; } -void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -1627,7 +1692,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ -#define REDUCE_FLS(a, b) \ +#define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ @@ -2213,6 +2278,9 @@ errout: } +/* + * Returns a matching context with refcount and pincount. + */ static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { @@ -2237,6 +2305,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + ++ctx->pin_count; return ctx; } @@ -2250,6 +2319,7 @@ retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); + ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2271,8 +2341,10 @@ retry: err = -ESRCH; else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; - else + else { + ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { @@ -5950,10 +6022,10 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_event_remove_from_context(group_leader); + perf_remove_from_context(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_event_remove_from_context(sibling); + perf_remove_from_context(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); @@ -5976,6 +6048,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); event->owner = current; @@ -6001,6 +6074,7 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: + perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); @@ -6051,6 +6125,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; @@ -6104,7 +6179,7 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - perf_event_remove_from_context(child_event); + perf_remove_from_context(child_event); parent_event = child_event->parent; /* @@ -6411,7 +6486,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -6526,6 +6601,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); return ret; } @@ -6595,9 +6671,9 @@ static void __perf_event_exit_context(void *__info) perf_pmu_rotate_stop(ctx->pmu); list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); } static void perf_event_exit_cpu_context(int cpu) diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..31cb5d5e1aac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2265,27 +2265,6 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); #endif /* CONFIG_SMP */ -/** - * task_oncpu_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - */ -void task_oncpu_function_call(struct task_struct *p, - void (*func) (void *info), void *info) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if (task_curr(p)) - smp_call_function_single(cpu, func, info, 1); - preempt_enable(); -} - #ifdef CONFIG_SMP /* * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. @@ -2776,9 +2755,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + trace_sched_switch(prev, next); } /** @@ -2911,7 +2893,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_sched_switch(prev, next); + mm = next->mm; oldmm = prev->active_mm; /* @@ -3989,9 +3971,6 @@ need_resched_nonpreemptible: rq->skip_clock_update = 0; if (likely(prev != next)) { - sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next); - rq->nr_switches++; rq->curr = next; ++*switch_count; -- cgit v1.2.3 From 725e7580aaf98e9f7b22b8ccfc640ad0c09e2b08 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:47:15 -0500 Subject: sched: Check the right ->nr_running in yield_task_fair() With CONFIG_FAIR_GROUP_SCHED, each task_group has its own cfs_rq. Yielding to a task from another cfs_rq may be worthwhile, since a process calling yield typically cannot use the CPU right now. Therefor, we want to check the per-cpu nr_running, not the cgroup local one. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201094715.798c4f86@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 55040f3938d8..4de9905228c4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ - if (unlikely(cfs_rq->nr_running == 1)) + if (unlikely(rq->nr_running == 1)) return; clear_buddies(cfs_rq, se); -- cgit v1.2.3 From 2c13c919d9e9a3db9896143a501f83dcbbe1ced4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:48:37 -0500 Subject: sched: Limit the scope of clear_buddies The clear_buddies function does not seem to play well with the concept of hierarchical runqueues. In the following tree, task groups are represented by 'G', tasks by 'T', next by 'n' and last by 'l'. (nl) / \ G(nl) G / \ \ T(l) T(n) T This situation can arise when a task is woken up T(n), and the previously running task T(l) is marked last. When clear_buddies is called from either T(l) or T(n), the next and last buddies of the group G(nl) will be cleared. This is not the desired result, since we would like to be able to find the other type of buddy in many cases. This especially a worry when implementing yield_task_fair through the buddy system. The fix is simple: only clear the buddy type that the task itself is indicated to be. As an added bonus, we stop walking up the tree when the buddy has already been cleared or pointed elsewhere. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201094837.6b0962a9@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4de9905228c4..a785e08202cf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -995,19 +995,35 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) list_add_leaf_cfs_rq(cfs_rq); } -static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void __clear_buddies_last(struct sched_entity *se) { - if (!se || cfs_rq->last == se) - cfs_rq->last = NULL; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} - if (!se || cfs_rq->next == se) - cfs_rq->next = NULL; +static void __clear_buddies_next(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - for_each_sched_entity(se) - __clear_buddies(cfs_rq_of(se), se); + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); } static void -- cgit v1.2.3 From ac53db596cc08ecb8040cfb6f71ae40c6f2041c4 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:51:03 -0500 Subject: sched: Use a buddy to implement yield_task_fair() Use the buddy mechanism to implement yield_task_fair. This allows us to skip onto the next highest priority se at every level in the CFS tree, unless doing so would introduce gross unfairness in CPU time distribution. We order the buddy selection in pick_next_entity to check yield first, then last, then next. We need next to be able to override yield, because it is possible for the "next" and "yield" task to be different processen in the same sub-tree of the CFS tree. When they are, we need to go into that sub-tree regardless of the "yield" hint, and pick the correct entity once we get to the right level. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095103.3a79e92a@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_debug.c | 2 +- kernel/sched_fair.c | 148 ++++++++++++++++++++++++++++++--------------------- kernel/sysctl.c | 7 --- 4 files changed, 90 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 477e1bcc63f9..ae5e1a19b9d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -324,7 +324,7 @@ struct cfs_rq { * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr, *next, *last; + struct sched_entity *curr, *next, *last, *skip; unsigned int nr_spread_over; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index eb6cb8edd075..7bacd83a4158 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) raw_spin_lock_irqsave(&rq->lock, flags); if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; last = __pick_last_entity(cfs_rq); if (last) max_vruntime = last->vruntime; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a785e08202cf..c0fbeb992833 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -68,14 +68,6 @@ static unsigned int sched_nr_latency = 8; */ unsigned int sysctl_sched_child_runs_first __read_mostly; -/* - * sys_sched_yield() compat mode - * - * This option switches the agressive yield implementation of the - * old scheduler back on. - */ -unsigned int __read_mostly sysctl_sched_compat_yield; - /* * SCHED_OTHER wake-up granularity. * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) @@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) return rb_entry(left, struct sched_entity, run_node); } +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ -#ifdef CONFIG_SCHED_DEBUG int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -1017,6 +1019,17 @@ static void __clear_buddies_next(struct sched_entity *se) } } +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } +} + static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->last == se) @@ -1024,6 +1037,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) if (cfs_rq->next == se) __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); } static void @@ -1099,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (cfs_rq->nr_running > 1) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta < 0) @@ -1143,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) - se = cfs_rq->next; + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } /* * Prefer last buddy, try to return the CPU to a preempted task. @@ -1157,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + clear_buddies(cfs_rq, se); return se; @@ -1333,52 +1369,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -/* - * sched_yield() support is very simple - we dequeue and enqueue. - * - * If compat_yield is turned on then we requeue to the end of the tree. - */ -static void yield_task_fair(struct rq *rq) -{ - struct task_struct *curr = rq->curr; - struct cfs_rq *cfs_rq = task_cfs_rq(curr); - struct sched_entity *rightmost, *se = &curr->se; - - /* - * Are we the only task in the tree? - */ - if (unlikely(rq->nr_running == 1)) - return; - - clear_buddies(cfs_rq, se); - - if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - - return; - } - /* - * Find the rightmost entry in the rbtree: - */ - rightmost = __pick_last_entity(cfs_rq); - /* - * Already in the rightmost position? - */ - if (unlikely(!rightmost || entity_before(rightmost, se))) - return; - - /* - * Minimally necessary key value to be last in the tree: - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - se->vruntime = rightmost->vruntime + 1; -} - #ifdef CONFIG_SMP static void task_waking_fair(struct rq *rq, struct task_struct *p) @@ -1849,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se) } } +static void set_skip_buddy(struct sched_entity *se) +{ + if (likely(task_of(se)->policy != SCHED_IDLE)) { + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; + } +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1947,6 +1945,36 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) } } +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + } + + set_skip_buddy(se); +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bc86bb32e126..cbfda7e2416e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -360,13 +360,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, - { - .procname = "sched_compat_yield", - .data = &sysctl_sched_compat_yield, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", -- cgit v1.2.3 From d95f412200652694e63e64bfd49f0ae274a54479 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 1 Feb 2011 09:50:51 -0500 Subject: sched: Add yield_to(task, preempt) functionality Currently only implemented for fair class tasks. Add a yield_to_task method() to the fair scheduling class. allowing the caller of yield_to() to accelerate another thread in it's thread group, task group. Implemented via a scheduler hint, using cfs_rq->next to encourage the target being selected. We can rely on pick_next_entity to keep things fair, so noone can accelerate a thread that has already used its fair share of CPU time. This also means callers should only call yield_to when they really mean it. Calling it too often can result in the scheduler just ignoring the hint. Signed-off-by: Rik van Riel Signed-off-by: Marcelo Tosatti Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <20110201095051.4ddb7738@annuminas.surriel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_fair.c | 20 +++++++++++++ 2 files changed, 105 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ae5e1a19b9d6..2effcb71a478 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1686,6 +1686,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + #endif static void calc_load_account_idle(struct rq *this_rq); @@ -5448,6 +5481,58 @@ void __sched yield(void) } EXPORT_SYMBOL(yield); +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + bool yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out; + + if (curr->sched_class != p->sched_class) + goto out; + + if (task_running(p_rq, p) || p->state) + goto out; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) + schedstat_inc(rq, yld_count); + +out: + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c0fbeb992833..027024694043 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1975,6 +1975,25 @@ static void yield_task_fair(struct rq *rq) set_skip_buddy(se); } +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + if (!se->on_rq) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */ + if (preempt) + resched_task(rq->curr); + + yield_task_fair(rq); + + return true; +} + #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods: @@ -4243,6 +4262,7 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, -- cgit v1.2.3 From 76022db323dd6d7c6958df3d595f7dedf7a14778 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:51:53 +0900 Subject: tracing/kprobes: Cleanup strict_strtol() using code Since strict_strtol() accepts minus digits started with '-', it doesn't need to invert after converting. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125153.9507.49335.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b4..2088893c049e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -767,16 +767,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, } break; case '+': /* deref memory */ + arg++; /* Skip '+', because strict_strtol() rejects it. */ case '-': tmp = strchr(arg, '('); if (!tmp) break; *tmp = '\0'; - ret = strict_strtol(arg + 1, 0, &offset); + ret = strict_strtol(arg, 0, &offset); if (ret) break; - if (arg[0] == '-') - offset = -offset; arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { -- cgit v1.2.3 From e3745369986ddcdaa19f70e2d24e658876b97e84 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:51:59 +0900 Subject: tracing/kprobes: Support longer (>128 bytes) command Expand command line buffer of kprobe-tracer to 4096 bytes. Reported-by: Arnaldo Carvalho de Melo Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125159.9507.20895.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2088893c049e..c6ed88660856 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1129,7 +1129,7 @@ static int command_trace_probe(const char *buf) return ret; } -#define WRITE_BUFSIZE 128 +#define WRITE_BUFSIZE 4096 static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) -- cgit v1.2.3 From 1ff511e35ed87cc2ebade9e678e4a2fe39b6f9c5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:52:05 +0900 Subject: tracing/kprobes: Add bitfield type Add bitfield type for tracing arguments on kprobe-tracer. The syntax of a bitfield type is: b@/ e.g. Accessing 2 bits-width field with 4 bits-offset in 32 bits-width data at 4 bytes offseted from the address pointed by AX register: +4(%ax):b2@4/32 Since the width of container data depends on the arch, so I just added the container-size at the end. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125205.9507.11363.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 104 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c6ed88660856..ccdc542022c3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) kfree(data); } +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static __kprobes void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -367,6 +404,7 @@ enum { FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, + FETCH_MTD_bitfield, FETCH_MTD_END, }; @@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ } \ } @@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type) if (!type) type = DEFAULT_FETCH_TYPE_STR; + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + type = strchr(type, '/'); + if (!type) + goto fail; + type++; + if (strict_strtoul(type, 0, &bs)) + goto fail; + switch (bs) { + case 8: + return find_fetch_type("u8"); + case 16: + return find_fetch_type("u16"); + case 32: + return find_fetch_type("u32"); + case 64: + return find_fetch_type("u64"); + default: + goto fail; + } + } + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) if (strcmp(type, fetch_type_table[i].name) == 0) return &fetch_type_table[i]; +fail: return NULL; } @@ -586,7 +649,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); @@ -806,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, return ret; } +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + /* String length checking wrapper */ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) @@ -835,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, parg->fetch.fn); -- cgit v1.2.3 From 6d54057d76e25c91165cda0e6e007f1811faa2be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:33:26 -0500 Subject: tracing/filter: Have no filter return a match The n_preds field of a file can change at anytime, and even can become zero, just as the filter is about to be processed by an event. In the case that is zero on entering the filter, return 1, telling the caller the event matchs and should be trace. Also use a variable and assign it with ACCESS_ONCE() such that the count stays consistent within the function. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..7275f0310ed8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -383,9 +383,14 @@ int filter_match_preds(struct event_filter *filter, void *rec) int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int n_preds = ACCESS_ONCE(filter->n_preds); int i; - for (i = 0; i < filter->n_preds; i++) { + /* no filter is considered a match */ + if (!n_preds) + return 1; + + for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec, val1, val2); -- cgit v1.2.3 From 58d9a597c4275d830a819625e7d437cd6fb23fa5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:37:09 -0500 Subject: tracing/filter: Move OR and AND logic out of fn() method The ops OR and AND act different from the other ops, as they are the only ones to take other ops as their arguements. These ops als change the logic of the filter_match_preds. By removing the OR and AND fn's we can also remove the val1 and val2 that is passed to all other fn's and are unused. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +-- kernel/trace/trace_events_filter.c | 51 ++++++++++++++------------------------ 2 files changed, 20 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3..1597bc0749c1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,8 +677,7 @@ struct event_subsystem { struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7275f0310ed8..5d719b340a2b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -124,8 +124,7 @@ struct filter_parse_state { }; #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +151,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +176,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +190,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +213,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +228,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -380,7 +360,7 @@ static void filter_build_regex(struct filter_pred *pred) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; + int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); @@ -393,7 +373,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); + match = pred->fn(pred, rec); stack[top++] = match; continue; } @@ -403,7 +383,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) } val1 = stack[--top]; val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); + switch (pred->op) { + case OP_AND: + match = val1 && val2; + break; + case OP_OR: + match = val1 || val2; + break; + default: + WARN_ONCE(1, "filter op is not AND or OR"); + } stack[top++] = match; } @@ -775,15 +764,13 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; if (pred->op == OP_AND) { pred->pop_n = 2; - fn = filter_pred_and; goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - fn = filter_pred_or; goto add_pred_fn; } -- cgit v1.2.3 From c9c53ca03d6f97fdd9832d5ed3f15b30ee5cdb86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:42:43 -0500 Subject: tracing/filter: Dynamically allocate preds For every filter that is made, we create predicates to hold every operation within the filter. We have a max of 32 predicates that we can hold. Currently, we allocate all 32 even if we only need to use one. Part of the reason we do this is that the filter can be used at any moment by any event. Fortunately, the filter is only used with preemption disabled. By reseting the count of preds used "n_preds" to zero, then performing a synchronize_sched(), we can safely free and reallocate a new array of preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 143 ++++++++++++++++++++++++++++--------- 2 files changed, 110 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1597bc0749c1..441fc1bc85d6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -661,7 +661,8 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ struct filter_pred **preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5d719b340a2b..aac6a6183e6a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,6 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; + struct filter_pred **preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -370,8 +371,13 @@ int filter_match_preds(struct event_filter *filter, void *rec) if (!n_preds) return 1; + /* + * n_preds and filter->preds is protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; + pred = preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -548,46 +554,55 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } +static void __free_preds(struct event_filter *filter) +{ + int i; + + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; + filter->n_preds = 0; +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + filter->preds[i]->fn = filter_pred_none; + } filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void __free_filter(struct event_filter *filter) { - int i; - if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; - struct filter_pred *pred; - int i; filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!filter) @@ -595,32 +610,63 @@ static struct event_filter *__alloc_preds(void) filter->n_preds = 0; - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ + struct filter_pred *pred; + int i; + + if (filter->preds) { + if (filter->a_preds < n_preds) { + /* We need to reallocate */ + filter->n_preds = 0; + /* + * It is possible that the filter is currently + * being used. We need to zero out the number + * of preds, wait on preemption and then free + * the preds. + */ + synchronize_sched(); + __free_preds(filter); + } + } + + if (!filter->preds) { + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->a_preds = n_preds; + } if (!filter->preds) - goto oom; + return -ENOMEM; + + if (WARN_ON(filter->a_preds < n_preds)) + return -EINVAL; - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + if (!pred) + pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; filter->preds[i] = pred; } - return filter; - -oom: + return 0; + oom: __free_preds(filter); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } -static int init_preds(struct ftrace_event_call *call) +static int init_filter(struct ftrace_event_call *call) { if (call->filter) return 0; call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); + call->filter = __alloc_filter(); if (IS_ERR(call->filter)) return PTR_ERR(call->filter); @@ -636,7 +682,7 @@ static int init_subsystem_preds(struct event_subsystem *system) if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); + err = init_filter(call); if (err) return err; } @@ -665,7 +711,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1179,6 +1225,20 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1191,10 +1251,23 @@ static int replace_preds(struct ftrace_event_call *call, int err; int n_preds = 0; + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + err = check_preds(ps); if (err) return err; + if (!dry_run) { + err = __alloc_preds(filter, n_preds); + if (err) + return err; + } + + n_preds = 0; list_for_each_entry(elt, &ps->postfix, list) { if (elt->op == OP_NONE) { if (!operand1) @@ -1208,7 +1281,7 @@ static int replace_preds(struct ftrace_event_call *call, continue; } - if (n_preds++ == MAX_FILTER_PRED) { + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1283,7 +1356,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) mutex_lock(&event_mutex); - err = init_preds(call); + err = init_filter(call); if (err) goto out_unlock; @@ -1376,7 +1449,7 @@ void ftrace_profile_free_filter(struct perf_event *event) struct event_filter *filter = event->filter; event->filter = NULL; - __free_preds(filter); + __free_filter(filter); } int ftrace_profile_set_filter(struct perf_event *event, int event_id, @@ -1402,7 +1475,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_preds(); + filter = __alloc_filter(); if (IS_ERR(filter)) { err = PTR_ERR(filter); goto out_unlock; @@ -1411,7 +1484,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - goto free_preds; + goto free_filter; parse_init(ps, filter_ops, filter_str); err = filter_parse(ps); @@ -1427,9 +1500,9 @@ free_ps: postfix_clear(ps); kfree(ps); -free_preds: +free_filter: if (err) - __free_preds(filter); + __free_filter(filter); out_unlock: mutex_unlock(&event_mutex); -- cgit v1.2.3 From 0fc3ca9a10a61a77f18710fb708b41fd99c79a56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:46:46 -0500 Subject: tracing/filter: Call synchronize_sched() just once for system filters By separating out the reseting of the filter->n_preds to zero from the reallocation of preds for the filter, we can reset groups of filters first, call synchronize_sched() just once, and then reallocate each of the filters in the system group. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 80 ++++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index aac6a6183e6a..8f00a11ce778 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -570,17 +570,28 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } +static void reset_preds(struct event_filter *filter) +{ + struct filter_pred *pred; + int n_preds = filter->n_preds; + int i; + + filter->n_preds = 0; + if (!filter->preds) + return; + + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + pred->fn = filter_pred_none; + } +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; - int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; - if (filter->preds) { - for (i = 0; i < filter->n_preds; i++) - filter->preds[i]->fn = filter_pred_none; - } - filter->n_preds = 0; + reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -620,15 +631,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) { if (filter->a_preds < n_preds) { - /* We need to reallocate */ - filter->n_preds = 0; /* - * It is possible that the filter is currently - * being used. We need to zero out the number - * of preds, wait on preemption and then free - * the preds. + * We need to reallocate. + * We should have already have zeroed out + * the pred count and called synchronized_sched() + * to make sure no one is using the preds. */ - synchronize_sched(); + if (WARN_ON_ONCE(filter->n_preds)) { + /* We need to reset it now */ + filter->n_preds = 0; + synchronize_sched(); + } __free_preds(filter); } } @@ -1328,6 +1341,30 @@ static int replace_system_preds(struct event_subsystem *system, /* try to see if the filter can be applied */ err = replace_preds(call, filter, ps, filter_string, true); if (err) + goto fail; + } + + /* set all filter pred counts to zero */ + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) + continue; + + reset_preds(filter); + } + + /* + * Since some of the preds may be used under preemption + * we need to wait for them to finish before we may + * reallocate them. + */ + synchronize_sched(); + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) continue; /* really apply the filter */ @@ -1342,11 +1379,13 @@ static int replace_system_preds(struct event_subsystem *system, fail = false; } - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; - } + if (fail) + goto fail; + return 0; + fail: + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) @@ -1381,6 +1420,13 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } + /* + * Make sure all the pred counts are zero so that + * no task is using it when we reallocate the preds array. + */ + reset_preds(call->filter); + synchronize_sched(); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); -- cgit v1.2.3 From 74e9e58c350a24139e268dd6857bbaa55c5aafcf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:49:48 -0500 Subject: tracing/filter: Allocate the preds in an array Currently we allocate an array of pointers to filter_preds, and then allocate a separate filter_pred for each item in the array. This adds slight overhead in the filters as it needs to derefernce twice to get to the op condition. Allocating the preds themselves in a single array removes a dereference as well as helps on the cache footprint. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- kernel/trace/trace_events_filter.c | 31 +++++++++---------------------- 2 files changed, 10 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 441fc1bc85d6..254d04a84ec3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -663,7 +663,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred **preds; + struct filter_pred *preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8f00a11ce778..b6c910642a1e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,7 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; - struct filter_pred **preds; + struct filter_pred *preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -377,7 +377,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) preds = rcu_dereference_sched(filter->preds); for (i = 0; i < n_preds; i++) { - pred = preds[i]; + pred = &preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -559,10 +559,8 @@ static void __free_preds(struct event_filter *filter) int i; if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -572,7 +570,6 @@ static void __free_preds(struct event_filter *filter) static void reset_preds(struct event_filter *filter) { - struct filter_pred *pred; int n_preds = filter->n_preds; int i; @@ -580,10 +577,8 @@ static void reset_preds(struct event_filter *filter) if (!filter->preds) return; - for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - pred->fn = filter_pred_none; - } + for (i = 0; i < n_preds; i++) + filter->preds[i].fn = filter_pred_none; } static void filter_disable_preds(struct ftrace_event_call *call) @@ -658,19 +653,11 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return -EINVAL; for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - if (!pred) - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } return 0; - oom: - __free_preds(filter); - return -ENOMEM; } static int init_filter(struct ftrace_event_call *call) @@ -730,8 +717,8 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(&filter->preds[idx], pred, fn); if (err) return err; -- cgit v1.2.3 From f76690afd05e3e163149310bdcd30234f93b3a7a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:53:06 -0500 Subject: tracing/filter: Free pred array on disabling of filter When a filter is disabled, free the preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b6c910642a1e..2f5458e244a3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1388,6 +1388,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); + reset_preds(call->filter); + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_preds(call->filter); remove_filter_string(call->filter); goto out_unlock; } -- cgit v1.2.3 From 61e9dea20e1ada886cc49a9ec6fe3c6ac0de7324 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:54:33 -0500 Subject: tracing/filter: Use a tree instead of stack for filter_match_preds() Currently the filter_match_preds() requires a stack to push and pop the preds to determine if the filter matches the record or not. This has two drawbacks: 1) It requires a stack to store state information. As this is done in fast paths we can't allocate the storage for this stack, and we can't use a global as it must be re-entrant. The stack is stored on the kernel stack and this greatly limits how many preds we may allow. 2) All conditions are calculated even when a short circuit exists. a || b will always calculate a and b even though a was determined to be true. Using a tree we can walk a constant structure that will save the state as we go. The algorithm is simply: pred = root; do { switch (move) { case MOVE_DOWN: if (OR or AND) { pred = left; continue; } if (pred == root) break; match = pred->fn(); pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; case MOVE_UP_FROM_LEFT: /* Only OR or AND can be a parent */ if (match && OR || !match && AND) { /* short circuit */ if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } pred = pred->right; move = MOVE_DOWN; continue; case MOVE_UP_FROM_RIGHT: if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } done = 1; } while (!done); This way there's no strict limit to how many preds we allow and it also will short circuit the logical operations when possible. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +- kernel/trace/trace_events_filter.c | 231 +++++++++++++++++++++++++++++-------- 2 files changed, 194 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 254d04a84ec3..bba34a72c780 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -664,6 +664,7 @@ struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -675,6 +676,9 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) + struct filter_pred; struct regex; @@ -704,7 +708,10 @@ struct filter_pred { int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2f5458e244a3..10390491b6d0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,6 +123,11 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -357,52 +362,95 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; struct filter_pred *preds; struct filter_pred *pred; + struct filter_pred *root; int n_preds = ACCESS_ONCE(filter->n_preds); - int i; + int done = 0; /* no filter is considered a match */ if (!n_preds) return 1; /* - * n_preds and filter->preds is protect with preemption disabled. + * n_preds, root and filter->preds are protect with preemption disabled. */ preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; - for (i = 0; i < n_preds; i++) { - pred = &preds[i]; - if (!pred->pop_n) { + pred = root; + + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* keep going to leaf node */ + pred = &preds[pred->left]; + continue; + } match = pred->fn(pred, rec); - stack[top++] = match; + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* Check for short circuits */ + if ((match && pred->op == OP_OR) || + (!match && pred->op == OP_AND)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - switch (pred->op) { - case OP_AND: - match = val1 && val2; - break; - case OP_OR: - match = val1 || val2; - break; - default: - WARN_ONCE(1, "filter op is not AND or OR"); - } - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -539,10 +587,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,8 +646,25 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + dest->left = left->index; + dest->right = right->index; + left->parent = dest->index; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + return __push_pred_stack(stack, dest); } static void __free_preds(struct event_filter *filter) @@ -574,6 +687,7 @@ static void reset_preds(struct event_filter *filter) int i; filter->n_preds = 0; + filter->root = NULL; if (!filter->preds) return; @@ -707,6 +821,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; @@ -718,7 +833,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, idx = filter->n_preds; filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(&filter->preds[idx], pred, fn); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -803,6 +918,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -812,13 +928,10 @@ static int filter_add_pred(struct filter_parse_state *ps, fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -867,7 +980,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1248,6 +1361,7 @@ static int replace_preds(struct ftrace_event_call *call, char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; @@ -1262,9 +1376,12 @@ static int replace_preds(struct ftrace_event_call *call, return err; if (!dry_run) { - err = __alloc_preds(filter, n_preds); + err = __alloc_pred_stack(&stack, n_preds); if (err) return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; } n_preds = 0; @@ -1276,14 +1393,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1293,22 +1412,44 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + filter->root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } static int replace_system_preds(struct event_subsystem *system, -- cgit v1.2.3 From 55719274188f13cff9e3bd11fdd4c0e7617cd03d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:12:05 -0500 Subject: tracing/filter: Optimize short ciruit check The test if we should break out early for OR and AND operations can be optimized by comparing the current result with (pred->op == OP_OR) That is if the result is true and the op is an OP_OR, or if the result is false and the op is not an OP_OR (thus an OP_AND) we can break out early in either case. Otherwise we continue processing. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 10390491b6d0..0a3e0502b507 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -426,9 +426,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) pred->parent, &move); continue; case MOVE_UP_FROM_LEFT: - /* Check for short circuits */ - if ((match && pred->op == OP_OR) || - (!match && pred->op == OP_AND)) { + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { if (pred == root) break; pred = get_pred_parent(pred, preds, -- cgit v1.2.3 From ec126cac23945de12eb2d103374e1f7ee97c5595 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:14:25 -0500 Subject: tracing/filter: Check the created pred tree Since the filter walks a tree to determine if a match is made or not, if the tree was incorrectly created, it could cause an infinite loop. Add a check to walk the entire tree before assigning it as a filter to make sure the tree is correct. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 72 +++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0a3e0502b507..91c9cdcb040b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1358,6 +1358,68 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1366,6 +1428,7 @@ static int replace_preds(struct ftrace_event_call *call, { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; + struct filter_pred *root; struct postfix_elt *elt; struct pred_stack stack = { }; /* init to NULL */ int err; @@ -1442,7 +1505,7 @@ add_pred: if (!pred) return -EINVAL; /* This item is where we start from in matching */ - filter->root = pred; + root = pred; /* Make sure the stack is empty */ pred = __pop_pred_stack(&stack); if (WARN_ON(pred)) { @@ -1450,6 +1513,13 @@ add_pred: filter->root = NULL; goto fail; } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; } err = 0; -- cgit v1.2.3 From 43cd414552d8137157e926e46361678ea867e476 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:16:51 -0500 Subject: tracing/filter: Optimize filter by folding the tree There are many cases that a filter will contain multiple ORs or ANDs together near the leafs. Walking up and down the tree to get to the next compare can be a waste. If there are several ORs or ANDs together, fold them into a single pred and allocate an array of the conditions that they check. This will speed up the filter by linearly walking an array and can still break out if a short circuit condition is met. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 12 +- kernel/trace/trace_events_filter.c | 233 +++++++++++++++++++++++++++++++++++-- 2 files changed, 235 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bba34a72c780..d754330934bb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -678,6 +678,7 @@ struct event_subsystem { #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) struct filter_pred; struct regex; @@ -704,7 +705,16 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 91c9cdcb040b..2403ce5b6507 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,42 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int type; + int match; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -414,11 +450,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) case MOVE_DOWN: /* only AND and OR have children */ if (pred->left != FILTER_PRED_INVALID) { - /* keep going to leaf node */ - pred = &preds[pred->left]; - continue; - } - match = pred->fn(pred, rec); + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); /* If this pred is the only pred */ if (pred == root) break; @@ -659,17 +700,34 @@ static int filter_set_pred(struct event_filter *filter, left = __pop_pred_stack(stack); if (!left || !right) return -EINVAL; - dest->left = left->index; - dest->right = right->index; - left->parent = dest->index; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else + } else { /* * Make dest->left invalid to be used as a quick * way to know this is a leaf node. */ dest->left = FILTER_PRED_INVALID; + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + return __push_pred_stack(stack, dest); } @@ -1420,6 +1478,158 @@ static int check_pred_tree(struct event_filter *filter, return 0; } +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1517,6 +1727,11 @@ add_pred: if (err) goto fail; + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + /* We don't set root until we know it works */ barrier(); filter->root = root; -- cgit v1.2.3 From 4a3d27e98a7f2682e96d6f863752e0424b00d691 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:19:49 -0500 Subject: tracing/filter: Move MAX_FILTER_PRED to local tracing directory The MAX_FILTER_PRED is only needed by the kernel/trace/*.c files. Move it to kernel/trace/trace.h. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d754330934bb..fbff872f8db1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,8 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) +#define MAX_FILTER_PRED 32 + struct filter_pred; struct regex; -- cgit v1.2.3 From bf93f9ed3a2cb89eb7e58851139d3be375b98027 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:21:34 -0500 Subject: tracing/filter: Increase the max preds to 2^14 Now that the filter logic does not require to save the pred results on the stack, we can increase the max number of preds we allow. As the preds are index by a short value, and we use the MSBs as flags we can increase the max preds to 2^14 (16384) which should be way more than enough. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fbff872f8db1..856e73cc1d3f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,7 +680,14 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) -#define MAX_FILTER_PRED 32 +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 struct filter_pred; struct regex; -- cgit v1.2.3 From 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:25:46 -0500 Subject: tracing/filter: Swap entire filter of events When creating a new filter, instead of allocating the filter to the event call first and then processing the filter, it is easier to process a temporary filter and then just swap it with the call filter. By doing this, it simplifies the code. A filter is allocated and processed, when it is done, it is swapped with the call filter, synchronize_sched() is called to make sure all callers are done with the old filter (filters are called with premption disabled), and then the old filter is freed. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 251 +++++++++++++++++++++---------------- 1 file changed, 146 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2403ce5b6507..f5d335d28d0b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -425,10 +425,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) struct filter_pred *preds; struct filter_pred *pred; struct filter_pred *root; - int n_preds = ACCESS_ONCE(filter->n_preds); + int n_preds; int done = 0; /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + if (!n_preds) return 1; @@ -509,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -568,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -581,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -745,26 +755,9 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void reset_preds(struct event_filter *filter) -{ - int n_preds = filter->n_preds; - int i; - - filter->n_preds = 0; - filter->root = NULL; - if (!filter->preds) - return; - - for (i = 0; i < n_preds; i++) - filter->preds[i].fn = filter_pred_none; -} - -static void filter_disable_preds(struct ftrace_event_call *call) +static void filter_disable(struct ftrace_event_call *call) { - struct event_filter *filter = call->filter; - call->flags &= ~TRACE_EVENT_FL_FILTERED; - reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -777,11 +770,16 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } static struct event_filter *__alloc_filter(void) @@ -789,11 +787,6 @@ static struct event_filter *__alloc_filter(void) struct event_filter *filter; filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); - - filter->n_preds = 0; - return filter; } @@ -838,46 +831,28 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static int init_filter(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_filter(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - - return 0; -} - -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; - int err; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - err = init_filter(call); - if (err) - return err; + filter_disable(call); + remove_filter_string(call->filter); } - - return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system) { struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - - filter_disable_preds(call); - remove_filter_string(call->filter); + __free_filter(call->filter); + call->filter = NULL; } } @@ -1743,88 +1718,129 @@ fail: return err; } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + static int replace_system_preds(struct event_subsystem *system, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; if (strcmp(call->class->system, system->name) != 0) continue; - /* try to see if the filter can be applied */ - err = replace_preds(call, filter, ps, filter_string, true); + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); if (err) goto fail; } - /* set all filter pred counts to zero */ list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + struct event_filter *filter; if (strcmp(call->class->system, system->name) != 0) continue; - reset_preds(filter); - } + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; - /* - * Since some of the preds may be used under preemption - * we need to wait for them to finish before we may - * reallocate them. - */ - synchronize_sched(); + list_add_tail(&filter_item->list, &filter_list); - list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; - if (strcmp(call->class->system, system->name) != 0) - continue; + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); + if (err) + goto fail_mem; - /* really apply the filter */ - filter_disable_preds(call); err = replace_preds(call, filter, ps, filter_string, false); - if (err) - filter_disable_preds(call); - else { + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; - replace_filter_string(filter, filter_string); - } + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + call->filter = filter_item->filter; + filter_item->filter = filter; + fail = false; } if (fail) goto fail; + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } return 0; fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; mutex_lock(&event_mutex); - err = init_filter(call); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable_preds(call); - reset_preds(call->filter); + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + call->filter = NULL; /* Make sure the filter is not being used */ synchronize_sched(); - __free_preds(call->filter); - remove_filter_string(call->filter); + __free_filter(filter); goto out_unlock; } @@ -1833,29 +1849,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!ps) goto out_unlock; - filter_disable_preds(call); - replace_filter_string(call->filter, filter_string); + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err) { - append_filter_err(ps, call->filter); + append_filter_err(ps, filter); goto out; } - /* - * Make sure all the pred counts are zero so that - * no task is using it when we reallocate the preds array. - */ - reset_preds(call->filter); - synchronize_sched(); - - err = replace_preds(call, call->filter, ps, filter_string, false); - if (err) - append_filter_err(ps, call->filter); - else + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + call->filter = filter; + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); @@ -1868,18 +1896,21 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; mutex_lock(&event_mutex); - err = init_subsystem_preds(system); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); goto out_unlock; } @@ -1888,7 +1919,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - replace_filter_string(system->filter, filter_string); + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); @@ -1945,7 +1986,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; filter = __alloc_filter(); - if (IS_ERR(filter)) { + if (!filter) { err = PTR_ERR(filter); goto out_unlock; } -- cgit v1.2.3 From 4defe682d81a4960b6840ee4ed1a36f9db77c7bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:29:06 -0500 Subject: tracing/filter: Remove synchronize_sched() from __alloc_preds() Because the filters are processed first and then activated (added to the call), we no longer need to worry about the preds of the filter in __alloc_preds() being used. As the filter that is allocating preds is not activated yet. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f5d335d28d0b..3249b4f77ef0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -795,33 +795,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) struct filter_pred *pred; int i; - if (filter->preds) { - if (filter->a_preds < n_preds) { - /* - * We need to reallocate. - * We should have already have zeroed out - * the pred count and called synchronized_sched() - * to make sure no one is using the preds. - */ - if (WARN_ON_ONCE(filter->n_preds)) { - /* We need to reset it now */ - filter->n_preds = 0; - synchronize_sched(); - } - __free_preds(filter); - } - } + if (filter->preds) + __free_preds(filter); + + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - if (!filter->preds) { - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->a_preds = n_preds; - } if (!filter->preds) return -ENOMEM; - if (WARN_ON(filter->a_preds < n_preds)) - return -EINVAL; + filter->a_preds = n_preds; + filter->n_preds = 0; for (i = 0; i < n_preds; i++) { pred = &filter->preds[i]; -- cgit v1.2.3 From ba976970c79fd2fbfe1a4b3b6766a318f4eb9d4c Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:20 +1100 Subject: tracing/syscalls: Don't add events for unmapped syscalls FTRACE_SYSCALLS would create events for each and every system call, even if it had failed to map the system call's name with it's number. This resulted in a number of events being created that would not behave as expected. This could happen, for example, on architectures who's symbol names are unusual and will not match the system call name. It could also happen with system calls which were mapped to sys_ni_syscall. This patch changes the default system call number in the metadata to -1. If the system call name from the metadata is not successfully mapped to a system call number during boot, than the event initialisation routine will now return an error, preventing the event from being created. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-2-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5c9fe08d2093..a9ceabd52247 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -424,6 +424,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int init_syscall_trace(struct ftrace_event_call *call) { int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } if (set_syscall_print_fmt(call) < 0) return -ENOMEM; -- cgit v1.2.3 From 3773b389b6927595512558594d040c1edba46f36 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:21 +1100 Subject: tracing/syscalls: Convert redundant syscall_nr checks into WARN_ON With the ftrace events now checking if the syscall_nr is valid upon initialisation it should no longer be possible to register or unregister a syscall event without a valid syscall_nr since they should not be created. This adds a WARN_ON_ONCE in the register and unregister functions to locate potential regressions in the future. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-3-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a9ceabd52247..423094288fb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -359,7 +359,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -377,7 +377,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -393,7 +393,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -411,7 +411,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; -- cgit v1.2.3 From c763ba06bd9b5db2c46c36276c89103d92d2c604 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:22 +1100 Subject: tracing/syscalls: Make arch_syscall_addr weak Some architectures use non-trivial system call tables and will not work with the generic arch_syscall_addr code. For example, PowerPC64 uses a table of twin long longs. This patch makes the generic arch_syscall_addr weak to allow architectures with non-trivial system call tables to override it. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-4-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 423094288fb5..af831545f656 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -446,7 +446,7 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } -unsigned long __init arch_syscall_addr(int nr) +unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; } -- cgit v1.2.3 From b2d55496818d64310b9f5486d4eea76ea614d7f8 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:23 +1100 Subject: tracing/syscalls: Allow arch specific syscall symbol matching Some architectures have unusual symbol names and the generic code to match the symbol name with the function name for the syscall metadata will fail. For example, symbols on PPC64 start with a period and the generic code will fail to match them. This patch moves the match logic out into a separate function which an arch can override by defining ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and implementing arch_syscall_match_sym_name. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-5-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index af831545f656..86a23e7de031 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[]; static struct syscall_metadata **syscalls_metadata; +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -73,13 +86,7 @@ find_syscall_meta(unsigned long syscall) kallsyms_lookup(syscall, NULL, NULL, NULL, str); for ( ; start < stop; start++) { - /* - * Only compare after the "sys" prefix. Archs that use - * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted - * mismatch. - */ - if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; } return NULL; -- cgit v1.2.3 From ae07f551c42d6e4162436ca452a199deac9dab4d Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:25 +1100 Subject: tracing/syscalls: Early terminate search for sys_ni_syscall Many system calls are unimplemented and mapped to sys_ni_syscall, but at boot ftrace would still search through every syscall metadata entry for a match which wouldn't be there. This patch adds causes the search to terminate early if the system call is not mapped. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-7-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 86a23e7de031..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -85,6 +85,9 @@ find_syscall_meta(unsigned long syscall) stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + for ( ; start < stop; start++) { if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; -- cgit v1.2.3 From dc5f219e88294b93009eef946251251ffffb6d60 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 13:19:20 +0100 Subject: genirq: Add IRQF_FORCE_RESUME Xen needs to reenable interrupts which are marked IRQF_NO_SUSPEND in the resume path. Add a flag to force the reenabling in the resume code. Tested-and-acked-by: Ian Campbell Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 11 ++++++++++- kernel/irq/pm.c | 3 --- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..b4198ee8cfdf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -282,8 +282,17 @@ EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { - if (resume) + if (resume) { + if (!(desc->status & IRQ_SUSPENDED)) { + if (!desc->action) + return; + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + /* Pretend that it got disabled ! */ + desc->depth++; + } desc->status &= ~IRQ_SUSPENDED; + } switch (desc->depth) { case 0: diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 0d4005d85b03..d6bfb89cce91 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -53,9 +53,6 @@ void resume_device_irqs(void) for_each_irq_desc(irq, desc) { unsigned long flags; - if (!(desc->status & IRQ_SUSPENDED)) - continue; - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From 5e38ca8f3ea423442eaafe1b7e206084aa38120a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 2 Feb 2011 13:28:18 +0100 Subject: tracing: Add unstable sched clock note to the warning The warning "Delta way too big" warning might appear on a system with unstable shed clock right after the system is resumed and tracing was enabled during the suspend. Since it's not realy bug, and the unstable sched clock is working fast and reliable otherwise, Steven suggested to keep using the sched clock in any case and just to make note in the warning itself. Signed-off-by: Jiri Olsa LKML-Reference: <1296649698-6003-1-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..7739893a1d0a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2163,10 +2163,14 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } -- cgit v1.2.3 From c305d524e5dd3c3c7a6035083e30950bea1b52dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 17:10:48 +0100 Subject: softirq: Avoid stack switch from ksoftirqd ksoftirqd() calls do_softirq() which switches stacks on several architectures. That makes no sense at all. ksoftirqd's stack is sufficient. Call __do_softirq() directly. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Acked-by: David Miller Cc: Paul Mundt Reviewed-by: Frank Rowand LKML-Reference: --- kernel/softirq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..c0490464e92f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -738,7 +738,10 @@ static int run_ksoftirqd(void * __bind_cpu) don't process */ if (cpu_is_offline((long)__bind_cpu)) goto wait_to_die; - do_softirq(); + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_irq_enable(); preempt_enable_no_resched(); cond_resched(); preempt_disable(); -- cgit v1.2.3 From 44951a60ff888add9e84f509ffce20052e45af94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 17:33:49 +0100 Subject: genirq: Remove dead code CONFIG_KSTAT_IRQS_ONDEMAND does not exist. It's not worth to implement it. Use sparse irqs if you care about memory consumption of the interrupt layer. Found by undertaker: http://vamos.informatik.uni-erlangen.de/trac/undertaker Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 282f20230e67..a7ac6e1e7074 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -251,7 +251,6 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - /* TODO : do this allocation on-demand ... */ desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); @@ -277,22 +276,6 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { -#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) - struct irq_desc *desc; - unsigned int i; - - for (i = 0; i < cnt; i++) { - desc = irq_to_desc(start + i); - if (desc && !desc->kstat_irqs) { - unsigned int __percpu *stats = alloc_percpu(unsigned int); - - if (!stats) - return -1; - if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) - free_percpu(stats); - } - } -#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ -- cgit v1.2.3 From 87d80de2800d087ea833cb79bc13f85ff34ed49f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 13:19:49 -0500 Subject: tracing: Remove obsolete sched_switch tracer The trace events sched_switch and sched_wakeup do the same thing as the stand alone sched_switch tracer does. It is no longer needed. Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_switch.c | 48 --------------------------------------- 1 file changed, 48 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) ctx_trace = tr; } -static void stop_sched_trace(struct trace_array *tr) -{ - tracing_stop_sched_switch_record(); -} - -static int sched_switch_trace_init(struct trace_array *tr) -{ - ctx_trace = tr; - tracing_reset_online_cpus(tr); - tracing_start_sched_switch_record(); - return 0; -} - -static void sched_switch_trace_reset(struct trace_array *tr) -{ - if (sched_ref) - stop_sched_trace(tr); -} - -static void sched_switch_trace_start(struct trace_array *tr) -{ - sched_stopped = 0; -} - -static void sched_switch_trace_stop(struct trace_array *tr) -{ - sched_stopped = 1; -} - -static struct tracer sched_switch_trace __read_mostly = -{ - .name = "sched_switch", - .init = sched_switch_trace_init, - .reset = sched_switch_trace_reset, - .start = sched_switch_trace_start, - .stop = sched_switch_trace_stop, - .wait_pipe = poll_wait_pipe, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sched_switch, -#endif -}; - -__init static int init_sched_switch_trace(void) -{ - return register_tracer(&sched_switch_trace); -} -device_initcall(init_sched_switch_trace); - -- cgit v1.2.3 From 6752ab4a9c30d5411b2dfdb251a3f1cb18aae487 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 13:54:06 -0500 Subject: tracing: Deprecate tracing_enabled for tracing_on tracing_enabled should not be used, it is heavy weight and does not do much in helping lower the overhead. tracing_on should be used instead. Warn users to use tracing_on when tracing_enabled is used as it will soon be removed from the tracing directory. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb80589..8dc8da6733f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2710,6 +2710,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (tracer_enabled ^ val) { + + /* Only need to warn if this is used to change the state */ + WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + if (val) { tracer_enabled = 1; if (current_trace->start) -- cgit v1.2.3 From 4149efb22da66e326fc48baf80d628834509f7f0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Feb 2011 10:39:03 +0100 Subject: workqueue: add system_freezeable_wq Add system wide freezeable workqueue. Signed-off-by: Tejun Heo Acked-by: Dmitry Torokhov Cc: "Rafael J. Wysocki" --- kernel/workqueue.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 11869faa6819..28f8bd08f0e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -249,10 +249,12 @@ struct workqueue_struct *system_wq __read_mostly; struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_freezeable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); +EXPORT_SYMBOL_GPL(system_freezeable_wq); #define CREATE_TRACE_POINTS #include @@ -3764,8 +3766,10 @@ static int __init init_workqueues(void) system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); + system_freezeable_wq = alloc_workqueue("events_freezeable", + WQ_FREEZEABLE, 0); BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq); + !system_unbound_wq || !system_freezeable_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.3 From 986c011ddbb3ed44b35e1bfd67f6aa60b293b495 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 9 Feb 2011 16:04:25 -0800 Subject: genirq: Call bus_lock/unlock functions in setup_irq() irq_chips that supply .irq_bus_lock/.irq_bus_sync_unlock functions, expect that the other chip methods will be called inside of calls to the pair. If this expectation is not met, things tend to not work. Make setup_irq() call chip_bus_lock()/chip_bus_sync_unlock() too. For the vast majority of irq_chips, this will be a NOP as most don't have these bus lock functions. [ tglx: No we don't want to call that in __setup_irq(). Way too many error exit pathes. ] Signed-off-by: David Daney LKML-Reference: <1297296265-18680-1-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..a00bf2cd67ed 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -871,9 +871,14 @@ out_thread: */ int setup_irq(unsigned int irq, struct irqaction *act) { + int retval; struct irq_desc *desc = irq_to_desc(irq); - return __setup_irq(irq, desc, act); + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; } EXPORT_SYMBOL_GPL(setup_irq); -- cgit v1.2.3 From 868baf07b1a259f5f3803c1dc2777b6c358f83cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Feb 2011 21:26:13 -0500 Subject: ftrace: Fix memory leak with function graph and cpu hotplug When the fuction graph tracer starts, it needs to make a special stack for each task to save the real return values of the tasks. All running tasks have this stack created, as well as any new tasks. On CPU hot plug, the new idle task will allocate a stack as well when init_idle() is called. The problem is that cpu hotplug does not create a new idle_task. Instead it uses the idle task that existed when the cpu went down. ftrace_graph_init_task() will add a new ret_stack to the task that is given to it. Because a clone will make the task have a stack of its parent it does not check if the task's ret_stack is already NULL or not. When the CPU hotplug code starts a CPU up again, it will allocate a new stack even though one already existed for it. The solution is to treat the idle_task specially. In fact, the function_graph code already does, just not at init_idle(). Instead of using the ftrace_graph_init_task() for the idle task, which that function expects the task to be a clone, have a separate ftrace_graph_init_idle_task(). Also, we will create a per_cpu ret_stack that is used by the idle task. When we call ftrace_graph_init_idle_task() it will check if the idle task's ret_stack is NULL, if it is, then it will assign it the per_cpu ret_stack. Reported-by: Benjamin Herrenschmidt Suggested-by: Peter Zijlstra Cc: Stable Tree Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- kernel/trace/ftrace.c | 52 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..fbe86cb04b61 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5571,7 +5571,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_graph_init_task(idle); + ftrace_graph_init_idle_task(idle, cpu); } /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae83883..888b611897d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) /* The cpu_boot init_task->ret_stack will never be freed */ for_each_online_cpu(cpu) { if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_task(idle_task(cpu)); + ftrace_graph_init_idle_task(idle_task(cpu), cpu); } do { @@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) mutex_unlock(&ftrace_lock); } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { @@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; + graph_init_task(t, ret_stack); } } -- cgit v1.2.3 From 0de4b34d466bae571b50f41c7296b85248205e35 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 14 Feb 2011 14:48:07 +0900 Subject: tracing/kprobe: Fix NULL pointer deref check Add NULL check for avoiding NULL pointer deref. This bug has been introduced by: 1ff511e35ed8: tracing/kprobes: Add bitfield type which causes a null pointer dereference bug when kprobe-tracer parses an argument without type. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Masami Hiramatsu Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Peter Zijlstra LKML-Reference: <20110214054807.8919.69740.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Ingo Molnar Reported-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccdc542022c3..8435b43b1782 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -935,7 +935,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); - if (ret >= 0) + if (ret >= 0 && t != NULL) ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, -- cgit v1.2.3 From d4f7e513234019a005c4d33477189f2a4e53bb9c Mon Sep 17 00:00:00 2001 From: Chris Smith Date: Fri, 12 Nov 2010 16:26:54 +0100 Subject: sh: Enable CONFIG_GCOV_PROFILE_ALL for sh This patch enables gcov kernel profiling over the whole kernel for sh. Profiling of specific files individually already worked. A handful of files have to be explicitly excluded from the profiling to avoid breaking things, notably pmb.c. Signed-off-by: Chris Smith Signed-off-by: Stuart Menefy Signed-off-by: Paul Mundt --- kernel/gcov/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 70a298d6da71..b8cadf70b1fb 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. -- cgit v1.2.3 From 4fe757dd48a9e95e1a071291f15dda5421dacb66 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Feb 2011 22:26:07 +0100 Subject: perf: Fix throttle logic It was possible to call pmu::start() on an already running event. In particular this lead so some wreckage as the hrtimer events would re-initialize active timers. This was due to throttled events being activated again by scheduling. Scheduling in a context would add and force start events, resulting in running events with a possible throttle status. The next tick to hit that task will then try to unthrottle the event and call ->start() on an already running event. Reported-by: Jeff Moyer Cc: Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 999835b6112b..656222fcf767 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -782,6 +782,10 @@ retry: raw_spin_unlock_irq(&ctx->lock); } +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + static int event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, @@ -794,6 +798,17 @@ event_sched_in(struct perf_event *event, event->state = PERF_EVENT_STATE_ACTIVE; event->oncpu = smp_processor_id(); + + /* + * Unthrottle events, since we scheduled we might have missed several + * ticks already, also for a heavily scheduling task there is little + * guarantee it'll get a tick in a timely manner. + */ + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { + perf_log_throttle(event, 1); + event->hw.interrupts = 0; + } + /* * The new state must be visible before we turn it on in the hardware: */ @@ -1596,10 +1611,6 @@ void __perf_event_task_sched_in(struct task_struct *task) } } -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_event *event, int enable); - static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) { u64 frequency = event->attr.sample_freq; -- cgit v1.2.3 From d41d5a01631af821d3a3447e6613a316f5ee6c25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Feb 2011 17:02:20 +0100 Subject: cgroup: Fix cgroup_subsys::exit callback Make the ::exit method act like ::attach, it is after all very nearly the same thing. The bug had no effect on correctness - fixing it is an optimization for the scheduler. Also, later perf-cgroups patches rely on it. Signed-off-by: Peter Zijlstra Acked-by: Paul Menage LKML-Reference: <1297160655.13327.92.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/cgroup.c | 31 ++++++++++++++++++------------- kernel/sched.c | 6 ++---- 2 files changed, 20 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83c..f6495f33a355 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - int i; struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } + int i; /* * Unlink from the css_set task list if necessary. @@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; + + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } + } task_unlock(tsk); + if (cg) put_css_set_taskexit(cg); } diff --git a/kernel/sched.c b/kernel/sched.c index e142e92f38da..79e611cd83dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; - if (p->flags & PF_EXITING) - return &root_task_group; - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -8863,7 +8860,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. -- cgit v1.2.3 From e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 14 Feb 2011 11:20:01 +0200 Subject: perf: Add cgroup support This kernel patch adds the ability to filter monitoring based on container groups (cgroups). This is for use in per-cpu mode only. The cgroup to monitor is passed as a file descriptor in the pid argument to the syscall. The file descriptor must be opened to the cgroup name in the cgroup filesystem. For instance, if the cgroup name is foo and cgroupfs is mounted in /cgroup, then the file descriptor is opened to /cgroup/foo. Cgroup mode is activated by passing PERF_FLAG_PID_CGROUP in the flags argument to the syscall. For instance to measure in cgroup foo on CPU1 assuming cgroupfs is mounted under /cgroup: struct perf_event_attr attr; int cgroup_fd, fd; cgroup_fd = open("/cgroup/foo", O_RDONLY); fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP); close(cgroup_fd); Signed-off-by: Stephane Eranian [ added perf_cgroup_{exit,attach} ] Signed-off-by: Peter Zijlstra LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/cgroup.c | 23 ++ kernel/perf_event.c | 638 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 626 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f6495f33a355..95362d15128c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3d3f282fa50e..65dcdc76d709 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) return data.ret; } +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; -atomic_t perf_task_events __read_mostly; +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +atomic_t perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } @@ -162,6 +176,338 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current); + /* + * do not update time when cgroup is not active + */ + if (!event->cgrp || cgrp != event->cgrp) + return; + + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + if (!task) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = now; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) + return PTR_ERR(css); + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } else { + /* must be done before we fput() the file */ + perf_get_cgroup(event); + } + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx) static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + return ctx ? ctx->time : 0; } @@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - - if (ctx->is_active) + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; else run_end = event->tstamp_stopped; @@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event) run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; + } /* @@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } + if (is_cgroup_event(event)) { + ctx->nr_cgroups++; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -616,7 +995,8 @@ out: static inline int event_filter_match(struct perf_event *event) { - return event->cpu == -1 || event->cpu == smp_processor_id(); + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); } static void @@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event, */ if (event->state == PERF_EVENT_STATE_INACTIVE && !event_filter_match(event)) { - delta = ctx->time - event->tstamp_stopped; + delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; } @@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - /* * Cross CPU call to remove a performance event * @@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info) */ if (event->state >= PERF_EVENT_STATE_INACTIVE) { update_context_time(ctx); + update_cgrp_time_from_event(event); update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); @@ -851,6 +1226,41 @@ retry: raw_spin_unlock_irq(&ctx->lock); } +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event, event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = tstamp - ctx->timestamp; + perf_set_shadow_time(event, ctx, tstamp); if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void perf_event_context_sched_in(struct perf_event_context *ctx); +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); /* * Cross CPU call to install and enable a performance event @@ -1033,11 +1444,17 @@ static int __perf_install_in_context(void *info) * which do context switches with IRQs enabled. */ if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, ctx->task); raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); add_event_to_ctx(event, ctx); @@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info) if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, perf_clock()); + __perf_event_mark_enabled(event, ctx); - if (!event_filter_match(event)) + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); goto unlock; + } /* * If the event is in a group and isn't the group leader, @@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); if (!ctx->nr_active) goto out; @@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task, for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); } static void task_ctx_sched_out(struct perf_event_context *ctx, @@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { + u64 now; + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; - ctx->timestamp = perf_clock(); - + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, now); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1601,11 +2048,12 @@ out: } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, task); } static void task_ctx_sched_in(struct perf_event_context *ctx, @@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, if (cpuctx->task_ctx == ctx) return; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, NULL); cpuctx->task_ctx = ctx; } -static void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) { struct perf_cpu_context *cpuctx; @@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx) */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); cpuctx->task_ctx = ctx; @@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task) if (likely(!ctx)) continue; - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, task); } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); if (ctx) task_ctx_sched_in(ctx, EVENT_FLEXIBLE); @@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); } @@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); @@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event) * (e.g., thread is blocked), in that case * we cannot update context time */ - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_task_events); + jump_label_dec(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event) event->buffer = NULL; } + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + if (event->destroy) event->destroy(event); @@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event) if (!in_nmi()) { update_context_time(event->ctx); + update_cgrp_time_from_event(event); time = event->ctx->time; } else { u64 now = perf_clock(); @@ -5725,7 +6189,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_task_events); + jump_label_inc(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open, int err; /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); @@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + event_fd = get_unused_fd_flags(O_RDWR); if (event_fd < 0) return event_fd; @@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } - if (pid != -1) { + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); @@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + } + /* * Special case software events and allow them to be part of * any hardware group. @@ -6808,3 +7287,92 @@ unlock: return ret; } device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + struct perf_cgroup_info *t; + int c; + + jc = kmalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + memset(jc, 0, sizeof(*jc)); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(c) { + t = per_cpu_ptr(jc->info, c); + t->time = 0; + t->timestamp = 0; + } + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void perf_cgroup_move(struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task, + bool threadgroup) +{ + perf_cgroup_move(task); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + perf_cgroup_move(c); + } + rcu_read_unlock(); + } +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_move(task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, +}; +#endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From 163ec4354a5135c6c38c3f4a9b46a31900ebdf48 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2011 11:22:34 +0100 Subject: perf: Optimize throttling code By pre-computing the maximum number of samples per tick we can avoid a multiplication and a conditional since MAX_INTERRUPTS > max_samples_per_tick. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 43 ++++++++++++++++++++++++------------------- kernel/sysctl.c | 2 +- 2 files changed, 25 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 65dcdc76d709..e03be08d0ddf 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -150,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* * max perf event sample rate */ -int sysctl_perf_event_sample_rate __read_mostly = 100000; +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} static atomic64_t perf_event_id; @@ -4941,26 +4958,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (unlikely(!is_sampling_event(event))) return 0; - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_event_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling events even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the event got enabled again: - */ + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); ret = 1; } - } + } else + hwc->interrupts++; if (event->attr.freq) { u64 now = perf_clock(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83db985..daef911cbadb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -948,7 +948,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = perf_proc_update_handler, }, #endif #ifdef CONFIG_KMEMCHECK -- cgit v1.2.3 From ba3dd36c6775264ee6e7354ba1aabcd6e86d7298 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Feb 2011 12:41:46 +0100 Subject: perf: Optimize hrtimer events There is no need to re-initialize the hrtimer every time we start it, so don't do that (shaves a few cycles). Also, since we know hrtimers run at a fixed rate (nanoseconds) we can pre-compute the desired frequency at which they tick. This avoids us having to go through the whole adaptive frequency feedback logic (shaves another few cycles). Signed-off-by: Peter Zijlstra LKML-Reference: <1297448589.5226.47.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e03be08d0ddf..a0a6987fabc4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5602,6 +5602,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + event->pmu->read(event); perf_sample_data_init(&data, 0); @@ -5628,9 +5632,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - period = local64_read(&hwc->period_left); if (period) { if (period < 0) @@ -5657,6 +5658,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) } } +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + /* * Software event: cpu wall time clock */ @@ -5709,6 +5734,8 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } @@ -5787,6 +5814,8 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } -- cgit v1.2.3 From 46e49b3836c7cd2ae5b5fe76fa981d0d292a52fe Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Mon, 14 Feb 2011 14:38:50 -0800 Subject: sched: Wholesale removal of sd_idle logic sd_idle logic was introduced way back in 2005 (commit 5969fe06), as an HT optimization. As per the discussion in the thread here: lkml - sched: Resolve sd_idle and first_idle_cpu Catch-22 - v1 https://patchwork.kernel.org/patch/532501/ The capacity based logic in the load balancer right now handles this in a much cleaner way, handling more than 2 SMT siblings etc, and sd_idle does not seem to bring any additional benefits. sd_idle logic also has some bugs that has performance impact. Here is the patch that removes the sd_idle logic altogether. Also, there was a dependency of sched_mc_power_savings == 2, with sd_idle logic. Signed-off-by: Venkatesh Pallipadi Acked-by: Vaidyanathan Srinivasan Signed-off-by: Peter Zijlstra LKML-Reference: <1297723130-693-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 53 +++++++++++------------------------------------------ 1 file changed, 11 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 027024694043..d384e739ea95 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2672,7 +2672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @sd_idle: Idle status of the sched_domain containing group. * @local_group: Does group contain this_cpu. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. @@ -2680,7 +2679,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) */ static inline void update_sg_lb_stats(struct sched_domain *sd, struct sched_group *group, int this_cpu, - enum cpu_idle_type idle, int load_idx, int *sd_idle, + enum cpu_idle_type idle, int load_idx, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { @@ -2700,9 +2699,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { @@ -2817,15 +2813,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, - enum cpu_idle_type idle, int *sd_idle, - const struct cpumask *cpus, int *balance, - struct sd_lb_stats *sds) + enum cpu_idle_type idle, const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; struct sched_group *sg = sd->groups; @@ -2843,7 +2837,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) @@ -3095,7 +3089,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, * @imbalance: Variable which stores amount of weighted load which should * be moved to restore balance/put a group to idle. * @idle: The idle status of this_cpu. - * @sd_idle: The idleness of sd * @cpus: The set of CPUs under consideration for load-balancing. * @balance: Pointer to a variable indicating if this_cpu * is the appropriate cpu to perform load balancing at this_level. @@ -3108,7 +3101,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) + const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; @@ -3118,8 +3111,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, - balance, &sds); + update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); /* Cases where imbalance does not exist from POV of this_cpu */ /* 1) this_cpu is not the appropriate cpu to perform load balancing @@ -3255,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, +static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { @@ -3287,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, * move_tasks() will succeed. ld_moved will be true and this * active balance code will not be triggered. */ - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return 0; - if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) return 0; } @@ -3308,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -3317,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, cpumask_copy(cpus, cpu_active_mask); - /* - * When power savings policy is enabled for the parent domain, idle - * sibling can pick up load irrespective of busy siblings. In this case, - * let the state of idle sibling percolate up as CPU_IDLE, instead of - * portraying it as CPU_NOT_IDLE. - */ - if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - sd_idle = 1; - schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, + group = find_busiest_group(sd, this_cpu, &imbalance, idle, cpus, balance); if (*balance == 0) @@ -3392,8 +3370,7 @@ redo: if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), - this_cpu)) { + if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3448,10 +3425,6 @@ redo: sd->balance_interval *= 2; } - if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - goto out; out_balanced: @@ -3465,11 +3438,7 @@ out_one_pinned: (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; - if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && - !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - ld_moved = -1; - else - ld_moved = 0; + ld_moved = 0; out: return ld_moved; } -- cgit v1.2.3 From 48228f7b470a74b6469a250d2977a13128d8fe96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 12:39:54 -0500 Subject: lockdep/timers: Explain in detail the locking problems del_timer_sync() may cause Twice I had to explain the output about why lockdep gives an error with locks in IRQ context and with del_timer_sync(). Might as well write it up and place it in the comments above the code in del_timer_sync(). Perhaps the next time this lockdep dump triggers people will understand the issues. It is a ticky issue and very subtle, explaining it in detail in the code may help others understand the issue when they stumble upon the bug again. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <1297186794.23343.19.camel@gandalf.stny.rr.com> Signed-off-by: Ingo Molnar --- kernel/timer.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d245..5f40c2e0a94e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -964,6 +964,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. * + * Note: You must not hold locks that are held in interrupt context + * while calling this function. Even if the lock has nothing to do + * with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * * The function returns whether it has deactivated a pending timer or not. */ int del_timer_sync(struct timer_list *timer) @@ -971,6 +990,10 @@ int del_timer_sync(struct timer_list *timer) #ifdef CONFIG_LOCKDEP unsigned long flags; + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); -- cgit v1.2.3 From e9345aab675382176740bc8a2c6d3caf1510e46d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Feb 2011 08:09:49 +0100 Subject: Revert "tracing: Add unstable sched clock note to the warning" This reverts commit 5e38ca8f3ea423442eaafe1b7e206084aa38120a. Breaks the build of several !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures. Cc: Jiri Olsa Cc: Steven Rostedt Message-ID: <20110217171823.GB17058@elte.hu> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7739893a1d0a..bd1c35a4fbcc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2163,14 +2163,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - sched_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + (unsigned long long)cpu_buffer->write_stamp); add_timestamp = 1; } } -- cgit v1.2.3 From db1c1cce4a653dcbe6949c72ae7b9f42cab1b929 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 18 Feb 2011 10:07:25 +0100 Subject: ntp: Remove redundant and incorrect parameter check The ADJ_SETOFFSET code redundantly checks the range of the nanoseconds field of the time value. This field is checked again in the subsequent call to timekeeping_inject_offset(). Also, as is, the check will not detect whether the number of microseconds is out of range. Let timekeeping_inject_offset() do the error checking. Signed-off-by: Richard Cochran Cc: johnstul@us.ibm.com LKML-Reference: <20110218090724.GA2924@riccoc20.at.omicron.at> Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5ac593267a26..5f1bb8e2008f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -650,13 +650,13 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_SETOFFSET) { struct timespec delta; - if ((unsigned long)txc->time.tv_usec >= NSEC_PER_SEC) - return -EINVAL; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; - timekeeping_inject_offset(&delta); + result = timekeeping_inject_offset(&delta); + if (result) + return result; } getnstimeofday(&ts); -- cgit v1.2.3 From c1ee6264280e740a9d3ff3feef38642cf0a57013 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 17 Feb 2011 17:45:15 +0100 Subject: genirq: Prevent access beyond allocated_irqs bitmap Lars-Peter Clausen pointed out: I stumbled upon this while looking through the existing archs using SPARSE_IRQ. Even with SPARSE_IRQ the NR_IRQS is still the upper limit for the number of IRQs. Both PXA and MMP set NR_IRQS to IRQ_BOARD_START, with IRQ_BOARD_START being the number of IRQs used by the core. In various machine files the nr_irqs field of the ARM machine defintion struct is then set to "IRQ_BOARD_START + NR_BOARD_IRQS". As a result "nr_irqs" will greater then NR_IRQS which then again causes the "allocated_irqs" bitmap in the core irq code to be accessed beyond its size overwriting unrelated data. The core code really misses a sanity check there. This went unnoticed so far as by chance the compiler/linker places data behind that bitmap which gets initialized later on those affected platforms. So the obvious fix would be to add a sanity check in early_irq_init() and break all affected platforms. Though that check wants to be backported to stable as well, which will require to fix all known problematic platforms and probably some more yet not known ones as well. Lots of churn. A way simpler solution is to allocate a slightly larger bitmap and avoid the whole churn w/o breaking anything. Add a few warnings when an arch returns utter crap. Reported-by: Lars-Peter Clausen Signed-off-by: Thomas Gleixner Cc: stable@kernel.org # .37 Cc: Haojian Zhuang Cc: Eric Miao Cc: Peter Zijlstra --- kernel/irq/internals.h | 6 ++++++ kernel/irq/irqdesc.c | 11 ++++++++++- kernel/irq/resend.c | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4571ae7e085a..99c3bc8a6fb4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -3,6 +3,12 @@ */ #include +#ifdef CONFIG_SPARSE_IRQ +# define IRQ_BITMAP_BITS (NR_IRQS + 8196) +#else +# define IRQ_BITMAP_BITS NR_IRQS +#endif + extern int noirqdebug; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 282f20230e67..2039bea31bdf 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -94,7 +94,7 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); -static DECLARE_BITMAP(allocated_irqs, NR_IRQS); +static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); #ifdef CONFIG_SPARSE_IRQ @@ -217,6 +217,15 @@ int __init early_irq_init(void) initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) + nr_irqs = IRQ_BITMAP_BITS; + + if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) + initcnt = IRQ_BITMAP_BITS; + + if (initcnt > nr_irqs) + nr_irqs = initcnt; + for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node); set_bit(i, allocated_irqs); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 891115a929aa..dc49358b73fa 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -23,7 +23,7 @@ #ifdef CONFIG_HARDIRQS_SW_RESEND /* Bitmap to handle software resend of interrupts: */ -static DECLARE_BITMAP(irqs_resend, NR_IRQS); +static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); /* * Run software resends of IRQ's -- cgit v1.2.3 From 6d83f94db95cfe65d2a6359cccdf61cf087c2598 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Feb 2011 23:27:23 +0100 Subject: genirq: Disable the SHIRQ_DEBUG call in request_threaded_irq for now With CONFIG_SHIRQ_DEBUG=y we call a newly installed interrupt handler in request_threaded_irq(). The original implementation (commit a304e1b8) called the handler _BEFORE_ it was installed, but that caused problems with handlers calling disable_irq_nosync(). See commit 377bf1e4. It's braindead in the first place to call disable_irq_nosync in shared handlers, but .... Moving this call after we installed the handler looks innocent, but it is very subtle broken on SMP. Interrupt handlers rely on the fact, that the irq core prevents reentrancy. Now this debug call violates that promise because we run the handler w/o the IRQ_INPROGRESS protection - which we cannot apply here because that would result in a possibly forever masked interrupt line. A concurrent real hardware interrupt on a different CPU results in handler reentrancy and can lead to complete wreckage, which was unfortunately observed in reality and took a fricking long time to debug. Leave the code here for now. We want this debug feature, but that's not easy to fix. We really should get rid of those disable_irq_nosync() abusers and remove that function completely. Signed-off-by: Thomas Gleixner Cc: Anton Vorontsov Cc: David Woodhouse Cc: Arjan van de Ven Cc: stable@kernel.org # .28 -> .37 --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0caa59f747dd..9033c1c70828 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1100,7 +1100,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (retval) kfree(action); -#ifdef CONFIG_DEBUG_SHIRQ +#ifdef CONFIG_DEBUG_SHIRQ_FIXME if (!retval && (irqflags & IRQF_SHARED)) { /* * It's a shared IRQ -- the driver ought to be prepared for it -- cgit v1.2.3 From e7bcecb7b1d29b9ad5af939149a945658620ca8f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Feb 2011 17:12:57 +0100 Subject: genirq: Make nr_irqs runtime expandable We face more and more the requirement to expand nr_irqs at runtime. The reason are irq expanders which can not be detected in the early boot stage. So we speculate nr_irqs to have enough room. Further Xen needs extra irq numbers and we really want to avoid adding more "detection" code into the early boot. There is no real good reason why we need to limit nr_irqs at early boot. Allow the allocation code to expand nr_irqs. We have already 8k extra number space in the allocation bitmap, so lets use it. Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a250d3a0af12..6f6644f819dd 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -206,6 +206,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } +static int irq_expand_nr_irqs(unsigned int cnt) +{ + if (nr_irqs + cnt > IRQ_BITMAP_BITS) + return -ENOMEM; + nr_irqs += cnt; + return 0; +} + int __init early_irq_init(void) { int i, initcnt, node = first_online_node; @@ -287,6 +295,12 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { return start; } + +static int irq_expand_nr_irqs(unsigned int cnt) +{ + return -ENOMEM; +} + #endif /* !CONFIG_SPARSE_IRQ */ /* Dynamic interrupt handling */ @@ -335,9 +349,11 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) if (irq >=0 && start != irq) goto err; - ret = -ENOMEM; - if (start >= nr_irqs) - goto err; + if (start >= nr_irqs) { + ret = irq_expand_nr_irqs(cnt); + if (ret) + goto err; + } bitmap_set(allocated_irqs, start, cnt); mutex_unlock(&sparse_irq_lock); -- cgit v1.2.3 From 43abe43ce0619d744c7a5bb15cce075e532b53b7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Feb 2011 12:10:49 +0100 Subject: genirq: Add missing buslock to set_irq_type(), set_irq_wake() chips behind a slow bus cannot update the chip under desc->lock, but we miss the chip_buslock/chip_bus_sync_unlock() calls around the set type and set wake functions. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 ++ kernel/irq/manage.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index baa5c4acad83..9639ab8bece0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -65,9 +65,11 @@ int set_irq_type(unsigned int irq, unsigned int type) if (type == IRQ_TYPE_NONE) return 0; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); ret = __irq_set_trigger(desc, irq, type); raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); return ret; } EXPORT_SYMBOL(set_irq_type); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ba84307fbf24..a400db220cf3 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -454,6 +454,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { @@ -476,6 +477,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) } raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); return ret; } EXPORT_SYMBOL(set_irq_wake); -- cgit v1.2.3 From a0cd9ca2b907d7ee26575e7b63ac92dad768a75e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 11:36:33 +0100 Subject: genirq: Namespace cleanup The irq namespace has become quite convoluted. My bad. Clean it up and deprecate the old functions. All new functions follow the scheme: irq number based: irq_set/get/xxx/_xxx(unsigned int irq, ...) irq_data based: irq_data_set/get/xxx/_xxx(struct irq_data *d, ....) irq_desc based: irq_desc_get_xxx(struct irq_desc *desc) Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 28 ++++++++++++++-------------- kernel/irq/manage.c | 6 +++--- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9639ab8bece0..622b55ac0e09 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -19,11 +19,11 @@ #include "internals.h" /** - * set_irq_chip - set the irq chip for an irq + * irq_set_chip - set the irq chip for an irq * @irq: irq number * @chip: pointer to irq chip description structure */ -int set_irq_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, struct irq_chip *chip) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -43,14 +43,14 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) return 0; } -EXPORT_SYMBOL(set_irq_chip); +EXPORT_SYMBOL(irq_set_chip); /** - * set_irq_type - set the irq trigger type for an irq + * irq_set_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ -int set_irq_type(unsigned int irq, unsigned int type) +int irq_set_irq_type(unsigned int irq, unsigned int type) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -72,16 +72,16 @@ int set_irq_type(unsigned int irq, unsigned int type) chip_bus_sync_unlock(desc); return ret; } -EXPORT_SYMBOL(set_irq_type); +EXPORT_SYMBOL(irq_set_irq_type); /** - * set_irq_data - set irq type data for an irq + * irq_set_handler_data - set irq handler data for an irq * @irq: Interrupt number * @data: Pointer to interrupt specific data * * Set the hardware irq controller data for an irq */ -int set_irq_data(unsigned int irq, void *data) +int irq_set_handler_data(unsigned int irq, void *data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -97,16 +97,16 @@ int set_irq_data(unsigned int irq, void *data) raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } -EXPORT_SYMBOL(set_irq_data); +EXPORT_SYMBOL(irq_set_handler_data); /** - * set_irq_msi - set MSI descriptor data for an irq + * irq_set_msi_desc - set MSI descriptor data for an irq * @irq: Interrupt number * @entry: Pointer to MSI descriptor data * * Set the MSI descriptor entry for an irq */ -int set_irq_msi(unsigned int irq, struct msi_desc *entry) +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -126,13 +126,13 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) } /** - * set_irq_chip_data - set irq chip data for an irq + * irq_set_chip_data - set irq chip data for an irq * @irq: Interrupt number * @data: Pointer to chip specific data * * Set the hardware irq chip data for an irq */ -int set_irq_chip_data(unsigned int irq, void *data) +int irq_set_chip_data(unsigned int irq, void *data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -154,7 +154,7 @@ int set_irq_chip_data(unsigned int irq, void *data) return 0; } -EXPORT_SYMBOL(set_irq_chip_data); +EXPORT_SYMBOL(irq_set_chip_data); struct irq_data *irq_get_irq_data(unsigned int irq) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a400db220cf3..b1b4da9446e6 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -434,7 +434,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) } /** - * set_irq_wake - control irq power management wakeup + * irq_set_irq_wake - control irq power management wakeup * @irq: interrupt to control * @on: enable/disable power management wakeup * @@ -445,7 +445,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) * Wakeup mode lets this IRQ wake the system from sleep * states like "suspend to RAM". */ -int set_irq_wake(unsigned int irq, unsigned int on) +int irq_set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -480,7 +480,7 @@ int set_irq_wake(unsigned int irq, unsigned int on) chip_bus_sync_unlock(desc); return ret; } -EXPORT_SYMBOL(set_irq_wake); +EXPORT_SYMBOL(irq_set_irq_wake); /* * Internal function that tells the architecture code whether a -- cgit v1.2.3 From 1fa46f1f070961783661ae640cd2f6b2557f3885 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 16:46:58 +0100 Subject: genirq: Simplify affinity related code There is lot of #ifdef CONFIG_GENERIC_PENDING_IRQ along with duplicated code in the irq core. Move the #ifdeffery into one place and cleanup the code so it's readable. No functional change. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 64 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b1b4da9446e6..99f3e9a3780c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -100,47 +100,70 @@ void irq_set_thread_affinity(struct irq_desc *desc) } } +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status & IRQ_MOVE_PCNTXT; +} +static inline bool irq_move_pending(struct irq_desc *desc) +{ + return desc->status & IRQ_MOVE_PENDING; +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else +static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } +static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } +#endif + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity * @cpumask: cpumask * */ -int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; + int ret = 0; if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); + if (irq_can_move_pcntxt(desc)) { + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + if (!ret) { + cpumask_copy(desc->irq_data.affinity, mask); irq_set_thread_affinity(desc); } - } - else { + } else { desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(desc->pending_mask, cpumask); + irq_copy_pending(desc, mask); } -#else - if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { - cpumask_copy(desc->irq_data.affinity, cpumask); - irq_set_thread_affinity(desc); - } -#endif + if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } desc->status |= IRQ_AFFINITY_SET; raw_spin_unlock_irqrestore(&desc->lock, flags); - return 0; + return ret; } int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) @@ -167,18 +190,13 @@ static void irq_affinity_notify(struct work_struct *work) cpumask_var_t cpumask; unsigned long flags; - if (!desc) - goto out; - - if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) goto out; raw_spin_lock_irqsave(&desc->lock, flags); -#ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) - cpumask_copy(cpumask, desc->pending_mask); + if (irq_move_pending(desc)) + irq_get_pending(cpumask, desc); else -#endif cpumask_copy(cpumask, desc->irq_data.affinity); raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From b008207cbd0d5ce606a1a2ac52826e0ab37d0b99 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 17:30:50 +0100 Subject: genirq: Rremove redundant check IRQ_NO_BALANCING is already checked in irq_can_set_affinity() above, no need to check it again. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 99f3e9a3780c..591c927b135c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -256,6 +256,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); */ static int setup_affinity(unsigned int irq, struct irq_desc *desc) { + /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) return 0; @@ -263,7 +264,7 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { + if (desc->status & (IRQ_AFFINITY_SET)) { if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; -- cgit v1.2.3 From 569bda8df11effa03e618729293c7961696abb10 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 17:05:08 +0100 Subject: genirq: Always apply cpu online mask If the affinity had been set by the user, then a later request_irq() will honour that setting. But online cpus can have changed. So apply the online mask and for this case as well. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 591c927b135c..ade65bfb466d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -256,6 +256,8 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); */ static int setup_affinity(unsigned int irq, struct irq_desc *desc) { + struct cpumask *set = irq_default_affinity; + /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) return 0; @@ -265,15 +267,13 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET)) { - if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) - < nr_cpu_ids) - goto set_affinity; + if (cpumask_intersects(desc->irq_data.affinity, + cpu_online_mask)) + set = desc->irq_data.affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - - cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); -set_affinity: + cpumask_and(desc->irq_data.affinity, cpu_online_mask, set); desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); return 0; -- cgit v1.2.3 From 3b8249e759c701c4a82f99d957be651a7657bf6f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 16:02:20 +0100 Subject: genirq: Do not copy affinity before set While rumaging through arch code I found that there are a few workarounds which deal with the fact that the initial affinity setting from request_irq() copies the mask into irq_data->affinity before the chip code is called. In the normal path we unconditionally copy the mask when the chip code returns 0. Copy after the code is called and add a return code IRQ_SET_MASK_OK_NOCOPY for the chip functions, which prevents the copy. That way we see the real mask when the chip function decided to truncate it further as some arches do. IRQ_SET_MASK_OK is 0, which is the current behaviour. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 47 ++++++++++++++++++++++++++++++++++------------- kernel/irq/proc.c | 2 +- 3 files changed, 36 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 99c3bc8a6fb4..b5bfa24aa6a6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -43,7 +43,7 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif -extern int irq_select_affinity_usr(unsigned int irq); +extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ade65bfb466d..dc95d53df510 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -148,9 +148,12 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) if (irq_can_move_pcntxt(desc)) { ret = chip->irq_set_affinity(&desc->irq_data, mask, false); - if (!ret) { + switch (ret) { + case IRQ_SET_MASK_OK: cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); + ret = 0; } } else { desc->status |= IRQ_MOVE_PENDING; @@ -254,9 +257,12 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); /* * Generic version of the affinity autoselector. */ -static int setup_affinity(unsigned int irq, struct irq_desc *desc) +static int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { + struct irq_chip *chip = get_irq_desc_chip(desc); struct cpumask *set = irq_default_affinity; + int ret; /* Excludes PER_CPU and NO_BALANCE interrupts */ if (!irq_can_set_affinity(irq)) @@ -273,13 +279,20 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(desc->irq_data.affinity, cpu_online_mask, set); - desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); + cpumask_and(mask, cpu_online_mask, set); + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + } return 0; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *d) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) { return irq_select_affinity(irq); } @@ -288,23 +301,23 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d) /* * Called when affinity is set via /proc/irq */ -int irq_select_affinity_usr(unsigned int irq) +int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret; raw_spin_lock_irqsave(&desc->lock, flags); - ret = setup_affinity(irq, desc); + ret = setup_affinity(irq, desc, mask); if (!ret) irq_set_thread_affinity(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); - return ret; } #else -static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) +static inline int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { return 0; } @@ -765,8 +778,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) struct irqaction *old, **old_ptr; const char *old_name = NULL; unsigned long flags; - int nested, shared = 0; - int ret; + int ret, nested, shared = 0; + cpumask_var_t mask; if (!desc) return -EINVAL; @@ -831,6 +844,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread = t; } + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_thread; + } + /* * The following block of code has to be executed atomically */ @@ -876,7 +894,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->flags & IRQF_TRIGGER_MASK); if (ret) - goto out_thread; + goto out_mask; } else compat_irq_chip_set_default_handler(desc); #if defined(CONFIG_IRQ_PER_CPU) @@ -903,7 +921,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_NO_BALANCING; /* Set default affinity mask once everything is setup */ - setup_affinity(irq, desc); + setup_affinity(irq, desc, mask); } else if ((new->flags & IRQF_TRIGGER_MASK) && (new->flags & IRQF_TRIGGER_MASK) @@ -956,6 +974,9 @@ mismatch: #endif ret = -EBUSY; +out_mask: + free_cpumask_var(mask); + out_thread: raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 6c8a2a9f8a7b..a46bd762db47 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -89,7 +89,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!cpumask_intersects(new_value, cpu_online_mask)) { /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - err = irq_select_affinity_usr(irq) ? -EINVAL : count; + err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; } else { irq_set_affinity(irq, new_value); err = count; -- cgit v1.2.3 From 2b879eaf095878430c38cbd95e5c0fc4ce65ad8e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 11:25:02 +0100 Subject: genirq: Remove redundant thread affinity setting Thread affinity is already set by setup_affinity(). Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dc95d53df510..33a6ee0ac68f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -309,8 +309,6 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) raw_spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc, mask); - if (!ret) - irq_set_thread_affinity(desc); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } -- cgit v1.2.3 From 1082687e8d6292a61759eb83358e7db39fed1bf4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 09:05:05 +0100 Subject: genirq: Plug race in report_bad_irq() We cannot walk the action chain unlocked. Even if IRQ_INPROGRESS is set an action can be removed and we follow a null pointer. It's safe to take the lock there, because the code which removes the action will call synchronize_irq() which waits unlocked for IRQ_INPROGRESS going away. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 3089d3b9d5f3..2fbfda2716e1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -139,15 +139,13 @@ static void poll_spurious_irqs(unsigned long dummy) * * (The other 100-of-100,000 interrupts may have been a correctly * functioning device sharing an IRQ with the failing one) - * - * Called under desc->lock */ - static void __report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { struct irqaction *action; + unsigned long flags; if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { printk(KERN_ERR "irq event %d: bogus return value %x\n", @@ -159,6 +157,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, dump_stack(); printk(KERN_ERR "handlers:\n"); + /* + * We need to take desc->lock here. note_interrupt() is called + * w/o desc->lock held, but IRQ_PROGRESS set. We might race + * with something else removing an action. It's ok to take + * desc->lock here. See synchronize_irq(). + */ + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; while (action) { printk(KERN_ERR "[<%p>]", action->handler); @@ -167,6 +172,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, printk("\n"); action = action->next; } + raw_spin_unlock_irqrestore(&desc->lock, flags); } static void -- cgit v1.2.3 From b738a50a202639614c98b5763b01bf9201779e50 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 23:58:19 +0100 Subject: genirq: Warn when handler enables interrupts We run all handlers with interrupts disabled and expect them not to enable them. Warn when we catch one who does. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a7190122..cdd6fbbe771c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -68,6 +68,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) ret = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, ret); + if (WARN_ON_ONCE(!irqs_disabled())) + local_irq_disable(); + switch (ret) { case IRQ_WAKE_THREAD: /* @@ -114,7 +117,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); - local_irq_disable(); return retval; } -- cgit v1.2.3 From fa27271bc8d230355c1f24ddea103824fdc12de6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 09:10:39 +0100 Subject: genirq: Fixup poll handling try_one_irq() contains redundant code and lots of useless checks for shared interrupts. Check for shared before setting IRQ_INPROGRESS and then call handle_IRQ_event() while pending. Shorter version with the same functionality. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 50 +++++++++++++++++++------------------------------- 1 file changed, 19 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 2fbfda2716e1..0af9e59c82eb 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -42,48 +42,36 @@ static int try_one_irq(int irq, struct irq_desc *desc) raw_spin_unlock(&desc->lock); return ok; } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; - raw_spin_unlock(&desc->lock); - - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(irq, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; - } - local_irq_disable(); - /* Now clean up the flags */ - raw_spin_lock(&desc->lock); - action = desc->action; - /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. */ - while ((desc->status & IRQ_PENDING) && action) { - /* - * Perform real IRQ processing for the IRQ we deferred - */ - work = 1; + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || !action->next) + goto out; + + /* Honour the normal IRQ locking */ + desc->status |= IRQ_INPROGRESS; + do { + work++; + desc->status &= ~IRQ_PENDING; raw_spin_unlock(&desc->lock); - handle_IRQ_event(irq, action); + if (handle_IRQ_event(irq, action) != IRQ_NONE) + ok = 1; raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; - } + action = desc->action; + } while ((desc->status & IRQ_PENDING) && action); + desc->status &= ~IRQ_INPROGRESS; /* * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work) + if (work > 1) irq_end(irq, desc); - raw_spin_unlock(&desc->lock); +out: + raw_spin_unlock(&desc->lock); return ok; } -- cgit v1.2.3 From c7259cd7af757ddcd65701c37099dcddae2054f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 09:52:27 +0100 Subject: genirq: Do not poll disabled, percpu and timer interrupts There is no point in polling disabled lines. percpu does not make sense at all because we only poll on the cpu we're currently running on. Also polling per_cpu interrupts is racy as hell. The handler runs without locking so we might get a huge surprise. If the timer interrupt needs polling, then we wont get there anyway. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 0af9e59c82eb..bd0e42d3e0ba 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -25,30 +25,42 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); /* * Recovery handler for misrouted interrupts. */ -static int try_one_irq(int irq, struct irq_desc *desc) +static int try_one_irq(int irq, struct irq_desc *desc, bool force) { struct irqaction *action; int ok = 0, work = 0; raw_spin_lock(&desc->lock); + + /* PER_CPU and nested thread interrupts are never polled */ + if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD)) + goto out; + + /* + * Do not poll disabled interrupts unless the spurious + * disabled poller asks explicitely. + */ + if ((desc->status & IRQ_DISABLED) && !force) + goto out; + + /* + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. + */ + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || + (action->flags & __IRQF_TIMER) || !action->next) + goto out; + /* Already running on another processor */ if (desc->status & IRQ_INPROGRESS) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - raw_spin_unlock(&desc->lock); - return ok; - } - /* - * All handlers must agree on IRQF_SHARED, so we test just the - * first. Check for action->next as well. - */ - action = desc->action; - if (!action || !(action->flags & IRQF_SHARED) || !action->next) + desc->status |= IRQ_PENDING; goto out; + } /* Honour the normal IRQ locking */ desc->status |= IRQ_INPROGRESS; @@ -87,7 +99,7 @@ static int misrouted_irq(int irq) if (i == irq) /* Already tried */ continue; - if (try_one_irq(i, desc)) + if (try_one_irq(i, desc, false)) ok = 1; } /* So the caller can adjust the irq error counts */ @@ -112,7 +124,7 @@ static void poll_spurious_irqs(unsigned long dummy) continue; local_irq_disable(); - try_one_irq(i, desc); + try_one_irq(i, desc, true); local_irq_enable(); } -- cgit v1.2.3 From d05c65fff0ef672be75429266751f0e015b54d94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 14:31:37 +0100 Subject: genirq: spurious: Run only one poller at a time No point in running concurrent pollers which confuse each other by setting PENDING. Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index bd0e42d3e0ba..56ff8fffb8b0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -21,6 +21,8 @@ static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) static void poll_spurious_irqs(unsigned long dummy); static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static int irq_poll_cpu; +static atomic_t irq_poll_active; /* * Recovery handler for misrouted interrupts. @@ -92,6 +94,11 @@ static int misrouted_irq(int irq) struct irq_desc *desc; int i, ok = 0; + if (atomic_inc_return(&irq_poll_active) == 1) + goto out; + + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { if (!i) continue; @@ -102,6 +109,8 @@ static int misrouted_irq(int irq) if (try_one_irq(i, desc, false)) ok = 1; } +out: + atomic_dec(&irq_poll_active); /* So the caller can adjust the irq error counts */ return ok; } @@ -111,6 +120,10 @@ static void poll_spurious_irqs(unsigned long dummy) struct irq_desc *desc; int i; + if (atomic_inc_return(&irq_poll_active) != 1) + goto out; + irq_poll_cpu = smp_processor_id(); + for_each_irq_desc(i, desc) { unsigned int status; @@ -127,7 +140,8 @@ static void poll_spurious_irqs(unsigned long dummy) try_one_irq(i, desc, true); local_irq_enable(); } - +out: + atomic_dec(&irq_poll_active); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } -- cgit v1.2.3 From fe200ae48ef5c79bf7941fe8046ff9505c570ff6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 10:34:30 +0100 Subject: genirq: Mark polled irqs and defer the real handler With the chip.end() function gone we might run into a situation where a poll call runs and the real interrupt comes in, sees IRQ_INPROGRESS and disables the line. That might be a perfect working one, which will then be masked forever. So mark them polled while the poll runs. When the real handler sees IRQ_INPROGRESS it checks the poll flag and waits for the polling to complete. Add the necessary amount of sanity checks to it to avoid deadlocks. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 26 +++++++++++++++++++------ kernel/irq/internals.h | 11 +---------- kernel/irq/spurious.c | 51 ++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 60 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 622b55ac0e09..31258782742c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -448,6 +448,13 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_nested_irq); +static bool irq_check_poll(struct irq_desc *desc) +{ + if (!(desc->status & IRQ_POLL_INPROGRESS)) + return false; + return irq_wait_for_poll(desc); +} + /** * handle_simple_irq - Simple and software-decoded IRQs. * @irq: the interrupt number @@ -469,7 +476,9 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; + if (!irq_check_poll(desc)) + goto out_unlock; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -510,7 +519,9 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc); if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out_unlock; + if (!irq_check_poll(desc)) + goto out_unlock; + desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -558,7 +569,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) - goto out; + if (!irq_check_poll(desc)) + goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); @@ -620,9 +632,11 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) */ if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc); - goto out_unlock; + if (!irq_check_poll(desc)) { + desc->status |= (IRQ_PENDING | IRQ_MASKED); + mask_ack_irq(desc); + goto out_unlock; + } } kstat_incr_irqs_this_cpu(irq, desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b5bfa24aa6a6..0eff7e92b1a9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -28,6 +28,7 @@ extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); +bool irq_wait_for_poll(struct irq_desc *desc); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); @@ -47,16 +48,6 @@ extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); extern void irq_set_thread_affinity(struct irq_desc *desc); -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static inline void irq_end(unsigned int irq, struct irq_desc *desc) -{ - if (desc->irq_data.chip && desc->irq_data.chip->end) - desc->irq_data.chip->end(irq); -} -#else -static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } -#endif - /* Inline functions for support of irq chips on slow busses */ static inline void chip_bus_lock(struct irq_desc *desc) { diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 56ff8fffb8b0..f749d29bfd81 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -24,13 +24,45 @@ static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); static int irq_poll_cpu; static atomic_t irq_poll_active; +/* + * We wait here for a poller to finish. + * + * If the poll runs on this CPU, then we yell loudly and return + * false. That will leave the interrupt line disabled in the worst + * case, but it should never happen. + * + * We wait until the poller is done and then recheck disabled and + * action (about to be disabled). Only if it's still active, we return + * true and let the handler run. + */ +bool irq_wait_for_poll(struct irq_desc *desc) +{ + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + +#ifdef CONFIG_SMP + do { + raw_spin_unlock(&desc->lock); + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (desc->status & IRQ_INPROGRESS); + /* Might have been disabled in meantime */ + return !(desc->status & IRQ_DISABLED) && desc->action; +#else + return false; +#endif +} + /* * Recovery handler for misrouted interrupts. */ static int try_one_irq(int irq, struct irq_desc *desc, bool force) { struct irqaction *action; - int ok = 0, work = 0; + int ok = 0; raw_spin_lock(&desc->lock); @@ -64,10 +96,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; + /* Honour the normal IRQ locking and mark it poll in progress */ + desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS; do { - work++; desc->status &= ~IRQ_PENDING; raw_spin_unlock(&desc->lock); if (handle_IRQ_event(irq, action) != IRQ_NONE) @@ -76,14 +107,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) action = desc->action; } while ((desc->status & IRQ_PENDING) && action); - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too - */ - if (work > 1) - irq_end(irq, desc); - + desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS); out: raw_spin_unlock(&desc->lock); return ok; @@ -238,6 +262,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { + if (desc->status & IRQ_POLL_INPROGRESS) + return; + if (unlikely(action_ret != IRQ_HANDLED)) { /* * If we are seeing only the odd spurious IRQ caused by -- cgit v1.2.3 From 1535dfacbf21c4da1b73fcf07c39913da5bd5581 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:55:43 +0100 Subject: genirq: Move irq thread flags to core Soleley used in core code. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 0eff7e92b1a9..b17c98440400 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -11,6 +11,20 @@ extern int noirqdebug; +/* + * Bits used by threaded handlers: + * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run + * IRQTF_DIED - handler thread died + * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity + */ +enum { + IRQTF_RUNTHREAD, + IRQTF_DIED, + IRQTF_WARNED, + IRQTF_AFFINITY, +}; + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ -- cgit v1.2.3 From 3b56f0585fd4c02d047dc406668cb40159b2d340 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 21:41:12 +0000 Subject: genirq: Remove bogus conditional The if (chip->irq_shutdown) check will always evaluate to true, as we fill in chip->irq_shutdown with default_shutdown in irq_chip_set_defaults() if the chip does not provide its own function. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110202212551.667607458@linutronix.de> --- kernel/irq/manage.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 33a6ee0ac68f..30bc8de40905 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1057,10 +1057,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->irq_data.chip->irq_shutdown) - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - else - desc->irq_data.chip->irq_disable(&desc->irq_data); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } #ifdef CONFIG_SMP -- cgit v1.2.3 From 4699923861513671d3f6ade8efb4e56a9a7ecadf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 21:41:14 +0000 Subject: genirq: Consolidate startup/shutdown of interrupts Aside of duplicated code some of the startup/shutdown sites do not handle the MASKED/DISABLED flags and the depth field at all. Move that to a helper function and take care of it there. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110202212551.787481468@linutronix.de> --- kernel/irq/autoprobe.c | 10 +++++----- kernel/irq/chip.c | 37 ++++++++++++++++++++----------------- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 14 +++++--------- 4 files changed, 33 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 505798f86c36..08947cb61725 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -60,7 +60,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_startup(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -77,7 +77,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->irq_data.chip->irq_startup(&desc->irq_data)) + if (irq_startup(desc)) desc->status |= IRQ_PENDING; } raw_spin_unlock_irq(&desc->lock); @@ -99,7 +99,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_shutdown(desc); } else if (i < 32) mask |= 1 << i; @@ -138,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } @@ -182,7 +182,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 31258782742c..988fe7a24282 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -192,6 +192,25 @@ void set_irq_nested_thread(unsigned int irq, int nest) } EXPORT_SYMBOL_GPL(set_irq_nested_thread); +int irq_startup(struct irq_desc *desc) +{ + desc->status &= ~(IRQ_MASKED | IRQ_DISABLED); + desc->depth = 0; + + if (desc->irq_data.chip->irq_startup) + return desc->irq_data.chip->irq_startup(&desc->irq_data); + + desc->irq_data.chip->irq_enable(&desc->irq_data); + return 0; +} + +void irq_shutdown(struct irq_desc *desc) +{ + desc->status |= IRQ_MASKED | IRQ_DISABLED; + desc->depth = 1; + desc->irq_data.chip->irq_shutdown(&desc->irq_data); +} + /* * default enable function */ @@ -210,17 +229,6 @@ static void default_disable(struct irq_data *data) { } -/* - * default startup function - */ -static unsigned int default_startup(struct irq_data *data) -{ - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_enable(data); - return 0; -} - /* * default shutdown function */ @@ -229,7 +237,6 @@ static void default_shutdown(struct irq_data *data) struct irq_desc *desc = irq_data_to_desc(data); desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -337,8 +344,6 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_enable = default_enable; if (!chip->irq_disable) chip->irq_disable = default_disable; - if (!chip->irq_startup) - chip->irq_startup = default_startup; /* * We use chip->irq_disable, when the user provided its own. When * we have default_disable set for chip->irq_disable, then we need @@ -747,10 +752,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->name = name; if (handle != handle_bad_irq && is_chained) { - desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; - desc->depth = 0; - desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_startup(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); chip_bus_sync_unlock(desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b17c98440400..5cbfc93ed7b1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -38,6 +38,9 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); +extern int irq_startup(struct irq_desc *desc); +extern void irq_shutdown(struct irq_desc *desc); + extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); /* Resending of interrupts :*/ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 30bc8de40905..9c562477e28b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -906,11 +906,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; - if (!(desc->status & IRQ_NOAUTOEN)) { - desc->depth = 0; - desc->status &= ~IRQ_DISABLED; - desc->irq_data.chip->irq_startup(&desc->irq_data); - } else + if (!(desc->status & IRQ_NOAUTOEN)) + irq_startup(desc); + else /* Undo nested disables: */ desc->depth = 1; @@ -1055,10 +1053,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif /* If this was the last handler, shut down the IRQ line: */ - if (!desc->action) { - desc->status |= IRQ_DISABLED; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); - } + if (!desc->action) + irq_shutdown(desc); #ifdef CONFIG_SMP /* make sure affinity_hint is cleaned up */ -- cgit v1.2.3 From 87923470c712dff00b101ffb6b6fbc27bd7a6df5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Feb 2011 12:27:44 +0100 Subject: genirq: Consolidate disable/enable Create irq_disable/enable and use them to keep the flags consistent. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 12 +++++++++++- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 2 +- kernel/irq/resend.c | 10 +++++----- kernel/irq/spurious.c | 2 +- 5 files changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 988fe7a24282..86c8e42f7fe4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -200,7 +200,7 @@ int irq_startup(struct irq_desc *desc) if (desc->irq_data.chip->irq_startup) return desc->irq_data.chip->irq_startup(&desc->irq_data); - desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_enable(desc); return 0; } @@ -211,6 +211,16 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_shutdown(&desc->irq_data); } +void irq_enable(struct irq_desc *desc) +{ + desc->irq_data.chip->irq_enable(&desc->irq_data); +} + +void irq_disable(struct irq_desc *desc) +{ + desc->irq_data.chip->irq_disable(&desc->irq_data); +} + /* * default enable function */ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5cbfc93ed7b1..c71fc4de0371 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -40,6 +40,8 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); extern int irq_startup(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); +extern void irq_enable(struct irq_desc *desc); +extern void irq_disable(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9c562477e28b..007802399697 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -331,7 +331,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_disable(desc); } } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index dc49358b73fa..4bfe268dffe5 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -55,20 +55,20 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - unsigned int status = desc->status; - /* * Make sure the interrupt is enabled, before resending it: */ - desc->irq_data.chip->irq_enable(&desc->irq_data); + irq_enable(desc); /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still * active. */ - if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; + if (desc->status & IRQ_LEVEL) + return; + if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY; if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f749d29bfd81..c300b8f6008d 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -303,7 +303,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; - desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_disable(desc); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); -- cgit v1.2.3 From 50f7c0327513d5acefbe26fd33498af18d1ffac5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 3 Feb 2011 13:23:54 +0100 Subject: genirq: Remove default magic Now that everything uses the wrappers, we can remove the default functions. None of those functions is performance critical. That makes the IRQ_MASKED flag tracking fully consistent. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 73 +++++++++++------------------------------------------ kernel/irq/manage.c | 4 +-- 2 files changed, 17 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 86c8e42f7fe4..1a239a83f925 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -208,45 +208,29 @@ void irq_shutdown(struct irq_desc *desc) { desc->status |= IRQ_MASKED | IRQ_DISABLED; desc->depth = 1; - desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); } void irq_enable(struct irq_desc *desc) { - desc->irq_data.chip->irq_enable(&desc->irq_data); -} - -void irq_disable(struct irq_desc *desc) -{ - desc->irq_data.chip->irq_disable(&desc->irq_data); -} - -/* - * default enable function - */ -static void default_enable(struct irq_data *data) -{ - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_unmask(&desc->irq_data); + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } -/* - * default disable function - */ -static void default_disable(struct irq_data *data) -{ -} - -/* - * default shutdown function - */ -static void default_shutdown(struct irq_data *data) +void irq_disable(struct irq_desc *desc) { - struct irq_desc *desc = irq_data_to_desc(data); - - desc->irq_data.chip->irq_mask(&desc->irq_data); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + desc->status |= IRQ_MASKED; + } } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -334,10 +318,6 @@ static void compat_bus_sync_unlock(struct irq_data *data) void irq_chip_set_defaults(struct irq_chip *chip) { #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - /* - * Compat fixup functions need to be before we set the - * defaults for enable/disable/startup/shutdown - */ if (chip->enable) chip->irq_enable = compat_irq_enable; if (chip->disable) @@ -346,31 +326,8 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->irq_shutdown = compat_irq_shutdown; if (chip->startup) chip->irq_startup = compat_irq_startup; -#endif - /* - * The real defaults - */ - if (!chip->irq_enable) - chip->irq_enable = default_enable; - if (!chip->irq_disable) - chip->irq_disable = default_disable; - /* - * We use chip->irq_disable, when the user provided its own. When - * we have default_disable set for chip->irq_disable, then we need - * to use default_shutdown, otherwise the irq line is not - * disabled on free_irq(): - */ - if (!chip->irq_shutdown) - chip->irq_shutdown = chip->irq_disable != default_disable ? - chip->irq_disable : default_shutdown; - -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; - - /* - * Now fix up the remaining compat handlers - */ if (chip->bus_lock) chip->irq_bus_lock = compat_bus_lock; if (chip->bus_sync_unlock) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 007802399697..2a6c6ee11a3f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -440,8 +440,8 @@ void enable_irq(unsigned int irq) if (!desc) return; - if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable, - KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + if (WARN(!desc->irq_data.chip, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) return; chip_bus_lock(desc); -- cgit v1.2.3 From 3aae994fb0f43f6d94a31c33536a83869504abdf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 10:17:52 +0100 Subject: genirq: Consolidate IRQ_DISABLED Handle IRQ_DISABLED consistent. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 14 ++++++++++---- kernel/irq/manage.c | 9 +++------ kernel/irq/resend.c | 5 ----- kernel/irq/spurious.c | 2 +- 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1a239a83f925..43c62ca68c11 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -194,11 +194,14 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); int irq_startup(struct irq_desc *desc) { - desc->status &= ~(IRQ_MASKED | IRQ_DISABLED); + desc->status &= ~IRQ_DISABLED; desc->depth = 0; - if (desc->irq_data.chip->irq_startup) - return desc->irq_data.chip->irq_startup(&desc->irq_data); + if (desc->irq_data.chip->irq_startup) { + int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + desc->status &= ~IRQ_MASKED; + return ret; + } irq_enable(desc); return 0; @@ -206,7 +209,7 @@ int irq_startup(struct irq_desc *desc) void irq_shutdown(struct irq_desc *desc) { - desc->status |= IRQ_MASKED | IRQ_DISABLED; + desc->status |= IRQ_DISABLED; desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) desc->irq_data.chip->irq_shutdown(&desc->irq_data); @@ -214,10 +217,12 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); + desc->status |= IRQ_MASKED; } void irq_enable(struct irq_desc *desc) { + desc->status &= ~IRQ_DISABLED; if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else @@ -227,6 +232,7 @@ void irq_enable(struct irq_desc *desc) void irq_disable(struct irq_desc *desc) { + desc->status |= IRQ_DISABLED; if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); desc->status |= IRQ_MASKED; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2a6c6ee11a3f..78a566a9b39f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -329,10 +329,8 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) desc->status |= IRQ_SUSPENDED; } - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; + if (!desc->depth++) irq_disable(desc); - } } /** @@ -407,12 +405,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - if (desc->status & IRQ_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ - desc->status = status | IRQ_NOPROBE; + desc->status |= IRQ_NOPROBE; + irq_enable(desc); check_irq_resend(desc, irq); /* fall-through */ } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 4bfe268dffe5..60b20261041b 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -55,11 +55,6 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - /* - * Make sure the interrupt is enabled, before resending it: - */ - irq_enable(desc); - /* * We do not resend level type interrupts. Level type * interrupts are resent by hardware when they are still diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c300b8f6008d..89e5e16aca39 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -301,7 +301,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; + desc->status |= IRQ_SPURIOUS_DISABLED; desc->depth++; irq_disable(desc); -- cgit v1.2.3 From d78f8dd36b90626106ce19cb2e6828b0dc39447e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Feb 2011 21:41:17 +0000 Subject: genirq: Do not fiddle with IRQ_MASKED in handle_edge_irq() IRQ_MASKED is set in mask_ack_irq() anyway. Remove it from handle_edge_irq() to allow simpler ab^HHreuse of that function. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110202212551.918484270@linutronix.de> --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 43c62ca68c11..2c30b7844595 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -611,7 +611,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { if (!irq_check_poll(desc)) { - desc->status |= (IRQ_PENDING | IRQ_MASKED); + desc->status |= IRQ_PENDING; mask_ack_irq(desc); goto out_unlock; } -- cgit v1.2.3 From 4912609f228da4a3d2bfbdf0f31de3d9eab2b7f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:08:49 +0100 Subject: genirq: Implement handle_irq_event() Core code replacement for the ugly camel case. It contains all the code which is shared in all handlers. clear status flags set INPROGRESS flag unlock call action chain note_interrupt lock clr INPROGRESS flag Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 47 +++++++++++++++++++++++++++++++++++++++-------- kernel/irq/internals.h | 3 +++ 2 files changed, 42 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index cdd6fbbe771c..4ef059478ebf 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,14 +51,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; @@ -120,3 +113,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) return retval; } + +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) +{ + irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action); + + if (!noirqdebug) + note_interrupt(desc->irq_data.irq, desc, ret); + return ret; +} + +irqreturn_t handle_irq_event(struct irq_desc *desc) +{ + struct irqaction *action = desc->action; + irqreturn_t ret; + + desc->status &= ~IRQ_PENDING; + desc->status |= IRQ_INPROGRESS; + raw_spin_unlock(&desc->lock); + + ret = handle_irq_event_percpu(desc, action); + + raw_spin_lock(&desc->lock); + desc->status &= ~IRQ_INPROGRESS; + return ret; +} + +/** + * handle_IRQ_event - irq action chain handler + * @irq: the interrupt number + * @action: the interrupt action chain for this irq + * + * Handles the action chain of an irq event + */ +irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) +{ + return __handle_irq_event(irq, action); +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c71fc4de0371..b61824cdadc6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -45,6 +45,9 @@ extern void irq_disable(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event(struct irq_desc *desc); + /* Resending of interrupts :*/ void check_irq_resend(struct irq_desc *desc, unsigned int irq); bool irq_wait_for_poll(struct irq_desc *desc); -- cgit v1.2.3 From 107781e72192067b95a7d373bfa460434a13c6ae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:21:02 +0100 Subject: genirq: Use handle_irq_event() in handle_simple_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2c30b7844595..809a03fe7e07 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -448,9 +448,6 @@ static bool irq_check_poll(struct irq_desc *desc) void handle_simple_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -460,19 +457,11 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); + handle_irq_event(desc); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } -- cgit v1.2.3 From 1529866c63d789925de9b4250646d82d033e4b95 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:22:17 +0100 Subject: genirq: Use handle_irq_event() in handle_level_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 809a03fe7e07..2d2ba4ace0ec 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -479,9 +479,6 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); mask_ack_irq(desc); @@ -496,19 +493,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + handle_irq_event(desc); if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) unmask_irq(desc); -- cgit v1.2.3 From a7ae4de5c8ae8110556f0f9c7241093ef984605c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:23:07 +0100 Subject: genirq: Use handle_irq_event() in handle_fasteoi_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2d2ba4ace0ec..a499ca5b11aa 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -518,9 +518,6 @@ EXPORT_SYMBOL_GPL(handle_level_irq); void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - struct irqaction *action; - irqreturn_t action_ret; - raw_spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) @@ -534,26 +531,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) { + if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; mask_irq(desc); goto out; } - - desc->status |= IRQ_INPROGRESS; - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + handle_irq_event(desc); out: desc->irq_data.chip->irq_eoi(&desc->irq_data); - raw_spin_unlock(&desc->lock); } -- cgit v1.2.3 From a60a5dc2db3b08b3c2900614c43b1262410c2d8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:24:07 +0100 Subject: genirq: Use handle_irq_event() in handle_edge_irq() It's safe to drop the IRQ_INPROGRESS flag between action chain walks as we are protected by desc->lock. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a499ca5b11aa..3ccff4d55b39 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -583,14 +583,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) /* Start handling the irq */ desc->irq_data.chip->irq_ack(&desc->irq_data); - /* Mark the IRQ currently in progress.*/ - desc->status |= IRQ_INPROGRESS; - do { - struct irqaction *action = desc->action; - irqreturn_t action_ret; - - if (unlikely(!action)) { + if (unlikely(!desc->action)) { mask_irq(desc); goto out_unlock; } @@ -606,16 +600,10 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) unmask_irq(desc); } - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - raw_spin_lock(&desc->lock); + handle_irq_event(desc); } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); - desc->status &= ~IRQ_INPROGRESS; out_unlock: raw_spin_unlock(&desc->lock); } -- cgit v1.2.3 From 849f061c25f8951d11c7dd88f44950ccde296392 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:25:41 +0100 Subject: genirq: Use handle_perpcu_event() in handle_percpu_irq() Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3ccff4d55b39..52b10ad7bd59 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -618,19 +618,17 @@ out_unlock: void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { - irqreturn_t action_ret; + struct irq_chip *chip = get_irq_desc_chip(desc); kstat_incr_irqs_this_cpu(irq, desc); - if (desc->irq_data.chip->irq_ack) - desc->irq_data.chip->irq_ack(&desc->irq_data); + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); + handle_irq_event_percpu(desc, desc->action); - if (desc->irq_data.chip->irq_eoi) - desc->irq_data.chip->irq_eoi(&desc->irq_data); + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); } void -- cgit v1.2.3 From 0877d66257082ce86fca8f9826b91870575b272c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:29:15 +0100 Subject: genirq: Use handle_irq_event() in the spurious poll code Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89e5e16aca39..bc0620745d5f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -56,13 +56,14 @@ bool irq_wait_for_poll(struct irq_desc *desc) #endif } + /* * Recovery handler for misrouted interrupts. */ static int try_one_irq(int irq, struct irq_desc *desc, bool force) { + irqreturn_t ret = IRQ_NONE; struct irqaction *action; - int ok = 0; raw_spin_lock(&desc->lock); @@ -96,21 +97,17 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; } - /* Honour the normal IRQ locking and mark it poll in progress */ - desc->status |= IRQ_INPROGRESS | IRQ_POLL_INPROGRESS; + /* Mark it poll in progress */ + desc->status |= IRQ_POLL_INPROGRESS; do { - desc->status &= ~IRQ_PENDING; - raw_spin_unlock(&desc->lock); - if (handle_IRQ_event(irq, action) != IRQ_NONE) - ok = 1; - raw_spin_lock(&desc->lock); + if (handle_irq_event(desc) == IRQ_HANDLED) + ret = IRQ_HANDLED; action = desc->action; - } while ((desc->status & IRQ_PENDING) && action); - - desc->status &= ~(IRQ_INPROGRESS | IRQ_POLL_INPROGRESS); + } while ((desc->status & IRQ_PENDING) && action); + desc->status &= ~IRQ_POLL_INPROGRESS; out: raw_spin_unlock(&desc->lock); - return ok; + return ret == IRQ_HANDLED; } static int misrouted_irq(int irq) -- cgit v1.2.3 From 1277a5325adfc53caac7dd3dac5d3d2fd2a125b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 01:40:27 +0100 Subject: genirq: Simplify handle_irq_event() Now that all core users are converted one layer can go. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 4ef059478ebf..ff40e0f5e2e2 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,10 +51,11 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action) +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0; + unsigned int status = 0, irq = desc->irq_data.irq; do { trace_irq_handler_entry(irq, action); @@ -111,17 +112,9 @@ static irqreturn_t __handle_irq_event(unsigned int irq, struct irqaction *action if (status & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); - return retval; -} - -irqreturn_t -handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) -{ - irqreturn_t ret = __handle_irq_event(desc->irq_data.irq, action); - if (!noirqdebug) - note_interrupt(desc->irq_data.irq, desc, ret); - return ret; + note_interrupt(irq, desc, ret); + return retval; } irqreturn_t handle_irq_event(struct irq_desc *desc) @@ -149,5 +142,5 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) */ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) { - return __handle_irq_event(irq, action); + return handle_irq_event_percpu(irq_to_desc(irq), action); } -- cgit v1.2.3 From c78b9b65faa291def628dbd8539649f58299f0f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Dec 2010 17:21:47 +0100 Subject: genirq: Implement generic irq_show_interrupts() All archs implement show_interrupts() in more or less the same way. That's tons of duplicated code with different bugs with no value. Implement a generic version and deprecate show_interrupts() Unfortunately we need some ifdeffery for !GENERIC_HARDIRQ archs. Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 3 +++ kernel/irq/proc.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 8e42fec7686d..4cd5d7135e0f 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -20,6 +20,9 @@ config HAVE_SPARSE_IRQ config GENERIC_IRQ_PROBE def_bool n +config GENERIC_IRQ_SHOW + def_bool n + config GENERIC_PENDING_IRQ def_bool n diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a46bd762db47..26449239bb46 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internals.h" @@ -357,3 +358,65 @@ void init_irq_proc(void) } } +#ifdef CONFIG_GENERIC_IRQ_SHOW + +int __weak arch_show_interrupts(struct seq_file *p, int prec) +{ + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + static int prec; + + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return arch_show_interrupts(p, prec); + + /* print header and calculate the width of the first column */ + if (i == 0) { + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + raw_spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %8s", desc->irq_data.chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +#endif -- cgit v1.2.3 From 35e857cbeb24e75c6f9a9312ac30454eee8c5950 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 12:20:23 +0100 Subject: genirq: Fixup core code namespace fallout Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 6 +++--- kernel/irq/manage.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 52b10ad7bd59..143eb2a9fa4e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -618,7 +618,7 @@ out_unlock: void handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { - struct irq_chip *chip = get_irq_desc_chip(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); kstat_incr_irqs_this_cpu(irq, desc); @@ -685,7 +685,7 @@ void set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle) { - set_irq_chip(irq, chip); + irq_set_chip(irq, chip); __set_irq_handler(irq, handle, 0, NULL); } @@ -693,7 +693,7 @@ void set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { - set_irq_chip(irq, chip); + irq_set_chip(irq, chip); __set_irq_handler(irq, handle, 0, name); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 78a566a9b39f..dd4e5c21b9e7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); static int setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) { - struct irq_chip *chip = get_irq_desc_chip(desc); + struct irq_chip *chip = irq_desc_get_chip(desc); struct cpumask *set = irq_default_affinity; int ret; -- cgit v1.2.3 From dbec07bac614a61e3392c1e7c08cc6a49ad43f7a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:19:55 +0100 Subject: genirq: Add internal state field to irq_desc That field will contain internal state information which is not going to be exposed to anything outside the core code - except via accessor functions. I'm tired of everyone fiddling in irq_desc.status. core_internal_state__do_not_mess_with_it is clear enough, annoying to type and easy to grep for. Offenders will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b61824cdadc6..ae96e688f4e1 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,5 +1,9 @@ /* * IRQ subsystem internal functions and variables: + * + * Do not ever include this file from anything else than + * kernel/irq/. Do not even think about using any information outside + * of this file for your non core code. */ #include @@ -9,6 +13,8 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#define istate core_internal_state__do_not_mess_with_it + extern int noirqdebug; /* -- cgit v1.2.3 From e6bea9c404699223322d7411c6f2ceaec02fa83c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 13:16:52 +0100 Subject: genirq: Protect tglx from tripping over his own feet The irq_desc.status field will either go away or renamed to settings. Anyway we need to maintain compatibility to avoid breaking the world and some more. While moving bits into the core, I need to avoid that I use any of the still existing IRQ_ bits in the core code by typos. So that file will hold the inline wrappers and some nasty CPP tricks to break the build when typoed. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 ++ kernel/irq/irqdesc.c | 4 ++-- kernel/irq/settings.h | 7 +++++++ 3 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 kernel/irq/settings.h (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ae96e688f4e1..8f200310a952 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,6 +13,8 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#include "settings.h" + #define istate core_internal_state__do_not_mess_with_it extern int noirqdebug; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6f6644f819dd..8b87f2ce0203 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.chip_data = NULL; desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; - desc->status = IRQ_DEFAULT_INIT_FLAGS; + desc->status = _IRQ_DEFAULT_INIT_FLAGS; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -246,7 +246,7 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = IRQ_DEFAULT_INIT_FLAGS, + .status = _IRQ_DEFAULT_INIT_FLAGS, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 000000000000..610f55597ce7 --- /dev/null +++ b/kernel/irq/settings.h @@ -0,0 +1,7 @@ +/* + * Internal header to deal with irq_desc->status which will be renamed + * to irq_desc->settings. + */ +enum { + _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, +}; -- cgit v1.2.3 From bd062e7667ac173afef57fbfe9327f3b914a9d4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:25:25 +0100 Subject: genirq: Move IRQ_AUTODETECT to internal state No users outside of core Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 29 ++++++++++++----------------- kernel/irq/internals.h | 15 +++++++++++++-- kernel/irq/manage.c | 3 ++- 3 files changed, 27 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 08947cb61725..916e56e10a2e 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -32,7 +32,6 @@ unsigned long probe_irq_on(void) { struct irq_desc *desc; unsigned long mask = 0; - unsigned int status; int i; /* @@ -76,7 +75,8 @@ unsigned long probe_irq_on(void) for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->status |= IRQ_AUTODETECT | IRQ_WAITING; + desc->istate |= IRQS_AUTODETECT; + desc->status |= IRQ_WAITING; if (irq_startup(desc)) desc->status |= IRQ_PENDING; } @@ -93,12 +93,11 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { + if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ - if (!(status & IRQ_WAITING)) { - desc->status = status & ~IRQ_AUTODETECT; + if (!(desc->status & IRQ_WAITING)) { + desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } else if (i < 32) @@ -125,19 +124,17 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int status, mask = 0; + unsigned int mask = 0; struct irq_desc *desc; int i; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (i < 16 && !(status & IRQ_WAITING)) + if (desc->istate & IRQS_AUTODETECT) { + if (i < 16 && !(desc->status & IRQ_WAITING)) mask |= 1 << i; - desc->status = status & ~IRQ_AUTODETECT; + desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); @@ -169,19 +166,17 @@ int probe_irq_off(unsigned long val) { int i, irq_found = 0, nr_of_irqs = 0; struct irq_desc *desc; - unsigned int status; for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); - status = desc->status; - if (status & IRQ_AUTODETECT) { - if (!(status & IRQ_WAITING)) { + if (desc->istate & IRQS_AUTODETECT) { + if (!(desc->status & IRQ_WAITING)) { if (!nr_of_irqs) irq_found = i; nr_of_irqs++; } - desc->status = status & ~IRQ_AUTODETECT; + desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 8f200310a952..7ffd4f439b92 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -33,6 +33,15 @@ enum { IRQTF_AFFINITY, }; +/* + * Bit masks for desc->state + * + * IRQS_AUTODETECT - autodetection in progress + */ +enum { + IRQS_AUTODETECT = 0x00000001, +}; + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ @@ -98,6 +107,7 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #include #define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -117,7 +127,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_DISABLED); P(IRQ_PENDING); P(IRQ_REPLAY); - P(IRQ_AUTODETECT); P(IRQ_WAITING); P(IRQ_LEVEL); P(IRQ_MASKED); @@ -127,7 +136,9 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOPROBE); P(IRQ_NOREQUEST); P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); } #undef P - +#undef PS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dd4e5c21b9e7..abe852c9449d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT | + desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); + desc->istate &= ~IRQS_AUTODETECT; if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; -- cgit v1.2.3 From 7acdd53e5b2c55b6f7e3427e85e2f91fa814a4f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:40:54 +0100 Subject: genirq: Move IRQ_SPURIOUS_DISABLED to core state No users outside. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 3 +++ kernel/irq/manage.c | 9 ++++----- kernel/irq/spurious.c | 8 ++++---- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 7ffd4f439b92..dc5e21b84f9e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -37,9 +37,12 @@ enum { * Bit masks for desc->state * * IRQS_AUTODETECT - autodetection in progress + * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt + * detection */ enum { IRQS_AUTODETECT = 0x00000001, + IRQS_SPURIOUS_DISABLED = 0x00000002, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index abe852c9449d..5b918ffa46af 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -897,9 +897,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | - IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); - desc->istate &= ~IRQS_AUTODETECT; + desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS); + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED); if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; @@ -937,8 +936,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Check whether we disabled the irq via the spurious handler * before. Reenable it and give it another chance. */ - if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) { - desc->status &= ~IRQ_SPURIOUS_DISABLED; + if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { + desc->istate &= ~IRQS_SPURIOUS_DISABLED; __enable_irq(desc, irq, false); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index bc0620745d5f..2941d8a22df7 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -146,15 +146,15 @@ static void poll_spurious_irqs(unsigned long dummy) irq_poll_cpu = smp_processor_id(); for_each_irq_desc(i, desc) { - unsigned int status; + unsigned int state; if (!i) continue; /* Racy but it doesn't matter */ - status = desc->status; + state = desc->istate; barrier(); - if (!(status & IRQ_SPURIOUS_DISABLED)) + if (!(state & IRQS_SPURIOUS_DISABLED)) continue; local_irq_disable(); @@ -298,7 +298,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * Now kill the IRQ */ printk(KERN_EMERG "Disabling IRQ #%d\n", irq); - desc->status |= IRQ_SPURIOUS_DISABLED; + desc->istate |= IRQS_SPURIOUS_DISABLED; desc->depth++; irq_disable(desc); -- cgit v1.2.3 From 6f91a52d9bb28396177662f1da0f2e2cef9cf5d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 13:33:16 +0100 Subject: genirq: Use modify_status for set_irq_nested_thread No need for a separate function in the core code. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 143eb2a9fa4e..bff21f233a02 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -164,34 +164,6 @@ struct irq_data *irq_get_irq_data(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_get_irq_data); -/** - * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq - * - * @irq: Interrupt number - * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag - * - * The IRQ_NESTED_THREAD flag indicates that on - * request_threaded_irq() no separate interrupt thread should be - * created for the irq as the handler are called nested in the - * context of a demultiplexing interrupt handler thread. - */ -void set_irq_nested_thread(unsigned int irq, int nest) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - raw_spin_lock_irqsave(&desc->lock, flags); - if (nest) - desc->status |= IRQ_NESTED_THREAD; - else - desc->status &= ~IRQ_NESTED_THREAD; - raw_spin_unlock_irqrestore(&desc->lock, flags); -} -EXPORT_SYMBOL_GPL(set_irq_nested_thread); - int irq_startup(struct irq_desc *desc) { desc->status &= ~IRQ_DISABLED; -- cgit v1.2.3 From 6954b75b488dd740950573f244ddd66fd28620aa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 20:55:35 +0100 Subject: genirq: Move IRQ_POLL_INPROGRESS to core No users outside of core. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/internals.h | 2 ++ kernel/irq/spurious.c | 6 +++--- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index bff21f233a02..34245e7d1213 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -400,7 +400,7 @@ EXPORT_SYMBOL_GPL(handle_nested_irq); static bool irq_check_poll(struct irq_desc *desc) { - if (!(desc->status & IRQ_POLL_INPROGRESS)) + if (!(desc->istate & IRQS_POLL_INPROGRESS)) return false; return irq_wait_for_poll(desc); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index dc5e21b84f9e..f5d28e1e1eda 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -39,10 +39,12 @@ enum { * IRQS_AUTODETECT - autodetection in progress * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection + * IRQS_POLL_INPROGRESS - polling in progress */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, + IRQS_POLL_INPROGRESS = 0x00000008, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 2941d8a22df7..21c46178b1a6 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -98,13 +98,13 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) } /* Mark it poll in progress */ - desc->status |= IRQ_POLL_INPROGRESS; + desc->istate |= IRQS_POLL_INPROGRESS; do { if (handle_irq_event(desc) == IRQ_HANDLED) ret = IRQ_HANDLED; action = desc->action; } while ((desc->status & IRQ_PENDING) && action); - desc->status &= ~IRQ_POLL_INPROGRESS; + desc->istate &= ~IRQS_POLL_INPROGRESS; out: raw_spin_unlock(&desc->lock); return ret == IRQ_HANDLED; @@ -259,7 +259,7 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc, void note_interrupt(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) { - if (desc->status & IRQ_POLL_INPROGRESS) + if (desc->istate & IRQS_POLL_INPROGRESS) return; if (unlikely(action_ret != IRQ_HANDLED)) { -- cgit v1.2.3 From 009b4c3b8ad584b3462734127a5bec680d5d6af4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 21:48:49 +0100 Subject: genirq: Add IRQ_INPROGRESS to core We need to maintain the flag for now in both fields status and istate. Add a CONFIG_GENERIC_HARDIRQS_NO_COMPAT switch to allow testing w/o the status one. Wrap the access to status IRQ_INPROGRESS in a inline which can be turned of with CONFIG_GENERIC_HARDIRQS_NO_COMPAT along with the define. There is no reason that anything outside of core looks at this. That needs some modifications, but we'll get there. Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 3 +++ kernel/irq/chip.c | 16 +++++++++------- kernel/irq/compat.h | 17 +++++++++++++++++ kernel/irq/handle.c | 6 ++++-- kernel/irq/internals.h | 5 ++++- kernel/irq/manage.c | 17 +++++++++-------- kernel/irq/settings.h | 3 +++ kernel/irq/spurious.c | 6 +++--- 8 files changed, 52 insertions(+), 21 deletions(-) create mode 100644 kernel/irq/compat.h (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 4cd5d7135e0f..9e2256de1d1a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -13,6 +13,9 @@ config GENERIC_HARDIRQS config GENERIC_HARDIRQS_NO_DEPRECATED def_bool n +config GENERIC_HARDIRQS_NO_COMPAT + def_bool n + # Options selectable by the architecture code config HAVE_SPARSE_IRQ def_bool n diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 34245e7d1213..075385549dcd 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -383,7 +383,8 @@ void handle_nested_irq(unsigned int irq) if (unlikely(!action || (desc->status & IRQ_DISABLED))) goto out_unlock; - desc->status |= IRQ_INPROGRESS; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -391,7 +392,8 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -422,7 +424,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->istate & IRQS_INPROGRESS)) if (!irq_check_poll(desc)) goto out_unlock; @@ -454,7 +456,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->istate & IRQS_INPROGRESS)) if (!irq_check_poll(desc)) goto out_unlock; @@ -492,7 +494,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->status & IRQ_INPROGRESS)) + if (unlikely(desc->istate & IRQS_INPROGRESS)) if (!irq_check_poll(desc)) goto out; @@ -542,8 +544,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || - !desc->action)) { + if (unlikely((desc->istate & (IRQS_INPROGRESS) || + (desc->status & IRQ_DISABLED) || !desc->action))) { if (!irq_check_poll(desc)) { desc->status |= IRQ_PENDING; mask_ack_irq(desc); diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h new file mode 100644 index 000000000000..aac6e400e608 --- /dev/null +++ b/kernel/irq/compat.h @@ -0,0 +1,17 @@ +/* + * Compat layer for transition period + */ +#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +static inline void irq_compat_set_progress(struct irq_desc *desc) +{ + desc->status |= IRQ_INPROGRESS; +} + +static inline void irq_compat_clr_progress(struct irq_desc *desc) +{ + desc->status &= ~IRQ_INPROGRESS; +} +#else +static inline void irq_compat_set_progress(struct irq_desc *desc) { } +static inline void irq_compat_clr_progress(struct irq_desc *desc) { } +#endif diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index ff40e0f5e2e2..d4ae0b1ccc00 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -123,13 +123,15 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irqreturn_t ret; desc->status &= ~IRQ_PENDING; - desc->status |= IRQ_INPROGRESS; + irq_compat_set_progress(desc); + desc->istate |= IRQS_INPROGRESS; raw_spin_unlock(&desc->lock); ret = handle_irq_event_percpu(desc, action); raw_spin_lock(&desc->lock); - desc->status &= ~IRQ_INPROGRESS; + desc->istate &= ~IRQS_INPROGRESS; + irq_compat_clr_progress(desc); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f5d28e1e1eda..d1cb1f8df6fe 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,6 +13,7 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif +#include "compat.h" #include "settings.h" #define istate core_internal_state__do_not_mess_with_it @@ -40,11 +41,13 @@ enum { * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress + * IRQS_INPROGRESS - Interrupt in progress */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, + IRQS_INPROGRESS = 0x00000010, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -128,7 +131,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_INPROGRESS); P(IRQ_DISABLED); P(IRQ_PENDING); P(IRQ_REPLAY); @@ -143,6 +145,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOAUTOEN); PS(IRQS_AUTODETECT); + PS(IRQS_INPROGRESS); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 5b918ffa46af..7e5a50825088 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -30,7 +30,7 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int status; + unsigned int state; if (!desc) return; @@ -42,16 +42,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->status & IRQ_INPROGRESS) + while (desc->istate & IRQS_INPROGRESS) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - status = desc->status; + state = desc->istate; raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (status & IRQ_INPROGRESS); + } while (state & IRQS_INPROGRESS); /* * We made sure that no hardirq handler is running. Now verify @@ -637,9 +637,9 @@ again: * The thread is faster done than the hard interrupt handler * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due - * to IRQ_INPROGRESS and the irq line is masked forever. + * to IRQS_INPROGRESS and the irq line is masked forever. */ - if (unlikely(desc->status & IRQ_INPROGRESS)) { + if (unlikely(desc->istate & IRQS_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); @@ -897,8 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT | IRQ_INPROGRESS); - desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED); + desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT); + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ + IRQS_INPROGRESS); if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 610f55597ce7..a96140eea409 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -5,3 +5,6 @@ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, }; + +#undef IRQ_INPROGRESS +#define IRQ_INPROGRESS GOT_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 21c46178b1a6..51504837d8cc 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -45,10 +45,10 @@ bool irq_wait_for_poll(struct irq_desc *desc) #ifdef CONFIG_SMP do { raw_spin_unlock(&desc->lock); - while (desc->status & IRQ_INPROGRESS) + while (desc->istate & IRQS_INPROGRESS) cpu_relax(); raw_spin_lock(&desc->lock); - } while (desc->status & IRQ_INPROGRESS); + } while (desc->istate & IRQS_INPROGRESS); /* Might have been disabled in meantime */ return !(desc->status & IRQ_DISABLED) && desc->action; #else @@ -88,7 +88,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { + if (desc->istate & IRQS_INPROGRESS) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too -- cgit v1.2.3 From 3d67baec7f1b01fc289ac1a2f1a7e6d5e43391c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 21:02:10 +0100 Subject: genirq: Move IRQ_ONESHOT to core No users outside of core. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 8 ++++---- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 075385549dcd..420fa6bdb117 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -472,7 +472,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT)) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index d1cb1f8df6fe..36563f731ff8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,12 +42,14 @@ enum { * detection * IRQS_POLL_INPROGRESS - polling in progress * IRQS_INPROGRESS - Interrupt in progress + * IRQS_ONESHOT - irq is not unmasked in primary handler */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, IRQS_INPROGRESS = 0x00000010, + IRQS_ONESHOT = 0x00000020, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7e5a50825088..aca4208a03ac 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -697,7 +697,7 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->status & IRQ_ONESHOT; + int wake, oneshot = desc->istate & IRQS_ONESHOT; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -897,12 +897,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~(IRQ_WAITING | IRQ_ONESHOT); + desc->status &= ~IRQ_WAITING; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS); + IRQS_INPROGRESS | IRQS_ONESHOT); if (new->flags & IRQF_ONESHOT) - desc->status |= IRQ_ONESHOT; + desc->istate |= IRQS_ONESHOT; if (!(desc->status & IRQ_NOAUTOEN)) irq_startup(desc); -- cgit v1.2.3 From 163ef3091195f514a06f064b12914597d2644c55 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 11:39:15 +0100 Subject: genirq: Move IRQ_REPLAY and IRQ_WAITING to core No users outside of core. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 11 +++++------ kernel/irq/chip.c | 9 ++++----- kernel/irq/internals.h | 8 ++++++-- kernel/irq/manage.c | 4 ++-- kernel/irq/resend.c | 7 +++++-- kernel/irq/settings.h | 4 ++++ 6 files changed, 26 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 916e56e10a2e..9ea8bb99f7c1 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -17,7 +17,7 @@ /* * Autodetection depends on the fact that any interrupt that * comes in on to an unassigned handler will get stuck with - * "IRQ_WAITING" cleared and the interrupt disabled. + * "IRQS_WAITING" cleared and the interrupt disabled. */ static DEFINE_MUTEX(probing_active); @@ -75,8 +75,7 @@ unsigned long probe_irq_on(void) for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { - desc->istate |= IRQS_AUTODETECT; - desc->status |= IRQ_WAITING; + desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; if (irq_startup(desc)) desc->status |= IRQ_PENDING; } @@ -96,7 +95,7 @@ unsigned long probe_irq_on(void) if (desc->istate & IRQS_AUTODETECT) { /* It triggered already - consider it spurious. */ - if (!(desc->status & IRQ_WAITING)) { + if (!(desc->istate & IRQS_WAITING)) { desc->istate &= ~IRQS_AUTODETECT; irq_shutdown(desc); } else @@ -131,7 +130,7 @@ unsigned int probe_irq_mask(unsigned long val) for_each_irq_desc(i, desc) { raw_spin_lock_irq(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { - if (i < 16 && !(desc->status & IRQ_WAITING)) + if (i < 16 && !(desc->istate & IRQS_WAITING)) mask |= 1 << i; desc->istate &= ~IRQS_AUTODETECT; @@ -171,7 +170,7 @@ int probe_irq_off(unsigned long val) raw_spin_lock_irq(&desc->lock); if (desc->istate & IRQS_AUTODETECT) { - if (!(desc->status & IRQ_WAITING)) { + if (!(desc->istate & IRQS_WAITING)) { if (!nr_of_irqs) irq_found = i; nr_of_irqs++; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 420fa6bdb117..59ae14527ecd 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -428,7 +428,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (!irq_check_poll(desc)) goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) @@ -460,7 +460,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (!irq_check_poll(desc)) goto out_unlock; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* @@ -498,7 +498,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) if (!irq_check_poll(desc)) goto out; - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); /* @@ -537,8 +537,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); /* * If we're currently running this IRQ, or its disabled, * we shouldn't process the IRQ. Mark it pending, handle diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 36563f731ff8..54037533af7a 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -43,6 +43,8 @@ enum { * IRQS_POLL_INPROGRESS - polling in progress * IRQS_INPROGRESS - Interrupt in progress * IRQS_ONESHOT - irq is not unmasked in primary handler + * IRQS_REPLAY - irq is replayed + * IRQS_WAITING - irq is waiting */ enum { IRQS_AUTODETECT = 0x00000001, @@ -50,6 +52,8 @@ enum { IRQS_POLL_INPROGRESS = 0x00000008, IRQS_INPROGRESS = 0x00000010, IRQS_ONESHOT = 0x00000020, + IRQS_REPLAY = 0x00000040, + IRQS_WAITING = 0x00000080, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -135,8 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_DISABLED); P(IRQ_PENDING); - P(IRQ_REPLAY); - P(IRQ_WAITING); P(IRQ_LEVEL); P(IRQ_MASKED); #ifdef CONFIG_IRQ_PER_CPU @@ -148,6 +150,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_AUTODETECT); PS(IRQS_INPROGRESS); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index aca4208a03ac..7971df53d6a9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -897,9 +897,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->status |= IRQ_PER_CPU; #endif - desc->status &= ~IRQ_WAITING; desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS | IRQS_ONESHOT); + IRQS_INPROGRESS | IRQS_ONESHOT | \ + IRQS_WAITING); if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 60b20261041b..f83387cd11f3 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -62,8 +62,11 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) */ if (desc->status & IRQ_LEVEL) return; - if ((desc->status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = (desc->status & ~IRQ_PENDING) | IRQ_REPLAY; + if (desc->istate & IRQS_REPLAY) + return; + if (desc->status & IRQ_PENDING) { + desc->status &= ~IRQ_PENDING; + desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index a96140eea409..2e7d08fff0ce 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -8,3 +8,7 @@ enum { #undef IRQ_INPROGRESS #define IRQ_INPROGRESS GOT_YOU_MORON +#undef IRQ_REPLAY +#define IRQ_REPLAY GOT_YOU_MORON +#undef IRQ_WAITING +#define IRQ_WAITING GOT_YOU_MORON -- cgit v1.2.3 From c1594b77e46124bb462f961e536120e471c67446 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Feb 2011 22:11:30 +0100 Subject: genirq: Move IRQ_DISABLED to core Keep status in sync until all abusers are fixed. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 48 +++++++++++++++++++++++++++++++----------------- kernel/irq/compat.h | 12 ++++++++++++ kernel/irq/internals.h | 4 +++- kernel/irq/irqdesc.c | 2 ++ kernel/irq/manage.c | 4 ++-- kernel/irq/migration.c | 2 +- kernel/irq/settings.h | 2 ++ kernel/irq/spurious.c | 4 ++-- 8 files changed, 55 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 59ae14527ecd..527df7ab1b05 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -164,9 +164,21 @@ struct irq_data *irq_get_irq_data(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_get_irq_data); +static void irq_state_clr_disabled(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_DISABLED; + irq_compat_clr_disabled(desc); +} + +static void irq_state_set_disabled(struct irq_desc *desc) +{ + desc->istate |= IRQS_DISABLED; + irq_compat_set_disabled(desc); +} + int irq_startup(struct irq_desc *desc) { - desc->status &= ~IRQ_DISABLED; + irq_state_clr_disabled(desc); desc->depth = 0; if (desc->irq_data.chip->irq_startup) { @@ -181,7 +193,7 @@ int irq_startup(struct irq_desc *desc) void irq_shutdown(struct irq_desc *desc) { - desc->status |= IRQ_DISABLED; + irq_state_set_disabled(desc); desc->depth = 1; if (desc->irq_data.chip->irq_shutdown) desc->irq_data.chip->irq_shutdown(&desc->irq_data); @@ -194,7 +206,7 @@ void irq_shutdown(struct irq_desc *desc) void irq_enable(struct irq_desc *desc) { - desc->status &= ~IRQ_DISABLED; + irq_state_clr_disabled(desc); if (desc->irq_data.chip->irq_enable) desc->irq_data.chip->irq_enable(&desc->irq_data); else @@ -204,7 +216,7 @@ void irq_enable(struct irq_desc *desc) void irq_disable(struct irq_desc *desc) { - desc->status |= IRQ_DISABLED; + irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); desc->status |= IRQ_MASKED; @@ -380,7 +392,7 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->status & IRQ_DISABLED))) + if (unlikely(!action || (desc->istate & IRQS_DISABLED))) goto out_unlock; irq_compat_set_progress(desc); @@ -431,7 +443,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; handle_irq_event(desc); @@ -467,12 +479,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) goto out_unlock; handle_irq_event(desc); - if (!(desc->status & IRQ_DISABLED) && !(desc->istate & IRQS_ONESHOT)) + if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); @@ -505,7 +517,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - if (unlikely(!desc->action || (desc->status & IRQ_DISABLED))) { + if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { desc->status |= IRQ_PENDING; mask_irq(desc); goto out; @@ -543,8 +555,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->istate & (IRQS_INPROGRESS) || - (desc->status & IRQ_DISABLED) || !desc->action))) { + if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || + !desc->action))) { if (!irq_check_poll(desc)) { desc->status |= IRQ_PENDING; mask_ack_irq(desc); @@ -567,15 +579,16 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ - if (unlikely((desc->status & - (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == - (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc); + if (unlikely(desc->status & IRQ_PENDING)) { + if (!(desc->istate & IRQS_DISABLED) && + (desc->status & IRQ_MASKED)) + unmask_irq(desc); } handle_irq_event(desc); - } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); + } while ((desc->status & IRQ_PENDING) && + !(desc->istate & IRQS_DISABLED)); out_unlock: raw_spin_unlock(&desc->lock); @@ -639,7 +652,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - desc->status |= IRQ_DISABLED; + irq_compat_set_disabled(desc); + desc->istate |= IRQS_DISABLED; desc->depth = 1; } desc->handle_irq = handle; diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index aac6e400e608..bc0c2a501e82 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -11,7 +11,19 @@ static inline void irq_compat_clr_progress(struct irq_desc *desc) { desc->status &= ~IRQ_INPROGRESS; } +static inline void irq_compat_set_disabled(struct irq_desc *desc) +{ + desc->status |= IRQ_DISABLED; +} + +static inline void irq_compat_clr_disabled(struct irq_desc *desc) +{ + desc->status &= ~IRQ_DISABLED; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } +static inline void irq_compat_set_disabled(struct irq_desc *desc) { } +static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } #endif + diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 54037533af7a..919d2dd0bb33 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -45,6 +45,7 @@ enum { * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting + * IRQS_DISABLED - irq is disabled */ enum { IRQS_AUTODETECT = 0x00000001, @@ -54,6 +55,7 @@ enum { IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, + IRQS_DISABLED = 0x00000100, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -137,7 +139,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_DISABLED); P(IRQ_PENDING); P(IRQ_LEVEL); P(IRQ_MASKED); @@ -152,6 +153,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_INPROGRESS); PS(IRQS_REPLAY); PS(IRQS_WAITING); + PS(IRQS_DISABLED); } #undef P diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 8b87f2ce0203..78866d050bc9 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -80,6 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; desc->status = _IRQ_DEFAULT_INIT_FLAGS; + desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -247,6 +248,7 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = _IRQ_DEFAULT_INIT_FLAGS, + .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7971df53d6a9..77ff275b54cf 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -646,7 +646,7 @@ again: goto again; } - if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { + if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } @@ -709,7 +709,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->status & IRQ_DISABLED)) { + if (unlikely(desc->istate & IRQS_DISABLED)) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 441fd629ff04..8c68cb8555a7 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -61,7 +61,7 @@ void move_native_irq(int irq) if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; - if (unlikely(desc->status & IRQ_DISABLED)) + if (unlikely(desc->istate & IRQS_DISABLED)) return; /* diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 2e7d08fff0ce..5e3411c7c62b 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -12,3 +12,5 @@ enum { #define IRQ_REPLAY GOT_YOU_MORON #undef IRQ_WAITING #define IRQ_WAITING GOT_YOU_MORON +#undef IRQ_DISABLED +#define IRQ_DISABLED GOT_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 51504837d8cc..367614f858ff 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -50,7 +50,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) raw_spin_lock(&desc->lock); } while (desc->istate & IRQS_INPROGRESS); /* Might have been disabled in meantime */ - return !(desc->status & IRQ_DISABLED) && desc->action; + return !(desc->istate & IRQS_DISABLED) && desc->action; #else return false; #endif @@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitely. */ - if ((desc->status & IRQ_DISABLED) && !force) + if ((desc->istate & IRQS_DISABLED) && !force) goto out; /* -- cgit v1.2.3 From 2a0d6fb335d4428285dab2d254911748e6040807 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 12:17:57 +0100 Subject: genirq: Move IRQ_PENDING flag to core Keep status in sync until all users are fixed. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 6 ++++-- kernel/irq/chip.c | 10 ++++++---- kernel/irq/compat.h | 12 +++++++++++- kernel/irq/handle.c | 3 ++- kernel/irq/internals.h | 4 +++- kernel/irq/manage.c | 5 +++-- kernel/irq/pm.c | 3 ++- kernel/irq/resend.c | 5 +++-- kernel/irq/settings.h | 2 ++ kernel/irq/spurious.c | 5 +++-- 10 files changed, 39 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 9ea8bb99f7c1..aab64c262726 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -76,8 +76,10 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) - desc->status |= IRQ_PENDING; + if (irq_startup(desc)) { + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; + } } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 527df7ab1b05..17c87865bfb1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -518,7 +518,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * then mask it and get out of here: */ if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; } @@ -558,7 +559,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || !desc->action))) { if (!irq_check_poll(desc)) { - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; } @@ -579,7 +581,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * one, we could have masked the irq. * Renable it, if it was not disabled in meantime. */ - if (unlikely(desc->status & IRQ_PENDING)) { + if (unlikely(desc->istate & IRQS_PENDING)) { if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) unmask_irq(desc); @@ -587,7 +589,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - } while ((desc->status & IRQ_PENDING) && + } while ((desc->istate & IRQS_PENDING) && !(desc->istate & IRQS_DISABLED)); out_unlock: diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index bc0c2a501e82..0067a69781f4 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -15,15 +15,25 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc) { desc->status |= IRQ_DISABLED; } - static inline void irq_compat_clr_disabled(struct irq_desc *desc) { desc->status &= ~IRQ_DISABLED; } +static inline void irq_compat_set_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_PENDING; +} + +static inline void irq_compat_clr_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_PENDING; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } static inline void irq_compat_set_disabled(struct irq_desc *desc) { } static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } +static inline void irq_compat_set_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_pending(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d4ae0b1ccc00..6e34bdbeb26a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -122,7 +122,8 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) struct irqaction *action = desc->action; irqreturn_t ret; - desc->status &= ~IRQ_PENDING; + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; irq_compat_set_progress(desc); desc->istate |= IRQS_INPROGRESS; raw_spin_unlock(&desc->lock); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 919d2dd0bb33..fdf2524437eb 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -46,6 +46,7 @@ enum { * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting * IRQS_DISABLED - irq is disabled + * IRQS_PENDING - irq is pending and replayed later */ enum { IRQS_AUTODETECT = 0x00000001, @@ -56,6 +57,7 @@ enum { IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, IRQS_DISABLED = 0x00000100, + IRQS_PENDING = 0x00000200, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -139,7 +141,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) print_symbol("%s\n", (unsigned long)desc->action->handler); } - P(IRQ_PENDING); P(IRQ_LEVEL); P(IRQ_MASKED); #ifdef CONFIG_IRQ_PER_CPU @@ -154,6 +155,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_REPLAY); PS(IRQS_WAITING); PS(IRQS_DISABLED); + PS(IRQS_PENDING); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 77ff275b54cf..ac060814a787 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -714,10 +714,11 @@ static int irq_thread(void *data) * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which * retriggers the thread in check_irq_resend() - * but AFAICT IRQ_PENDING should be fine as it + * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d6bfb89cce91..d7389418e91a 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,7 +69,8 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) - if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING)) + if ((desc->status & IRQ_WAKEUP) && + (desc->istate & IRQS_PENDING)) return -EBUSY; return 0; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index f83387cd11f3..ff1fea060014 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -64,8 +64,9 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) return; if (desc->istate & IRQS_REPLAY) return; - if (desc->status & IRQ_PENDING) { - desc->status &= ~IRQ_PENDING; + if (desc->istate & IRQS_PENDING) { + irq_compat_clr_pending(desc); + desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 5e3411c7c62b..623fcf83e7de 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -14,3 +14,5 @@ enum { #define IRQ_WAITING GOT_YOU_MORON #undef IRQ_DISABLED #define IRQ_DISABLED GOT_YOU_MORON +#undef IRQ_PENDING +#define IRQ_PENDING GOT_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 367614f858ff..692ce2bae302 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -93,7 +93,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - desc->status |= IRQ_PENDING; + irq_compat_set_pending(desc); + desc->istate |= IRQS_PENDING; goto out; } @@ -103,7 +104,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) if (handle_irq_event(desc) == IRQ_HANDLED) ret = IRQ_HANDLED; action = desc->action; - } while ((desc->status & IRQ_PENDING) && action); + } while ((desc->istate & IRQS_PENDING) && action); desc->istate &= ~IRQS_POLL_INPROGRESS; out: raw_spin_unlock(&desc->lock); -- cgit v1.2.3 From 6e40262ea43c4b0e3f435b3a083e4461ef921c17 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 12:36:06 +0100 Subject: genirq: Move IRQ_MASKED to core Keep status in sync until all users are fixed. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 28 ++++++++++++++++++++-------- kernel/irq/compat.h | 11 +++++++++++ kernel/irq/internals.h | 4 +++- kernel/irq/manage.c | 5 +++-- kernel/irq/migration.c | 2 +- kernel/irq/settings.h | 2 ++ 6 files changed, 40 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 17c87865bfb1..73b2e7e00934 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -176,6 +176,18 @@ static void irq_state_set_disabled(struct irq_desc *desc) irq_compat_set_disabled(desc); } +static void irq_state_clr_masked(struct irq_desc *desc) +{ + desc->istate &= ~IRQS_MASKED; + irq_compat_clr_masked(desc); +} + +static void irq_state_set_masked(struct irq_desc *desc) +{ + desc->istate |= IRQS_MASKED; + irq_compat_set_masked(desc); +} + int irq_startup(struct irq_desc *desc) { irq_state_clr_disabled(desc); @@ -183,7 +195,7 @@ int irq_startup(struct irq_desc *desc) if (desc->irq_data.chip->irq_startup) { int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); return ret; } @@ -201,7 +213,7 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } void irq_enable(struct irq_desc *desc) @@ -211,7 +223,7 @@ void irq_enable(struct irq_desc *desc) desc->irq_data.chip->irq_enable(&desc->irq_data); else desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); } void irq_disable(struct irq_desc *desc) @@ -219,8 +231,8 @@ void irq_disable(struct irq_desc *desc) irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); - desc->status |= IRQ_MASKED; } + irq_state_set_masked(desc); } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED @@ -352,14 +364,14 @@ static inline void mask_ack_irq(struct irq_desc *desc) if (desc->irq_data.chip->irq_ack) desc->irq_data.chip->irq_ack(&desc->irq_data); } - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } static inline void mask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); - desc->status |= IRQ_MASKED; + irq_state_set_masked(desc); } } @@ -367,7 +379,7 @@ static inline void unmask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); - desc->status &= ~IRQ_MASKED; + irq_state_clr_masked(desc); } } @@ -583,7 +595,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) */ if (unlikely(desc->istate & IRQS_PENDING)) { if (!(desc->istate & IRQS_DISABLED) && - (desc->status & IRQ_MASKED)) + (desc->istate & IRQS_MASKED)) unmask_irq(desc); } diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index 0067a69781f4..593abecbcc44 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -28,6 +28,15 @@ static inline void irq_compat_clr_pending(struct irq_desc *desc) { desc->status &= ~IRQ_PENDING; } +static inline void irq_compat_set_masked(struct irq_desc *desc) +{ + desc->status |= IRQ_MASKED; +} + +static inline void irq_compat_clr_masked(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MASKED; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } @@ -35,5 +44,7 @@ static inline void irq_compat_set_disabled(struct irq_desc *desc) { } static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } static inline void irq_compat_set_pending(struct irq_desc *desc) { } static inline void irq_compat_clr_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_masked(struct irq_desc *desc) { } +static inline void irq_compat_clr_masked(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fdf2524437eb..3f2fcc194dcc 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -47,6 +47,7 @@ enum { * IRQS_WAITING - irq is waiting * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later + * IRQS_MASKED - irq is masked */ enum { IRQS_AUTODETECT = 0x00000001, @@ -58,6 +59,7 @@ enum { IRQS_WAITING = 0x00000080, IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, + IRQS_MASKED = 0x00000400, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) @@ -142,7 +144,6 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) } P(IRQ_LEVEL); - P(IRQ_MASKED); #ifdef CONFIG_IRQ_PER_CPU P(IRQ_PER_CPU); #endif @@ -156,6 +157,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) PS(IRQS_WAITING); PS(IRQS_DISABLED); PS(IRQS_PENDING); + PS(IRQS_MASKED); } #undef P diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac060814a787..83fd20194e5b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -646,8 +646,9 @@ again: goto again; } - if (!(desc->istate & IRQS_DISABLED) && (desc->status & IRQ_MASKED)) { - desc->status &= ~IRQ_MASKED; + if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) { + irq_compat_clr_masked(desc); + desc->istate &= ~IRQS_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 8c68cb8555a7..6f2f98480354 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -69,7 +69,7 @@ void move_native_irq(int irq) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->status & IRQ_MASKED; + masked = desc->istate & IRQS_MASKED; if (!masked) desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 623fcf83e7de..2cd45fd5ec8a 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -16,3 +16,5 @@ enum { #define IRQ_DISABLED GOT_YOU_MORON #undef IRQ_PENDING #define IRQ_PENDING GOT_YOU_MORON +#undef IRQ_MASKED +#define IRQ_MASKED GOT_YOU_MORON -- cgit v1.2.3 From c531e8361f1968d664e6e97fbd3bfa4cf0e62e42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 12:44:58 +0100 Subject: genirq: Move IRQ_SUSPENDED to core No users outside of core. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 8 ++++---- kernel/irq/pm.c | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 3f2fcc194dcc..46889119e6a6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -48,6 +48,7 @@ enum { * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later * IRQS_MASKED - irq is masked + * IRQS_SUSPENDED - irq is suspended */ enum { IRQS_AUTODETECT = 0x00000001, @@ -60,6 +61,7 @@ enum { IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, IRQS_MASKED = 0x00000400, + IRQS_SUSPENDED = 0x00000800, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 83fd20194e5b..b912de4ff4de 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -326,7 +326,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (suspend) { if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; - desc->status |= IRQ_SUSPENDED; + desc->istate |= IRQS_SUSPENDED; } if (!desc->depth++) @@ -388,7 +388,7 @@ EXPORT_SYMBOL(disable_irq); void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) { if (resume) { - if (!(desc->status & IRQ_SUSPENDED)) { + if (!(desc->istate & IRQS_SUSPENDED)) { if (!desc->action) return; if (!(desc->action->flags & IRQF_FORCE_RESUME)) @@ -396,7 +396,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) /* Pretend that it got disabled ! */ desc->depth++; } - desc->status &= ~IRQ_SUSPENDED; + desc->istate &= ~IRQS_SUSPENDED; } switch (desc->depth) { @@ -405,7 +405,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); break; case 1: { - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ desc->status |= IRQ_NOPROBE; diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d7389418e91a..d81337fc1cff 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -18,7 +18,7 @@ * During system-wide suspend or hibernation device drivers need to be prevented * from receiving interrupts and this function is provided for this purpose. * It marks all interrupt lines in use, except for the timer ones, as disabled - * and sets the IRQ_SUSPENDED flag for each of them. + * and sets the IRQS_SUSPENDED flag for each of them. */ void suspend_device_irqs(void) { @@ -34,7 +34,7 @@ void suspend_device_irqs(void) } for_each_irq_desc(irq, desc) - if (desc->status & IRQ_SUSPENDED) + if (desc->istate & IRQS_SUSPENDED) synchronize_irq(irq); } EXPORT_SYMBOL_GPL(suspend_device_irqs); @@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs); * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() * * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQ_SUSPENDED flag set. + * have the IRQS_SUSPENDED flag set. */ void resume_device_irqs(void) { -- cgit v1.2.3 From 6d2cd17fde1fc3e93302815f049f255bb2b3123e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 14:34:18 +0100 Subject: genirq: Move IRQ_WAKEUP to core No users outside of core. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 4 ++-- kernel/irq/pm.c | 2 +- kernel/irq/settings.h | 2 ++ 4 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 46889119e6a6..cef0849dcfa5 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -49,6 +49,7 @@ enum { * IRQS_PENDING - irq is pending and replayed later * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended + * IRQS_WAKEUP - irq triggers system wakeup from suspend */ enum { IRQS_AUTODETECT = 0x00000001, @@ -62,6 +63,7 @@ enum { IRQS_PENDING = 0x00000200, IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, + IRQS_WAKEUP = 0x00001000, }; #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b912de4ff4de..ccc9389909ff 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -489,7 +489,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 0; else - desc->status |= IRQ_WAKEUP; + desc->istate |= IRQS_WAKEUP; } } else { if (desc->wake_depth == 0) { @@ -499,7 +499,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 1; else - desc->status &= ~IRQ_WAKEUP; + desc->istate &= ~IRQS_WAKEUP; } } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index d81337fc1cff..f39383d8672d 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,7 +69,7 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) - if ((desc->status & IRQ_WAKEUP) && + if ((desc->istate & IRQS_WAKEUP) && (desc->istate & IRQS_PENDING)) return -EBUSY; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 2cd45fd5ec8a..ef09824e4b32 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -18,3 +18,5 @@ enum { #define IRQ_PENDING GOT_YOU_MORON #undef IRQ_MASKED #define IRQ_MASKED GOT_YOU_MORON +#undef IRQ_WAKEUP +#define IRQ_WAKEUP GOT_YOU_MORON -- cgit v1.2.3 From f230b6d5c48f8d12f4dfa1f8b5ab0b0320076d21 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Feb 2011 15:20:04 +0100 Subject: genirq: Add IRQ_MOVE_PENDING to irq_data.state chip implementations need to know about it. Keep status in sync until all users are fixed. Accessor function: irqd_is_setaffinity_pending(irqdata) Coders who access them directly will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- kernel/irq/compat.h | 11 +++++++++++ kernel/irq/internals.h | 15 +++++++++++++++ kernel/irq/manage.c | 4 ++-- kernel/irq/migration.c | 6 +++--- kernel/irq/proc.c | 2 +- kernel/irq/settings.h | 2 ++ 6 files changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index 593abecbcc44..5e33aadadacc 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -37,6 +37,15 @@ static inline void irq_compat_clr_masked(struct irq_desc *desc) { desc->status &= ~IRQ_MASKED; } +static inline void irq_compat_set_move_pending(struct irq_desc *desc) +{ + desc->status |= IRQ_MOVE_PENDING; +} + +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) +{ + desc->status &= ~IRQ_MOVE_PENDING; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } @@ -46,5 +55,7 @@ static inline void irq_compat_set_pending(struct irq_desc *desc) { } static inline void irq_compat_clr_pending(struct irq_desc *desc) { } static inline void irq_compat_set_masked(struct irq_desc *desc) { } static inline void irq_compat_clr_masked(struct irq_desc *desc) { } +static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index cef0849dcfa5..e93e6090cd47 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -124,6 +124,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +/* + * Manipulation functions for irq_data.state + */ +static inline void irqd_set_move_pending(struct irq_data *d) +{ + d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; + irq_compat_set_move_pending(irq_data_to_desc(d)); +} + +static inline void irqd_clr_move_pending(struct irq_data *d) +{ + d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; + irq_compat_clr_move_pending(irq_data_to_desc(d)); +} + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ccc9389909ff..9a99c471d470 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -107,7 +107,7 @@ static inline bool irq_can_move_pcntxt(struct irq_desc *desc) } static inline bool irq_move_pending(struct irq_desc *desc) { - return desc->status & IRQ_MOVE_PENDING; + return irqd_is_setaffinity_pending(&desc->irq_data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) @@ -156,7 +156,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) ret = 0; } } else { - desc->status |= IRQ_MOVE_PENDING; + irqd_set_move_pending(&desc->irq_data); irq_copy_pending(desc, mask); } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 6f2f98480354..9485ae081dcd 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -9,7 +9,7 @@ void move_masked_irq(int irq) struct irq_desc *desc = irq_to_desc(irq); struct irq_chip *chip = desc->irq_data.chip; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; /* @@ -20,7 +20,7 @@ void move_masked_irq(int irq) return; } - desc->status &= ~IRQ_MOVE_PENDING; + irqd_clr_move_pending(&desc->irq_data); if (unlikely(cpumask_empty(desc->pending_mask))) return; @@ -58,7 +58,7 @@ void move_native_irq(int irq) struct irq_desc *desc = irq_to_desc(irq); bool masked; - if (likely(!(desc->status & IRQ_MOVE_PENDING))) + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; if (unlikely(desc->istate & IRQS_DISABLED)) diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 26449239bb46..afe4e6803148 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -25,7 +25,7 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PENDING) + if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif seq_cpumask(m, mask); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index ef09824e4b32..bb104a2dce73 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -20,3 +20,5 @@ enum { #define IRQ_MASKED GOT_YOU_MORON #undef IRQ_WAKEUP #define IRQ_WAKEUP GOT_YOU_MORON +#undef IRQ_MOVE_PENDING +#define IRQ_MOVE_PENDING GOT_YOU_MORON -- cgit v1.2.3 From 6a58fb3bad099076f36f0f30f44507bc3275cdb6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 15:40:05 +0100 Subject: genirq: Remove CONFIG_IRQ_PER_CPU The saving of this switch is minimal versus the ifdef mess it creates. Simple enable PER_CPU unconditionally and remove the config switch. Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 3 --- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 9 +++------ 3 files changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9e2256de1d1a..48ad25f5fa59 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -32,9 +32,6 @@ config GENERIC_PENDING_IRQ config AUTO_IRQ_AFFINITY def_bool n -config IRQ_PER_CPU - def_bool n - config HARDIRQS_SW_RESEND def_bool n diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e93e6090cd47..9e32b3d35d35 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -163,9 +163,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) } P(IRQ_LEVEL); -#ifdef CONFIG_IRQ_PER_CPU P(IRQ_PER_CPU); -#endif P(IRQ_NOPROBE); P(IRQ_NOREQUEST); P(IRQ_NOAUTOEN); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9a99c471d470..056aa49698b4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -865,12 +865,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto mismatch; } -#if defined(CONFIG_IRQ_PER_CPU) /* All handlers must agree on per-cpuness */ if ((old->flags & IRQF_PERCPU) != (new->flags & IRQF_PERCPU)) goto mismatch; -#endif /* add new interrupt at end of irq queue */ do { @@ -894,15 +892,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) goto out_mask; } else compat_irq_chip_set_default_handler(desc); -#if defined(CONFIG_IRQ_PER_CPU) - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; -#endif desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_INPROGRESS | IRQS_ONESHOT | \ IRQS_WAITING); + if (new->flags & IRQF_PERCPU) + desc->status |= IRQ_PER_CPU; + if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; -- cgit v1.2.3 From fae581e588e64a0690f3fc995e404fcacaebe772 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 16:53:24 +0100 Subject: genirq: Remove CHECK_IRQ_PER_CPU from core code Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 ++-- kernel/irq/migration.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 056aa49698b4..f1cfa271ba70 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || - !desc->irq_data.chip->irq_set_affinity) + if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return 0; return 1; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 9485ae081dcd..24f53caddf47 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -15,7 +15,7 @@ void move_masked_irq(int irq) /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (CHECK_IRQ_PER_CPU(desc->status)) { + if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) { WARN_ON(1); return; } -- cgit v1.2.3 From 1ce6068dac1924f7095be5850481e790cbf1b3c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 20:44:21 +0100 Subject: genirq: Move debug code to separate header It'll break when I'm going to undefine the constants. Signed-off-by: Thomas Gleixner --- kernel/irq/debug.h | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/irq/internals.h | 48 ++++-------------------------------------------- 2 files changed, 44 insertions(+), 44 deletions(-) create mode 100644 kernel/irq/debug.h (limited to 'kernel') diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 000000000000..d1a33b7fa61d --- /dev/null +++ b/kernel/irq/debug.h @@ -0,0 +1,40 @@ +/* + * Debugging printout: + */ + +#include + +#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_LEVEL); + P(IRQ_PER_CPU); + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); + PS(IRQS_INPROGRESS); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); + PS(IRQS_DISABLED); + PS(IRQS_PENDING); + PS(IRQS_MASKED); +} + +#undef P +#undef PS diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 9e32b3d35d35..b2ba59e73f21 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,9 +13,6 @@ # define IRQ_BITMAP_BITS NR_IRQS #endif -#include "compat.h" -#include "settings.h" - #define istate core_internal_state__do_not_mess_with_it extern int noirqdebug; @@ -66,6 +63,10 @@ enum { IRQS_WAKEUP = 0x00001000, }; +#include "compat.h" +#include "debug.h" +#include "settings.h" + #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) /* Set default functions for irq_chip structures: */ @@ -138,44 +139,3 @@ static inline void irqd_clr_move_pending(struct irq_data *d) d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; irq_compat_clr_move_pending(irq_data_to_desc(d)); } - -/* - * Debugging printout: - */ - -#include - -#define P(f) if (desc->status & f) printk("%14s set\n", #f) -#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) - -static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", - irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); - printk("->action(): %p\n", desc->action); - if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); - } - - P(IRQ_LEVEL); - P(IRQ_PER_CPU); - P(IRQ_NOPROBE); - P(IRQ_NOREQUEST); - P(IRQ_NOAUTOEN); - - PS(IRQS_AUTODETECT); - PS(IRQS_INPROGRESS); - PS(IRQS_REPLAY); - PS(IRQS_WAITING); - PS(IRQS_DISABLED); - PS(IRQS_PENDING); - PS(IRQS_MASKED); -} - -#undef P -#undef PS -- cgit v1.2.3 From a005677b3dd05decdd8880cf3044ae709856f58f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:11:03 +0100 Subject: genirq: Mirror IRQ_PER_CPU and IRQ_NO_BALANCING in irq_data.state That's the right data structure to look at for arch code. Accessor functions are provided. irqd_is_per_cpu(irqdata); irqd_can_balance(irqdata); Coders who access them directly will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 15 +++++++++------ kernel/irq/internals.h | 11 +++++++++++ kernel/irq/manage.c | 16 ++++++++++------ kernel/irq/migration.c | 2 +- kernel/irq/settings.h | 36 ++++++++++++++++++++++++++++++++++++ kernel/irq/spurious.c | 3 ++- 6 files changed, 69 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 73b2e7e00934..b8aa3dfe8301 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -706,12 +706,15 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (!desc) return; - /* Sanitize flags */ - set &= IRQF_MODIFY_MASK; - clr &= IRQF_MODIFY_MASK; - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~clr; - desc->status |= set; + + irq_settings_clr_and_set(desc, clr, set); + + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b2ba59e73f21..a80b44d2735e 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -139,3 +139,14 @@ static inline void irqd_clr_move_pending(struct irq_data *d) d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; irq_compat_clr_move_pending(irq_data_to_desc(d)); } + +static inline void irqd_clear(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors &= ~mask; +} + +static inline void irqd_set(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors |= mask; +} + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f1cfa271ba70..84a0a9c22226 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if ((desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) || - !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) + if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || + !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -897,8 +897,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) IRQS_INPROGRESS | IRQS_ONESHOT | \ IRQS_WAITING); - if (new->flags & IRQF_PERCPU) - desc->status |= IRQ_PER_CPU; + if (new->flags & IRQF_PERCPU) { + irqd_set(&desc->irq_data, IRQD_PER_CPU); + irq_settings_set_per_cpu(desc); + } if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; @@ -910,8 +912,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->depth = 1; /* Exclude IRQ from balancing if requested */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } /* Set default affinity mask once everything is setup */ setup_affinity(irq, desc, mask); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 24f53caddf47..7a93c6b88b25 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -15,7 +15,7 @@ void move_masked_irq(int irq) /* * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. */ - if (desc->status & (IRQ_PER_CPU | IRQ_NO_BALANCING)) { + if (!irqd_can_balance(&desc->irq_data)) { WARN_ON(1); return; } diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index bb104a2dce73..ba0fffe410ad 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -4,6 +4,9 @@ */ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, + _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; #undef IRQ_INPROGRESS @@ -22,3 +25,36 @@ enum { #define IRQ_WAKEUP GOT_YOU_MORON #undef IRQ_MOVE_PENDING #define IRQ_MOVE_PENDING GOT_YOU_MORON +#undef IRQ_PER_CPU +#define IRQ_PER_CPU GOT_YOU_MORON +#undef IRQ_NO_BALANCING +#define IRQ_NO_BALANCING GOT_YOU_MORON +#undef IRQF_MODIFY_MASK +#define IRQF_MODIFY_MASK GOT_YOU_MORON + +static inline void +irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) +{ + desc->status &= ~(clr & _IRQF_MODIFY_MASK); + desc->status |= (set & _IRQF_MODIFY_MASK); +} + +static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) +{ + return desc->status & _IRQ_PER_CPU; +} + +static inline void irq_settings_set_per_cpu(struct irq_desc *desc) +{ + desc->status |= _IRQ_PER_CPU; +} + +static inline void irq_settings_set_no_balancing(struct irq_desc *desc) +{ + desc->status |= _IRQ_NO_BALANCING; +} + +static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) +{ + return desc->status & _IRQ_NO_BALANCING; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 692ce2bae302..226ed7d26a84 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -68,7 +68,8 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); /* PER_CPU and nested thread interrupts are never polled */ - if (desc->status & (IRQ_PER_CPU | IRQ_NESTED_THREAD)) + if (irq_settings_is_per_cpu(desc) || + (desc->status & IRQ_NESTED_THREAD)) goto out; /* -- cgit v1.2.3 From bce43032ad79fae0ce5b6174ce1321e643ceb54b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 22:37:41 +0100 Subject: genirq: Reuse existing can set affinty check Add a !desc check while at it. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 ++-- kernel/irq/proc.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 84a0a9c22226..550ae97a0040 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (!irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || - !desc->irq_data.chip->irq_set_affinity) + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) return 0; return 1; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index afe4e6803148..4cc2e5ed0bec 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -66,8 +66,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || - irq_balancing_disabled(irq)) + if (!irq_can_set_affinity(irq) || no_irq_affinity) return -EIO; if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) -- cgit v1.2.3 From 2bdd10558c8d93009cb6c32ce9e30800fbb08add Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:22:00 +0100 Subject: genirq: Move IRQ_AFFINITY_SET to core Keep status in sync until last abuser is gone. Signed-off-by: Thomas Gleixner --- kernel/irq/compat.h | 11 +++++++++++ kernel/irq/internals.h | 4 ++++ kernel/irq/manage.c | 11 +++++++---- kernel/irq/settings.h | 2 ++ 4 files changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h index 5e33aadadacc..6bbaf66aca85 100644 --- a/kernel/irq/compat.h +++ b/kernel/irq/compat.h @@ -46,6 +46,15 @@ static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { desc->status &= ~IRQ_MOVE_PENDING; } +static inline void irq_compat_set_affinity(struct irq_desc *desc) +{ + desc->status |= IRQ_AFFINITY_SET; +} + +static inline void irq_compat_clr_affinity(struct irq_desc *desc) +{ + desc->status &= ~IRQ_AFFINITY_SET; +} #else static inline void irq_compat_set_progress(struct irq_desc *desc) { } static inline void irq_compat_clr_progress(struct irq_desc *desc) { } @@ -57,5 +66,7 @@ static inline void irq_compat_set_masked(struct irq_desc *desc) { } static inline void irq_compat_clr_masked(struct irq_desc *desc) { } static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } +static inline void irq_compat_set_affinity(struct irq_desc *desc) { } +static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index a80b44d2735e..6776453c454c 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -150,3 +150,7 @@ static inline void irqd_set(struct irq_data *d, unsigned int mask) d->state_use_accessors |= mask; } +static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) +{ + return d->state_use_accessors & mask; +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 550ae97a0040..8246afc81956 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -164,7 +164,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } - desc->status |= IRQ_AFFINITY_SET; + irq_compat_set_affinity(desc); + irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } @@ -272,12 +273,14 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & (IRQ_AFFINITY_SET)) { + if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { if (cpumask_intersects(desc->irq_data.affinity, cpu_online_mask)) set = desc->irq_data.affinity; - else - desc->status &= ~IRQ_AFFINITY_SET; + else { + irq_compat_clr_affinity(desc); + irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); + } } cpumask_and(mask, cpu_online_mask, set); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index ba0fffe410ad..da5acb446b1c 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -29,6 +29,8 @@ enum { #define IRQ_PER_CPU GOT_YOU_MORON #undef IRQ_NO_BALANCING #define IRQ_NO_BALANCING GOT_YOU_MORON +#undef IRQ_AFFINITY_SET +#define IRQ_AFFINITY_SET GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON -- cgit v1.2.3 From 876dbd4cc1b35c1a4cb96a2be1d43ea0eabce3b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:28:12 +0100 Subject: genirq: Mirror irq trigger type bits in irq_data.state That's the data structure chip functions get provided. Also allow them to signal the core code that they updated the flags in irq_data.state by returning IRQ_SET_MASK_OK_NOCOPY. The default is unchanged. The type bits should be accessed via: val = irqd_get_trigger_type(irqdata); and irqd_set_trigger_type(irqdata, val); Coders who access them directly will be tracked down and slapped with stinking trouts. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 5 ++++- kernel/irq/manage.c | 44 +++++++++++++++++++++++++++----------------- kernel/irq/resend.c | 2 +- kernel/irq/settings.h | 30 ++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b8aa3dfe8301..9c9b573a718e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -710,11 +710,14 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); - irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); + irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8246afc81956..9ae758ed8e66 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -567,23 +567,32 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return 0; } + flags &= IRQ_TYPE_SENSE_MASK; /* caller masked out all except trigger mode flags */ ret = chip->irq_set_type(&desc->irq_data, flags); - if (ret) - pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", - flags, irq, chip->irq_set_type); - else { - if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) - flags |= IRQ_LEVEL; - /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ - desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); - desc->status |= flags; + switch (ret) { + case IRQ_SET_MASK_OK: + irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); + irqd_set(&desc->irq_data, flags); + + case IRQ_SET_MASK_OK_NOCOPY: + flags = irqd_get_trigger_type(&desc->irq_data); + irq_settings_set_trigger_mask(desc, flags); + irqd_clear(&desc->irq_data, IRQD_LEVEL); + irq_settings_clr_level(desc); + if (flags & IRQ_TYPE_LEVEL_MASK) { + irq_settings_set_level(desc); + irqd_set(&desc->irq_data, IRQD_LEVEL); + } if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); + return 0; + default: + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); } - return ret; } @@ -923,13 +932,14 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* Set default affinity mask once everything is setup */ setup_affinity(irq, desc, mask); - } else if ((new->flags & IRQF_TRIGGER_MASK) - && (new->flags & IRQF_TRIGGER_MASK) - != (desc->status & IRQ_TYPE_SENSE_MASK)) { - /* hope the handler works with the actual trigger mode... */ - pr_warning("IRQ %d uses trigger mode %d; requested %d\n", - irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), - (int)(new->flags & IRQF_TRIGGER_MASK)); + } else if (new->flags & IRQF_TRIGGER_MASK) { + unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; + unsigned int omsk = irq_settings_get_trigger_mask(desc); + + if (nmsk != omsk) + /* hope the handler works with current trigger mode */ + pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } new->irq = irq; diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index ff1fea060014..ad683a99b1ec 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) * interrupts are resent by hardware when they are still * active. */ - if (desc->status & IRQ_LEVEL) + if (irq_settings_is_level(desc)) return; if (desc->istate & IRQS_REPLAY) return; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index da5acb446b1c..2201f2aaa9a0 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -5,6 +5,7 @@ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_LEVEL = IRQ_LEVEL, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -31,6 +32,8 @@ enum { #define IRQ_NO_BALANCING GOT_YOU_MORON #undef IRQ_AFFINITY_SET #define IRQ_AFFINITY_SET GOT_YOU_MORON +#undef IRQ_LEVEL +#define IRQ_LEVEL GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -60,3 +63,30 @@ static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) { return desc->status & _IRQ_NO_BALANCING; } + +static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) +{ + return desc->status & IRQ_TYPE_SENSE_MASK; +} + +static inline void +irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) +{ + desc->status &= ~IRQ_TYPE_SENSE_MASK; + desc->status |= mask & IRQ_TYPE_SENSE_MASK; +} + +static inline bool irq_settings_is_level(struct irq_desc *desc) +{ + return desc->status & _IRQ_LEVEL; +} + +static inline void irq_settings_clr_level(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_LEVEL; +} + +static inline void irq_settings_set_level(struct irq_desc *desc) +{ + desc->status |= _IRQ_LEVEL; +} -- cgit v1.2.3 From 1ccb4e612f68ceefb888c2c6c1def6294ea8666d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 14:44:17 +0100 Subject: genirq: Wrap the remaning IRQ_* flags Use wrappers to keep them away from the core code. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/chip.c | 3 ++- kernel/irq/manage.c | 14 ++++++------ kernel/irq/settings.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/spurious.c | 3 +-- 5 files changed, 70 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index aab64c262726..c8bbc4fabaab 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -45,7 +45,7 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { + if (!desc->action && irq_settings_can_probe(desc)) { /* * An old-style architecture might still have * the handle_bad_irq handler there: @@ -74,7 +74,7 @@ unsigned long probe_irq_on(void) */ for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); - if (!desc->action && !(desc->status & IRQ_NOPROBE)) { + if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; if (irq_startup(desc)) { irq_compat_set_pending(desc); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9c9b573a718e..9e9220da4deb 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -674,7 +674,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->name = name; if (handle != handle_bad_irq && is_chained) { - desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; + irq_settings_set_noprobe(desc); + irq_settings_set_norequest(desc); irq_startup(desc); } raw_spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9ae758ed8e66..b5de828e58d9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -103,7 +103,7 @@ void irq_set_thread_affinity(struct irq_desc *desc) #ifdef CONFIG_GENERIC_PENDING_IRQ static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { - return desc->status & IRQ_MOVE_PCNTXT; + return irq_settings_can_move_pcntxt(desc); } static inline bool irq_move_pending(struct irq_desc *desc) { @@ -411,7 +411,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) if (desc->istate & IRQS_SUSPENDED) goto err_out; /* Prevent probing on this irq: */ - desc->status |= IRQ_NOPROBE; + irq_settings_set_noprobe(desc); irq_enable(desc); check_irq_resend(desc, irq); /* fall-through */ @@ -526,7 +526,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) if (!desc) return 0; - if (desc->status & IRQ_NOREQUEST) + if (!irq_settings_can_request(desc)) return 0; raw_spin_lock_irqsave(&desc->lock, flags); @@ -820,7 +820,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Check whether the interrupt nests into another interrupt * thread. */ - nested = desc->status & IRQ_NESTED_THREAD; + nested = irq_settings_is_nested_thread(desc); if (nested) { if (!new->thread_fn) return -EINVAL; @@ -917,7 +917,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->istate |= IRQS_ONESHOT; - if (!(desc->status & IRQ_NOAUTOEN)) + if (irq_settings_can_autoenable(desc)) irq_startup(desc); else /* Undo nested disables: */ @@ -1217,7 +1217,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NOREQUEST) + if (!irq_settings_can_request(desc)) return -EINVAL; if (!handler) { @@ -1292,7 +1292,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (desc->status & IRQ_NESTED_THREAD) { + if (irq_settings_is_nested_thread(desc)) { ret = request_threaded_irq(irq, NULL, handler, flags, name, dev_id); return !ret ? IRQC_IS_NESTED : ret; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 2201f2aaa9a0..216b6f200e7c 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -6,7 +6,12 @@ enum { _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, _IRQ_PER_CPU = IRQ_PER_CPU, _IRQ_LEVEL = IRQ_LEVEL, + _IRQ_NOPROBE = IRQ_NOPROBE, + _IRQ_NOREQUEST = IRQ_NOREQUEST, + _IRQ_NOAUTOEN = IRQ_NOAUTOEN, + _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -34,6 +39,14 @@ enum { #define IRQ_AFFINITY_SET GOT_YOU_MORON #undef IRQ_LEVEL #define IRQ_LEVEL GOT_YOU_MORON +#undef IRQ_NOPROBE +#define IRQ_NOPROBE GOT_YOU_MORON +#undef IRQ_NOREQUEST +#define IRQ_NOREQUEST GOT_YOU_MORON +#undef IRQ_NOAUTOEN +#define IRQ_NOAUTOEN GOT_YOU_MORON +#undef IRQ_NESTED_THREAD +#define IRQ_NESTED_THREAD GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -90,3 +103,48 @@ static inline void irq_settings_set_level(struct irq_desc *desc) { desc->status |= _IRQ_LEVEL; } + +static inline bool irq_settings_can_request(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOREQUEST); +} + +static inline void irq_settings_clr_norequest(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOREQUEST; +} + +static inline void irq_settings_set_norequest(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOREQUEST; +} + +static inline bool irq_settings_can_probe(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOPROBE); +} + +static inline void irq_settings_clr_noprobe(struct irq_desc *desc) +{ + desc->status &= ~_IRQ_NOPROBE; +} + +static inline void irq_settings_set_noprobe(struct irq_desc *desc) +{ + desc->status |= _IRQ_NOPROBE; +} + +static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status & _IRQ_MOVE_PCNTXT; +} + +static inline bool irq_settings_can_autoenable(struct irq_desc *desc) +{ + return !(desc->status & _IRQ_NOAUTOEN); +} + +static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) +{ + return desc->status & _IRQ_NESTED_THREAD; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 226ed7d26a84..dd586ebf9c8c 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -68,8 +68,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) raw_spin_lock(&desc->lock); /* PER_CPU and nested thread interrupts are never polled */ - if (irq_settings_is_per_cpu(desc) || - (desc->status & IRQ_NESTED_THREAD)) + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) goto out; /* -- cgit v1.2.3 From f9e4989eb8183a1f33581fa1b99274287b0639d2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Feb 2011 14:54:49 +0100 Subject: genirq: Force wrapped access to desc->status in core code Force the usage of wrappers by another nasty CPP substitution. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 6 +++--- kernel/irq/irqdesc.c | 4 ++-- kernel/irq/settings.h | 3 +++ 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6e34bdbeb26a..cb62e2d0df4e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -55,7 +55,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t ret, retval = IRQ_NONE; - unsigned int status = 0, irq = desc->irq_data.irq; + unsigned int random = 0, irq = desc->irq_data.irq; do { trace_irq_handler_entry(irq, action); @@ -98,7 +98,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: - status |= action->flags; + random |= action->flags; break; default: @@ -109,7 +109,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); - if (status & IRQF_SAMPLE_RANDOM) + if (random & IRQF_SAMPLE_RANDOM) add_interrupt_randomness(irq); if (!noirqdebug) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 78866d050bc9..3387fbd7f2f4 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -79,7 +79,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.chip_data = NULL; desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; - desc->status = _IRQ_DEFAULT_INIT_FLAGS; + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; @@ -247,7 +247,6 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .status = _IRQ_DEFAULT_INIT_FLAGS, .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, @@ -271,6 +270,7 @@ int __init early_irq_init(void) desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; desc[i].kstat_irqs = alloc_percpu(unsigned int); + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 216b6f200e7c..47bcd3b9f399 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -148,3 +148,6 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { return desc->status & _IRQ_NESTED_THREAD; } + +/* Nothing should touch desc->status from now on */ +#define status USE_THE_PROPER_WRAPPERS_YOU_MORON -- cgit v1.2.3 From 5d4d8fc9ac3e9a90bbdf90bae6864cb2c01f2208 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Feb 2011 17:27:18 +0100 Subject: genirq: Cleanup irq.h Put the constants into an enum and document them. Signed-off-by: Thomas Gleixner --- kernel/irq/settings.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 47bcd3b9f399..55ebe1e09da4 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,37 +15,21 @@ enum { _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; -#undef IRQ_INPROGRESS #define IRQ_INPROGRESS GOT_YOU_MORON -#undef IRQ_REPLAY #define IRQ_REPLAY GOT_YOU_MORON -#undef IRQ_WAITING #define IRQ_WAITING GOT_YOU_MORON -#undef IRQ_DISABLED #define IRQ_DISABLED GOT_YOU_MORON -#undef IRQ_PENDING #define IRQ_PENDING GOT_YOU_MORON -#undef IRQ_MASKED #define IRQ_MASKED GOT_YOU_MORON -#undef IRQ_WAKEUP #define IRQ_WAKEUP GOT_YOU_MORON -#undef IRQ_MOVE_PENDING #define IRQ_MOVE_PENDING GOT_YOU_MORON -#undef IRQ_PER_CPU #define IRQ_PER_CPU GOT_YOU_MORON -#undef IRQ_NO_BALANCING #define IRQ_NO_BALANCING GOT_YOU_MORON -#undef IRQ_AFFINITY_SET #define IRQ_AFFINITY_SET GOT_YOU_MORON -#undef IRQ_LEVEL #define IRQ_LEVEL GOT_YOU_MORON -#undef IRQ_NOPROBE #define IRQ_NOPROBE GOT_YOU_MORON -#undef IRQ_NOREQUEST #define IRQ_NOREQUEST GOT_YOU_MORON -#undef IRQ_NOAUTOEN #define IRQ_NOAUTOEN GOT_YOU_MORON -#undef IRQ_NESTED_THREAD #define IRQ_NESTED_THREAD GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON -- cgit v1.2.3 From d4d5e08960844a062da8387ee5f16ca7a33200d0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 13:16:14 +0100 Subject: genirq: Add IRQCHIP_SET_TYPE_MASKED flag irq_chips, which require to mask the chip before changing the trigger type should set this flag. So the core takes care of it and the requirement for looking into desc->status in the chip goes away. Signed-off-by: Thomas Gleixner Cc: Linus Walleij Cc: Lars-Peter Clausen --- kernel/irq/chip.c | 4 ++-- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 16 +++++++++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9e9220da4deb..4687457fe7f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -367,7 +367,7 @@ static inline void mask_ack_irq(struct irq_desc *desc) irq_state_set_masked(desc); } -static inline void mask_irq(struct irq_desc *desc) +void mask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask) { desc->irq_data.chip->irq_mask(&desc->irq_data); @@ -375,7 +375,7 @@ static inline void mask_irq(struct irq_desc *desc) } } -static inline void unmask_irq(struct irq_desc *desc) +void unmask_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_unmask) { desc->irq_data.chip->irq_unmask(&desc->irq_data); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6776453c454c..1d500fbde0d4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -84,6 +84,8 @@ extern int irq_startup(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); +extern void mask_irq(struct irq_desc *desc); +extern void unmask_irq(struct irq_desc *desc); extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b5de828e58d9..50809c79c7ad 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -554,8 +554,8 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { - int ret; struct irq_chip *chip = desc->irq_data.chip; + int ret, unmask = 0; if (!chip || !chip->irq_set_type) { /* @@ -568,6 +568,14 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, } flags &= IRQ_TYPE_SENSE_MASK; + + if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { + if (!(desc->istate & IRQS_MASKED)) + mask_irq(desc); + if (!(desc->istate & IRQS_DISABLED)) + unmask = 1; + } + /* caller masked out all except trigger mode flags */ ret = chip->irq_set_type(&desc->irq_data, flags); @@ -588,11 +596,13 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); - return 0; + ret = 0; default: pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); } + if (unmask) + unmask_irq(desc); return ret; } @@ -669,7 +679,7 @@ again: #ifdef CONFIG_SMP /* - * Check whether we need to change the affinity of the interrupt thread. + * Check whether we need to chasnge the affinity of the interrupt thread. */ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) -- cgit v1.2.3 From 7f94226f03299f1ca32f118f02f2a0295e0e5e93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 19:46:26 +0100 Subject: genirq: Move wakeup state to irq_data Some irq_chips need to know the state of wakeup mode for setting the trigger type etc. Reflect it in irq_data state. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 4 ++-- kernel/irq/pm.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 1d500fbde0d4..5e2366da9f38 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -46,7 +46,6 @@ enum { * IRQS_PENDING - irq is pending and replayed later * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended - * IRQS_WAKEUP - irq triggers system wakeup from suspend */ enum { IRQS_AUTODETECT = 0x00000001, @@ -60,7 +59,6 @@ enum { IRQS_PENDING = 0x00000200, IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, - IRQS_WAKEUP = 0x00001000, }; #include "compat.h" diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50809c79c7ad..ea6add6036b1 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -492,7 +492,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 0; else - desc->istate |= IRQS_WAKEUP; + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); } } else { if (desc->wake_depth == 0) { @@ -502,7 +502,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) if (ret) desc->wake_depth = 1; else - desc->istate &= ~IRQS_WAKEUP; + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f39383d8672d..1329f0eff49e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -69,7 +69,7 @@ int check_wakeup_irqs(void) int irq; for_each_irq_desc(irq, desc) - if ((desc->istate & IRQS_WAKEUP) && + if (irqd_is_wakeup_set(&desc->irq_data) && (desc->istate & IRQS_PENDING)) return -EBUSY; -- cgit v1.2.3 From e1ef824146131709d7466e37f889f2dab24ca98e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 22:25:31 +0100 Subject: genirq: Reflect IRQ_MOVE_PCNTXT in irq_data state Required by x86. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4687457fe7f0..2b0f9192a830 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -712,11 +712,13 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | - IRQD_TRIGGER_MASK | IRQD_LEVEL); + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) irqd_set(&desc->irq_data, IRQD_NO_BALANCING); if (irq_settings_is_per_cpu(desc)) irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_can_move_pcntxt(desc)) + irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); -- cgit v1.2.3 From a6967caf00ebbb2d4acdebcb72a25f2e9ba43fd2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 22:01:25 +0100 Subject: genirq: Remove desc->status when GENERIC_HARDIRQS_NO_COMPAT=y If everything uses the right accessors, then enabling GENERIC_HARDIRQS_NO_COMPAT should just work. If not it will tell you. Don't be lazy and use the trick which I use in the core code! git grep status_use_accessors will unearth it in a split second. Offenders are tracked down and not slapped with stinking trouts. This time we use frozen shark for a better educational value. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 4 ++++ kernel/irq/settings.h | 1 + 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 5e2366da9f38..fd5777ab2d34 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,6 +15,10 @@ #define istate core_internal_state__do_not_mess_with_it +#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT +# define status status_use_accessors +#endif + extern int noirqdebug; /* diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 55ebe1e09da4..0227ad358272 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -134,4 +134,5 @@ static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) } /* Nothing should touch desc->status from now on */ +#undef status #define status USE_THE_PROPER_WRAPPERS_YOU_MORON -- cgit v1.2.3 From 091738a266fc74329ae186f22ff2b3f01319112d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 20:16:43 +0100 Subject: genirq: Remove real old transition functions These transition helpers are stale for years now. Remove them. Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 6 ------ kernel/irq/chip.c | 16 ++++------------ kernel/irq/internals.h | 3 --- kernel/irq/manage.c | 14 +------------- 4 files changed, 5 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index c8bbc4fabaab..394784c57060 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -46,12 +46,6 @@ unsigned long probe_irq_on(void) for_each_irq_desc_reverse(i, desc) { raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { - /* - * An old-style architecture might still have - * the handle_bad_irq handler there: - */ - compat_irq_chip_set_default_handler(desc); - /* * Some chips need to know about probing in * progress: diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2b0f9192a830..c19c0b562c80 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -644,19 +644,11 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, return; } - if (!handle) + if (!handle) { handle = handle_bad_irq; - else if (desc->irq_data.chip == &no_irq_chip) { - printk(KERN_WARNING "Trying to install %sinterrupt handler " - "for IRQ%d\n", is_chained ? "chained " : "", irq); - /* - * Some ARM implementations install a handler for really dumb - * interrupt hardware without setting an irq_chip. This worked - * with the ARM no_irq_chip but the check in setup_irq would - * prevent us to setup the interrupt at all. Switch it to - * dummy_irq_chip for easy transition. - */ - desc->irq_data.chip = &dummy_irq_chip; + } else { + if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + return; } chip_bus_lock(desc); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index fd5777ab2d34..f80a77471617 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -74,9 +74,6 @@ enum { /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); -/* Set default handler: */ -extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ea6add6036b1..99395a24f432 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -540,17 +540,6 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) return !action; } -void compat_irq_chip_set_default_handler(struct irq_desc *desc) -{ - /* - * If the architecture still has not overriden - * the flow handler then zap the default. This - * should catch incorrect flow-type setting. - */ - if (desc->handle_irq == &handle_bad_irq) - desc->handle_irq = NULL; -} - int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { @@ -912,8 +901,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (ret) goto out_mask; - } else - compat_irq_chip_set_default_handler(desc); + } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ IRQS_INPROGRESS | IRQS_ONESHOT | \ -- cgit v1.2.3 From d5eb4ad2dfb2dfae43fd51bc8630b4fc3ef00e92 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Feb 2011 12:16:16 +0100 Subject: genirq: Implement irq_get/put_desc_[bus]locked/unlock() Most of the managing functions get the irq descriptor and lock it - either with or without buslock. Instead of open coding this over and over provide a common function to do that. Signed-off-by: Thomas Gleixner --- kernel/irq/internals.h | 28 ++++++++++++++++++++++++++++ kernel/irq/irqdesc.c | 20 ++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f80a77471617..935bec4bfa87 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -126,6 +126,34 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); + +static inline struct irq_desc * +irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, true); +} + +static inline void +irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, true); +} + +static inline struct irq_desc * +irq_get_desc_lock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, false); +} + +static inline void +irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, false); +} + /* * Manipulation functions for irq_data.state */ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 3387fbd7f2f4..394ab6a6c62c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -402,6 +402,26 @@ unsigned int irq_get_next_irq(unsigned int offset) return find_next_bit(allocated_irqs, nr_irqs, offset); } +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + } + return desc; +} + +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) +{ + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (bus) + chip_bus_sync_unlock(desc); +} + /** * dynamic_irq_cleanup - cleanup a dynamically allocated irq * @irq: irq number to initialize -- cgit v1.2.3 From 02725e7471b8dd58fa96f6604bdb5dde45405a2e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 12 Feb 2011 10:37:36 +0100 Subject: genirq: Use irq_get/put functions Convert the management functions to use the common irq_get/put function. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 93 +++++++++++++++-------------------------------------- kernel/irq/manage.c | 81 ++++++++++++++++++---------------------------- 2 files changed, 57 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c19c0b562c80..3ea6aecd99c0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -25,22 +25,18 @@ */ int irq_set_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } if (!chip) chip = &no_irq_chip; - raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->irq_data.chip = chip; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_chip); @@ -52,24 +48,17 @@ EXPORT_SYMBOL(irq_set_chip); */ int irq_set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - int ret = -ENXIO; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + int ret = 0; - if (!desc) { - printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); - return -ENODEV; - } + if (!desc) + return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - if (type == IRQ_TYPE_NONE) - return 0; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_trigger(desc, irq, type); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + if (type != IRQ_TYPE_NONE) + ret = __irq_set_trigger(desc, irq, type); + irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_type); @@ -83,18 +72,13 @@ EXPORT_SYMBOL(irq_set_irq_type); */ int irq_set_handler_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install controller data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.handler_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_handler_data); @@ -108,20 +92,15 @@ EXPORT_SYMBOL(irq_set_handler_data); */ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install msi data for IRQ%d\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); return 0; } @@ -134,24 +113,13 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) */ int irq_set_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install chip data for IRQ%d\n", irq); - return -EINVAL; - } - - if (!desc->irq_data.chip) { - printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); + if (!desc) return -EINVAL; - } - - raw_spin_lock_irqsave(&desc->lock, flags); desc->irq_data.chip_data = data; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL(irq_set_chip_data); @@ -635,25 +603,19 @@ void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); - if (!desc) { - printk(KERN_ERR - "Trying to install type control for IRQ%d\n", irq); + if (!desc) return; - } if (!handle) { handle = handle_bad_irq; } else { if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) - return; + goto out; } - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - /* Uninstall? */ if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) @@ -670,8 +632,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_norequest(desc); irq_startup(desc); } - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); +out: + irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -693,14 +655,11 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); if (!desc) return; - - raw_spin_lock_irqsave(&desc->lock, flags); - irq_settings_clr_and_set(desc, clr, set); irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | @@ -714,5 +673,5 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); - raw_spin_unlock_irqrestore(&desc->lock, flags); + irq_put_desc_unlock(desc, flags); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 99395a24f432..6cca1956c503 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -172,16 +172,13 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); if (!desc) return -EINVAL; - - raw_spin_lock_irqsave(&desc->lock, flags); desc->affinity_hint = m; - raw_spin_unlock_irqrestore(&desc->lock, flags); - + irq_put_desc_unlock(desc, flags); return 0; } EXPORT_SYMBOL_GPL(irq_set_affinity_hint); @@ -336,6 +333,18 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) irq_disable(desc); } +static int __disable_irq_nosync(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return -EINVAL; + __disable_irq(desc, irq, false); + irq_put_desc_busunlock(desc, flags); + return 0; +} + /** * disable_irq_nosync - disable an irq without waiting * @irq: Interrupt to disable @@ -349,17 +358,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) - return; - - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); - __disable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + __disable_irq_nosync(irq); } EXPORT_SYMBOL(disable_irq_nosync); @@ -377,13 +376,7 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); - - if (!desc) - return; - - disable_irq_nosync(irq); - if (desc->action) + if (!__disable_irq_nosync(irq)) synchronize_irq(irq); } EXPORT_SYMBOL(disable_irq); @@ -434,21 +427,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); if (!desc) return; - if (WARN(!desc->irq_data.chip, KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) - return; + goto out; - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); +out: + irq_put_desc_busunlock(desc, flags); } EXPORT_SYMBOL(enable_irq); @@ -477,15 +467,13 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) */ int irq_set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); int ret = 0; /* wakeup-capable irqs can be shared between drivers that * don't need to have the same sleep mode behaviors. */ - chip_bus_lock(desc); - raw_spin_lock_irqsave(&desc->lock, flags); if (on) { if (desc->wake_depth++ == 0) { ret = set_irq_wake_real(irq, on); @@ -505,9 +493,7 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); } } - - raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(desc); + irq_put_desc_busunlock(desc, flags); return ret; } EXPORT_SYMBOL(irq_set_irq_wake); @@ -519,25 +505,20 @@ EXPORT_SYMBOL(irq_set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + int canrequest = 0; if (!desc) return 0; - if (!irq_settings_can_request(desc)) - return 0; - - raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - if (action) - if (irqflags & action->flags & IRQF_SHARED) - action = NULL; - - raw_spin_unlock_irqrestore(&desc->lock, flags); - - return !action; + if (irq_settings_can_request(desc)) { + if (desc->action) + if (irqflags & desc->action->flags & IRQF_SHARED) + canrequest =1; + } + irq_put_desc_unlock(desc, flags); + return canrequest; } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, -- cgit v1.2.3 From 3836ca08aad4575c120ccf328652f3873eea9063 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Feb 2011 20:09:19 +0100 Subject: genirq: Consolidate set_chip_handler functions No need to have separate functions if we have one plus inline wrappers. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3ea6aecd99c0..2a36038b8f59 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -600,7 +600,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) } void -__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { unsigned long flags; @@ -635,22 +635,14 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, out: irq_put_desc_busunlock(desc, flags); } -EXPORT_SYMBOL_GPL(__set_irq_handler); +EXPORT_SYMBOL_GPL(__irq_set_handler); void -set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, - irq_flow_handler_t handle) -{ - irq_set_chip(irq, chip); - __set_irq_handler(irq, handle, 0, NULL); -} - -void -set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); - __set_irq_handler(irq, handle, 0, name); + __irq_set_handler(irq, handle, 0, name); } void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) -- cgit v1.2.3 From 781295762defc709a609efc01d8bb065276cd9a2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Feb 2011 15:14:20 +0100 Subject: genirq: Add preflow handler support sparc64 needs to call a preflow handler on certain interrupts befor calling the action chain. Integrate it into handle_fasteoi_irq. Must be enabled via CONFIG_IRQ_FASTEOI_PREFLOW. No impact when disabled. Signed-off-by: Thomas Gleixner Cc: David S. Miller --- kernel/irq/Kconfig | 3 +++ kernel/irq/chip.c | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 48ad25f5fa59..9149be729e45 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -35,6 +35,9 @@ config AUTO_IRQ_AFFINITY config HARDIRQS_SW_RESEND def_bool n +config IRQ_PREFLOW_FASTEOI + def_bool n + config SPARSE_IRQ bool "Support sparse irq numbering" depends on HAVE_SPARSE_IRQ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 2a36038b8f59..08be5d182be3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -471,6 +471,16 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_level_irq); +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void preflow_handler(struct irq_desc *desc) +{ + if (desc->preflow_handler) + desc->preflow_handler(&desc->irq_data); +} +#else +static inline void preflow_handler(struct irq_desc *desc) { } +#endif + /** * handle_fasteoi_irq - irq handler for transparent controllers * @irq: the interrupt number @@ -503,6 +513,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) mask_irq(desc); goto out; } + preflow_handler(desc); handle_irq_event(desc); out: desc->irq_data.chip->irq_eoi(&desc->irq_data); -- cgit v1.2.3 From 77694b408abb8f92195ad5ed6ce5492f1d794c77 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Feb 2011 10:33:57 +0100 Subject: genirq; Add fasteoi irq_chip quirk Some chips want irq_eoi() only called when an interrupt is actually handled. So they have checks for INPROGRESS and DISABLED in their irq_eoi callbacks. Add a chip flag, which allows to handle that in the generic code. No impact on the fastpath. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 08be5d182be3..1d3e25e68b0c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -515,9 +515,16 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) } preflow_handler(desc); handle_irq_event(desc); -out: + +out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); +out_unlock: raw_spin_unlock(&desc->lock); + return; +out: + if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) + goto out_eoi; + goto out_unlock; } /** -- cgit v1.2.3 From a439520f8b18917b322f576be04c54aba84bb044 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Feb 2011 18:46:16 +0100 Subject: genirq: Implement irq_data based move_*_irq() versions No need to lookup the irq descriptor when calling from a chip callback function which has irq_data already handy. Signed-off-by: Thomas Gleixner --- kernel/irq/migration.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 7a93c6b88b25..ec4806d4778b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -4,10 +4,10 @@ #include "internals.h" -void move_masked_irq(int irq) +void irq_move_masked_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; + struct irq_desc *desc = irq_data_to_desc(idata); + struct irq_chip *chip = idata->chip; if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) return; @@ -53,12 +53,17 @@ void move_masked_irq(int irq) cpumask_clear(desc->pending_mask); } -void move_native_irq(int irq) +void move_masked_irq(int irq) +{ + irq_move_masked_irq(irq_get_irq_data(irq)); +} + +void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(idata); bool masked; - if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) + if (likely(!irqd_is_setaffinity_pending(idata))) return; if (unlikely(desc->istate & IRQS_DISABLED)) @@ -71,8 +76,13 @@ void move_native_irq(int irq) */ masked = desc->istate & IRQS_MASKED; if (!masked) - desc->irq_data.chip->irq_mask(&desc->irq_data); - move_masked_irq(irq); + idata->chip->irq_mask(idata); + irq_move_masked_irq(idata); if (!masked) - desc->irq_data.chip->irq_unmask(&desc->irq_data); + idata->chip->irq_unmask(idata); +} + +void move_native_irq(int irq) +{ + irq_move_irq(irq_get_irq_data(irq)); } -- cgit v1.2.3 From 24d51add7438f9696a7205927bf9de3c5c787a58 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 21 Feb 2011 09:52:50 +0100 Subject: workqueue: fix build failure introduced by s/freezeable/freezable/ wq:fixes-2.6.38 does s/WQ_FREEZEABLE/WQ_FREEZABLE and wq:for-2.6.39 adds new usage of the flag. The combination of the two creates a build failure after merge. Fix it by renaming all freezeables to freezables. Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell --- kernel/workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 572f559f6cb9..1b64d225f067 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -251,12 +251,12 @@ struct workqueue_struct *system_wq __read_mostly; struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; -struct workqueue_struct *system_freezeable_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); -EXPORT_SYMBOL_GPL(system_freezeable_wq); +EXPORT_SYMBOL_GPL(system_freezable_wq); #define CREATE_TRACE_POINTS #include @@ -3777,10 +3777,10 @@ static int __init init_workqueues(void) system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); - system_freezeable_wq = alloc_workqueue("events_freezeable", - WQ_FREEZEABLE, 0); + system_freezable_wq = alloc_workqueue("events_freezable", + WQ_FREEZABLE, 0); BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezeable_wq); + !system_unbound_wq || !system_freezable_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.3 From a61d825808a0ce9935afebc225dcd602d5339e14 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Feb 2011 12:54:34 +0100 Subject: genirq: Fix misplaced status update in irq_disable() We lazy disable interrupt lines, so only mark the line masked, when the chip provides an irq_disable callback. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1d3e25e68b0c..b5145654855f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -199,8 +199,8 @@ void irq_disable(struct irq_desc *desc) irq_state_set_disabled(desc); if (desc->irq_data.chip->irq_disable) { desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); } - irq_state_set_masked(desc); } #ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -- cgit v1.2.3 From ed4dea6e0e33a3e58d8b77b775a8f0e433e7a005 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Feb 2011 11:07:37 -0800 Subject: genirq: Use IRQ_BITMAP_BITS as search size in irq_alloc_descs() The runtime expansion of nr_irqs does not take into account that bitmap_find_next_zero_area() returns "start" + size in case the search for an matching zero area fails. That results in a start value which can be completely off and is not covered by the following expand_nr_irqs() and possibly outside of the absolute limit. But we use it without further checking. Use IRQ_BITMAP_BITS as the limit for the bitmap search and expand nr_irqs when the start bit is beyond nr_irqs. So start is always pointing to the correct area in the bitmap. nr_irqs is just the limit for irq enumerations, not the real limit for the irq space. [ tglx: Let irq_expand_nr_irqs() take the new upper end so we do not expand nr_irqs more than necessary. Made changelog readable ] Signed-off-by: Yinghai Lu LKML-Reference: <4D6014F9.8040605@kernel.org> Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 394ab6a6c62c..dbccc799407f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -207,11 +207,11 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } -static int irq_expand_nr_irqs(unsigned int cnt) +static int irq_expand_nr_irqs(unsigned int nr) { - if (nr_irqs + cnt > IRQ_BITMAP_BITS) + if (nr > IRQ_BITMAP_BITS) return -ENOMEM; - nr_irqs += cnt; + nr_irqs = nr; return 0; } @@ -298,7 +298,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) return start; } -static int irq_expand_nr_irqs(unsigned int cnt) +static int irq_expand_nr_irqs(unsigned int nr) { return -ENOMEM; } @@ -346,13 +346,14 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) mutex_lock(&sparse_irq_lock); - start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, + from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) goto err; - if (start >= nr_irqs) { - ret = irq_expand_nr_irqs(cnt); + if (start + cnt > nr_irqs) { + ret = irq_expand_nr_irqs(start + cnt); if (ret) goto err; } -- cgit v1.2.3 From 8fff39e06987492da3d4a0b9ec7cdbd245b6762b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Feb 2011 14:19:42 +0100 Subject: genirq: Add missing break in __irq_set_trigger() The switch case in __irq_set_trigger() lacks a break, which emits a pr_err unconditionally on success. Reported-by: Lars-Peter Clausen Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6cca1956c503..01f8a9519e63 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -567,6 +567,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, if (chip != desc->irq_data.chip) irq_chip_set_defaults(desc->irq_data.chip); ret = 0; + break; default: pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", flags, irq, chip->irq_set_type); -- cgit v1.2.3 From e06383db9ec591696a06654257474b85bac1f8cb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 14 Dec 2010 19:37:07 -0800 Subject: hrtimers: extend hrtimer base code to handle more then 2 clockids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hrtimer code is written mainly with CLOCK_REALTIME and CLOCK_MONOTONIC in mind. These are clockids 0 and 1 resepctively. However, if we are to introduce any new hrtimer bases, using new clockids, we have to skip the cputimers (clockids 2,3) as well as other clockids that may not impelement timers. This patch adds a little bit of indirection between the clockid and the base, so that we can extend the base by one when we add a new clockid at number 7 or so. CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- kernel/hrtimer.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 57c4d33c9a9d..ca99e2454600 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -53,11 +53,10 @@ /* * The timer bases: * - * Note: If we want to add new timer bases, we have to skip the two - * clock ids captured by the cpu-timers. We do this by holding empty - * entries rather than doing math adjustment of the clock ids. - * This ensures that we capture erroneous accesses to these clock ids - * rather than moving them into the range of valid clock id's. + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { @@ -77,6 +76,14 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; +static int hrtimer_clock_to_base_table[MAX_CLOCKS]; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + /* * Get the coarse grained time at the softirq based on xtime and * wall_to_monotonic. @@ -90,8 +97,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); - base->clock_base[CLOCK_REALTIME].softirq_time = xtim; - base->clock_base[CLOCK_MONOTONIC].softirq_time = + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = ktime_add(xtim, tomono); } @@ -179,10 +186,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); int cpu = hrtimer_get_target(this_cpu, pinned); + int basenum = hrtimer_clockid_to_base(base->index); again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); - new_base = &new_cpu_base->clock_base[base->index]; + new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* @@ -618,7 +626,7 @@ static void retrigger_next_event(void *arg) /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[CLOCK_REALTIME].offset = + base->clock_base[HRTIMER_BASE_REALTIME].offset = timespec_to_ktime(realtime_offset); hrtimer_force_reprogram(base, 0); @@ -716,8 +724,8 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -1112,6 +1120,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { struct hrtimer_cpu_base *cpu_base; + int base; memset(timer, 0, sizeof(struct hrtimer)); @@ -1120,7 +1129,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) clock_id = CLOCK_MONOTONIC; - timer->base = &cpu_base->clock_base[clock_id]; + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; hrtimer_init_timer_hres(timer); timerqueue_init(&timer->node); @@ -1156,9 +1166,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init); int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); cpu_base = &__raw_get_cpu_var(hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); return 0; } @@ -1705,6 +1716,9 @@ static struct notifier_block __cpuinitdata hrtimers_nb = { void __init hrtimers_init(void) { + hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; + hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -- cgit v1.2.3 From abb3a4ea2e0ea7114a4475745da2f32bd9ad5b73 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Feb 2011 17:52:09 -0800 Subject: time: Introduce get_monotonic_boottime and ktime_get_boottime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds new functions that return the monotonic time since boot (in other words, CLOCK_MONOTONIC + suspend time). CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6262c1d18397..5fbd9aa7df95 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -907,7 +907,7 @@ static void update_wall_time(void) * getboottime - Return the real time of system boot. * @ts: pointer to the timespec to be set * - * Returns the time of day in a timespec. + * Returns the wall-time of boot in a timespec. * * This is based on the wall_to_monotonic offset and the total suspend * time. Calls to settimeofday will affect the value returned (which @@ -925,6 +925,55 @@ void getboottime(struct timespec *ts) } EXPORT_SYMBOL_GPL(getboottime); + +/** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts: pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ + struct timespec tomono, sleep; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + sleep = total_sleep_time; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ + struct timespec ts; + + get_monotonic_boottime(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); + /** * monotonic_to_bootbased - Convert the monotonic time to boot based. * @ts: pointer to the timespec to be converted -- cgit v1.2.3 From 314ac37150011ebb398f522db528d2dbcc611189 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Feb 2011 18:43:08 -0800 Subject: time: Extend get_xtime_and_monotonic_offset() to also return sleep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend get_xtime_and_monotonic_offset to get_xtime_and_monotonic_and_sleep_offset(). CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- kernel/hrtimer.c | 9 +++++---- kernel/time/timekeeping.c | 8 ++++++-- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ca99e2454600..e8bf3ad99063 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -91,9 +91,9 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { ktime_t xtim, tomono; - struct timespec xts, tom; + struct timespec xts, tom, slp; - get_xtime_and_monotonic_offset(&xts, &tom); + get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); xtim = timespec_to_ktime(xts); tomono = timespec_to_ktime(tom); @@ -614,12 +614,13 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset, wtm; + struct timespec realtime_offset, wtm, sleep; if (!hrtimer_hres_active()) return; - get_xtime_and_monotonic_offset(&realtime_offset, &wtm); + get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, + &sleep); set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5fbd9aa7df95..3bd7e3d5c632 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1040,11 +1040,14 @@ void do_timer(unsigned long ticks) } /** - * get_xtime_and_monotonic_offset() - get xtime and wall_to_monotonic + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + * and sleep offsets. * @xtim: pointer to timespec to be set with xtime * @wtom: pointer to timespec to be set with wall_to_monotonic + * @sleep: pointer to timespec to be set with time in suspend */ -void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom) +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep) { unsigned long seq; @@ -1052,6 +1055,7 @@ void get_xtime_and_monotonic_offset(struct timespec *xtim, struct timespec *wtom seq = read_seqbegin(&xtime_lock); *xtim = xtime; *wtom = wall_to_monotonic; + *sleep = total_sleep_time; } while (read_seqretry(&xtime_lock, seq)); } -- cgit v1.2.3 From 70a08cca1227dc31c784ec930099a4417a06e7d0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 15 Feb 2011 10:45:16 -0800 Subject: timers: Add CLOCK_BOOTTIME hrtimer base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CLOCK_MONOTONIC stops while the system is in suspend. This is because to applications system suspend is invisible. However, there is a growing set of applications that are wanting to be suspend-aware, but do not want to deal with the complications of CLOCK_REALTIME (which might jump around if settimeofday is called). For these applications, I propose a new clockid: CLOCK_BOOTTIME. CLOCK_BOOTTIME is idential to CLOCK_MONOTONIC, except it also includes any time spent in suspend. This patch add hrtimer base for CLOCK_BOOTTIME, using get_monotonic_boottime/ktime_get_boottime, to allow in kernel users to set timers against. CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- kernel/hrtimer.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index e8bf3ad99063..4c53af18734b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -73,6 +73,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .get_time = &ktime_get, .resolution = KTIME_LOW_RES, }, + { + .index = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + .resolution = KTIME_LOW_RES, + }, } }; @@ -90,16 +95,17 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) */ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) { - ktime_t xtim, tomono; + ktime_t xtim, mono, boot; struct timespec xts, tom, slp; get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); xtim = timespec_to_ktime(xts); - tomono = timespec_to_ktime(tom); + mono = ktime_add(xtim, timespec_to_ktime(tom)); + boot = ktime_add(mono, timespec_to_ktime(slp)); base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = - ktime_add(xtim, tomono); + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; } /* @@ -727,6 +733,7 @@ static int hrtimer_switch_to_hres(void) base->hres_active = 1; base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; + base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -1719,6 +1726,7 @@ void __init hrtimers_init(void) { hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; + hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME; hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); -- cgit v1.2.3 From 7fdd7f89006dd5a4c702fa0ce0c272345fa44ae0 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 15 Feb 2011 10:52:57 -0800 Subject: timers: Export CLOCK_BOOTTIME via the posix timers interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch exports CLOCK_BOOTTIME through the posix timers interface CC: Jamie Lokier CC: Thomas Gleixner CC: Alexander Shishkin CC: Arve Hjønnevåg Signed-off-by: John Stultz --- kernel/posix-timers.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 44fcff131b38..4c0124919f9a 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -187,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* - * Get monotonic time for posix timers + * Get monotonic-raw time for posix timers */ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) { @@ -214,6 +214,14 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp *tp = ktime_to_timespec(KTIME_LOW_RES); return 0; } + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ @@ -253,12 +261,23 @@ static __init int init_posix_timers(void) .clock_getres = posix_get_coarse_res, .clock_get = posix_get_monotonic_coarse, }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, -- cgit v1.2.3 From 70433c01613c2a44756c7b25f7bdd6c1c77b119f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Feb 2011 12:50:12 +0100 Subject: genirq: Use the correct variable for note_interrupt note_interrupt wants to be called with the combined result of all handlers called, not with the last one. If it's a shared interrupt then the last handler might return IRQ_NONE often enough to trigger the spurious dectector which turns off a perfectly fine working interrupt line. Bug was introduced in commit 1277a532(genirq: Simplify handle_irq_event()). Yes, I really messed up there. First the variable ret should not have been named differently to avoid similarity with retval. Second it should have been declared in the do {} loop. Rename it to res and move it into the do {} loop and vanish under a huge brown paperbag. Reported-bisected-tested-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index cb62e2d0df4e..e099e9e9de0b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -54,24 +54,26 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { - irqreturn_t ret, retval = IRQ_NONE; + irqreturn_t retval = IRQ_NONE; unsigned int random = 0, irq = desc->irq_data.irq; do { + irqreturn_t res; + trace_irq_handler_entry(irq, action); - ret = action->handler(irq, action->dev_id); - trace_irq_handler_exit(irq, action, ret); + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); if (WARN_ON_ONCE(!irqs_disabled())) local_irq_disable(); - switch (ret) { + switch (res) { case IRQ_WAKE_THREAD: /* * Set result to handled so the spurious check * does not trigger. */ - ret = IRQ_HANDLED; + res = IRQ_HANDLED; /* * Catch drivers which return WAKE_THREAD but @@ -105,7 +107,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - retval |= ret; + retval |= res; action = action->next; } while (action); @@ -113,7 +115,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) add_interrupt_randomness(irq); if (!noirqdebug) - note_interrupt(irq, desc, ret); + note_interrupt(irq, desc, retval); return retval; } -- cgit v1.2.3 From dbebbfbb1605f0179e7c0d900d941cc9c45de569 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Feb 2011 21:46:25 +0100 Subject: rtmutex: tester: Remove the remaining BKL leftovers We just leave the numbers assinged as commemoration and in case that someone was crazy enough to reimplement the test stuff out of tree. Signed-off-by: Thomas Gleixner --- kernel/rtmutex-tester.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index d5b543506cbc..5c9ccd380966 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -44,9 +44,8 @@ enum test_opcodes { RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ - RTTEST_LOCKBKL, /* 9 Was: Lock BKL */ - RTTEST_UNLOCKBKL, /* 10 Was: Unlock BKL */ - RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ + /* 9, 10 - reserved for BKL commemoration */ + RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ RTTEST_RESET = 99, /* 99 Reset all pending operations */ }; -- cgit v1.2.3 From fd4afaf33313d94f548cb09129ecba3dbab62931 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 17 Feb 2011 13:39:05 +0000 Subject: genirq: Streamline kernel/irq/Kconfig "def_bool n" without prompt is pointless, these should be just "bool". [ tglx: Adapted to latest changes ] Signed-off-by: Jan Beulich LKML-Reference: <4D5D3309020000780003264A@vpn.id2.novell.com> Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 9149be729e45..355b8c7957f5 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,5 +1,5 @@ config HAVE_GENERIC_HARDIRQS - def_bool n + bool if HAVE_GENERIC_HARDIRQS menu "IRQ subsystem" @@ -11,32 +11,32 @@ config GENERIC_HARDIRQS # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED - def_bool n + bool config GENERIC_HARDIRQS_NO_COMPAT - def_bool n + bool # Options selectable by the architecture code config HAVE_SPARSE_IRQ - def_bool n + bool config GENERIC_IRQ_PROBE - def_bool n + bool config GENERIC_IRQ_SHOW - def_bool n + bool config GENERIC_PENDING_IRQ - def_bool n + bool config AUTO_IRQ_AFFINITY - def_bool n + bool config HARDIRQS_SW_RESEND - def_bool n + bool config IRQ_PREFLOW_FASTEOI - def_bool n + bool config SPARSE_IRQ bool "Support sparse irq numbering" -- cgit v1.2.3 From c186fafe9aba87c1a93df8c7120a6ae01fe435ad Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:52:53 +0100 Subject: sched: Clean up remnants of sd_idle With the wholesale removal of the sd_idle SMT logic we can clean up some more. Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d384e739ea95..cd18600a8a63 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3150,25 +3150,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.this_load >= sds.avg_load) goto out_balanced; - /* - * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. - * And to check for busy balance use !idle_cpu instead of - * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE - * even when they are idle. - */ - if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { - if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) - goto out_balanced; - } else { + if (idle == CPU_IDLE) { /* * This cpu is idle. If the busiest group load doesn't * have more tasks than the number of available cpu's and * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && sds.busiest_nr_running <= sds.busiest_group_weight) goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; } force_balance: @@ -3862,8 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no - * longer idle, or one of our SMT siblings is - * not idle. + * longer idle. */ idle = CPU_NOT_IDLE; } -- cgit v1.2.3 From cc57aa8f4b3bece8c26c7929728edcc5fa6b5aed Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:55:32 +0100 Subject: sched: Clean up some f_b_g() comments The existing comment tends to grow state (as it already has), split it up and place it near the actual tests. Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cd18600a8a63..03496ebc4553 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3113,19 +3113,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); - /* Cases where imbalance does not exist from POV of this_cpu */ - /* 1) this_cpu is not the appropriate cpu to perform load balancing - * at this level. - * 2) There is no busy sibling group to pull from. - * 3) This group is the busiest group. - * 4) This group is more busy than the avg busieness at this - * sched_domain. - * 5) The imbalance is within the specified limit. - * - * Note: when doing newidle balance, if the local group has excess - * capacity (i.e. nr_running < group_capacity) and the busiest group - * does not have any capacity, we force a load balance to pull tasks - * to the local group. In this case, we skip past checks 3, 4 and 5. + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. */ if (!(*balance)) goto ret; @@ -3134,19 +3124,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, check_asym_packing(sd, &sds, this_cpu, imbalance)) return sds.busiest; + /* There is no busy sibling group to pull tasks from */ if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) goto force_balance; + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ if (sds.this_load >= sds.max_load) goto out_balanced; + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load) goto out_balanced; -- cgit v1.2.3 From 866ab43efd325fae8889ea77a744d03f2b957e38 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Feb 2011 18:56:47 +0100 Subject: sched: Fix the group_imb logic On a 2*6*2 machine something like: taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done' _should_ result in 9 busy CPUs, each running 1 task. However it didn't quite work reliably, most of the time one cpu of the second socket (6-11) would be idle and one cpu of the first socket (0-5) would have two tasks on it. The group_imb logic is supposed to deal with this and detect when a particular group is imbalanced (like in our case, 0-2 are idle but 3-5 will have 4 tasks on it). The detection phase needed a bit of a tweak as it was too weak and required more than 2 avg weight tasks difference between idle and busy cpus in the group which won't trigger for our test-case. So cure that to be one or more avg task weight difference between cpus. Once the detection phase worked, it was then defeated by the f_b_g() tests trying to avoid ping-pongs. In particular, this_load >= max_load triggered because the pulling cpu (the (first) idle cpu in on the second socket, say 6) would find this_load to be 5 and max_load to be 4 (there'd be 5 tasks running on our socket and only 4 on the other socket). Signed-off-by: Peter Zijlstra Cc: Nikhil Rao Cc: Venkatesh Pallipadi Cc: Suresh Siddha Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 03496ebc4553..3a88dee165c0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2743,7 +2743,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. + * than the average weight of a task. * * APZ: with cgroup the avg task weight can vary wildly and * might not be a suitable number - should we keep a @@ -2753,7 +2753,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); @@ -3128,6 +3128,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && !sds.busiest_has_capacity) -- cgit v1.2.3 From 1747b21fecbfb63fbf6b9624e8b92707960d5a97 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Sun, 20 Feb 2011 15:08:12 +0800 Subject: sched, autogroup, sysctl: Use proc_dointvec_minmax() instead sched_autogroup_enabled has min/max value, proc_dointvec_minmax() is be used for this case. Signed-off-by: Yong Zhang Cc: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1298185696-4403-2-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cb7c830f7faa..7b5eeadfb254 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -367,7 +367,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_autogroup_enabled, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one, }, -- cgit v1.2.3 From 800d4d30c8f20bd728e5741a3b77c4859a613f7c Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Sun, 20 Feb 2011 15:08:14 +0800 Subject: sched, autogroup: Stop going ahead if autogroup is disabled when autogroup is disable from the beginning, sched_autogroup_create_attach() autogroup_move_group() <== 1 sched_move_task() <== 2 task_move_group_fair() set_task_rq() task_group() autogroup_task_group() We go the whole path without doing anything useful. Then stop going further if autogroup is disabled. But there will be a race window between 1 and 2, in which sysctl_sched_autogroup_enabled is enabled. This issue will be toke by following patch. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <1298185696-4403-4-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.c | 4 ++++ kernel/sched_autogroup.h | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 9fb656283157..137a096ae9d8 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -161,11 +161,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + t = p; do { sched_move_task(t); } while_each_thread(p, t); +out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 7b859ffe5dad..05577055cfca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -1,6 +1,11 @@ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; -- cgit v1.2.3 From 511f67a5997c4967c69a3961e2fc9f04d8d244ac Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 22 Feb 2011 15:02:00 +0100 Subject: sched, autogroup: Stop claiming ownership of the root task group Disown it, and only display autogroup association if one exists. Signed-off-by: Mike Galbraith Reviewed-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <1298383320.8036.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 137a096ae9d8..5946ac515602 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr; static void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; - root_task_group.autogroup = &autogroup_default; kref_init(&autogroup_default.kref); init_rwsem(&autogroup_default.lock); init_task->signal->autogroup = &autogroup_default; @@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) static inline bool task_group_is_autogroup(struct task_group *tg) { - return tg != &root_task_group && tg->autogroup; + return !!tg->autogroup; } static inline struct task_group * @@ -251,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) { struct autogroup *ag = autogroup_task_get(p); + if (!task_group_is_autogroup(ag->tg)) + goto out; + down_read(&ag->lock); seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); up_read(&ag->lock); +out: autogroup_kref_put(ag); } #endif /* CONFIG_PROC_FS */ @@ -262,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (!enabled || !tg->autogroup) + if (!task_group_is_autogroup(tg)) return 0; return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); -- cgit v1.2.3 From 3f7cce3c18188a067d463749168bdda5abc5b0f7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 18 Feb 2011 14:40:01 +0200 Subject: perf_events: Fix rcu and locking issues with cgroup support This patches ensures that we do not end up calling perf_cgroup_from_task() when there is no cgroup event. This avoids potential RCU and locking issues. The change in perf_cgroup_set_timestamp() ensures we check against ctx->nr_cgroups. It also avoids calling perf_clock() tiwce in a row. It also ensures we do need to grab ctx->lock before calling the function. We drop update_cgrp_time() from task_clock_event_read() because it is not needed. This also avoids having to deal with perf_cgroup_from_task(). Thanks to Peter Zijlstra for his help on this. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d5e76b8.815bdf0a.7ac3.774f@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a0a6987fabc4..dadeaea4b3fc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -201,6 +201,11 @@ __get_cpu_context(struct perf_event_context *ctx) #ifdef CONFIG_CGROUP_PERF +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { @@ -268,28 +273,41 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) static inline void update_cgrp_time_from_event(struct perf_event *event) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current); + struct perf_cgroup *cgrp; + /* - * do not update time when cgroup is not active + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) */ - if (!event->cgrp || cgrp != event->cgrp) + if (!is_cgroup_event(event)) return; - __update_cgrp_time(event->cgrp); + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; - if (!task) + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) return; cgrp = perf_cgroup_from_task(task); info = this_cpu_ptr(cgrp->info); - info->timestamp = now; + info->timestamp = ctx->timestamp; } #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ @@ -494,7 +512,8 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) { } @@ -1613,7 +1632,7 @@ static int __perf_event_enable(void *info) /* * set current task's cgroup time reference point */ - perf_cgroup_set_timestamp(current, perf_clock()); + perf_cgroup_set_timestamp(current, ctx); __perf_event_mark_enabled(event, ctx); @@ -2048,7 +2067,7 @@ ctx_sched_in(struct perf_event_context *ctx, now = perf_clock(); ctx->timestamp = now; - perf_cgroup_set_timestamp(task, now); + perf_cgroup_set_timestamp(task, ctx); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -5795,7 +5814,6 @@ static void task_clock_event_read(struct perf_event *event) if (!in_nmi()) { update_context_time(event->ctx); - update_cgrp_time_from_event(event); time = event->ctx->time; } else { u64 now = perf_clock(); -- cgit v1.2.3 From 768a06e2ca49cdf72389208cfc056a36cf8bc5e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Feb 2011 16:52:24 +0100 Subject: perf: Simplify task_clock_event_read() There is no point in us having different code paths for nmi and !nmi here, so remove the !nmi one. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index dadeaea4b3fc..64a018e94fca 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5810,16 +5810,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } -- cgit v1.2.3 From 7e9498705e810404ecf29bb2d6fa632b9484c609 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 25 Feb 2011 14:32:28 +0100 Subject: sched: Add #ifdef around irq time accounting functions Get rid of this: kernel/sched.c:3731:13: warning: 'irqtime_account_idle_ticks' defined but not used kernel/sched.c:3732:13: warning: 'irqtime_account_process_tick' defined but not used Signed-off-by: Heiko Carstens Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <20110225133228.GD7469@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2effcb71a478..79bca166bed7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3687,6 +3687,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +#ifndef CONFIG_VIRT_CPU_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -3753,6 +3754,7 @@ static void irqtime_account_idle_ticks(int ticks) {} static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} #endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ /* * Account for involuntary wait time. -- cgit v1.2.3 From 1204e95689f9fbd245a4ce5c1b0cd0a9b77f8d25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Feb 2011 17:17:18 +0100 Subject: genirq: Make warning in handle_percpu_event useful The WARN_ON_ONCE in handle_percpu_event() which emits a warning when an action handler returns with interrupts enabled is not really useful. It does not reveal the interrupt number and handler function which caused it. Make it WARN_ONCE() and add the information. Reported-by: Tony Luck Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e099e9e9de0b..b110c835e070 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -64,7 +64,8 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); - if (WARN_ON_ONCE(!irqs_disabled())) + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + irq, action->handler)) local_irq_disable(); switch (res) { -- cgit v1.2.3 From b5faba21a6805c33b40e258d36f57997ee1de131 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:13 +0000 Subject: genirq: Prepare the handling of shared oneshot interrupts For level type interrupts we need to track how many threads are on flight to avoid useless interrupt storms when not all thread handlers have finished yet. Keep track of the woken threads and only unmask when there are no more threads in flight. Yes, I'm lazy and using a bitfield. But not only because I'm lazy, the main reason is that it's way simpler than using a refcount. A refcount based solution would need to keep track of various things like crashing the irq thread, spurious interrupts coming in, disables/enables, free_irq() and some more. The bitfield keeps the tracking simple and makes things just work. It's also nicely confined to the thread code pathes and does not require additional checks all over the place. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.388095876@linutronix.de> --- kernel/irq/handle.c | 76 ++++++++++++++++++++++++++++++++++++++++++++--------- kernel/irq/manage.c | 54 ++++++++++++++++++++++++++++++++----- 2 files changed, 111 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b110c835e070..517561fc7317 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -51,6 +51,68 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } +static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +{ + /* + * Wake up the handler thread for this action. In case the + * thread crashed and was killed we just pretend that we + * handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. If the + * RUNTHREAD bit is already set, nothing to do. + */ + if (test_bit(IRQTF_DIED, &action->thread_flags) || + test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + return; + + /* + * It's safe to OR the mask lockless here. We have only two + * places which write to threads_oneshot: This code and the + * irq thread. + * + * This code is the hard irq context and can never run on two + * cpus in parallel. If it ever does we have more serious + * problems than this bitmask. + * + * The irq threads of this irq which clear their "running" bit + * in threads_oneshot are serialized via desc->lock against + * each other and they are serialized against this code by + * IRQS_INPROGRESS. + * + * Hard irq handler: + * + * spin_lock(desc->lock); + * desc->state |= IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + * desc->threads_oneshot |= mask; + * spin_lock(desc->lock); + * desc->state &= ~IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * + * irq thread: + * + * again: + * spin_lock(desc->lock); + * if (desc->state & IRQS_INPROGRESS) { + * spin_unlock(desc->lock); + * while(desc->state & IRQS_INPROGRESS) + * cpu_relax(); + * goto again; + * } + * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + * desc->threads_oneshot &= ~mask; + * spin_unlock(desc->lock); + * + * So either the thread waits for us to clear IRQS_INPROGRESS + * or we are waiting in the flow handler for desc->lock to be + * released before we reach this point. The thread also checks + * IRQTF_RUNTHREAD under desc->lock. If set it leaves + * threads_oneshot untouched and runs the thread another time. + */ + desc->threads_oneshot |= action->thread_mask; + wake_up_process(action->thread); +} + irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { @@ -85,19 +147,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) break; } - /* - * Wake up the handler thread for this - * action. In case the thread crashed and was - * killed we just pretend that we handled the - * interrupt. The hardirq handler above has - * disabled the device interrupt, so no irq - * storm is lurking. - */ - if (likely(!test_bit(IRQTF_DIED, - &action->thread_flags))) { - set_bit(IRQTF_RUNTHREAD, &action->thread_flags); - wake_up_process(action->thread); - } + irq_wake_thread(desc, action); /* Fall through to add to randomness */ case IRQ_HANDLED: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 01f8a9519e63..2301de19ac7d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -617,8 +617,11 @@ static int irq_wait_for_interrupt(struct irqaction *action) * handler finished. unmask if the interrupt has not been disabled and * is marked MASKED. */ -static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) +static void irq_finalize_oneshot(struct irq_desc *desc, + struct irqaction *action, bool force) { + if (!(desc->istate & IRQS_ONESHOT)) + return; again: chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); @@ -631,6 +634,11 @@ again: * on the other CPU. If we unmask the irq line then the * interrupt can come in again and masks the line, leaves due * to IRQS_INPROGRESS and the irq line is masked forever. + * + * This also serializes the state of shared oneshot handlers + * versus "desc->threads_onehsot |= action->thread_mask;" in + * irq_wake_thread(). See the comment there which explains the + * serialization. */ if (unlikely(desc->istate & IRQS_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); @@ -639,11 +647,23 @@ again: goto again; } - if (!(desc->istate & IRQS_DISABLED) && (desc->istate & IRQS_MASKED)) { + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the threads_oneshot bit of this thread which + * was just set. + */ + if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + goto out_unlock; + + desc->threads_oneshot &= ~action->thread_mask; + + if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && + (desc->istate & IRQS_MASKED)) { irq_compat_clr_masked(desc); desc->istate &= ~IRQS_MASKED; desc->irq_data.chip->irq_unmask(&desc->irq_data); } +out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); } @@ -691,7 +711,7 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); - int wake, oneshot = desc->istate & IRQS_ONESHOT; + int wake; sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -719,8 +739,7 @@ static int irq_thread(void *data) action->thread_fn(action->irq, action->dev_id); - if (oneshot) - irq_finalize_oneshot(action->irq, desc); + irq_finalize_oneshot(desc, action, false); } wake = atomic_dec_and_test(&desc->threads_active); @@ -729,6 +748,9 @@ static int irq_thread(void *data) wake_up(&desc->wait_for_threads); } + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action, true); + /* * Clear irqaction. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. @@ -743,6 +765,7 @@ static int irq_thread(void *data) void exit_irq_thread(void) { struct task_struct *tsk = current; + struct irq_desc *desc; if (!tsk->irqaction) return; @@ -751,6 +774,14 @@ void exit_irq_thread(void) "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + desc = irq_to_desc(tsk->irqaction->irq); + + /* + * Prevent a stale desc->threads_oneshot. Must be called + * before setting the IRQTF_DIED flag. + */ + irq_finalize_oneshot(desc, tsk->irqaction, true); + /* * Set the THREAD DIED flag to prevent further wakeups of the * soon to be gone threaded handler. @@ -767,7 +798,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; const char *old_name = NULL; - unsigned long flags; + unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; cpumask_var_t mask; @@ -865,12 +896,23 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { + thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; } while (old); shared = 1; } + /* + * Setup the thread mask for this irqaction. Unlikely to have + * 32 resp 64 irqs sharing one line, but who knows. + */ + if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + new->thread_mask = 1 << ffz(thread_mask); + if (!shared) { irq_chip_set_defaults(desc->irq_data.chip); -- cgit v1.2.3 From 9d591edd02a245305b1b9379e4c5571bad4d2774 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:16 +0000 Subject: genirq: Allow shared oneshot interrupts Support ONESHOT on shared interrupts, if all drivers agree on it. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.483640430@linutronix.de> --- kernel/irq/manage.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2301de19ac7d..58c861367300 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -824,10 +824,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) rand_initialize_irq(irq); } - /* Oneshot interrupts are not allowed with shared */ - if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED)) - return -EINVAL; - /* * Check whether the interrupt nests into another interrupt * thread. @@ -881,10 +877,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * Can't share interrupts unless both agree to and are * the same type (level, edge, polarity). So both flag * fields must have IRQF_SHARED set and the bits which - * set the trigger type must match. + * set the trigger type must match. Also all must + * agree on ONESHOT. */ if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) { + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + ((old->flags ^ new->flags) & IRQF_ONESHOT)) { old_name = old->name; goto mismatch; } -- cgit v1.2.3 From 8eb90c30e0e815a1308828352eabd03ca04229dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:21 +0000 Subject: sched: Switch wait_task_inactive to schedule_hrtimeout() When we force thread hard and soft interrupts the startup of ksoftirqd would hang in kthread_bind() when wait_task_inactive() calls schedule_timeout_uninterruptible() because there is no softirq yet which will wake us up. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.677109139@linutronix.de> --- kernel/sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..66ca5d9ba83c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2224,7 +2224,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * yield - it could be a while. */ if (unlikely(on_rq)) { - schedule_timeout_uninterruptible(1); + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); continue; } -- cgit v1.2.3 From 544b4a1f309d18f40969dbab7e08bafd136b2f55 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 25 Feb 2011 15:13:16 -0800 Subject: sched: Clean up the IRQ_TIME_ACCOUNTING code Fix this warning: lkml.org/lkml/2011/1/30/124 kernel/sched.c:3719: warning: 'irqtime_account_idle_ticks' defined but not used kernel/sched.c:3720: warning: 'irqtime_account_process_tick' defined but not used In a cleaner way than: 7e9498705e81: sched: Add #ifdef around irq time accounting functions This patch will not have any functional impact. Signed-off-by: Venkatesh Pallipadi Cc: heiko.carstens@de.ibm.com Cc: a.p.zijlstra@chello.nl LKML-Reference: <1298675596-10992-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 64 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 79bca166bed7..0c8712630f05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3687,7 +3687,36 @@ void account_system_time(struct task_struct *p, int hardirq_offset, __account_system_time(p, cputime, cputime_scaled, target_cputime64); } +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + #ifndef CONFIG_VIRT_CPU_ACCOUNTING + #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* * Account a tick to a process and cpustat @@ -3749,42 +3778,11 @@ static void irqtime_account_idle_ticks(int ticks) for (i = 0; i < ticks; i++) irqtime_account_process_tick(current, 0, rq); } -#else +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ static void irqtime_account_idle_ticks(int ticks) {} static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) {} -#endif -#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ - -/* - * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait - */ -void account_steal_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - - cpustat->steal = cputime64_add(cpustat->steal, cputime64); -} - -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -void account_idle_time(cputime_t cputime) -{ - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); - else - cpustat->idle = cputime64_add(cpustat->idle, cputime64); -} - -#ifndef CONFIG_VIRT_CPU_ACCOUNTING +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* * Account a single tick of cpu time. -- cgit v1.2.3 From 3a142a0672b48a853f00af61f184c7341ac9c99d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Feb 2011 22:34:23 +0100 Subject: clockevents: Prevent oneshot mode when broadcast device is periodic When the per cpu timer is marked CLOCK_EVT_FEAT_C3STOP, then we only can switch into oneshot mode, when the backup broadcast device supports oneshot mode as well. Otherwise we would try to switch the broadcast device into an unsupported mode unconditionally. This went unnoticed so far as the current available broadcast devices support oneshot mode. Seth unearthed this problem while debugging and working around an hpet related BIOS wreckage. Add the necessary check to tick_is_oneshot_available(). Reported-and-tested-by: Seth Forshee Signed-off-by: Thomas Gleixner LKML-Reference: Cc: stable@kernel.org # .21 -> --- kernel/time/tick-broadcast.c | 10 ++++++++++ kernel/time/tick-common.c | 6 +++++- kernel/time/tick-internal.h | 3 +++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 48b2761b5668..a3b5aff62606 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -600,4 +600,14 @@ int tick_broadcast_oneshot_active(void) return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; } +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 051bc80a0c43..ed228ef6f6b8 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -51,7 +51,11 @@ int tick_is_oneshot_available(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT); + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return 0; + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 1; + return tick_broadcast_oneshot_available(); } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 290eefbc1f60..f65d3a723a64 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,6 +36,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast(int cpu); +bool tick_broadcast_oneshot_available(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -46,6 +47,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline bool tick_broadcast_oneshot_available(void) { return true; } # endif /* !BROADCAST */ #else /* !ONESHOT */ @@ -76,6 +78,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) return 0; } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline bool tick_broadcast_oneshot_available(void) { return false; } #endif /* !TICK_ONESHOT */ /* -- cgit v1.2.3 From 8d32a307e4faa8b123dc8a9cd56d1a7525f69ad3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Feb 2011 23:52:23 +0000 Subject: genirq: Provide forced interrupt threading Add a commandline parameter "threadirqs" which forces all interrupts except those marked IRQF_NO_THREAD to run threaded. That's mostly a debug option to allow retrieving better debug data from crashing interrupt handlers. If "threadirqs" is not enabled on the kernel command line, then there is no impact in the interrupt hotpath. Architecture code needs to select CONFIG_IRQ_FORCED_THREADING after marking the interrupts which cant be threaded IRQF_NO_THREAD. All interrupts which have IRQF_TIMER set are implict marked IRQF_NO_THREAD. Also all PER_CPU interrupts are excluded. Forced threading hard interrupts also forces all soft interrupt handling into thread context. When enabled it might slow down things a bit, but for debugging problems in interrupt code it's a reasonable penalty as it does not immediately crash and burn the machine when an interrupt handler is buggy. Some test results on a Core2Duo machine: Cache cold run of: # time git grep irq_desc non-threaded threaded real 1m18.741s 1m19.061s user 0m1.874s 0m1.757s sys 0m5.843s 0m5.427s # iperf -c server non-threaded [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 933 Mbits/sec threaded [ 3] 0.0-10.0 sec 1.09 GBytes 939 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 934 Mbits/sec [ 3] 0.0-10.0 sec 1.09 GBytes 937 Mbits/sec Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20110223234956.772668648@linutronix.de> --- kernel/irq/Kconfig | 3 +++ kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/softirq.c | 16 ++++++++++-- 4 files changed, 82 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 355b8c7957f5..144db9dcfcde 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -38,6 +38,9 @@ config HARDIRQS_SW_RESEND config IRQ_PREFLOW_FASTEOI bool +config IRQ_FORCED_THREADING + bool + config SPARSE_IRQ bool "Support sparse irq numbering" depends on HAVE_SPARSE_IRQ diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 935bec4bfa87..6c6ec9a49027 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -27,12 +27,14 @@ extern int noirqdebug; * IRQTF_DIED - handler thread died * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity + * IRQTF_FORCED_THREAD - irq action is force threaded */ enum { IRQTF_RUNTHREAD, IRQTF_DIED, IRQTF_WARNED, IRQTF_AFFINITY, + IRQTF_FORCED_THREAD, }; /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 58c861367300..acd599a43bfb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -17,6 +17,17 @@ #include "internals.h" +#ifdef CONFIG_IRQ_FORCED_THREADING +__read_mostly bool force_irqthreads; + +static int __init setup_forced_irqthreads(char *arg) +{ + force_irqthreads = true; + return 0; +} +early_param("threadirqs", setup_forced_irqthreads); +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -701,6 +712,32 @@ static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } #endif +/* + * Interrupts which are not explicitely requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. + */ +static void +irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + local_bh_disable(); + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); + local_bh_enable(); +} + +/* + * Interrupts explicitely requested as threaded interupts want to be + * preemtible - many of them need to sleep and wait for slow busses to + * complete. + */ +static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); +} + /* * Interrupt handler thread */ @@ -711,8 +748,15 @@ static int irq_thread(void *data) }; struct irqaction *action = data; struct irq_desc *desc = irq_to_desc(action->irq); + void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); int wake; + if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; + else + handler_fn = irq_thread_fn; + sched_setscheduler(current, SCHED_FIFO, ¶m); current->irqaction = action; @@ -736,10 +780,7 @@ static int irq_thread(void *data) raw_spin_unlock_irq(&desc->lock); } else { raw_spin_unlock_irq(&desc->lock); - - action->thread_fn(action->irq, action->dev_id); - - irq_finalize_oneshot(desc, action, false); + handler_fn(desc, action); } wake = atomic_dec_and_test(&desc->threads_active); @@ -789,6 +830,22 @@ void exit_irq_thread(void) set_bit(IRQTF_DIED, &tsk->irqaction->flags); } +static void irq_setup_forced_threading(struct irqaction *new) +{ + if (!force_irqthreads) + return; + if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) + return; + + new->flags |= IRQF_ONESHOT; + + if (!new->thread_fn) { + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + } +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -838,6 +895,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * dummy function which warns when called. */ new->handler = irq_nested_primary_handler; + } else { + irq_setup_forced_threading(new); } /* diff --git a/kernel/softirq.c b/kernel/softirq.c index c0490464e92f..a33fb2911248 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -311,9 +311,21 @@ void irq_enter(void) } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED -# define invoke_softirq() __do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + __do_softirq(); + else + wakeup_softirqd(); +} #else -# define invoke_softirq() do_softirq() +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + do_softirq(); + else + wakeup_softirqd(); +} #endif /* -- cgit v1.2.3 From c69e3758ff56d03e161187355791ec992c574276 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Mar 2011 11:49:21 +0100 Subject: genirq: Fixup fasteoi handler for oneshot mode The fasteoi handler must mask the interrupt line in oneshot mode otherwise we end up with an irq storm. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b5145654855f..c9c0601f0615 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -513,6 +513,10 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) mask_irq(desc); goto out; } + + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); + preflow_handler(desc); handle_irq_event(desc); -- cgit v1.2.3 From 5cd10e7946d28cfc42442fee2e6c757e244d756e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Mar 2011 16:58:30 +0100 Subject: hrtimer: Update base[CLOCK_BOOTTIME].offset correctly We calculate the current time of each clock base by adding an offset to clock_monotonic. The offset for the clock bases is set in retrigger_next_event() which is called when we switch a cpu to highres mode or when the clock was set. Add the missing update for clock boottime. Signed-off-by: Thomas Gleixner Cc: John Stultz --- kernel/hrtimer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 4c53af18734b..f679a2d08723 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -635,6 +635,8 @@ static void retrigger_next_event(void *arg) raw_spin_lock(&base->lock); base->clock_base[HRTIMER_BASE_REALTIME].offset = timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); -- cgit v1.2.3 From 2d3a8497f8cc5aca14b722cd37d51f6c15ff9f74 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 3 Mar 2011 10:53:20 -0500 Subject: blktrace: Remove blk_fill_rwbs_rq. If we enable trace events to trace block actions, We use blk_fill_rwbs_rq to analyze the corresponding actions in request's cmd_flags, but we only choose the minor 2 bits from it, so most of other flags(e.g, REQ_SYNC) are missing. For example, with a sync write we get: write_test-2409 [001] 160.013869: block_rq_insert: 3,64 W 0 () 258135 + = 8 [write_test] Since now we have integrated the flags of both bio and request, it is safe to pass rq->cmd_flags directly to blk_fill_rwbs and blk_fill_rwbs_rq isn't needed any more. With this patch, after a sync write we get: write_test-2417 [000] 226.603878: block_rq_insert: 3,64 WS 0 () 258135 += 8 [write_test] Signed-off-by: Tao Ma Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d95721f33702..cbafed7d4f38 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1827,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) rwbs[i] = '\0'; } -void blk_fill_rwbs_rq(char *rwbs, struct request *rq) -{ - int rw = rq->cmd_flags & 0x03; - int bytes; - - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - - bytes = blk_rq_bytes(rq); - - blk_fill_rwbs(rwbs, rw, bytes); -} - #endif /* CONFIG_EVENT_TRACING */ -- cgit v1.2.3 From c53fa1ed92cd671a1dfb1e7569e9ab672612ddc6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 3 Mar 2011 10:55:40 -0800 Subject: netlink: kill loginuid/sessionid/sid members from struct netlink_skb_parms Netlink message processing in the kernel is synchronous these days, the session information can be collected when needed. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- kernel/audit.c | 6 +++--- kernel/auditfilter.c | 10 +++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 162e88e33bc9..939500317066 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -673,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) pid = NETLINK_CREDS(skb)->pid; uid = NETLINK_CREDS(skb)->uid; - loginuid = NETLINK_CB(skb).loginuid; - sessionid = NETLINK_CB(skb).sessionid; - sid = NETLINK_CB(skb).sid; + loginuid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + security_task_getsecid(current, &sid); seq = nlh->nlmsg_seq; data = NLMSG_DATA(nlh); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index add2819af71b..f8277c80d678 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; int result = 0; + u32 sid; switch (f->type) { case AUDIT_PID: @@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, result = audit_comparator(cb->creds.gid, f->op, f->val); break; case AUDIT_LOGINUID: - result = audit_comparator(cb->loginuid, f->op, f->val); + result = audit_comparator(audit_get_loginuid(current), + f->op, f->val); break; case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: - if (f->lsm_rule) - result = security_audit_rule_match(cb->sid, + if (f->lsm_rule) { + security_task_getsecid(current, &sid); + result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule, NULL); + } break; } -- cgit v1.2.3 From 0c3b9168017cbad2c4af3dd65ec93fe646eeaa62 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 3 Mar 2011 17:04:35 +0530 Subject: sched: Fix sched rt group scheduling when hierachy is enabled The current sched rt code is broken when it comes to hierarchical scheduling, this patch fixes two problems 1. It adds redundant enqueuing (harmless) when it finds a queue has tasks enqueued, but it has no run time and it is not throttled. 2. The most important change is in sched_rt_rq_enqueue/dequeue. The code just picks the rt_rq belonging to the current cpu on which the period timer runs, the patch fixes it, so that the correct rt_se is enqueued/dequeued. Tested with a simple hierarchy /c/d, c and d assigned similar runtimes of 50,000 and a while 1 loop runs within "d". Both c and d get throttled, without the patch, the task just stops running and never runs (depends on where the sched_rt b/w timer runs). With the patch, the task is throttled and runs as expected. [ bharata, suggestions on how to pick the rt_se belong to the rt_rq and correct cpu ] Signed-off-by: Balbir Singh Acked-by: Bharata B Rao Signed-off-by: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: <20110303113435.GA2868@balbir.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index ad6267714c84..01f75a5f17af 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -210,11 +210,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { - int this_cpu = smp_processor_id(); struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se; - rt_se = rt_rq->tg->rt_se[this_cpu]; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + + rt_se = rt_rq->tg->rt_se[cpu]; if (rt_rq->rt_nr_running) { if (rt_se && !on_rt_rq(rt_se)) @@ -226,10 +227,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) { - int this_cpu = smp_processor_id(); struct sched_rt_entity *rt_se; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); - rt_se = rt_rq->tg->rt_se[this_cpu]; + rt_se = rt_rq->tg->rt_se[cpu]; if (rt_se && on_rt_rq(rt_se)) dequeue_rt_entity(rt_se); @@ -565,8 +566,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); - } else if (rt_rq->rt_nr_running) + } else if (rt_rq->rt_nr_running) { idle = 0; + if (!rt_rq_throttled(rt_rq)) + enqueue = 1; + } if (enqueue) sched_rt_rq_enqueue(rt_rq); -- cgit v1.2.3 From a2f5c9ab79f78e8b91ac993e0543d65b661dd19b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Tue, 22 Feb 2011 13:04:33 -0800 Subject: sched: Allow SCHED_BATCH to preempt SCHED_IDLE tasks Perform the test for SCHED_IDLE before testing for SCHED_BATCH (and ensure idle tasks don't preempt idle tasks) so the non-interactive, but still important, SCHED_BATCH tasks will run in favor of the very low priority SCHED_IDLE tasks. Signed-off-by: Darren Hart Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Cc: Richard Purdie LKML-Reference: <1298408674-3130-2-git-send-email-dvhart@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3a88dee165c0..1438e13cf8be 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1870,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (test_tsk_need_resched(curr)) return; + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + /* - * Batch and idle tasks do not preempt (their preemption is driven by - * the tick): + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): */ if (unlikely(p->policy != SCHED_NORMAL)) return; - /* Idle tasks are by definition preempted by everybody. */ - if (unlikely(curr->policy == SCHED_IDLE)) - goto preempt; if (!sched_feat(WAKEUP_PREEMPT)) return; -- cgit v1.2.3 From c02aa73b1d18e43cfd79c2f193b225e84ca497c8 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 17 Feb 2011 15:37:07 -0800 Subject: sched: Allow users with sufficient RLIMIT_NICE to change from SCHED_IDLE policy The current scheduler implementation returns -EPERM when trying to change from SCHED_IDLE to SCHED_OTHER or SCHED_BATCH. Since SCHED_IDLE is considered to be a nice 20 on steroids, changing to another policy should be allowed provided the RLIMIT_NICE is accounted for. This patch allows the following test-case to pass with RLIMIT_NICE=40, but still fail with RLIMIT_NICE=10 when the calling process is run from a typical shell (nice 0, or 20 in rlimit terms). int main() { int ret; struct sched_param sp; sp.sched_priority = 0; /* switch to SCHED_IDLE */ ret = sched_setscheduler(0, SCHED_IDLE, &sp); printf("setscheduler IDLE: %d\n", ret); if (ret) return ret; /* switch back to SCHED_OTHER */ ret = sched_setscheduler(0, SCHED_OTHER, &sp); printf("setscheduler OTHER: %d\n", ret); return ret; } $ ulimit -e 40 $ ./test setscheduler IDLE: 0 setscheduler OTHER: 0 $ ulimit -e 10 $ ulimit -e 10 $ ./test setscheduler IDLE: 0 setscheduler OTHER: -1 Signed-off-by: Darren Hart Signed-off-by: Peter Zijlstra Cc: Richard Purdie LKML-Reference: <4D657BEE.4040608@linux.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0c8712630f05..f3030709d826 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4981,12 +4981,15 @@ recheck: param->sched_priority > rlim_rtprio) return -EPERM; } + /* - * Like positive nice levels, dont allow tasks to - * move out of SCHED_IDLE either: + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. */ - if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) - return -EPERM; + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } /* can't change other user's priorities */ if (!check_same_owner(p)) -- cgit v1.2.3 From 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Tue, 1 Mar 2011 16:28:21 -0800 Subject: sched: Resched proper CPU on yield_to() yield_to_task_fair() has code to resched the CPU of yielding task when the intention is to resched the CPU of the task that is being yielded to. Change here fixes the problem and also makes the resched conditional on rq != p_rq. Signed-off-by: Venkatesh Pallipadi Reviewed-by: Rik van Riel Signed-off-by: Peter Zijlstra LKML-Reference: <1299025701-22168-1-git-send-email-venki@google.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- kernel/sched_fair.c | 4 ---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f3030709d826..61452e86c73b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5522,8 +5522,15 @@ again: goto out; yielded = curr->sched_class->yield_to_task(rq, p, preempt); - if (yielded) + if (yielded) { schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } out: double_rq_unlock(rq, p_rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 1438e13cf8be..3f7ec9e27ee1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1987,10 +1987,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp /* Tell the scheduler that we'd really like pse to run next. */ set_next_buddy(se); - /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */ - if (preempt) - resched_task(rq->curr); - yield_task_fair(rq); return true; -- cgit v1.2.3 From 940c5b2971de443df22eed0441bc74fb0116e9f5 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sun, 27 Feb 2011 21:13:31 +0800 Subject: perf: Fix the missing event initialization when pmu is found in idr Currently, the event is not initialized if pmu is found in idr. This never causes bug just because now no pmu is associated with the idr id. Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra LKML-Reference: <1298812411.2699.9.camel@localhost> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 64a018e94fca..821ce8221974 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6098,17 +6098,22 @@ struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; + int ret; idx = srcu_read_lock(&pmus_srcu); rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); - if (pmu) + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); goto unlock; + } list_for_each_entry_rcu(pmu, &pmus, entry) { - int ret = pmu->event_init(event); + ret = pmu->event_init(event); if (!ret) goto unlock; -- cgit v1.2.3 From 3db272c0494900fcb905a201180a78cae3addd6e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:25:37 +0800 Subject: perf cgroup: Fix leak of file reference count In perf_cgroup_connect(), fput_light() is missing in a failure path. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F3461.6060406@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 821ce8221974..7c999e809ba3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -404,8 +404,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, return -EBADF; css = cgroup_css_from_dir(file, perf_subsys_id); - if (IS_ERR(css)) - return PTR_ERR(css); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -422,6 +424,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, /* must be done before we fput() the file */ perf_get_cgroup(event); } +out: fput_light(file, fput_needed); return ret; } -- cgit v1.2.3 From f75e18cb9627b1d3d752b83a0b5563da0042c50a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:25:50 +0800 Subject: perf cgroup: Fix unmatched call to perf_detach_cgroup() In the failure path, we call perf_detach_cgroup(), but we didn't call perf_get_cgroup() prio to it. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F346E.9070606@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7c999e809ba3..b00209544d57 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -412,6 +412,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; + /* must be done before we fput() the file */ + perf_get_cgroup(event); + /* * all events in a group must monitor * the same cgroup because a task belongs @@ -420,9 +423,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; - } else { - /* must be done before we fput() the file */ - perf_get_cgroup(event); } out: fput_light(file, fput_needed); -- cgit v1.2.3 From 1b15d0558e82df9b3659804ceb44187b98eda354 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:26:06 +0800 Subject: perf cgroup: Clean up perf_cgroup_create() - Use kzalloc() to replace kmalloc() + memset(). - Remove redundant initialization, since alloc_percpu() returns zero-filled percpu memory. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F347E.2010806@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b00209544d57..193b1900e64f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -7346,26 +7346,17 @@ static struct cgroup_subsys_state *perf_cgroup_create( struct cgroup_subsys *ss, struct cgroup *cont) { struct perf_cgroup *jc; - struct perf_cgroup_info *t; - int c; - jc = kmalloc(sizeof(*jc), GFP_KERNEL); + jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); - memset(jc, 0, sizeof(*jc)); - jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } - for_each_possible_cpu(c) { - t = per_cpu_ptr(jc->info, c); - t->time = 0; - t->timestamp = 0; - } return &jc->css; } -- cgit v1.2.3 From 08309379b7083a9ceec0f9bb96a629058fb623c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Mar 2011 11:31:20 +0100 Subject: perf: Fix cgroup vs jump_label problem Li Zefan reported that the jump label code sleeps and we're calling it under a spinlock, *fail* ;-) Reported-by: Li Zefan Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 193b1900e64f..ed253aa24ba4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -820,16 +820,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) { + if (is_cgroup_event(event)) ctx->nr_cgroups++; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); - } list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) @@ -957,11 +949,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { + if (is_cgroup_event(event)) ctx->nr_cgroups--; - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); - } ctx->nr_events--; if (event->attr.inherit_stat) @@ -2903,6 +2892,10 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } } if (event->buffer) { @@ -6478,6 +6471,13 @@ SYSCALL_DEFINE5(perf_event_open, err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); } /* -- cgit v1.2.3 From ba74f4d7e5125d04d453b4af69c53c533e6feb80 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 9 Jan 2011 18:09:51 -0800 Subject: rcu: call __rcu_read_unlock() in exit_rcu for tiny RCU Using __rcu_read_lock() in place of rcu_read_lock() leaves any debug state as it really should be, namely with the lock still held. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 015abaea962a..3cb8e362e883 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -852,7 +852,7 @@ void exit_rcu(void) if (t->rcu_read_lock_nesting == 0) return; t->rcu_read_lock_nesting = 1; - rcu_read_unlock(); + __rcu_read_unlock(); } #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ -- cgit v1.2.3 From 37743de384951ef97c040e69039820c987849501 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Mon, 10 Jan 2011 21:43:13 +0100 Subject: rcutorture: Get rid of duplicate sched.h include linux/sched.h is included twice in kernel/rcutorture.c - once is enough. Signed-off-by: Jesper Juhl Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 89613f97ff26..c224da41890c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -47,7 +47,6 @@ #include #include #include -#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and " -- cgit v1.2.3 From fe8e64071a4ff5925ced4f33cb32ba090a04649c Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Sun, 30 Jan 2011 18:23:08 +0800 Subject: rcupdate: remove dead code DEBUG_OBJECTS_RCU_HEAD depends on PREEMPT, so #ifndef CONFIG_PREEMPT is totally useless in kernel/rcupdate.c. Signed-off-by: WANG Cong Cc: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcupdate.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a23a57a976d1..afd21d17c081 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -215,10 +215,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. */ -#ifndef CONFIG_PREEMPT - WARN_ON(1); - return 0; -#else if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { WARN_ON(1); @@ -229,7 +225,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) rcu_barrier_bh(); debug_object_free(head, &rcuhead_debug_descr); return 1; -#endif default: return 0; } -- cgit v1.2.3 From e611eecd6f9f91d74beda8cfbb3b5e02abdeb5a1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 30 Jan 2011 23:56:46 -0800 Subject: rcu: add comment saying why DEBUG_OBJECTS_RCU_HEAD depends on PREEMPT. The build will break if you change the Kconfig to allow DEBUG_OBJECTS_RCU_HEAD and !PREEMPT, so document the reasoning near where the breakage would occur. Signed-off-by: Paul E. McKenney --- kernel/rcupdate.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index afd21d17c081..f3240e987928 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -214,6 +214,11 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) * Ensure that queued callbacks are all executed. * If we detect that we are nested in a RCU read-side critical * section, we should simply fail, otherwise we would deadlock. + * Note that the machinery to reliably determine whether + * or not we are in an RCU read-side critical section + * exists only in the preemptible RCU implementations + * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why + * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. */ if (rcu_preempt_depth() != 0 || preempt_count() != 0 || irqs_disabled()) { -- cgit v1.2.3 From e3e89cc535223433a619d0969db3fa05cdd946b8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 4 Mar 2011 09:23:30 -0800 Subject: Mark ptrace_{traceme,attach,detach} static They are only used inside kernel/ptrace.c, and have been for a long time. We don't want to go back to the bad-old-days when architectures did things on their own, so make them static and private. Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1708b1e2972d..e2302e40b360 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; } -int ptrace_attach(struct task_struct *task) +static int ptrace_attach(struct task_struct *task) { int retval; @@ -219,7 +219,7 @@ out: * Performs checks and sets PT_PTRACED. * Should be used by all ptrace implementations for PTRACE_TRACEME. */ -int ptrace_traceme(void) +static int ptrace_traceme(void) { int ret = -EPERM; @@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) return false; } -int ptrace_detach(struct task_struct *child, unsigned int data) +static int ptrace_detach(struct task_struct *child, unsigned int data) { bool dead = false; -- cgit v1.2.3 From b75f38d659e6fc747eda64cb72f3920e29dd44a4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 4 Mar 2011 17:36:21 -0800 Subject: cpuset: add a missing unlock in cpuset_write_resmask() Don't forget to release cgroup_mutex if alloc_trial_cpuset() fails. [akpm@linux-foundation.org: avoid multiple return points] Signed-off-by: Li Zefan Cc: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4349935c2ad8..e92e98189032 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; trialcs = alloc_trial_cpuset(cs); - if (!trialcs) - return -ENOMEM; + if (!trialcs) { + retval = -ENOMEM; + goto out; + } switch (cft->private) { case FILE_CPULIST: @@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, } free_trial_cpuset(trialcs); +out: cgroup_unlock(); return retval; } -- cgit v1.2.3 From 4ba8216cd90560bc402f52076f64d8546e8aefcb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Jan 2011 22:52:22 +0100 Subject: BKL: That's all, folks This removes the implementation of the big kernel lock, at last. A lot of people have worked on this in the past, I so the credit for this patch should be with everyone who participated in the hunt. The names on the Cc list are the people that were the most active in this, according to the recorded git history, in alphabetical order. Signed-off-by: Arnd Bergmann Acked-by: Alan Cox Cc: Alessio Igor Bogani Cc: Al Viro Cc: Andrew Hendry Cc: Andrew Morton Cc: Christoph Hellwig Cc: Eric W. Biederman Cc: Frederic Weisbecker Cc: Hans Verkuil Acked-by: Ingo Molnar Cc: Jan Blunck Cc: John Kacur Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Oliver Neukum Cc: Paul Menage Acked-by: Thomas Gleixner Cc: Trond Myklebust --- kernel/sched.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..827c170c6017 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -3945,9 +3944,6 @@ need_resched: rcu_note_context_switch(cpu); prev = rq->curr; - release_kernel_lock(prev); -need_resched_nonpreemptible: - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -4010,9 +4006,6 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(prev))) - goto need_resched_nonpreemptible; - preempt_enable_no_resched(); if (need_resched()) goto need_resched; @@ -8074,7 +8067,7 @@ static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); - return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); + return (nested == preempt_offset); } void __might_sleep(const char *file, int line, int preempt_offset) -- cgit v1.2.3 From dfef6dcd35cb4a251f6322ca9b2c06f0bb1aa1f4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 8 Mar 2011 01:25:28 -0500 Subject: unfuck proc_sysctl ->d_compare() a) struct inode is not going to be freed under ->d_compare(); however, the thing PROC_I(inode)->sysctl points to just might. Fortunately, it's enough to make freeing that sucker delayed, provided that we don't step on its ->unregistering, clear the pointer to it in PROC_I(inode) before dropping the reference and check if it's NULL in ->d_compare(). b) I'm not sure that we *can* walk into NULL inode here (we recheck dentry->seq between verifying that it's still hashed / fetching dentry->d_inode and passing it to ->d_compare() and there's no negative hashed dentries in /proc/sys/*), but if we can walk into that, we really should not have ->d_compare() return 0 on it! Said that, I really suspect that this check can be simply killed. Nick? Signed-off-by: Al Viro --- kernel/sysctl.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83db985..4eed0af5d144 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -194,9 +194,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { - .count = 1, + {{.count = 1, .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list), + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, .root = &sysctl_table_root, .set = &sysctl_table_root.default_set, }; @@ -1567,11 +1567,16 @@ void sysctl_head_get(struct ctl_table_header *head) spin_unlock(&sysctl_lock); } +static void free_head(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ctl_table_header, rcu)); +} + void sysctl_head_put(struct ctl_table_header *head) { spin_lock(&sysctl_lock); if (!--head->count) - kfree(head); + call_rcu(&head->rcu, free_head); spin_unlock(&sysctl_lock); } @@ -1948,10 +1953,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) start_unregistering(header); if (!--header->parent->count) { WARN_ON(1); - kfree(header->parent); + call_rcu(&header->parent->rcu, free_head); } if (!--header->count) - kfree(header); + call_rcu(&header->rcu, free_head); spin_unlock(&sysctl_lock); } -- cgit v1.2.3 From 997772884036e6e121de39322179989154437d9f Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 7 Mar 2011 09:58:33 +0100 Subject: debugobjects: Add hint for better object identification In complex subsystems like mac80211 structures can contain several timers and work structs, so identifying a specific instance from the call trace and object type output of debugobjects can be hard. Allow the subsystems which support debugobjects to provide a hint function. This function returns a pointer to a kernel address (preferrably the objects callback function) which is printed along with the debugobjects type. Add hint methods for timer_list, work_struct and hrtimer. [ tglx: Massaged changelog, made it compile ] Signed-off-by: Stanislaw Gruszka LKML-Reference: <20110307085809.GA9334@redhat.com> Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 6 ++++++ kernel/timer.c | 6 ++++++ kernel/workqueue.c | 6 ++++++ 3 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0c8d7c048615..e38f5a073d01 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -334,6 +334,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe); static struct debug_obj_descr hrtimer_debug_descr; +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -393,6 +398,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d245..33a67925d900 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {} static struct debug_obj_descr timer_debug_descr; +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + /* * fixup_init is called when: * - an active object is initialized @@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr timer_debug_descr = { .name = "timer_list", + .debug_hint = timer_debug_hint, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee6578b578ad..b5fe4c00eb3c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -316,6 +316,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, static struct debug_obj_descr work_debug_descr; +static void *work_debug_hint(void *addr) +{ + return ((struct work_struct *) addr)->func; +} + /* * fixup_init is called when: * - an active object is initialized @@ -387,6 +392,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state) static struct debug_obj_descr work_debug_descr = { .name = "work_struct", + .debug_hint = work_debug_hint, .fixup_init = work_fixup_init, .fixup_activate = work_fixup_activate, .fixup_free = work_fixup_free, -- cgit v1.2.3 From c68fd4f3ca90de7d18c567e70b2c164078aefadf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 8 Mar 2011 19:52:55 +0100 Subject: genirq: Add comments to Kconfig switches Signed-off-by: Thomas Gleixner Cc: Sam Ravnborg --- kernel/irq/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 144db9dcfcde..09bef82d74cb 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,3 +1,4 @@ +# Select this to activate the generic irq options below config HAVE_GENERIC_HARDIRQS bool @@ -17,27 +18,36 @@ config GENERIC_HARDIRQS_NO_COMPAT bool # Options selectable by the architecture code + +# Make sparse irq Kconfig switch below available config HAVE_SPARSE_IRQ bool +# Enable the generic irq autoprobe mechanism config GENERIC_IRQ_PROBE bool +# Use the generic /proc/interrupts implementation config GENERIC_IRQ_SHOW bool +# Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool +# Alpha specific irq affinity mechanism config AUTO_IRQ_AFFINITY bool +# Tasklet based software resend for pending interrupts on enable_irq() config HARDIRQS_SW_RESEND bool +# Preflow handler support for fasteoi (sparc64) config IRQ_PREFLOW_FASTEOI bool +# Support forced irq threading config IRQ_FORCED_THREADING bool -- cgit v1.2.3 From 750912fa366312e9c5bc83eab352898a26750401 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Wed, 8 Dec 2010 13:46:47 -0800 Subject: tracing: Add an 'overwrite' trace_option. Add an "overwrite" trace_option for ftrace to control whether the buffer should be overwritten on overflow or not. The default remains to overwrite old events when the buffer is full. This patch adds the option to instead discard newest events when the buffer is full. This is useful to get a snapshot of traces just after enabling traces. Dropping the current event is also a simpler code path. Signed-off-by: David Sharp LKML-Reference: <1291844807-15481-1-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++++++++ kernel/trace/trace.c | 17 +++++++++++------ kernel/trace/trace.h | 1 + 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..269db80a961e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1429,6 +1429,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8dc8da6733f9..85e3ee1e474e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,8 +41,6 @@ #include "trace.h" #include "trace_output.h" -#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -425,6 +423,7 @@ static const char *trace_options[] = { "sleep-time", "graph-time", "record-cmd", + "overwrite", NULL }; @@ -2529,6 +2528,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) + ring_buffer_change_overwrite(global_trace.buffer, enabled); } static ssize_t @@ -4555,9 +4557,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) __init static int tracer_alloc_buffers(void) { int ring_buf_size; + enum ring_buffer_flags rb_flags; int i; int ret = -ENOMEM; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -4570,12 +4574,13 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); @@ -4585,7 +4590,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, rb_flags); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 856e73cc1d3f..951d0b7e7062 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -606,6 +606,7 @@ enum trace_iterator_flags { TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, }; /* -- cgit v1.2.3 From de29be5e712dc8b7eef2bef9417af3bb6a88e47a Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:16 -0800 Subject: ring-buffer: Remove unused #include Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-3-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 269db80a961e..3237d961d905 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5,7 +5,6 @@ */ #include #include -#include #include #include #include -- cgit v1.2.3 From 73c101011926c5832e6e141682180c4debe2cf45 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Mar 2011 13:19:51 +0100 Subject: block: initial patch for on-stack per-task plugging This patch adds support for creating a queuing context outside of the queue itself. This enables us to batch up pieces of IO before grabbing the block device queue lock and submitting them to the IO scheduler. The context is created on the stack of the process and assigned in the task structure, so that we can auto-unplug it if we hit a schedule event. The current queue plugging happens implicitly if IO is submitted to an empty device, yet callers have to remember to unplug that IO when they are going to wait for it. This is an ugly API and has caused bugs in the past. Additionally, it requires hacks in the vm (->sync_page() callback) to handle that logic. By switching to an explicit plugging scheme we make the API a lot nicer and can get rid of the ->sync_page() hack in the vm. Signed-off-by: Jens Axboe --- kernel/exit.c | 1 + kernel/fork.c | 3 +++ kernel/sched.c | 12 ++++++++++++ 3 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f9a45ebcc7b1..6a488ad2dce5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code) profile_task_exit(tsk); WARN_ON(atomic_read(&tsk->fs_excl)); + WARN_ON(blk_needs_flush_plug(tsk)); if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..027c80e5162f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1204,6 +1204,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; +#ifdef CONFIG_BLOCK + p->plug = NULL; +#endif #ifdef CONFIG_FUTEX p->robust_list = NULL; #ifdef CONFIG_COMPAT diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..ca098bf4cc65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3978,6 +3978,16 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } + /* + * If we are going to sleep and we have plugged IO queued, make + * sure to submit it to avoid deadlocks. + */ + if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_flush_plug(prev); + raw_spin_lock(&rq->lock); + } + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) @@ -5333,6 +5343,7 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; schedule(); current->in_iowait = 0; @@ -5348,6 +5359,7 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); current->in_iowait = 1; ret = schedule_timeout(timeout); current->in_iowait = 0; -- cgit v1.2.3 From 721a9602e6607417c6bc15b18e97a2f35266c690 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Mar 2011 11:56:30 +0100 Subject: block: kill off REQ_UNPLUG With the plugging now being explicitly controlled by the submitter, callers need not pass down unplugging hints to the block layer. If they want to unplug, it's because they manually plugged on their own - in which case, they should just unplug at will. Signed-off-by: Jens Axboe --- kernel/power/block_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 83bbc7c02df9..d09dd10c5a5e 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; + const int bio_rw = rw | REQ_SYNC; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); -- cgit v1.2.3 From e6e1e2593592a8f6f6380496655d8c6f67431266 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 10:41:56 -0500 Subject: tracing: Remove lock_depth from event entry The lock_depth field in the event headers was added as a temporary data point for help in removing the BKL. Now that the BKL is pretty much been removed, we can remove this field. This in turn changes the header from 12 bytes to 8 bytes, removing the 4 byte buffer that gcc would insert if the first field in the data load was 8 bytes in size. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 +++----- kernel/trace/trace_events.c | 1 - kernel/trace/trace_output.c | 10 ++-------- 3 files changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85e3ee1e474e..fd6e1b906b3c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1101,7 +1101,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1748,10 +1747,9 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| /_--=> lock-depth \n"); - seq_puts(m, "# |||||/ delay \n"); - seq_puts(m, "# cmd pid |||||| time | caller \n"); - seq_puts(m, "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e0438a4..e1d579b19834 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, lock_depth); return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..151f32e5f2c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -529,7 +529,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) * @entry: The trace entry field from the ring buffer * * Prints the generic fields of irqs off, in hard or softirq, preempt - * count and lock depth. + * count. */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { @@ -554,13 +554,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) else ret = trace_seq_putc(s, '.'); - if (!ret) - return 0; - - if (entry->lock_depth < 0) - return trace_seq_putc(s, '.'); - - return trace_seq_printf(s, "%d", entry->lock_depth); + return ret; } static int -- cgit v1.2.3 From 140e4f2d1cd816aed196705c036763313c0e4bd3 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:19 -0800 Subject: tracing: Fix event alignment: ftrace:context_switch and ftrace:wakeup Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-6-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf223764be8..1516cb3ec549 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, */ #define FTRACE_CTX_FIELDS \ __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ __field( unsigned char, prev_prio ) \ __field( unsigned char, prev_state ) \ - __field( unsigned int, next_pid ) \ __field( unsigned char, next_prio ) \ - __field( unsigned char, next_state ) \ - __field( unsigned int, next_cpu ) + __field( unsigned char, next_state ) FTRACE_ENTRY(context_switch, ctx_switch_entry, -- cgit v1.2.3 From 10da37a645b5e915d8572cc2b1f5eb11ada3ea4f Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:26 -0800 Subject: tracing: Adjust conditional expression latency formatting. Formatting change only to improve code readability. No code changes except to introduce intermediate variables. Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-13-git-send-email-dhsharp@google.com> [ Keep variable declarations and assignment separate ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 151f32e5f2c6..456be9063c2d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -533,20 +533,30 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - int hardirq, softirq; + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + need_resched = + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + hardsoft_irq = + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.'; + if (!trace_seq_printf(s, "%c%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? - 'X' : '.', - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? - 'N' : '.', - (hardirq && softirq) ? 'H' : - hardirq ? 'h' : softirq ? 's' : '.')) + irqs_off, need_resched, hardsoft_irq)) return 0; if (entry->preempt_count) -- cgit v1.2.3 From 31274d72f01604f4b02d933b4f3cac84d2c201fd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Feb 2011 15:52:19 +0100 Subject: tracing: Explain about unstable clock on resume with ring buffer warning The "Delta way too big" warning might appear on a system with a unstable shed clock right after the system is resumed and tracing was enabled at time of suspend. Since it's not realy a bug, and the unstable sched clock is working fast and reliable otherwise, Steven suggested to keep using the sched clock in any case and just to make note in the warning itself. v2 changes: - added #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK Signed-off-by: Jiri Olsa LKML-Reference: <20110218145219.GD2604@jolsa.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3237d961d905..db7b439d23ee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2172,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable; +#endif WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } -- cgit v1.2.3 From 56355b83e2a24ce7e1870c8479205e2cdd332225 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Mon, 8 Nov 2010 14:05:12 +0800 Subject: tracing: Export trace_set_clr_event() Trace events belonging to a module only exists when the module is loaded. Well, we can use trace_set_clr_event funtion to enable some trace event at the module init routine, so that we will not miss something while loading then module. So, Export the trace_set_clr_event function so that module can use it. Signed-off-by: Yuanhan Liu LKML-Reference: <1289196312-25323-1-git-send-email-yuanhan.liu@linux.intel.com> Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e1d579b19834..e88f74fe1d4c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -325,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) { return __ftrace_set_clr_event(NULL, system, event, set); } +EXPORT_SYMBOL_GPL(trace_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit v1.2.3 From 9a24470b2826e4665b1484836c7ae6aba1ddea32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 14:53:38 -0500 Subject: tracing: Align 4 byte ints together in struct tracer Move elements in struct tracer for better alignment. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 951d0b7e7062..5e9dfc6286dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,8 +272,8 @@ struct tracer { /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; - int print_max; struct tracer_flags *flags; + int print_max; int use_max_tr; }; -- cgit v1.2.3 From 4a0b1665db09cf2da9ad7d0f12da386373c10bfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 20:09:26 -0500 Subject: tracing: Fix irqoff selftest expanding max buffer If the kernel command line declares a tracer "ftrace=sometracer" and that tracer is either not defined or is enabled after irqsoff, then the irqs off selftest will fail with the following error: Testing tracer irqsoff: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/autotest/nobackup/linux-test.git/kernel/trace/tra ce.c:713 update_max_tr_single+0xfa/0x11b() Hardware name: Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.38-rc8-test #1 Call Trace: [] ? warn_slowpath_common+0x65/0x7a [] ? update_max_tr_single+0xfa/0x11b [] ? warn_slowpath_null+0xf/0x13 [] ? update_max_tr_single+0xfa/0x11b [] ? stop_critical_timing+0x154/0x204 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? time_hardirqs_on+0x25/0x28 [] ? trace_hardirqs_on_caller+0x18/0x12f [] ? trace_hardirqs_on+0xb/0xd [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? register_tracer+0xf8/0x1a3 [] ? init_irqsoff_tracer+0xd/0x11 [] ? do_one_initcall+0x71/0x121 [] ? init_irqsoff_tracer+0x0/0x11 [] ? kernel_init+0x13a/0x1b6 [] ? kernel_init+0x0/0x1b6 [] ? kernel_thread_helper+0x6/0x10 ---[ end trace e93713a9d40cd06c ]--- .. no entries found ..FAILED! What happens is the "ftrace=..." will expand the ring buffer to its default size (from its minimum size) but it will not expand the max ring buffer (the ring buffer to store maximum latencies). When the irqsoff test runs, it will call the ring buffer swap routine that checks if the max ring buffer is the same size as the normal ring buffer, and will fail if it is not. This causes the test to fail. The solution is to expand the max ring buffer before running the self test if the max ring buffer is used by that tracer and the normal ring buffer is expanded. The max ring buffer should be shrunk again after the test is done to save space. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd6e1b906b3c..9541c27c1cf2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -779,6 +779,11 @@ __acquires(kernel_lock) tracing_reset_online_cpus(tr); current_trace = type; + + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, trace_buf_size); + /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); @@ -791,6 +796,10 @@ __acquires(kernel_lock) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); + /* Shrink the max buffer again */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, 1); + printk(KERN_CONT "PASSED\n"); } #endif -- cgit v1.2.3 From a9e7acfff0a279792918b7b0de74106e576e9988 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Mar 2011 19:12:24 +0100 Subject: hrtimer: Remove empty hrtimer_init_hres_timer() Leftover from earlier implementation. All empty, remove it. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f679a2d08723..63fb74972b75 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -680,14 +680,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -/* - * Initialize the high resolution related parts of a hrtimer - */ -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) -{ -} - - /* * When High resolution timers are active, try to reprogram. Note, that in case * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry @@ -759,7 +751,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1141,7 +1132,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base = hrtimer_clockid_to_base(clock_id); timer->base = &cpu_base->clock_base[base]; - hrtimer_init_timer_hres(timer); timerqueue_init(&timer->node); #ifdef CONFIG_TIMER_STATS -- cgit v1.2.3 From 8fe8f545c6d753ead15e1f4919d39e8f9bb49629 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Sun, 6 Mar 2011 18:07:50 -0800 Subject: futex: Update futex_wait_setup comments about locking Reviving a cleanup I had done about a year ago as part of a larger futex_set_wait proposal. Over the years, the locking of the hashed futex queue got improved, so that some of the "rare but normal" race conditions described in comments can't actually happen anymore. Signed-off-by: Michel Lespinasse Cc: Linus Torvalds Cc: Darren Hart Cc: Peter Zijlstra LKML-Reference: <20110307020750.GA31188@google.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..3184d3b9cadf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1781,13 +1781,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); -- cgit v1.2.3 From bf294b41cefcb22fc3139e0f42c5b3f06728bd5e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 21 Feb 2011 11:05:41 -0800 Subject: SUNRPC: Close a race in __rpc_wait_for_completion_task() Although they run as rpciod background tasks, under normal operation (i.e. no SIGKILL), functions like nfs_sillyrename(), nfs4_proc_unlck() and nfs4_do_close() want to be fully synchronous. This means that when we exit, we want all references to the rpc_task to be gone, and we want any dentry references etc. held by that task to be released. For this reason these functions call __rpc_wait_for_completion_task(), followed by rpc_put_task() in the expectation that the latter will be releasing the last reference to the rpc_task, and thus ensuring that the callback_ops->rpc_release() has been called synchronously. This patch fixes a race which exists due to the fact that rpciod calls rpc_complete_task() (in order to wake up the callers of __rpc_wait_for_completion_task()) and then subsequently calls rpc_put_task() without ensuring that these two steps are done atomically. In order to avoid adding new spin locks, the patch uses the existing waitqueue spin lock to order the rpc_task reference count releases between the waiting process and rpciod. The common case where nobody is waiting for completion is optimised for by checking if the RPC_TASK_ASYNC flag is cleared and/or if the rpc_task reference count is 1: in those cases we drop trying to grab the spin lock, and immediately free up the rpc_task. Those few processes that need to put the rpc_task from inside an asynchronous context and that do not care about ordering are given a new helper: rpc_put_task_async(). Signed-off-by: Trond Myklebust --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..42eab5a8437d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4213,6 +4213,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) { __wake_up_common(q, mode, 1, 0, key); } +EXPORT_SYMBOL_GPL(__wake_up_locked_key); /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. -- cgit v1.2.3 From c0c9ed15042ceac7c485813012a0a97316101b57 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Mar 2011 11:51:22 +0100 Subject: futex: Avoid redudant evaluation of task_pid_vnr() The result is not going to change under us, so no need to reevaluate this over and over. Seems to be a leftover from the mechanical mass conversion of task->pid to task_pid_vnr(tsk). Signed-off-by: Thomas Gleixner --- kernel/futex.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3184d3b9cadf..773815465bac 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -674,7 +674,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; - u32 uval, newval, curval; + u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: ret = lock_taken = 0; @@ -684,7 +684,7 @@ retry: * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = task_pid_vnr(task); + newval = vpid; if (set_waiters) newval |= FUTEX_WAITERS; @@ -696,7 +696,7 @@ retry: /* * Detect deadlocks. */ - if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) return -EDEADLK; /* @@ -723,7 +723,7 @@ retry: */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + newval = (curval & ~FUTEX_TID_MASK) | vpid; ownerdied = 0; lock_taken = 1; } @@ -2047,9 +2047,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) { struct futex_hash_bucket *hb; struct futex_q *this, *next; - u32 uval; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; + u32 uval, vpid = task_pid_vnr(current); int ret; retry: @@ -2058,7 +2058,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) + if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); @@ -2074,7 +2074,7 @@ retry: * anyone else up: */ if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); + uval = cmpxchg_futex_value_locked(uaddr, vpid, 0); if (unlikely(uval == -EFAULT)) @@ -2083,7 +2083,7 @@ retry: * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == task_pid_vnr(current))) + if (unlikely(uval == vpid)) goto out_unlock; /* -- cgit v1.2.3 From 37a9d912b24f96a0591773e6e6c3642991ae5a70 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Mar 2011 18:48:51 -0800 Subject: futex: Sanitize cmpxchg_futex_value_locked API The cmpxchg_futex_value_locked API was funny in that it returned either the original, user-exposed futex value OR an error code such as -EFAULT. This was confusing at best, and could be a source of livelocks in places that retry the cmpxchg_futex_value_locked after trying to fix the issue by running fault_in_user_writeable(). This change makes the cmpxchg_futex_value_locked API more similar to the get_futex_value_locked one, returning an error code and updating the original value through a reference argument. Signed-off-by: Michel Lespinasse Acked-by: Chris Metcalf [tile] Acked-by: Tony Luck [ia64] Acked-by: Thomas Gleixner Tested-by: Michal Simek [microblaze] Acked-by: David Howells [frv] Cc: Darren Hart Cc: Peter Zijlstra Cc: Matt Turner Cc: Russell King Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Linus Torvalds LKML-Reference: <20110311024851.GC26122@google.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 773815465bac..237f14bfc022 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, return NULL; } -static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) { - u32 curval; + int ret; pagefault_disable(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); pagefault_enable(); - return curval; + return ret; } static int get_futex_value_locked(u32 *dest, u32 __user *from) @@ -688,9 +689,7 @@ retry: if (set_waiters) newval |= FUTEX_WAITERS; - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) return -EFAULT; /* @@ -728,9 +727,7 @@ retry: lock_taken = 1; } - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; if (unlikely(curval != uval)) goto retry; @@ -843,9 +840,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) ret = -EFAULT; else if (curval != uval) ret = -EINVAL; @@ -880,10 +875,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) * There is no waiter, so we unlock the futex. The owner died * bit has not to be preserved here. We are the owner: */ - oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); - - if (oldval == -EFAULT) - return oldval; + if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) + return -EFAULT; if (oldval != uval) return -EAGAIN; @@ -1578,9 +1571,7 @@ retry: while (1) { newval = (uval & FUTEX_OWNER_DIED) | newtid; - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) goto handle_fault; if (curval == uval) break; @@ -2073,11 +2064,8 @@ retry: * again. If it succeeds then we can return without waking * anyone else up: */ - if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, vpid, 0); - - - if (unlikely(uval == -EFAULT)) + if (!(uval & FUTEX_OWNER_DIED) && + cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) goto pi_faulted; /* * Rare case: we managed to release the lock atomically, @@ -2464,9 +2452,7 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); - - if (nval == -EFAULT) + if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval)) return -1; if (nval != uval) @@ -2679,8 +2665,7 @@ static int __init futex_init(void) * implementation, the non-functional ones will return * -ENOSYS. */ - curval = cmpxchg_futex_value_locked(NULL, 0, 0); - if (curval == -EFAULT) + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { -- cgit v1.2.3 From 805f6b5e1cbfedfb9b3d354013e7f4b13a79270f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 11 Mar 2011 20:11:59 +0100 Subject: blktrace: Use rq->cmd_flags directly in blk_add_trace_rq. In blk_add_trace_rq, we only chose the minor 2 bits from request's cmd_flags and did some check for discard. so most of other flags(e.g, REQ_SYNC) are missing. For example, with a sync write after blkparse we get: 8,16 1 1 0.001776503 7509 A WS 1349632 + 1024 <- (8,17) 1347584 8,16 1 2 0.001776813 7509 Q WS 1349632 + 1024 [dd] 8,16 1 3 0.001780395 7509 G WS 1349632 + 1024 [dd] 8,16 1 5 0.001783186 7509 I W 1349632 + 1024 [dd] 8,16 1 11 0.001816987 7509 D W 1349632 + 1024 [dd] 8,16 0 2 0.006218192 0 C W 1349632 + 1024 [0] Since now we have integrated the flags of both bio and request, it is safe to pass rq->cmd_flags directly to __blk_add_trace. With this patch, after a sync write we get: 8,16 1 1 0.001776900 5425 A WS 1189888 + 1024 <- (8,17) 1187840 8,16 1 2 0.001777179 5425 Q WS 1189888 + 1024 [dd] 8,16 1 3 0.001780797 5425 G WS 1189888 + 1024 [dd] 8,16 1 5 0.001783402 5425 I WS 1189888 + 1024 [dd] 8,16 1 11 0.001817468 5425 D WS 1189888 + 1024 [dd] 8,16 0 2 0.005640709 0 C WS 1189888 + 1024 [0] Signed-off-by: Tao Ma Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index cbafed7d4f38..7aa40f8e182d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q) * **/ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, - u32 what) + u32 what) { struct blk_trace *bt = q->blk_trace; - int rw = rq->cmd_flags & 0x03; if (likely(!bt)) return; - if (rq->cmd_flags & REQ_DISCARD) - rw |= REQ_DISCARD; - - if (rq->cmd_flags & REQ_SECURE) - rw |= REQ_SECURE; - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, - what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); } } -- cgit v1.2.3 From 2e12978a9f7a7abd54e8eb9ce70a7718767b8b2c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 22 Dec 2010 14:18:50 +0800 Subject: futex,plist: Pass the real head of the priority list to plist_del() Some plist_del()s in kernel/futex.c are passed a faked head of the priority list. It does not fail because the current code does not require the real head in plist_del(). The current code of plist_del() just uses the head for checking, so it will not cause a bad result even when we use a faked head. But it is undocumented usage: /** * plist_del - Remove a @node from plist. * * @node: &struct plist_node pointer - entry to be removed * @head: &struct plist_head pointer - list head */ The document says that the @head is the "list head" head of the priority list. In futex code, several places use "plist_del(&q->list, &q->list.plist);", they pass a fake head. We need to fix them all. Thanks to Darren Hart for many suggestions. Acked-by: Darren Hart Signed-off-by: Lai Jiangshan LKML-Reference: <4D11984A.5030203@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/futex.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..6feeea4f8f15 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -775,6 +775,24 @@ retry: return ret; } +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) + || plist_node_empty(&q->list))) + return; + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -792,7 +810,7 @@ static void wake_futex(struct futex_q *q) */ get_task_struct(p); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as * q->lock_ptr = NULL is written, without taking any locks. A @@ -1100,8 +1118,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, get_futex_key_refs(key); q->key = *key; - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; @@ -1504,8 +1521,7 @@ retry: spin_unlock(lock_ptr); goto retry; } - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(q->pi_state); @@ -1525,8 +1541,7 @@ retry: static void unqueue_me_pi(struct futex_q *q) __releases(q->lock_ptr) { - WARN_ON(plist_node_empty(&q->list)); - plist_del(&q->list, &q->list.plist); + __unqueue_futex(q); BUG_ON(!q->pi_state); free_pi_state(q->pi_state); @@ -2167,7 +2182,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We were woken prior to requeue by a timeout or a signal. * Unqueue the futex_q and determine which it was. */ - plist_del(&q->list, &q->list.plist); + plist_del(&q->list, &hb->chain); /* Handle spurious wakeups gracefully */ ret = -EWOULDBLOCK; -- cgit v1.2.3 From 017f2b239dabb2740b91df162e004371b861f371 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 21 Dec 2010 17:55:10 +0800 Subject: futex,plist: Remove debug lock assignment from plist_node The original code uses &plist_node->plist as the fake head of the priority list for plist_del(), these debug locks in the fake head are needed for CONFIG_DEBUG_PI_LIST. But now we always pass the real head to plist_del(), the debug locks in plist_node will not be used, so we remove these assignments. Acked-by: Darren Hart Signed-off-by: Lai Jiangshan LKML-Reference: <4D10797E.7040803@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/futex.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 6feeea4f8f15..9fe913141ec9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1089,9 +1089,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_del(&q->list, &hb1->chain); plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb2->lock; -#endif } get_futex_key_refs(key2); q->key = *key2; @@ -1124,9 +1121,6 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, q->rt_waiter = NULL; q->lock_ptr = &hb->lock; -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif wake_up_state(q->task, TASK_NORMAL); } @@ -1474,9 +1468,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) prio = min(current->normal_prio, MAX_RT_PRIO); plist_node_init(&q->list, prio); -#ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.spinlock = &hb->lock; -#endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); -- cgit v1.2.3 From d209a699a0b975ad47f399d70ddc3791f1b84496 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 11 Mar 2011 21:22:14 +0100 Subject: genirq: Add chip flag to force mask on suspend On suspend we disable all interrupts in the core code, but this does not mask the interrupt line in the default implementation as we use a lazy disable approach. That means we mark the interrupt disabled, but leave the hardware unmasked. That's an optimization because we avoid the hardware access for the common case where no interrupt happens after we marked it disabled. If an interrupt happens, then the interrupt flow handler masks the line at the hardware level and marks it pending. Suspend makes use of this delayed disable as it "disables" all interrupts when preparing the suspend transition. Right before the system goes into hardware suspend state it checks whether one of the interrupts which is marked as a wakeup interrupt came in after disabling it. Most interrupt chips have a separate register which selects the interrupts which can wake up the system from suspend, so we don't have to mask any on the non wakeup interrupts. But now we have to deal with brilliant designed hardware which lacks such a wakeup configuration facility. For such hardware it's necessary to mask all non wakeup interrupts before going into suspend in order to avoid the wakeup from random interrupts. Rather than working around this in the affected interrupt chip implementations we can solve this elegant in the core code itself. Add a flag IRQCHIP_MASK_ON_SUSPEND which can be set by the irq chip implementation to indicate, that the interrupts which are not selected as wakeup sources must be masked in the suspend path. Mask them in the loop which checks the wakeup interrupts pending flag. Signed-off-by: Thomas Gleixner Reviewed-by: Abhijeet Dharmapurikar LKML-Reference: --- kernel/irq/pm.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 1329f0eff49e..f76fc00c9877 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -68,10 +68,24 @@ int check_wakeup_irqs(void) struct irq_desc *desc; int irq; - for_each_irq_desc(irq, desc) - if (irqd_is_wakeup_set(&desc->irq_data) && - (desc->istate & IRQS_PENDING)) - return -EBUSY; + for_each_irq_desc(irq, desc) { + if (irqd_is_wakeup_set(&desc->irq_data)) { + if (desc->istate & IRQS_PENDING) + return -EBUSY; + continue; + } + /* + * Check the non wakeup interrupts whether they need + * to be masked before finally going into suspend + * state. That's for hardware which has no wakeup + * source configuration facility. The chip + * implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (desc->istate & IRQS_SUSPENDED && + irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + } return 0; } -- cgit v1.2.3 From 6e6823d17b157f185be09f4c70181299f9273f0b Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Thu, 3 Mar 2011 18:26:14 +0100 Subject: posix-clocks: Check write permissions in posix syscalls pc_clock_settime() and pc_clock_adjtime() do not check whether the fd was opened in write mode, so a clock can be set with a read only fd. [ tglx: We deliberately do not return -EPERM as we want this to be distingushable from the capability based permission check ] Signed-off-by: Torben Hohn LKML-Reference: <1299173174-348-4-git-send-email-torbenh@gmx.de> Cc: Richard Cochran Cc: John Stultz Cc: Thomas Gleixner --- kernel/time/posix-clock.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 04498cbf6002..25028dd4fa18 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -287,11 +287,16 @@ static int pc_clock_adjtime(clockid_t id, struct timex *tx) if (err) return err; + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + if (cd.clk->ops.clock_adjtime) err = cd.clk->ops.clock_adjtime(cd.clk, tx); else err = -EOPNOTSUPP; - +out: put_clock_desc(&cd); return err; @@ -344,11 +349,16 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts) if (err) return err; + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else err = -EOPNOTSUPP; - +out: put_clock_desc(&cd); return err; -- cgit v1.2.3 From 15a9155fe3e8215c02b80df51ec2cac7c0d726ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Feb 2011 15:08:54 -0500 Subject: fix race in audit_get_nd() don't rely on pathname resolution ending up twice at the same point... Signed-off-by: Al Viro --- kernel/audit_watch.c | 85 ++++++++++++++++++++-------------------------------- 1 file changed, 32 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index d2e3c7866460..20b9fe6907d0 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct nameidata *ndp) +static struct audit_parent *audit_init_parent(struct path *path) { - struct inode *inode = ndp->path.dentry->d_inode; + struct inode *inode = path->dentry->d_inode; struct audit_parent *parent; int ret; @@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent) } /* Get path information necessary for adding watches. */ -static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw) +static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct nameidata *ndparent, *ndwatch; + struct nameidata nd; + struct dentry *d; int err; - ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); - if (unlikely(!ndparent)) - return -ENOMEM; + err = path_lookup(watch->path, LOOKUP_PARENT, &nd); + if (err) + return err; - ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); - if (unlikely(!ndwatch)) { - kfree(ndparent); - return -ENOMEM; + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return -EINVAL; } - err = path_lookup(path, LOOKUP_PARENT, ndparent); - if (err) { - kfree(ndparent); - kfree(ndwatch); - return err; + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return PTR_ERR(d); } - - err = path_lookup(path, 0, ndwatch); - if (err) { - kfree(ndwatch); - ndwatch = NULL; + if (d->d_inode) { + /* update watch filter fields */ + watch->dev = d->d_inode->i_sb->s_dev; + watch->ino = d->d_inode->i_ino; } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - *ndp = ndparent; - *ndw = ndwatch; - + *parent = nd.path; + dput(d); return 0; } -/* Release resources used for watch path information. */ -static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) -{ - if (ndp) { - path_put(&ndp->path); - kfree(ndp); - } - if (ndw) { - path_put(&ndw->path); - kfree(ndw); - } -} - /* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, @@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; struct audit_parent *parent; - struct nameidata *ndp = NULL, *ndw = NULL; + struct path parent_path; int h, ret = 0; mutex_unlock(&audit_filter_mutex); /* Avoid calling path_lookup under audit_filter_mutex. */ - ret = audit_get_nd(watch->path, &ndp, &ndw); - if (ret) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); - goto error; - } + ret = audit_get_nd(watch, &parent_path); + /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - /* update watch filter fields */ - if (ndw) { - watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; - watch->ino = ndw->path.dentry->d_inode->i_ino; - } + if (ret) + return ret; /* either find an old parent or attach a new one */ - parent = audit_find_parent(ndp->path.dentry->d_inode); + parent = audit_find_parent(parent_path.dentry->d_inode); if (!parent) { - parent = audit_init_parent(ndp); + parent = audit_init_parent(&parent_path); if (IS_ERR(parent)) { ret = PTR_ERR(parent); goto error; @@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) h = audit_hash_ino((u32)watch->ino); *list = &audit_inode_hash[h]; error: - audit_put_nd(ndp, ndw); /* NULL args OK */ + path_put(&parent_path); return ret; - } void audit_remove_watch_rule(struct audit_krule *krule) -- cgit v1.2.3 From c9c6cac0c2bdbda42e7b804838648d0bc60ddb13 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Feb 2011 15:15:47 -0500 Subject: kill path_lookup() all remaining callers pass LOOKUP_PARENT to it, so flags argument can die; renamed to kern_path_parent() Signed-off-by: Al Viro --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 20b9fe6907d0..e683869365d9 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -359,7 +359,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d; int err; - err = path_lookup(watch->path, LOOKUP_PARENT, &nd); + err = kern_path_parent(watch->path, &nd); if (err) return err; -- cgit v1.2.3 From 73d049a40fc6269189c4e2ba6792cb5dd054883c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 11 Mar 2011 12:08:24 -0500 Subject: open-style analog of vfs_path_lookup() new function: file_open_root(dentry, mnt, name, flags) opens the file vfs_path_lookup would arrive to. Note that name can be empty; in that case the usual requirement that dentry should be a directory is lifted. open-coded equivalents switched to it, may_open() got down exactly one caller and became static. Signed-off-by: Al Viro --- kernel/sysctl_binary.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index b875bedf7c9a..3b8e028b9601 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { const struct bin_table *table = NULL; - struct nameidata nd; struct vfsmount *mnt; struct file *file; ssize_t result; char *pathname; int flags; - int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen, /* How should the sysctl be accessed? */ if (oldval && oldlen && newval && newlen) { flags = O_RDWR; - acc_mode = MAY_READ | MAY_WRITE; } else if (newval && newlen) { flags = O_WRONLY; - acc_mode = MAY_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; - acc_mode = MAY_READ; } else { result = 0; goto out_putname; } mnt = current->nsproxy->pid_ns->proc_mnt; - result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd); - if (result) - goto out_putname; - - result = may_open(&nd.path, acc_mode, flags); - if (result) - goto out_putpath; - - file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred()); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; @@ -1370,10 +1357,6 @@ out_putname: putname(pathname); out: return result; - -out_putpath: - path_put(&nd.path); - goto out_putname; } -- cgit v1.2.3 From 9d90c8d9cde929cbc575098e825d7c29d9f45054 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Sun, 13 Mar 2011 03:19:51 +0100 Subject: printk: do not mangle valid userspace syslog prefixes printk: do not mangle valid userspace syslog prefixes with /dev/kmsg Log messages passed to the kernel log by using /dev/kmsg or /dev/ttyprintk might contain a syslog prefix including the syslog facility value. This makes printk to recognize these headers properly, extract the real log level from it to use, and add the prefix as a proper prefix to the log buffer, instead of wrongly printing it as the log message text. Before: $ echo '<14>text' > /dev/kmsg $ dmesg -r <4>[135159.594810] <14>text After: $ echo '<14>text' > /dev/kmsg $ dmesg -r <14>[ 50.750654] text Cc: Lennart Poettering Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 138 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 104 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2ddbdc73aade..5e3d042e7001 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -498,6 +498,71 @@ static void _call_console_drivers(unsigned start, } } +/* + * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the + * lower 3 bit are the log level, the rest are the log facility. In case + * userspace passes usual userspace syslog messages to /dev/kmsg or + * /dev/ttyprintk, the log prefix might contain the facility. Printk needs + * to extract the correct log level for in-kernel processing, and not mangle + * the original value. + * + * If a prefix is found, the length of the prefix is returned. If 'level' is + * passed, it will be filled in with the log level without a possible facility + * value. If 'special' is passed, the special printk prefix chars are accepted + * and returned. If no valid header is found, 0 is returned and the passed + * variables are not touched. + */ +static size_t log_prefix(const char *p, unsigned int *level, char *special) +{ + unsigned int lev = 0; + char sp = '\0'; + size_t len; + + if (p[0] != '<' || !p[1]) + return 0; + if (p[2] == '>') { + /* usual single digit level number or special char */ + switch (p[1]) { + case '0' ... '7': + lev = p[1] - '0'; + break; + case 'c': /* KERN_CONT */ + case 'd': /* KERN_DEFAULT */ + sp = p[1]; + break; + default: + return 0; + } + len = 3; + } else { + /* multi digit including the level and facility number */ + char *endp = NULL; + + if (p[1] < '0' && p[1] > '9') + return 0; + + lev = (simple_strtoul(&p[1], &endp, 10) & 7); + if (endp == NULL || endp[0] != '>') + return 0; + len = (endp + 1) - p; + } + + /* do not accept special char if not asked for */ + if (sp && !special) + return 0; + + if (special) { + *special = sp; + /* return special char, do not touch level */ + if (sp) + return len; + } + + if (level) + *level = lev; + return len; +} + /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. @@ -513,13 +578,9 @@ static void call_console_drivers(unsigned start, unsigned end) cur_index = start; start_print = start; while (cur_index != end) { - if (msg_level < 0 && ((end - cur_index) > 2) && - LOG_BUF(cur_index + 0) == '<' && - LOG_BUF(cur_index + 1) >= '0' && - LOG_BUF(cur_index + 1) <= '7' && - LOG_BUF(cur_index + 2) == '>') { - msg_level = LOG_BUF(cur_index + 1) - '0'; - cur_index += 3; + if (msg_level < 0 && ((end - cur_index) > 2)) { + /* strip log prefix */ + cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); start_print = cur_index; } while (cur_index != end) { @@ -717,6 +778,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) unsigned long flags; int this_cpu; char *p; + size_t plen; + char special; boot_delay_msec(); printk_delay(); @@ -757,45 +820,52 @@ asmlinkage int vprintk(const char *fmt, va_list args) printed_len += vscnprintf(printk_buf + printed_len, sizeof(printk_buf) - printed_len, fmt, args); - p = printk_buf; - /* Do we have a loglevel in the string? */ - if (p[0] == '<') { - unsigned char c = p[1]; - if (c && p[2] == '>') { - switch (c) { - case '0' ... '7': /* loglevel */ - current_log_level = c - '0'; - /* Fallthrough - make sure we're on a new line */ - case 'd': /* KERN_DEFAULT */ - if (!new_text_line) { - emit_log_char('\n'); - new_text_line = 1; - } - /* Fallthrough - skip the loglevel */ - case 'c': /* KERN_CONT */ - p += 3; - break; + /* Read log level and handle special printk prefix */ + plen = log_prefix(p, ¤t_log_level, &special); + if (plen) { + p += plen; + + switch (special) { + case 'c': /* Strip KERN_CONT, continue line */ + plen = 0; + break; + case 'd': /* Strip KERN_DEFAULT, start new line */ + plen = 0; + default: + if (!new_text_line) { + emit_log_char('\n'); + new_text_line = 1; } } } /* - * Copy the output into log_buf. If the caller didn't provide - * appropriate log level tags, we insert them here + * Copy the output into log_buf. If the caller didn't provide + * the appropriate log prefix, we insert them here */ - for ( ; *p; p++) { + for (; *p; p++) { if (new_text_line) { - /* Always output the token */ - emit_log_char('<'); - emit_log_char(current_log_level + '0'); - emit_log_char('>'); - printed_len += 3; new_text_line = 0; + if (plen) { + /* Copy original log prefix */ + int i; + + for (i = 0; i < plen; i++) + emit_log_char(printk_buf[i]); + printed_len += plen; + } else { + /* Add log prefix */ + emit_log_char('<'); + emit_log_char(current_log_level + '0'); + emit_log_char('>'); + printed_len += 3; + } + if (printk_time) { - /* Follow the token with the time */ + /* Add the current time stamp */ char tbuf[50], *tp; unsigned tlen; unsigned long long t; -- cgit v1.2.3 From 6e0aa9f8a8190e0879a29bd67aa606b51734a122 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Mar 2011 10:34:35 +0100 Subject: futex: Deobfuscate handle_futex_death() handle_futex_death() uses futex_atomic_cmpxchg_inatomic() without disabling page faults. That's ok, but totally non obvious. We don't hold locks so we actually can and want to fault here, because the get_user() before futex_atomic_cmpxchg_inatomic() does not guarantee a R/W mapping. We could just add a big fat comment to explain this, but actually changing the code so that the functionality is entirely clear is better. Use the helper function which disables page faults around the futex_atomic_cmpxchg_inatomic() and handle a fault with a call to fault_in_user_writeable() as all other places in the futex code do as well. Pointed-out-by: Linus Torvalds Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Cc: Michel Lespinasse Cc: Peter Zijlstra Cc: Matt Turner Cc: Russell King Cc: David Howells Cc: Tony Luck Cc: Michal Simek Cc: Ralf Baechle Cc: "James E.J. Bottomley" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/futex.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c6bef6e404fe..e9251d934f7d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2458,9 +2458,20 @@ retry: * userspace. */ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; - if (futex_atomic_cmpxchg_inatomic(&nval, uaddr, uval, mval)) - return -1; - + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + } if (nval != uval) goto retry; -- cgit v1.2.3 From cd51e61cf4e8b220da37dc35e9c2dc2dc258b4de Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 11 Feb 2011 00:04:52 +0100 Subject: PM / ACPI: Remove references to pm_flags from bus.c If direct references to pm_flags are removed from drivers/acpi/bus.c, CONFIG_ACPI will not need to depend on CONFIG_PM any more. Make that happen. Signed-off-by: Rafael J. Wysocki Acked-by: Len Brown --- kernel/power/main.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 701853042c28..b5405af48ddb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -17,10 +17,20 @@ DEFINE_MUTEX(pm_mutex); +#ifdef CONFIG_PM_SLEEP + unsigned int pm_flags; EXPORT_SYMBOL(pm_flags); -#ifdef CONFIG_PM_SLEEP +bool pm_apm_enabled(void) +{ + return !!(pm_flags & PM_APM); +} + +void pm_set_acpi_flag(void) +{ + pm_flags |= PM_ACPI; +} /* Routines for PM-transition notifications */ -- cgit v1.2.3 From 1eb208aea3179dd2fc0cdeea45ef869d75b4fe70 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 11 Feb 2011 00:06:30 +0100 Subject: PM: Make CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME) From the users' point of view CONFIG_PM is really only used for making it possible to set CONFIG_SUSPEND, CONFIG_HIBERNATION, CONFIG_PM_RUNTIME and (surprisingly enough) CONFIG_XEN_SAVE_RESTORE (CONFIG_PM_OPP also depends on CONFIG_PM, but quite artificially). However, both CONFIG_SUSPEND and CONFIG_HIBERNATION require platform support (independent of CONFIG_PM) and it is not quite obvious that CONFIG_PM has to be set for CONFIG_XEN_SAVE_RESTORE to be available. Thus, from the users' point of view, it would be more logical to automatically select CONFIG_PM if any of the above options depending on it are set. Make CONFIG_PM depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME), which will cause it to be selected when any of CONFIG_SUSPEND, CONFIG_HIBERNATION, CONFIG_PM_RUNTIME, CONFIG_XEN_SAVE_RESTORE is set and will clarify its meaning. Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 265729966ece..d739cf612e50 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,23 +1,7 @@ config PM - bool "Power Management support" - depends on !IA64_HP_SIM - ---help--- - "Power Management" means that parts of your computer are shut - off or put into a power conserving "sleep" mode if they are not - being used. There are two competing standards for doing this: APM - and ACPI. If you want to use either one, say Y here and then also - to the requisite support below. - - Power Management is most important for battery powered laptop - computers; if you have a laptop, check out the Linux Laptop home - page on the WWW at or - Tuxmobil - Linux on Mobile Computers at - and the Battery Powered Linux mini-HOWTO, available from - . - - Note that, even if you say N here, Linux on the x86 architecture - will issue the hlt instruction if nothing is to be done, thereby - sending the processor to sleep and saving power. + bool + depends on PM_SLEEP || PM_RUNTIME + default y config PM_DEBUG bool "Power Management Debug Support" @@ -102,7 +86,7 @@ config PM_SLEEP_ADVANCED_DEBUG config SUSPEND bool "Suspend to RAM and standby" - depends on PM && ARCH_SUSPEND_POSSIBLE + depends on ARCH_SUSPEND_POSSIBLE default y ---help--- Allow the system to enter sleep states in which main memory is @@ -133,7 +117,7 @@ config SUSPEND_FREEZER config HIBERNATION bool "Hibernation (aka 'suspend to disk')" - depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + depends on SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS ---help--- @@ -224,7 +208,7 @@ config APM_EMULATION config PM_RUNTIME bool "Run-time PM core functionality" - depends on PM + depends on !IA64_HP_SIM ---help--- Enable functionality allowing I/O devices to be put into energy-saving (low power) states at run time (or autosuspended) after a specified @@ -246,7 +230,6 @@ config ARCH_HAS_OPP config PM_OPP bool "Operating Performance Point (OPP) Layer library" - depends on PM depends on ARCH_HAS_OPP ---help--- SOCs have a standard set of tuples consisting of frequency and -- cgit v1.2.3 From 196ec243224bb38fc5c41d9fa4050f70708b7fb4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 11 Feb 2011 00:06:42 +0100 Subject: PM: Reorder power management Kconfig options Reorder configuration options in kernel/power/Kconfig so that the options depended on are at the top of the list. This patch doesn't introduce any functional changes. Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 222 +++++++++++++++++++++++++-------------------------- 1 file changed, 111 insertions(+), 111 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index d739cf612e50..1e4d9238c5da 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,89 +1,3 @@ -config PM - bool - depends on PM_SLEEP || PM_RUNTIME - default y - -config PM_DEBUG - bool "Power Management Debug Support" - depends on PM - ---help--- - This option enables various debugging support in the Power Management - code. This is helpful when debugging and reporting PM bugs, like - suspend support. - -config PM_ADVANCED_DEBUG - bool "Extra PM attributes in sysfs for low-level debugging/testing" - depends on PM_DEBUG - default n - ---help--- - Add extra sysfs attributes allowing one to access some Power Management - fields of device objects from user space. If you are not a kernel - developer interested in debugging/testing Power Management, say "no". - -config PM_VERBOSE - bool "Verbose Power Management debugging" - depends on PM_DEBUG - default n - ---help--- - This option enables verbose messages from the Power Management code. - -config CAN_PM_TRACE - def_bool y - depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL - -config PM_TRACE - bool - help - This enables code to save the last PM event point across - reboot. The architecture needs to support this, x86 for - example does by saving things in the RTC, see below. - - The architecture specific code must provide the extern - functions from as well as the - header with a TRACE_RESUME() macro. - - The way the information is presented is architecture- - dependent, x86 will print the information during a - late_initcall. - -config PM_TRACE_RTC - bool "Suspend/resume event tracing" - depends on CAN_PM_TRACE - depends on X86 - select PM_TRACE - default n - ---help--- - This enables some cheesy code to save the last PM event point in the - RTC across reboots, so that you can debug a machine that just hangs - during suspend (or more commonly, during resume). - - To use this debugging feature you should attempt to suspend the - machine, reboot it and then run - - dmesg -s 1000000 | grep 'hash matches' - - CAUTION: this option will cause your machine's real-time clock to be - set to an invalid time after a resume. - -config PM_SLEEP_SMP - bool - depends on SMP - depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE - depends on PM_SLEEP - select HOTPLUG - select HOTPLUG_CPU - default y - -config PM_SLEEP - bool - depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE - default y - -config PM_SLEEP_ADVANCED_DEBUG - bool - depends on PM_ADVANCED_DEBUG - default n - config SUSPEND bool "Suspend to RAM and standby" depends on ARCH_SUSPEND_POSSIBLE @@ -93,17 +7,6 @@ config SUSPEND powered and thus its contents are preserved, such as the suspend-to-RAM state (e.g. the ACPI S3 state). -config PM_TEST_SUSPEND - bool "Test suspend/resume and wakealarm during bootup" - depends on SUSPEND && PM_DEBUG && RTC_CLASS=y - ---help--- - This option will let you suspend your machine during bootup, and - make it wake up a few seconds later using an RTC wakeup alarm. - Enable this with a kernel parameter like "test_suspend=mem". - - You probably want to have your system's RTC driver statically - linked, ensuring that it's available when this test runs. - config SUSPEND_FREEZER bool "Enable freezer for suspend to RAM/standby" \ if ARCH_WANTS_FREEZER_CONTROL || BROKEN @@ -180,6 +83,117 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config PM_SLEEP + bool + depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE + default y + +config PM_SLEEP_SMP + bool + depends on SMP + depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE + depends on PM_SLEEP + select HOTPLUG + select HOTPLUG_CPU + default y + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on !IA64_HP_SIM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. + +config PM + bool + depends on PM_SLEEP || PM_RUNTIME + default y + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + ---help--- + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_VERBOSE + bool "Verbose Power Management debugging" + depends on PM_DEBUG + default n + ---help--- + This option enables verbose messages from the Power Management code. + +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + default n + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + +config PM_SLEEP_ADVANCED_DEBUG + bool + depends on PM_ADVANCED_DEBUG + default n + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config CAN_PM_TRACE + def_bool y + depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL + +config PM_TRACE + bool + help + This enables code to save the last PM event point across + reboot. The architecture needs to support this, x86 for + example does by saving things in the RTC, see below. + + The architecture specific code must provide the extern + functions from as well as the + header with a TRACE_RESUME() macro. + + The way the information is presented is architecture- + dependent, x86 will print the information during a + late_initcall. + +config PM_TRACE_RTC + bool "Suspend/resume event tracing" + depends on CAN_PM_TRACE + depends on X86 + select PM_TRACE + default n + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + config APM_EMULATION tristate "Advanced Power Management Emulation" depends on PM && SYS_SUPPORTS_APM_EMULATION @@ -206,20 +220,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config PM_RUNTIME - bool "Run-time PM core functionality" - depends on !IA64_HP_SIM - ---help--- - Enable functionality allowing I/O devices to be put into energy-saving - (low power) states at run time (or autosuspended) after a specified - period of inactivity and woken up in response to a hardware-generated - wake-up event or a driver's request. - - Hardware support is generally required for this functionality to work - and the bus type drivers of the buses the devices are on are - responsible for the actual handling of the autosuspend requests and - wake-up events. - config PM_OPS bool depends on PM_SLEEP || PM_RUNTIME -- cgit v1.2.3 From aa33860158114d0df3c7997bc1dd41c0168e1c2a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 11 Feb 2011 00:06:54 +0100 Subject: PM: Remove CONFIG_PM_OPS After redefining CONFIG_PM to depend on (CONFIG_PM_SLEEP || CONFIG_PM_RUNTIME) the CONFIG_PM_OPS option is redundant and can be replaced with CONFIG_PM. Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 1e4d9238c5da..0710beead05b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -220,11 +220,6 @@ config APM_EMULATION anything, try disabling/enabling this option (or disabling/enabling APM in your BIOS). -config PM_OPS - bool - depends on PM_SLEEP || PM_RUNTIME - default y - config ARCH_HAS_OPP bool -- cgit v1.2.3 From 88a6f33e4d7143f94f8e787fa4065ac873507984 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 11 Feb 2011 20:31:11 +0100 Subject: PM: Clean up PM_TRACE dependencies and drop unnecessary Kconfig option CONFIG_PM_SLEEP_ADVANCED_DEBUG is not used any more, so drop it and CONFIG_CAN_PM_TRACE need not depend on EXPERIMENTAL, so remove that dependency. Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 0710beead05b..298bed04555e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -140,11 +140,6 @@ config PM_ADVANCED_DEBUG fields of device objects from user space. If you are not a kernel developer interested in debugging/testing Power Management, say "no". -config PM_SLEEP_ADVANCED_DEBUG - bool - depends on PM_ADVANCED_DEBUG - default n - config PM_TEST_SUSPEND bool "Test suspend/resume and wakealarm during bootup" depends on SUSPEND && PM_DEBUG && RTC_CLASS=y @@ -158,7 +153,7 @@ config PM_TEST_SUSPEND config CAN_PM_TRACE def_bool y - depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL + depends on PM_DEBUG && PM_SLEEP config PM_TRACE bool -- cgit v1.2.3 From 6831c6edc7b272a08dd2a6c71bb183a48fe98ae6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 15 Feb 2011 21:22:24 +0100 Subject: PM: Drop pm_flags that is not necessary The variable pm_flags is used to prevent APM from being enabled along with ACPI, which would lead to problems. However, acpi_init() is always called before apm_init() and after acpi_init() has returned, it is known whether or not ACPI will be used. Namely, if acpi_disabled is not set after acpi_init() has returned, this means that ACPI is enabled. Thus, it is sufficient to check acpi_disabled in apm_init() to prevent APM from being enabled in parallel with ACPI. Signed-off-by: Rafael J. Wysocki Acked-by: Len Brown --- kernel/power/main.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index b5405af48ddb..8eaba5f27b10 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -19,19 +19,6 @@ DEFINE_MUTEX(pm_mutex); #ifdef CONFIG_PM_SLEEP -unsigned int pm_flags; -EXPORT_SYMBOL(pm_flags); - -bool pm_apm_enabled(void) -{ - return !!(pm_flags & PM_APM); -} - -void pm_set_acpi_flag(void) -{ - pm_flags |= PM_ACPI; -} - /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); -- cgit v1.2.3 From cf4fb80ca3d591cae366ae8364e3c3f7a68bd249 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Feb 2011 01:05:36 +0100 Subject: PM: Simplify kernel/power/Kconfig 'n' defaults are pretty pointless and actually bogus when used with prompt-less config options. The "bool"/"default y" pair with no prompt can be expressed more compactly using def_bool. [rjw: Rebased on top of earlier patches modifying this file.] Signed-off-by: Jan Beulich Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 298bed04555e..4603f08dc47b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -84,18 +84,16 @@ config PM_STD_PARTITION device. config PM_SLEEP - bool + def_bool y depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE - default y config PM_SLEEP_SMP - bool + def_bool y depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP select HOTPLUG select HOTPLUG_CPU - default y config PM_RUNTIME bool "Run-time PM core functionality" @@ -112,9 +110,8 @@ config PM_RUNTIME wake-up events. config PM - bool + def_bool y depends on PM_SLEEP || PM_RUNTIME - default y config PM_DEBUG bool "Power Management Debug Support" @@ -127,14 +124,12 @@ config PM_DEBUG config PM_VERBOSE bool "Verbose Power Management debugging" depends on PM_DEBUG - default n ---help--- This option enables verbose messages from the Power Management code. config PM_ADVANCED_DEBUG bool "Extra PM attributes in sysfs for low-level debugging/testing" depends on PM_DEBUG - default n ---help--- Add extra sysfs attributes allowing one to access some Power Management fields of device objects from user space. If you are not a kernel @@ -175,7 +170,6 @@ config PM_TRACE_RTC depends on CAN_PM_TRACE depends on X86 select PM_TRACE - default n ---help--- This enables some cheesy code to save the last PM event point in the RTC across reboots, so that you can debug a machine that just hangs -- cgit v1.2.3 From f9b9e806ae0ede772cbb9916d9ac7354a123d044 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Mon, 28 Feb 2011 22:06:34 +0100 Subject: PM QoS: Make pm_qos settings readable I have a machine where entering deep C-states broke. pm_qos was a hot candidate, but I couldn't find any way to double check without the need of recompiling. While in this case it was a driver bug (ath9k): https://bugzilla.kernel.org/show_bug.cgi?id=27532 powertop or others may want to read out cpu_dma_latency restrictions which could be the cause of preventing a machine entering deeper C-states. Output with this patch: # default value of 2000 * USEC_PER_SEC (0x77359400) cat /dev/network_latency |hexdump 0000000 9400 7735 0000004 # value of 55 us which is the reason for not entering C2 cat /dev/cpu_dma_latency |hexdump 0000000 0037 0000 0000004 There is no reason to hide this info -> make pm_qos files readable. Signed-off-by: Thomas Renninger Signed-off-by: Rafael J. Wysocki --- kernel/pm_qos_params.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index aeaa7f846821..0da058bff8eb 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -103,11 +103,14 @@ static struct pm_qos_object *pm_qos_array[] = { static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); static int pm_qos_power_release(struct inode *inode, struct file *filp); static const struct file_operations pm_qos_power_fops = { .write = pm_qos_power_write, + .read = pm_qos_power_read, .open = pm_qos_power_open, .release = pm_qos_power_release, .llseek = noop_llseek, @@ -376,6 +379,27 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) } +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + unsigned long flags; + struct pm_qos_object *o; + struct pm_qos_request_list *pm_qos_req = filp->private_data;; + + if (!pm_qos_req) + return -EINVAL; + if (!pm_qos_request_active(pm_qos_req)) + return -EINVAL; + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(o); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { -- cgit v1.2.3 From 40dc166cb5dddbd36aa4ad11c03915ea538f5a61 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 15 Mar 2011 00:43:46 +0100 Subject: PM / Core: Introduce struct syscore_ops for core subsystems PM Some subsystems need to carry out suspend/resume and shutdown operations with one CPU on-line and interrupts disabled. The only way to register such operations is to define a sysdev class and a sysdev specifically for this purpose which is cumbersome and inefficient. Moreover, the arguments taken by sysdev suspend, resume and shutdown callbacks are practically never necessary. For this reason, introduce a simpler interface allowing subsystems to register operations to be executed very late during system suspend and shutdown and very early during resume in the form of strcut syscore_ops objects. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- kernel/power/hibernate.c | 9 +++++++++ kernel/power/suspend.c | 4 ++++ kernel/sys.c | 4 ++++ 3 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1832bd264219..aeabd26e3342 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -272,6 +273,8 @@ static int create_image(int platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_FREEZE); + if (!error) + error = syscore_suspend(); if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); @@ -295,6 +298,7 @@ static int create_image(int platform_mode) } Power_up: + syscore_resume(); sysdev_resume(); /* NOTE: dpm_resume_noirq() is just a resume() for devices * that suspended with irqs off ... no overall powerup. @@ -403,6 +407,8 @@ static int resume_target_kernel(bool platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_QUIESCE); + if (!error) + error = syscore_suspend(); if (error) goto Enable_irqs; @@ -429,6 +435,7 @@ static int resume_target_kernel(bool platform_mode) restore_processor_state(); touch_softlockup_watchdog(); + syscore_resume(); sysdev_resume(); Enable_irqs: @@ -516,6 +523,7 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + syscore_suspend(); if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; @@ -526,6 +534,7 @@ int hibernation_platform_enter(void) while (1); Power_up: + syscore_resume(); sysdev_resume(); local_irq_enable(); enable_nonboot_cpus(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index de6f86bfa303..2814c32aed51 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "power.h" @@ -163,11 +164,14 @@ static int suspend_enter(suspend_state_t state) BUG_ON(!irqs_disabled()); error = sysdev_suspend(PMSG_SUSPEND); + if (!error) + error = syscore_suspend(); if (!error) { if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); events_check_enabled = false; } + syscore_resume(); sysdev_resume(); } diff --git a/kernel/sys.c b/kernel/sys.c index 18da702ec813..1ad48b3b9068 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -298,6 +299,7 @@ void kernel_restart_prepare(char *cmd) system_state = SYSTEM_RESTART; device_shutdown(); sysdev_shutdown(); + syscore_shutdown(); } /** @@ -336,6 +338,7 @@ void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); sysdev_shutdown(); + syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); machine_halt(); @@ -355,6 +358,7 @@ void kernel_power_off(void) pm_power_off_prepare(); disable_nonboot_cpus(); sysdev_shutdown(); + syscore_shutdown(); printk(KERN_EMERG "Power down.\n"); kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); -- cgit v1.2.3 From bea3864fb627d110933cfb8babe048b63c4fc76e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 15 Mar 2011 00:45:46 +0100 Subject: PM / Hibernate: Reduce autotuned default image size The hibernate image size autotuning mechanism sets the default image size to 5/2 of the total system RAM, but it is reported that on some systems device drivers allocate substantial amounts of memory during suspend and the creation of the image fails as a result (too little memory is preallocated). Modify the autotuning mechanism to use 1/3 instead of 2/5 of RAM as the default image size, which is reported to be sufficient for the affected systems. References: https://bugzilla.kernel.org/show_bug.cgi?id=30482 Reported-and-tested-by: Martin Steigerwald Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 64db648ff911..ca0aacc24874 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -42,15 +42,15 @@ static void swsusp_unset_page_forbidden(struct page *); /* * Preferred image size in bytes (tunable via /sys/power/image_size). - * When it is set to N, swsusp will do its best to ensure the image - * size will not exceed N bytes, but if that is impossible, it will - * try to create the smallest image possible. + * When it is set to N, the image creating code will do its best to + * ensure the image size will not exceed N bytes, but if that is + * impossible, it will try to create the smallest image possible. */ unsigned long image_size; void __init hibernate_image_size_init(void) { - image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; + image_size = (totalram_pages / 3) * PAGE_SIZE; } /* List of PBEs needed for restoring the pages that were allocated before -- cgit v1.2.3 From 990d6c2d7aee921e3bce22b2d6a750fd552262be Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:26 +0530 Subject: vfs: Add name to file handle conversion support The syscall also return mount id which can be used to lookup file system specific information such as uuid in /proc//mountinfo Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- kernel/sys_ni.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c782fe9924c7..4e013439ac28 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -186,3 +186,6 @@ cond_syscall(sys_perf_event_open); /* fanotify! */ cond_syscall(sys_fanotify_init); cond_syscall(sys_fanotify_mark); + +/* open by handle */ +cond_syscall(sys_name_to_handle_at); -- cgit v1.2.3 From becfd1f37544798cbdfd788f32c827160fab98c1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 29 Jan 2011 18:43:26 +0530 Subject: vfs: Add open by file handle support [AV: duplicate of open() guts removed; file_open_root() used instead] Signed-off-by: Aneesh Kumar K.V Signed-off-by: Al Viro --- kernel/sys_ni.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4e013439ac28..25cc41cd8f33 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -189,3 +189,5 @@ cond_syscall(sys_fanotify_mark); /* open by handle */ cond_syscall(sys_name_to_handle_at); +cond_syscall(sys_open_by_handle_at); +cond_syscall(compat_sys_open_by_handle_at); -- cgit v1.2.3 From 58cbe2476abce8b5e3508d23bd05c2e2e8c394da Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 15 Mar 2011 16:12:30 -0700 Subject: sched, kernel-doc: Fix runqueue_is_locked() description Fix kernel-doc warning for runqueue_is_locked(): Warning(kernel/sched.c:664): missing initial short description Signed-off-by: Randy Dunlap Cc: Linus Torvalds LKML-Reference: <20110315161230.c4e1e8e3.rdunlap@xenotime.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c8e40b7005c0..58d66ea7d200 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -661,10 +661,9 @@ static void update_rq_clock(struct rq *rq) #endif /** - * runqueue_is_locked + * runqueue_is_locked - Returns true if the current cpu runqueue is locked * @cpu: the processor in question. * - * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -- cgit v1.2.3 From 91b2f482e62ad0d444222253026a5cbca28c4ab9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Mar 2011 21:27:08 +0100 Subject: perf: Fix the software events state check Fix the mistakenly inverted check of events state. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian Cc: LKML-Reference: <1299529629-18280-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ed253aa24ba4..974e2e64a43a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5122,7 +5122,7 @@ static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) - return 0; + return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) -- cgit v1.2.3 From a0f7d0f7fc02465bb9758501f611f63381792996 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Mar 2011 21:27:09 +0100 Subject: perf: Handle stopped state with tracepoints We toggle the state from start and stop callbacks but actually don't check it when the event triggers. Do it so that these callbacks actually work. Signed-off-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian Cc: Signed-off-by: Peter Zijlstra LKML-Reference: <1299529629-18280-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 974e2e64a43a..533f71570736 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5478,6 +5478,8 @@ static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; /* * All tracepoints are from kernel-space. */ -- cgit v1.2.3 From 38b435b16c36b0d863efcf3f07b34a6fac9873fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Mar 2011 14:37:10 +0100 Subject: perf: Fix tear-down of inherited group events When destroying inherited events, we need to destroy groups too, otherwise the event iteration in perf_event_exit_task_context() will miss group siblings and we leak events with all the consequences. Reported-and-tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: # .35+ LKML-Reference: <1300196470.2203.61.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 533f71570736..3472bb1a070c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6722,17 +6722,20 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - struct perf_event *parent_event; + if (child_event->parent) { + raw_spin_lock_irq(&child_ctx->lock); + perf_group_detach(child_event); + raw_spin_unlock_irq(&child_ctx->lock); + } perf_remove_from_context(child_event); - parent_event = child_event->parent; /* - * It can happen that parent exits first, and has events + * It can happen that the parent exits first, and has events * that are still around due to the child reference. These - * events need to be zapped - but otherwise linger. + * events need to be zapped. */ - if (parent_event) { + if (child_event->parent) { sync_child_event(child_event, child); free_event(child_event); } -- cgit v1.2.3 From 1fd06bb1571e2618ae392e2484925bf0dadd7857 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 15 Mar 2011 16:12:30 -0700 Subject: sched.c: fix kernel-doc for runqueue_is_locked() Fix kernel-doc warning for runqueue_is_locked(): Warning(kernel/sched.c:664): missing initial short description on line: Signed-off-by: Randy Dunlap Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c8e40b7005c0..58d66ea7d200 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -661,10 +661,9 @@ static void update_rq_clock(struct rq *rq) #endif /** - * runqueue_is_locked + * runqueue_is_locked - Returns true if the current cpu runqueue is locked * @cpu: the processor in question. * - * Returns true if the current cpu runqueue is locked. * This interface allows printk to be called with the runqueue lock * held and know whether or not it is OK to wake up the klogd. */ -- cgit v1.2.3 From 13e5befaddcf8d542ae45610b552105490a0010b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 16 Mar 2011 17:17:08 -0700 Subject: trace, documentation: Fix branch profiling location in debugfs The debugfs interface for branch profiling is through /sys/kernel/debug/tracing/trace_stat/branch_annotated /sys/kernel/debug/tracing/trace_stat/branch_all so update the Kconfig accordingly. Signed-off-by: David Rientjes Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 14674dce77a6..61d7d59f4a1a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: - /sys/kernel/debug/tracing/profile_annotated_branch + /sys/kernel/debug/tracing/trace_stat/branch_annotated Note: this will add a significant overhead; only turn this on if you need to profile the system's use of these macros. @@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES taken in the kernel is recorded whether it hit or miss. The results will be displayed in: - /sys/kernel/debug/tracing/profile_branch + /sys/kernel/debug/tracing/trace_stat/branch_all This option also enables the likely/unlikely profiler. -- cgit v1.2.3 From ed3cd4a86562eee79de25b567a00e648cc3dc2bf Mon Sep 17 00:00:00 2001 From: matt mooney Date: Fri, 14 Jan 2011 06:12:24 -0800 Subject: kernel: change to new flag variable Replace EXTRA_CFLAGS with ccflags-y. Signed-off-by: matt mooney Acked-by: WANG Cong Signed-off-by: Michal Marek --- kernel/gcov/Makefile | 2 +- kernel/power/Makefile | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index 3f761001d517..e97ca59e2520 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile @@ -1,3 +1,3 @@ -EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' +ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c350e18b53e3..c5ebc6a90643 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,4 +1,5 @@ -ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG + +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o -- cgit v1.2.3 From ee0401ec11bb7b83b7fa9476d0aaeabd59216062 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 17 Mar 2011 13:36:57 +0100 Subject: genirq: show_interrupts: Check desc->name before printing it blindly desc->name is not required and not used by all architectures. Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4cc2e5ed0bec..760248de109d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -405,7 +405,8 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); seq_printf(p, " %8s", desc->irq_data.chip->name); - seq_printf(p, "-%-8s", desc->name); + if (desc->name) + seq_printf(p, "-%-8s", desc->name); if (action) { seq_printf(p, " %s", action->name); -- cgit v1.2.3 From 1c389795c15d349c2c7b23baf927e16e86ca3ae8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 17 Mar 2011 14:43:07 +0300 Subject: genirq: Fix incorrect unlock in __setup_irq() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit goto out_thread is called before we take the lock. It causes a gcc warning: "kernel/irq/manage.c:858: warning: ‘flags’ may be used uninitialized in this function" [ tglx: Moved unlock before free_cpumask_var() ] Signed-off-by: Dan Carpenter LKML-Reference: <20110317114307.GJ2008@bicker> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index acd599a43bfb..0a2aa73e536c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1064,10 +1064,10 @@ mismatch: ret = -EBUSY; out_mask: + raw_spin_unlock_irqrestore(&desc->lock, flags); free_cpumask_var(mask); out_thread: - raw_spin_unlock_irqrestore(&desc->lock, flags); if (new->thread) { struct task_struct *t = new->thread; -- cgit v1.2.3 From 77c100c83e84316ced2507c5799f79c2c80bc6b9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:51:46 -0500 Subject: export pid symbols needed for kvm_vcpu_on_spin Export the symbols required for a race-free kvm_vcpu_on_spin. Signed-off-by: Rik van Riel Signed-off-by: Avi Kivity --- kernel/fork.c | 1 + kernel/pid.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..05b92c457010 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround diff --git a/kernel/pid.c b/kernel/pid.c index 39b65b69584f..02f221274265 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_unlock(); return pid; } +EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { @@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) rcu_read_unlock(); return result; } +EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { -- cgit v1.2.3 From e6cd1e07a185d5f9b0aa75e020df02d3c1c44940 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Tue, 15 Mar 2011 13:27:16 -0600 Subject: call_function_many: fix list delete vs add race Peter pointed out there was nothing preventing the list_del_rcu in smp_call_function_interrupt from running before the list_add_rcu in smp_call_function_many. Fix this by not setting refs until we have gotten the lock for the list. Take advantage of the wmb in list_add_rcu to save an explicit additional one. I tried to force this race with a udelay before the lock & list_add and by mixing all 64 online cpus with just 3 random cpus in the mask, but was unsuccessful. Still, inspection shows a valid race, and the fix is a extension of the existing protection window in the current code. Cc: stable@kernel.org (v2.6.32 and later) Reported-by: Peter Zijlstra Signed-off-by: Milton Miller Signed-off-by: Linus Torvalds --- kernel/smp.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 9910744f0856..aaeee20c5634 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -491,14 +491,15 @@ void smp_call_function_many(const struct cpumask *mask, cpumask_clear_cpu(this_cpu, data->cpumask); /* - * To ensure the interrupt handler gets an complete view - * we order the cpumask and refs writes and order the read - * of them in the interrupt handler. In addition we may - * only clear our own cpu bit from the mask. + * We reuse the call function data without waiting for any grace + * period after some other cpu removes it from the global queue. + * This means a cpu might find our data block as it is writen. + * The interrupt handler waits until it sees refs filled out + * while its cpu mask bit is set; here we may only clear our + * own cpu mask bit, and must wait to set refs until we are sure + * previous writes are complete and we have obtained the lock to + * add the element to the queue. */ - smp_wmb(); - - atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_lock_irqsave(&call_function.lock, flags); /* @@ -507,6 +508,11 @@ void smp_call_function_many(const struct cpumask *mask, * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); + /* + * We rely on the wmb() in list_add_rcu to order the writes + * to func, data, and cpumask before this write to refs. + */ + atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_unlock_irqrestore(&call_function.lock, flags); /* -- cgit v1.2.3 From 45a5791920ae643eafc02e2eedef1a58e341b736 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Tue, 15 Mar 2011 13:27:16 -0600 Subject: call_function_many: add missing ordering Paul McKenney's review pointed out two problems with the barriers in the 2.6.38 update to the smp call function many code. First, a barrier that would force the func and info members of data to be visible before their consumption in the interrupt handler was missing. This can be solved by adding a smp_wmb between setting the func and info members and setting setting the cpumask; this will pair with the existing and required smp_rmb ordering the cpumask read before the read of refs. This placement avoids the need a second smp_rmb in the interrupt handler which would be executed on each of the N cpus executing the call request. (I was thinking this barrier was present but was not). Second, the previous write to refs (establishing the zero that we the interrupt handler was testing from all cpus) was performed by a third party cpu. This would invoke transitivity which, as a recient or concurrent addition to memory-barriers.txt now explicitly states, would require a full smp_mb(). However, we know the cpumask will only be set by one cpu (the data owner) and any preivous iteration of the mask would have cleared by the reading cpu. By redundantly writing refs to 0 on the owning cpu before the smp_wmb, the write to refs will follow the same path as the writes that set the cpumask, which in turn allows us to keep the barrier in the interrupt handler a smp_rmb instead of promoting it to a smp_mb (which will be be executed by N cpus for each of the possible M elements on the list). I moved and expanded the comment about our (ab)use of the rcu list primitives for the concurrent walk earlier into this function. I considered moving the first two paragraphs to the queue list head and lock, but felt it would have been too disconected from the code. Cc: Paul McKinney Cc: stable@kernel.org (2.6.32 and later) Signed-off-by: Milton Miller Signed-off-by: Linus Torvalds --- kernel/smp.c | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index aaeee20c5634..7c6ded5effd9 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -483,23 +483,42 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); - BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); - data->csd.func = func; - data->csd.info = info; - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); + /* This BUG_ON verifies our reuse assertions and can be removed */ + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); /* + * The global call function queue list add and delete are protected + * by a lock, but the list is traversed without any lock, relying + * on the rcu list add and delete to allow safe concurrent traversal. * We reuse the call function data without waiting for any grace * period after some other cpu removes it from the global queue. - * This means a cpu might find our data block as it is writen. - * The interrupt handler waits until it sees refs filled out - * while its cpu mask bit is set; here we may only clear our - * own cpu mask bit, and must wait to set refs until we are sure - * previous writes are complete and we have obtained the lock to - * add the element to the queue. + * This means a cpu might find our data block as it is being + * filled out. + * + * We hold off the interrupt handler on the other cpu by + * ordering our writes to the cpu mask vs our setting of the + * refs counter. We assert only the cpu owning the data block + * will set a bit in cpumask, and each bit will only be cleared + * by the subject cpu. Each cpu must first find its bit is + * set and then check that refs is set indicating the element is + * ready to be processed, otherwise it must skip the entry. + * + * On the previous iteration refs was set to 0 by another cpu. + * To avoid the use of transitivity, set the counter to 0 here + * so the wmb will pair with the rmb in the interrupt handler. */ + atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ + + data->csd.func = func; + data->csd.info = info; + + /* Ensure 0 refs is visible before mask. Also orders func and info */ + smp_wmb(); + + /* We rely on the "and" being processed before the store */ + cpumask_and(data->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, data->cpumask); raw_spin_lock_irqsave(&call_function.lock, flags); /* @@ -509,8 +528,9 @@ void smp_call_function_many(const struct cpumask *mask, */ list_add_rcu(&data->csd.list, &call_function.queue); /* - * We rely on the wmb() in list_add_rcu to order the writes - * to func, data, and cpumask before this write to refs. + * We rely on the wmb() in list_add_rcu to complete our writes + * to the cpumask before this write to refs, which indicates + * data is on the list and is ready to be processed. */ atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_unlock_irqrestore(&call_function.lock, flags); -- cgit v1.2.3 From 723aae25d5cdb09962901d36d526b44d4be1051c Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Tue, 15 Mar 2011 13:27:17 -0600 Subject: smp_call_function_many: handle concurrent clearing of mask Mike Galbraith reported finding a lockup ("perma-spin bug") where the cpumask passed to smp_call_function_many was cleared by other cpu(s) while a cpu was preparing its call_data block, resulting in no cpu to clear the last ref and unlock the block. Having cpus clear their bit asynchronously could be useful on a mask of cpus that might have a translation context, or cpus that need a push to complete an rcu window. Instead of adding a BUG_ON and requiring yet another cpumask copy, just detect the race and handle it. Note: arch_send_call_function_ipi_mask must still handle an empty cpumask because the data block is globally visible before the that arch callback is made. And (obviously) there are no guarantees to which cpus are notified if the mask is changed during the call; only cpus that were online and had their mask bit set during the whole call are guaranteed to be called. Reported-by: Mike Galbraith Reported-by: Jan Beulich Acked-by: Jan Beulich Cc: stable@kernel.org Signed-off-by: Milton Miller Signed-off-by: Linus Torvalds --- kernel/smp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 7c6ded5effd9..954548906afb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask, { struct call_function_data *data; unsigned long flags; - int cpu, next_cpu, this_cpu = smp_processor_id(); + int refs, cpu, next_cpu, this_cpu = smp_processor_id(); /* * Can deadlock when called with interrupts disabled. @@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress && !early_boot_irqs_disabled); - /* So, what's a CPU they want? Ignoring this one. */ + /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); @@ -519,6 +519,13 @@ void smp_call_function_many(const struct cpumask *mask, /* We rely on the "and" being processed before the store */ cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); + refs = cpumask_weight(data->cpumask); + + /* Some callers race with other cpus changing the passed mask */ + if (unlikely(!refs)) { + csd_unlock(&data->csd); + return; + } raw_spin_lock_irqsave(&call_function.lock, flags); /* @@ -532,7 +539,7 @@ void smp_call_function_many(const struct cpumask *mask, * to the cpumask before this write to refs, which indicates * data is on the list and is ready to be processed. */ - atomic_set(&data->refs, cpumask_weight(data->cpumask)); + atomic_set(&data->refs, refs); raw_spin_unlock_irqrestore(&call_function.lock, flags); /* -- cgit v1.2.3 From c8def554d031664e984323f6a5d667f070717776 Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Tue, 15 Mar 2011 13:27:17 -0600 Subject: smp_call_function_interrupt: use typedef and %pf Use the newly added smp_call_func_t in smp_call_function_interrupt for the func variable, and make the comment above the WARN more assertive and explicit. Also, func is a function pointer and does not need an offset, so use %pf not %pS. Signed-off-by: Milton Miller Signed-off-by: Linus Torvalds --- kernel/smp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 954548906afb..7cbd0f293df4 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - void (*func) (void *info); + smp_call_func_t func; /* * Since we walk the list without any locks, we might @@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void) if (atomic_read(&data->refs) == 0) continue; - func = data->csd.func; /* for later warn */ - data->csd.func(data->csd.info); + func = data->csd.func; /* save for later warn */ + func(data->csd.info); /* - * If the cpu mask is not still set then it enabled interrupts, - * we took another smp interrupt, and executed the function - * twice on this cpu. In theory that copy decremented refs. + * If the cpu mask is not still set then func enabled + * interrupts (BUG), and this cpu took another smp call + * function interrupt and executed func(info) twice + * on this cpu. That nested execution decremented refs. */ if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { - WARN(1, "%pS enabled interrupts and double executed\n", - func); + WARN(1, "%pf enabled interrupts and double executed\n", func); continue; } -- cgit v1.2.3 From 1ef1d1c2353967e2d61ecaddf76edfd058a778b4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Mar 2011 14:41:27 +0100 Subject: trace, filters: Initialize the match variable in process_ops() properly Make sure the 'match' variable always has a value. Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3249b4f77ef0..8008ddcfbf20 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -391,8 +391,8 @@ static int process_ops(struct filter_pred *preds, struct filter_pred *op, void *rec) { struct filter_pred *pred; + int match = 0; int type; - int match; int i; /* -- cgit v1.2.3 From d57f078b193981d1b7d24193f3118c6b806db0ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 18 Mar 2011 16:54:31 +0000 Subject: KGDB: Notify GDB of machine halt, reboot or power off Notify GDB of the machine halting, rebooting or powering off by sending it an exited command (remote protocol command 'W'). This is done by calling: void gdbstub_exit(int status) from the arch's machine_{halt,restart,power_off}() functions with an appropriate exit status to be reported to GDB. Signed-off-by: David Howells --- kernel/debug/gdbstub.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 481a7bd2dfe7..a11db956dd62 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) put_packet(remcom_out_buffer); return 0; } + +/** + * gdbstub_exit - Send an exit message to GDB + * @status: The exit code to report. + */ +void gdbstub_exit(int status) +{ + unsigned char checksum, ch, buffer[3]; + int loop; + + buffer[0] = 'W'; + buffer[1] = hex_asc_hi(status); + buffer[2] = hex_asc_lo(status); + + dbg_io_ops->write_char('$'); + checksum = 0; + + for (loop = 0; loop < 3; loop++) { + ch = buffer[loop]; + checksum += ch; + dbg_io_ops->write_char(ch); + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + + /* make sure the output is flushed, lest the bootloader clobber it */ + dbg_io_ops->flush(); +} -- cgit v1.2.3 From 16addf954d3954a72fd56abc02ffcba3c18529a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Mar 2011 09:34:53 -0700 Subject: sched: Fix yield_to kernel-doc Add missing function parameters for yield_to(): Warning(kernel/sched.c:5470): No description found for parameter 'p' Warning(kernel/sched.c:5470): No description found for parameter 'preempt' Signed-off-by: Randy Dunlap Cc: Peter Zijlstra LKML-Reference: <20110318093453.8f7489a4.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 58d66ea7d200..052120d67706 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5467,6 +5467,8 @@ EXPORT_SYMBOL(yield); * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not * * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. -- cgit v1.2.3 From da48524eb20662618854bb3df2db01fc65f3070c Mon Sep 17 00:00:00 2001 From: Julien Tinnes Date: Fri, 18 Mar 2011 15:05:21 -0700 Subject: Prevent rt_sigqueueinfo and rt_tgsigqueueinfo from spoofing the signal code Userland should be able to trust the pid and uid of the sender of a signal if the si_code is SI_TKILL. Unfortunately, the kernel has historically allowed sigqueueinfo() to send any si_code at all (as long as it was negative - to distinguish it from kernel-generated signals like SIGILL etc), so it could spoof a SI_TKILL with incorrect siginfo values. Happily, it looks like glibc has always set si_code to the appropriate SI_QUEUE, so there are probably no actual user code that ever uses anything but the appropriate SI_QUEUE flag. So just tighten the check for si_code (we used to allow any negative value), and add a (one-time) warning in case there are binaries out there that might depend on using other si_code values. Signed-off-by: Julien Tinnes Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/signal.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 4e3cff10fdce..31751868de88 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2421,9 +2421,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return -EFAULT; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info.si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info.si_code != SI_QUEUE) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info.si_code < 0); return -EPERM; + } info.si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -2437,9 +2441,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) return -EINVAL; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info->si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code != SI_QUEUE) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); return -EPERM; + } info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); -- cgit v1.2.3 From 1106b6997df7d0c0487e21fd9c9dd2ce3d4a52db Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Feb 2011 17:35:34 +0100 Subject: tracing: Fix set_ftrace_filter probe function display If one or more function probes (like traceon) are enabled, and there's no other function filter, the first probe func is skipped (which one depends on the position in the hash). $ echo sys_open:traceon sys_close:traceon > ./set_ftrace_filter $ cat set_ftrace_filter #### all functions enabled #### sys_close:traceon:unlimited $ The reason was, that in the case of no other function filter, the func_pos was not properly updated before calling t_hash_start. Signed-off-by: Jiri Olsa LKML-Reference: <1297874134-7008-1-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 888b611897d3..c075f4ea6b94 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1467,7 +1467,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return t_hash_next(m, pos); (*pos)++; - iter->pos = *pos; + iter->pos = iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) return t_hash_start(m, pos); @@ -1502,7 +1502,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (!rec) return t_hash_start(m, pos); - iter->func_pos = *pos; iter->func = rec; return iter; -- cgit v1.2.3 From 8d2587970b8bdf7c8d9208e3f4bb93182aef1a0f Mon Sep 17 00:00:00 2001 From: Phil Carmody Date: Tue, 22 Mar 2011 16:30:13 -0700 Subject: cgroups: if you list_empty() a head then don't list_del() it list_del() leaves poison in the prev and next pointers. The next list_empty() will compare those poisons, and say the list isn't empty. Any list operations that assume the node is on a list because of such a check will be fooled into dereferencing poison. One needs to INIT the node after the del, and fortunately there's already a wrapper for that - list_del_init(). Some of the dels are followed by deallocations, so can be ignored, and one can be merged with an add to make a move. Apart from that, I erred on the side of caution in making nodes list_empty()-queriable. Signed-off-by: Phil Carmody Reviewed-by: Paul Menage Cc: Li Zefan Acked-by: Kirill A. Shutemov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 95362d15128c..e31b220a743d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1813,10 +1813,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3655,12 +3653,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3879,7 +3877,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as @@ -4241,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) if (!list_empty(&tsk->cg_list)) { write_lock(&css_set_lock); if (!list_empty(&tsk->cg_list)) - list_del(&tsk->cg_list); + list_del_init(&tsk->cg_list); write_unlock(&css_set_lock); } -- cgit v1.2.3 From 504f52b5439aaf26d3e2c1d45ec10fce38c8dd27 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:41 -0700 Subject: mm: NUMA aware alloc_task_struct_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_cpu(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if available. Users of this new function are : ksoftirqd, kworker, migration, pktgend... This patch: Add a node parameter to alloc_task_struct(), and change its name to alloc_task_struct_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 05b92c457010..cffbe8a4e1fc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -109,8 +109,10 @@ int nr_processes(void) } #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR -# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) -# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) static struct kmem_cache *task_struct_cachep; #endif @@ -249,12 +251,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - + int node = numa_node_id(); int err; prepare_to_copy(orig); - tsk = alloc_task_struct(); + tsk = alloc_task_struct_node(node); if (!tsk) return NULL; -- cgit v1.2.3 From b6a84016bd2598e35ead635147fa53619982648d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:42 -0700 Subject: mm: NUMA aware alloc_thread_info_node() Add a node parameter to alloc_thread_info(), and change its name to alloc_thread_info_node() This change is needed to allow NUMA aware kthread_create_on_cpu() Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index cffbe8a4e1fc..cbc6adc6e891 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -117,14 +117,17 @@ static struct kmem_cache *task_struct_cachep; #endif #ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR -static inline struct thread_info *alloc_thread_info(struct task_struct *tsk) +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) { #ifdef CONFIG_DEBUG_STACK_USAGE gfp_t mask = GFP_KERNEL | __GFP_ZERO; #else gfp_t mask = GFP_KERNEL; #endif - return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER); + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) @@ -260,7 +263,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) if (!tsk) return NULL; - ti = alloc_thread_info(tsk); + ti = alloc_thread_info_node(tsk, node); if (!ti) { free_task_struct(tsk); return NULL; -- cgit v1.2.3 From 207205a2ba2655652fe46a60b49838af6c16a919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:44 -0700 Subject: kthread: NUMA aware kthread_create_on_node() All kthreads being created from a single helper task, they all use memory from a single node for their kernel stack and task struct. This patch suite creates kthread_create_on_node(), adding a 'cpu' parameter to parameters already used by kthread_create(). This parameter serves in allocating memory for the new kthread on its memory node if possible. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Cc: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- kernel/kthread.c | 31 +++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index cbc6adc6e891..a8f64f8ec7e1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -254,7 +255,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) struct task_struct *tsk; struct thread_info *ti; unsigned long *stackend; - int node = numa_node_id(); + int node = tsk_fork_get_node(orig); int err; prepare_to_copy(orig); diff --git a/kernel/kthread.c b/kernel/kthread.c index c55afba990a3..684ab3f7dd72 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -27,6 +27,7 @@ struct kthread_create_info /* Information passed to kthread() from kthreadd. */ int (*threadfn)(void *data); void *data; + int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; @@ -98,10 +99,23 @@ static int kthread(void *_create) do_exit(ret); } +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + static void create_kthread(struct kthread_create_info *create) { int pid; +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { @@ -111,15 +125,18 @@ static void create_kthread(struct kthread_create_info *create) } /** - * kthread_create - create a kthread. + * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. + * @node: memory node number. * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which noone will call kthread_stop(), or @@ -129,15 +146,17 @@ static void create_kthread(struct kthread_create_info *create) * * Returns a task_struct or ERR_PTR(-ENOMEM). */ -struct task_struct *kthread_create(int (*threadfn)(void *data), - void *data, - const char namefmt[], - ...) +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) { struct kthread_create_info create; create.threadfn = threadfn; create.data = data; + create.node = node; init_completion(&create.done); spin_lock(&kthread_create_lock); @@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } return create.result; } -EXPORT_SYMBOL(kthread_create); +EXPORT_SYMBOL(kthread_create_on_node); /** * kthread_bind - bind a just-created kthread to a cpu. -- cgit v1.2.3 From 94dcf29a11b3d20a28790598d701f98484a969da Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Mar 2011 16:30:45 -0700 Subject: kthread: use kthread_create_on_node() ksoftirqd, kworker, migration, and pktgend kthreads can be created with kthread_create_on_node(), to get proper NUMA affinities for their stack and task_struct. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Reviewed-by: Andi Kleen Acked-by: Rusty Russell Acked-by: Tejun Heo Cc: Tony Luck Cc: Fenghua Yu Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 5 ++++- kernel/stop_machine.c | 6 ++++-- kernel/workqueue.c | 6 ++++-- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 56e5dec837f0..735d87095172 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -845,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); return notifier_from_errno(PTR_ERR(p)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 2df820b03beb..e3516b29076c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", - cpu); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); if (IS_ERR(p)) return notifier_from_errno(PTR_ERR(p)); get_task_struct(p); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca7ce9ce754..04ef830690ec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1366,8 +1366,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind) worker->id = id; if (!on_unbound_cpu) - worker->task = kthread_create(worker_thread, worker, - "kworker/%u:%d", gcwq->cpu, id); + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d", id); -- cgit v1.2.3 From d404ab0a1133e95557bb7deab2a49b348dfeba85 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Tue, 22 Mar 2011 16:34:04 -0700 Subject: move x86 specific oops=panic to generic code The oops=panic cmdline option is not x86 specific, move it to generic code. Update documentation. Signed-off-by: Olaf Hering Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 991bb87a1704..69231670eb95 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); -- cgit v1.2.3 From 34db18a054c600b6f81787165669dc572fe4de25 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 22 Mar 2011 16:34:06 -0700 Subject: smp: move smp setup functions to kernel/smp.c Move setup_nr_cpu_ids(), smp_init() and some other SMP boot parameter setup functions from init/main.c to kenrel/smp.c, saves some #ifdef CONFIG_SMP. Signed-off-by: WANG Cong Cc: Rakib Mullick Cc: David Howells Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Tejun Heo Cc: Arnd Bergmann Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 7cbd0f293df4..73a195193558 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -604,6 +604,87 @@ void ipi_call_unlock_irq(void) } #endif /* USE_GENERIC_SMP_HELPERS */ +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=", where is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to . + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + /* * Call a function on all processors. May be used during early boot while * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead -- cgit v1.2.3 From 4d51985e484dd11d9047dfcd1278ec9ccfb435d5 Mon Sep 17 00:00:00 2001 From: Michael Rodriguez Date: Tue, 22 Mar 2011 16:34:07 -0700 Subject: kernel/cpu.c: fix many errors related to style. Change the printk() calls to have the KERN_INFO/KERN_ERROR stuff, and fixes other coding style errors. Not _all_ of them are gone, though. [akpm@linux-foundation.org: revert the bits I disagree with] Signed-off-by: Michael Rodriguez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 156cc5556140..c95fc4df0faa 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v) { BUG_ON(cpu_notify(val, v)); } - EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); - return 0; } @@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { nr_calls--; @@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk("%s: attempt to bring up CPU %u failed\n", + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", __func__, cpu); goto out_notify; } @@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk("Enabling non-boot CPUs ...\n"); + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk("CPU%d is up\n", cpu); + printk(KERN_INFO "CPU%d is up\n", cpu); continue; } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); @@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) */ /* cpu_bit_bitmap[0] is empty - so we can back into it */ -#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x) +#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) -- cgit v1.2.3 From 9bfb23fc4a481650e60d22dbe84c0fd5a9d49bba Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Mar 2011 16:34:09 -0700 Subject: sys_unshare: remove the dead CLONE_THREAD/SIGHAND/VM code Cleanup: kill the dead code which does nothing but complicates the code and confuses the reader. sys_unshare(CLONE_THREAD/SIGHAND/VM) is not really implemented, and I doubt very much it will ever work. At least, nobody even tried since the original 99d1419d96d7df9cfa56 ("unshare system call -v5: system call handler function") was applied more than 4 years ago. And the code is not consistent. unshare_thread() always fails unconditionally, while unshare_sighand() and unshare_vm() pretend to work if there is nothing to unshare. Remove unshare_thread(), unshare_sighand(), unshare_vm() helpers and related variables and add a simple CLONE_THREAD | CLONE_SIGHAND| CLONE_VM check into check_unshare_flags(). Also, move the "CLONE_NEWNS needs CLONE_FS" check from check_unshare_flags() to sys_unshare(). This looks more consistent and matches the similar do_sysvsem check in sys_unshare(). Note: with or without this patch "atomic_read(mm->mm_users) > 1" can give a false positive due to get_task_mm(). Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Cc: Janak Desai Cc: Daniel Lezcano Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 123 ++++++++++++---------------------------------------------- 1 file changed, 25 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a8f64f8ec7e1..f2b494d7c557 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1519,38 +1519,24 @@ void __init proc_caches_init(void) } /* - * Check constraints on flags passed to the unshare system call and - * force unsharing of additional process context as appropriate. + * Check constraints on flags passed to the unshare system call. */ -static void check_unshare_flags(unsigned long *flags_ptr) +static int check_unshare_flags(unsigned long unshare_flags) { + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; /* - * If unsharing a thread from a thread group, must also - * unshare vm. - */ - if (*flags_ptr & CLONE_THREAD) - *flags_ptr |= CLONE_VM; - - /* - * If unsharing vm, must also unshare signal handlers. - */ - if (*flags_ptr & CLONE_VM) - *flags_ptr |= CLONE_SIGHAND; - - /* - * If unsharing namespace, must also unshare filesystem information. + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. */ - if (*flags_ptr & CLONE_NEWNS) - *flags_ptr |= CLONE_FS; -} - -/* - * Unsharing of tasks created with CLONE_THREAD is not supported yet - */ -static int unshare_thread(unsigned long unshare_flags) -{ - if (unshare_flags & CLONE_THREAD) - return -EINVAL; + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } return 0; } @@ -1576,34 +1562,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) return 0; } -/* - * Unsharing of sighand is not supported yet - */ -static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) -{ - struct sighand_struct *sigh = current->sighand; - - if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) - return -EINVAL; - else - return 0; -} - -/* - * Unshare vm if it is being shared - */ -static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp) -{ - struct mm_struct *mm = current->mm; - - if ((unshare_flags & CLONE_VM) && - (mm && atomic_read(&mm->mm_users) > 1)) { - return -EINVAL; - } - - return 0; -} - /* * Unshare file descriptor table if it is being shared */ @@ -1632,23 +1590,21 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp */ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { - int err = 0; struct fs_struct *fs, *new_fs = NULL; - struct sighand_struct *new_sigh = NULL; - struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; + int err; - check_unshare_flags(&unshare_flags); - - /* Return -EINVAL for all unsupported flags */ - err = -EINVAL; - if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| - CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + err = check_unshare_flags(unshare_flags); + if (err) goto bad_unshare_out; + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -1656,21 +1612,15 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; - if ((err = unshare_thread(unshare_flags))) - goto bad_unshare_out; if ((err = unshare_fs(unshare_flags, &new_fs))) - goto bad_unshare_cleanup_thread; - if ((err = unshare_sighand(unshare_flags, &new_sigh))) - goto bad_unshare_cleanup_fs; - if ((err = unshare_vm(unshare_flags, &new_mm))) - goto bad_unshare_cleanup_sigh; + goto bad_unshare_out; if ((err = unshare_fd(unshare_flags, &new_fd))) - goto bad_unshare_cleanup_vm; + goto bad_unshare_cleanup_fs; if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs))) goto bad_unshare_cleanup_fd; - if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) { + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). @@ -1696,19 +1646,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) spin_unlock(&fs->lock); } - if (new_mm) { - mm = current->mm; - active_mm = current->active_mm; - current->mm = new_mm; - current->active_mm = new_mm; - if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { - atomic_dec(&mm->oom_disable_count); - atomic_inc(&new_mm->oom_disable_count); - } - activate_mm(active_mm, new_mm); - new_mm = mm; - } - if (new_fd) { fd = current->files; current->files = new_fd; @@ -1725,20 +1662,10 @@ bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); -bad_unshare_cleanup_vm: - if (new_mm) - mmput(new_mm); - -bad_unshare_cleanup_sigh: - if (new_sigh) - if (atomic_dec_and_test(&new_sigh->count)) - kmem_cache_free(sighand_cachep, new_sigh); - bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); -bad_unshare_cleanup_thread: bad_unshare_out: return err; } -- cgit v1.2.3 From fef2c9bc1b54c0261324a96e948c0b849796e896 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 22 Mar 2011 16:34:16 -0700 Subject: kernel/watchdog.c: allow hardlockup to panic by default When a cpu is considered stuck, instead of limping along and just printing a warning, it is sometimes preferred to just panic, let kdump capture the vmcore and reboot. This gets the machine back into a stable state quickly while saving the info that got it into a stuck state to begin with. Add a Kconfig option to allow users to set the hardlockup to panic by default. Also add in a 'nmi_watchdog=nopanic' to override this. [akpm@linux-foundation.org: fix strncmp length] Signed-off-by: Don Zickus Acked-by: Peter Zijlstra Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18bb15776c57..054a67cca9da 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); * Should we panic when a soft-lockup or hard-lockup occurs: */ #ifdef CONFIG_HARDLOCKUP_DETECTOR -static int hardlockup_panic; +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; static int __init hardlockup_panic_setup(char *str) { if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) watchdog_enabled = 0; return 1; -- cgit v1.2.3 From f99a99330f85a84c346ddeb4adc72dbfad9b9e3e Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 22 Mar 2011 16:34:17 -0700 Subject: kernel/watchdog.c: always return NOTIFY_OK during cpu up/down events This patch addresses a couple of problems. One was the case when the hardlockup failed to start, it also failed to start the softlockup. There were valid cases when the hardlockup shouldn't start and that shouldn't block the softlockup (no lapic, bios controls perf counters). The second problem was when the hardlockup failed to start on boxes (from a no lapic or bios controlled perf counter case), it reported failure to the cpu notifier chain. This blocked the notifier from continuing to start other more critical pieces of cpu bring-up (in our case based on a 2.6.32 fork, it was the mce). As a result, during soft cpu online/offline testing, the system would panic when a cpu was offlined because the cpu notifier would succeed in processing a watchdog disable cpu event and would panic in the mce case as a result of un-initialized variables from a never executed cpu up event. I realized the hardlockup/softlockup cases are really just debugging aids and should never impede the progress of a cpu up/down event. Therefore I modified the code to always return NOTIFY_OK and instead rely on printks to inform the user of problems. Signed-off-by: Don Zickus Acked-by: Peter Zijlstra Reviewed-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 054a67cca9da..140dce750450 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -418,19 +418,22 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); - int err; + int err = 0; /* enable the perf event */ err = watchdog_nmi_enable(cpu); - if (err) - return err; + + /* Regardless of err above, fall through and start softlockup */ /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return PTR_ERR(p); + if (!err) + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + goto out; } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -438,7 +441,8 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - return 0; +out: + return err; } static void watchdog_disable(int cpu) @@ -550,7 +554,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return notifier_from_errno(err); + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; } static struct notifier_block __cpuinitdata cpu_nfb = { -- cgit v1.2.3 From 7bf693951a8e5f7e600a45b74d91d962a453146e Mon Sep 17 00:00:00 2001 From: "Fabio M. Di Nitto" Date: Tue, 22 Mar 2011 16:34:20 -0700 Subject: console: allow to retain boot console via boot option keep_bootcon On some architectures, the boot process involves de-registering the boot console (early boot), initialize drivers and then re-register the console. This mechanism introduces a window in which no printk can happen on the console and messages are buffered and then printed once the new console is available. If a kernel crashes during this window, all it's left on the boot console is "console [foo] enabled, bootconsole disabled" making debug of the crash rather 'interesting'. By adding "keep_bootcon" option, do not unregister the boot console, that will allow to printk everything that is happening up to the crash. The option is clearly meant only for debugging purposes as it introduces lots of duplicated info printed on console, but will make bug report from users easier as it doesn't require a kernel build just to figure out where we crash. Signed-off-by: Fabio M. Di Nitto Acked-by: David S. Miller Cc: Alan Cox Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 33284adb2189..2b591f252e55 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1316,6 +1316,18 @@ void console_start(struct console *console) } EXPORT_SYMBOL(console_start); +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -1463,7 +1475,9 @@ void register_console(struct console *newcon) * users know there might be something in the kernel's log buffer that * went to the bootconsole (that they do not see on the real console) */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { /* we need to iterate through twice, to make sure we print * everything out, before we unregister the console(s) */ -- cgit v1.2.3 From fe3d8ad31cf51b062bbb8a9609eeb1d0c41a7f30 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 22 Mar 2011 16:34:21 -0700 Subject: console: prevent registered consoles from dumping old kernel message over again For a platform with many consoles like: "console=tty1 console=ttyMFD2 console=ttyS0 earlyprintk=mrst" Each time when the non "selected_console" (tty1 and ttyMFD2 here) get registered, the existing kernel message will be printed out on registered consoles again, the "mrst" early console will get some same message for 3 times, and "tty1" will get some for twice. As suggested by Andrew Morton, every time a new console is registered, it will be set as the "exclusive" console which will dump the already existing kernel messages. Signed-off-by: Feng Tang Cc: Greg KH Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2b591f252e55..a53607eea6d0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -112,6 +112,11 @@ static unsigned log_start; /* Index into log_buf: next char to be read by syslog static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ +/* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + /* * Array of consoles built from command line options (console=) */ @@ -476,6 +481,8 @@ static void __call_console_drivers(unsigned start, unsigned end) struct console *con; for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; if ((con->flags & CON_ENABLED) && con->write && (cpu_online(smp_processor_id()) || (con->flags & CON_ANYTIME))) @@ -1230,6 +1237,11 @@ void console_unlock(void) local_irq_restore(flags); } console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) @@ -1464,6 +1476,12 @@ void register_console(struct console *newcon) spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; } console_unlock(); console_sysfs_notify(); -- cgit v1.2.3 From 9f36e2c448007b54851e7e4fa48da97d1477a175 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Mar 2011 16:34:22 -0700 Subject: printk: use %pK for /proc/kallsyms and /proc/modules In an effort to reduce kernel address leaks that might be used to help target kernel privilege escalation exploits, this patch uses %pK when displaying addresses in /proc/kallsyms, /proc/modules, and /sys/module/*/sections/*. Note that this changes %x to %p, so some legitimately 0 values in /proc/kallsyms would have changed from 00000000 to "(null)". To avoid this, "(null)" is not used when using the "K" format. Anything that was already successfully parsing "(null)" in addition to full hex digits should have no problem with this change. (Thanks to Joe Perches for the suggestion.) Due to the %x to %p, "void *" casts are needed since these addresses are already "unsigned long" everywhere internally, due to their starting life as ELF section offsets. Signed-off-by: Kees Cook Cc: Eugene Teo Cc: Dan Rosenberg Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 10 ++++------ kernel/module.c | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..75dcca37d61a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -477,13 +477,11 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, "%0*lx %c %s\t[%s]\n", - (int)(2 * sizeof(void *)), - iter->value, type, iter->name, iter->module_name); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); } else - seq_printf(m, "%0*lx %c %s\n", - (int)(2 * sizeof(void *)), - iter->value, iter->type, iter->name); + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); return 0; } diff --git a/kernel/module.c b/kernel/module.c index efa290ea94bf..1f9f7bc56ca1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1168,7 +1168,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr, { struct module_sect_attr *sattr = container_of(mattr, struct module_sect_attr, mattr); - return sprintf(buf, "0x%lx\n", sattr->address); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); } static void free_sect_attrs(struct module_sect_attrs *sect_attrs) @@ -3224,7 +3224,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading": "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%p", mod->module_core); + seq_printf(m, " 0x%pK", mod->module_core); /* Taints info */ if (mod->taints) -- cgit v1.2.3 From 5af5bcb8d37f99ba415a1adc6da71051b84f93a5 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 22 Mar 2011 16:34:23 -0700 Subject: printk: allow setting DEFAULT_MESSAGE_LEVEL via Kconfig We've been burned by regressions/bugs which we later realized could have been triaged quicker if only we'd paid closer attention to dmesg. To make it easier to audit dmesg, we'd like to make DEFAULT_MESSAGE_LEVEL Kconfig-settable. That way we can set it to KERN_NOTICE and audit any messages <= KERN_WARNING. Signed-off-by: Mandeep Singh Baines Cc: Ingo Molnar Cc: Joe Perches Cc: Olof Johansson Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a53607eea6d0..da8ca817eae3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -53,7 +53,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL /* We show everything that is MORE important than this.. */ #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -- cgit v1.2.3 From 20dd67407160eac577656cd2f8ee9a1fead960b8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 23 Mar 2011 13:17:23 +0200 Subject: sched: Remove unused 'rq' variable and cpu_rq() call from alloc_fair_sched_group() Signed-off-by: Sergey Senozhatsky Cc: Steven Rostedt Cc: Peter Zijlstra LKML-Reference: <20110323111722.GA4244@swordfish.minsk.epam.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 052120d67706..a361e20ec2cd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8443,7 +8443,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; struct sched_entity *se; - struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8456,8 +8455,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) -- cgit v1.2.3 From dec2960827c85253d76938dbfa909df3be34958b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 23 Mar 2011 14:38:28 +0200 Subject: lockdep: Remove unused 'factor' variable from lockdep_stats_show() Signed-off-by: Sergey Senozhatsky Cc: Peter Zijlstra LKML-Reference: <20110323123828.GB4244@swordfish.minsk.epam.com> Signed-off-by: Ingo Molnar --- kernel/lockdep_proc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 1969d2fc4b36..71edd2f60c02 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, - sum_forward_deps = 0, factor = 0; + sum_forward_deps = 0; list_for_each_entry(class, &all_lock_classes, lock_entry) { @@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v) nr_hardirq_unsafe * nr_hardirq_safe + nr_list_entries); - /* - * Estimated factor between direct and indirect - * dependencies: - */ - if (nr_list_entries) - factor = sum_forward_deps / nr_list_entries; - #ifdef CONFIG_PROVE_LOCKING seq_printf(m, " dependency chains: %11lu [max: %lu]\n", nr_lock_chains, MAX_LOCKDEP_CHAINS); -- cgit v1.2.3 From 1232d6132a986125f6a687ab9b61a4330e319270 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 22 Mar 2011 18:46:18 +0100 Subject: sched, doc: Update sched-design-CFS.txt Correct ->dequeue_tree() thinko into sched_class->dequeue_task and drop all references to ->task_new() since it is obviously gone. Signed-off-by: Borislav Petkov Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <1300815978-16618-1-git-send-email-bp@amd64.org> Signed-off-by: Ingo Molnar --- kernel/sched_idletask.c | 2 -- kernel/sched_stoptask.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index c82f26c1b7c3..a776a6396427 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -94,6 +94,4 @@ static const struct sched_class idle_sched_class = { .prio_changed = prio_changed_idle, .switched_to = switched_to_idle, - - /* no .task_new for idle tasks */ }; diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 84ec9bcf82d9..1ba2bd40fdac 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -102,6 +102,4 @@ static const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, - - /* no .task_new for stop tasks */ }; -- cgit v1.2.3 From 68cacd29167b1926d237bd1b153aa2a990201729 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 23 Mar 2011 16:03:06 +0100 Subject: perf_events: Fix stale ->cgrp pointer in update_cgrp_time_from_cpuctx() This patch solves a stale pointer problem in update_cgrp_time_from_cpuctx(). The cpuctx->cgrp was not cleared on all possible event exit paths, including: close() perf_release() perf_release_kernel() list_del_event() This patch fixes list_del_event() to clear cpuctx->cgrp when there are no cgroup events left in the context. [ This second version makes the code compile when CONFIG_CGROUP_PERF is not enabled. We unconditionally define perf_cpu_context->cgrp. ] Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: perfmon2-devel@lists.sf.net Cc: paulus@samba.org Cc: davem@davemloft.net LKML-Reference: <20110323150306.GA1580@quad> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3472bb1a070c..0c714226ae0c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -941,6 +941,7 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_cpu_context *cpuctx; /* * We can have double detach due to exit/hot-unplug + close. */ @@ -949,8 +950,17 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) + if (is_cgroup_event(event)) { ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } ctx->nr_events--; if (event->attr.inherit_stat) -- cgit v1.2.3 From 3b9038912828384e38d82409c281124631c8533b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Mar 2011 00:24:11 +0100 Subject: genirq; Remove the last leftovers of the old sparse irq code All users converted. Get rid of it. Signed-off-by: Thomas Gleixner --- kernel/irq/irqdesc.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index dbccc799407f..6fb014f172f7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -198,15 +198,6 @@ err: return -ENOMEM; } -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - int res = irq_alloc_descs(irq, irq, 1, node); - - if (res == -EEXIST || res == irq) - return irq_to_desc(irq); - return NULL; -} - static int irq_expand_nr_irqs(unsigned int nr) { if (nr > IRQ_BITMAP_BITS) @@ -283,11 +274,6 @@ struct irq_desc *irq_to_desc(unsigned int irq) return (irq < NR_IRQS) ? irq_desc + irq : NULL; } -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} - static void free_desc(unsigned int irq) { dynamic_irq_cleanup(irq); -- cgit v1.2.3 From 880f57318450dbead6a03f9e31a1468924d6dd88 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 23 Mar 2011 19:29:39 +0100 Subject: perf: Better fit max unprivileged mlock pages for tools needs The maximum kilobytes of locked memory that an unprivileged user can reserve is of 512 kB = 128 pages by default, scaled to the number of onlined CPUs, which fits well with the tools that use 128 data pages by default. However tools actually use 129 pages, because they need one more for the user control page. Thus the default mlock threshold is not sufficient for the default tools needs and we always end up to evaluate the constant mlock rlimit policy, which doesn't have this scaling with the number of online CPUs. Hence, on systems that have more than 16 CPUs, we overlap the rlimit threshold and fail to mmap: $ perf record ls Error: failed to mmap with 1 (Operation not permitted) Just increase the max unprivileged mlock threshold by one page so that it supports well perf tools even after 16 CPUs. Reported-by: Han Pingtian Reported-by: Peter Zijlstra Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Frederic Weisbecker Acked-by: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: Stable LKML-Reference: <1300904979-5508-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 0c714226ae0c..c75925c4d1e2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -145,7 +145,8 @@ static struct srcu_struct pmus_srcu; */ int sysctl_perf_event_paranoid __read_mostly = 1; -int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ +/* Minimum for 128 pages + 1 for the user control page */ +int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */ /* * max perf event sample rate -- cgit v1.2.3 From cae5d39032acf26c265f6b1dc73d7ce6ff4bc387 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:17 -0400 Subject: mm: arch: rename in_gate_area_no_task to in_gate_area_no_mm Now that gate vma's are referenced with respect to a particular mm and not a particular task it only makes sense to propagate the change to this predicate as well. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- kernel/kallsyms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..b9d0fd1d21c7 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr) if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || arch_is_kernel_text(addr)) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) return 1; - return in_gate_area_no_task(addr); + return in_gate_area_no_mm(addr); } static int is_ksym_addr(unsigned long addr) -- cgit v1.2.3 From e1a85b2c519551d4792180cdab4074d7e99bf2c9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 23 Mar 2011 22:16:04 +0100 Subject: timekeeping: Use syscore_ops instead of sysdev class and sysdev The timekeeping subsystem uses a sysdev class and a sysdev for executing timekeeping_suspend() after interrupts have been turned off on the boot CPU (during system suspend) and for executing timekeeping_resume() before turning on interrupts on the boot CPU (during system resume). However, since both of these functions ignore their arguments, the entire mechanism may be replaced with a struct syscore_ops object which is simpler. Signed-off-by: Rafael J. Wysocki Reviewed-by: Thomas Gleixner --- kernel/time/timekeeping.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3bd7e3d5c632..8ad5d576755e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -597,13 +597,12 @@ static struct timespec timekeeping_suspend_time; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * @dev: unused * * This is for the generic clocksource timekeeping. * xtime/wall_to_monotonic/jiffies/etc are * still managed by arch specific suspend/resume code. */ -static int timekeeping_resume(struct sys_device *dev) +static void timekeeping_resume(void) { unsigned long flags; struct timespec ts; @@ -632,11 +631,9 @@ static int timekeeping_resume(struct sys_device *dev) /* Resume hrtimers */ hres_timers_resume(); - - return 0; } -static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +static int timekeeping_suspend(void) { unsigned long flags; @@ -654,26 +651,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) } /* sysfs resume/suspend bits for timekeeping */ -static struct sysdev_class timekeeping_sysclass = { - .name = "timekeeping", +static struct syscore_ops timekeeping_syscore_ops = { .resume = timekeeping_resume, .suspend = timekeeping_suspend, }; -static struct sys_device device_timer = { - .id = 0, - .cls = &timekeeping_sysclass, -}; - -static int __init timekeeping_init_device(void) +static int __init timekeeping_init_ops(void) { - int error = sysdev_class_register(&timekeeping_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; + register_syscore_ops(&timekeeping_syscore_ops); + return 0; } -device_initcall(timekeeping_init_device); +device_initcall(timekeeping_init_ops); /* * If the error is already larger, we look ahead even further -- cgit v1.2.3 From 6c191cd01a935e5b53ef43c9403c771bb7a32b60 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 23 Mar 2011 16:42:18 -0700 Subject: memcg: res_counter_read_u64(): fix potential races on 32-bit machines res_counter_read_u64 reads u64 value without lock. It's dangerous in a 32bit environment. Add locking. Signed-off-by: KAMEZAWA Hiroyuki Cc: Greg Thelen Cc: Johannes Weiner Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index c7eaa37a768b..34683efa2cce 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member, pos, buf, s - buf); } +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else u64 res_counter_read_u64(struct res_counter *counter, int member) { return *res_counter_member(counter, member); } +#endif int res_counter_memparse_write_strategy(const char *buf, unsigned long long *res) -- cgit v1.2.3 From 6b3ae58efca06623c197fd6d91ded4aa3a8fe039 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Mar 2011 16:42:30 -0700 Subject: memcg: remove direct page_cgroup-to-page pointer In struct page_cgroup, we have a full word for flags but only a few are reserved. Use the remaining upper bits to encode, depending on configuration, the node or the section, to enable page_cgroup-to-page lookups without a direct pointer. This saves a full word for every page in a system with memory cgroups enabled. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/bounds.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bounds.c b/kernel/bounds.c index 98a51f26c136..0c9b862292b2 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,11 +9,13 @@ #include #include #include +#include void foo(void) { /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); /* End of constants */ } -- cgit v1.2.3 From 9303e0c4814d2a6afca878cc35433291e862169c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:45 -0700 Subject: cpuset: remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() It's not necessary to copy cpuset->mems_allowed to a buffer allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf(). As spotted by Paul, a side effect is we fix a bug that the function can return -ENOMEM but the caller doesn't expect negative return value. Therefore change the return value of cpuset_sprintf_cpulist() and cpuset_sprintf_memlist() from int to size_t. Signed-off-by: Li Zefan Acked-by: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e92e98189032..4683fe728c9b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1610,34 +1610,26 @@ out: * across a page fault. */ -static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { - int ret; + size_t count; mutex_lock(&callback_mutex); - ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); + count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); mutex_unlock(&callback_mutex); - return ret; + return count; } -static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) +static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); - int retval; - - if (mask == NULL) - return -ENOMEM; + size_t count; mutex_lock(&callback_mutex); - *mask = cs->mems_allowed; + count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); mutex_unlock(&callback_mutex); - retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); - - NODEMASK_FREE(mask); - - return retval; + return count; } static ssize_t cpuset_common_file_read(struct cgroup *cont, -- cgit v1.2.3 From c8163ca8afcac0fc54593fc60d1e1110edbd0eb2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:46 -0700 Subject: cpuset: remove unneeded NODEMASK_ALLOC() in cpuset_attach() oldcs->mems_allowed is not modified during cpuset_attach(), so we don't have to copy it to a buffer allocated by NODEMASK_ALLOC(). Just pass it to cpuset_migrate_mm(). Signed-off-by: Li Zefan Cc: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4683fe728c9b..7f384f4013b2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - if (from == NULL || to == NULL) + if (to == NULL) goto alloc_fail; if (cs == &top_cpuset) { @@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, } /* change mm; only needs to be done once even if threadgroup */ - *from = oldcs->mems_allowed; *to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { mpol_rebind_mm(mm, to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, from, to); + cpuset_migrate_mm(mm, &oldcs->mems_allowed, to); mmput(mm); } alloc_fail: - NODEMASK_FREE(from); NODEMASK_FREE(to); } -- cgit v1.2.3 From ee24d3797780eee6ffe581a7b78d27896f9b494a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:47 -0700 Subject: cpuset: fix unchecked calls to NODEMASK_ALLOC() Those functions that use NODEMASK_ALLOC() can't propagate errno to users, but will fail silently. Fix it by using a static nodemask_t variable for each function, and those variables are protected by cgroup_mutex; [akpm@linux-foundation.org: fix comment spelling, strengthen cgroup_lock comment] Signed-off-by: Li Zefan Cc: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 51 ++++++++++++++++----------------------------------- 1 file changed, 16 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7f384f4013b2..e472fe139192 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); - - if (!newmems) - return; + static nodemask_t newmems; /* protected by cgroup_mutex */ cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, newmems); + guarantee_online_mems(cs, &newmems); - cpuset_change_task_nodemask(p, newmems); - - NODEMASK_FREE(newmems); + cpuset_change_task_nodemask(p, &newmems); mm = get_task_mm(p); if (!mm) @@ -1438,41 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); - - if (to == NULL) - goto alloc_fail; + static nodemask_t to; /* protected by cgroup_mutex */ if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); } else { guarantee_online_cpus(cs, cpus_attach); } - guarantee_online_mems(cs, to); + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, to, cs); + cpuset_attach_task(tsk, &to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, to, cs); + cpuset_attach_task(c, &to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - *to = cs->mems_allowed; + to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, to); + mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &oldcs->mems_allowed, to); + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); mmput(mm); } - -alloc_fail: - NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -2055,10 +2044,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return; + static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2075,7 +2061,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - *oldmems = cp->mems_allowed; + oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2091,10 +2077,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, oldmems, NULL); + update_tasks_nodemask(cp, &oldmems, NULL); } } - NODEMASK_FREE(oldmems); } /* @@ -2136,19 +2121,16 @@ void cpuset_update_active_cpus(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); - - if (oldmems == NULL) - return NOTIFY_DONE; + static nodemask_t oldmems; /* protected by cgroup_mutex */ cgroup_lock(); switch (action) { case MEM_ONLINE: - *oldmems = top_cpuset.mems_allowed; + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, oldmems, NULL); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2162,7 +2144,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, } cgroup_unlock(); - NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif -- cgit v1.2.3 From 523fb486bfd94e3a3b16a42bcb21b1959cf14df8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 23 Mar 2011 16:42:48 -0700 Subject: cpuset: hold callback_mutex in cpuset_post_clone() Chaning cpuset->mems/cpuset->cpus should be protected under callback_mutex. cpuset_clone() doesn't follow this rule. It's ok because it's called when creating and initializing a cgroup, but we'd better hold the lock to avoid subtil break in the future. Signed-off-by: Li Zefan Acked-by: Paul Menage Acked-by: David Rientjes Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e472fe139192..33eee16addb8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1840,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, cs = cgroup_cs(cgroup); parent_cs = cgroup_cs(parent); + mutex_lock(&callback_mutex); cs->mems_allowed = parent_cs->mems_allowed; cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); + mutex_unlock(&callback_mutex); return; } -- cgit v1.2.3 From 814ecf6e5b7854504ae83255173e53836c5d8420 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Wed, 23 Mar 2011 16:43:08 -0700 Subject: sysctl_check: drop table->procname checks Since the for loop checks for the table->procname drop useless table->procname checks inside the loop body Signed-off-by: Denis Kirjanov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_check.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 10b90d8a03c4..3a01c3e46494 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) const char *fail = NULL; if (table->parent) { - if (table->procname && !table->parent->procname) + if (!table->parent->procname) set_fail(&fail, table, "Parent without procname"); } - if (!table->procname) - set_fail(&fail, table, "No procname"); if (table->child) { if (table->data) set_fail(&fail, table, "Directory with data?"); @@ -144,7 +142,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) set_fail(&fail, table, "No maxlen"); } #ifdef CONFIG_PROC_SYSCTL - if (table->procname && !table->proc_handler) + if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif #if 0 -- cgit v1.2.3 From 256c53a65128cbc8a766b1503f3f25a52a8d07cb Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Wed, 23 Mar 2011 16:43:08 -0700 Subject: sysctl_check: drop dead code Drop dead code. Signed-off-by: Denis Kirjanov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_check.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 3a01c3e46494..4e4932a7b360 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -144,10 +144,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) #ifdef CONFIG_PROC_SYSCTL if (!table->proc_handler) set_fail(&fail, table, "No proc_handler"); -#endif -#if 0 - if (!table->procname && table->proc_handler) - set_fail(&fail, table, "proc_handler without procname"); #endif sysctl_check_leaf(namespaces, table, &fail); } -- cgit v1.2.3 From cb16e95fa2996743a6e80a665ed2ed0590bd38cf Mon Sep 17 00:00:00 2001 From: Petr Holasek Date: Wed, 23 Mar 2011 16:43:09 -0700 Subject: sysctl: add some missing input constraint checks Add boundaries of allowed input ranges for: dirty_expire_centisecs, drop_caches, overcommit_memory, page-cluster and panic_on_oom. Signed-off-by: Petr Holasek Acked-by: Dave Young Cc: David Rientjes Cc: Wu Fengguang Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40245d697602..97ab1690f5ed 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -117,6 +117,7 @@ static int neg_one = -1; static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; +static int __maybe_unused three = 3; static unsigned long one_ul = 1; static int one_hundred = 100; #ifdef CONFIG_PRINTK @@ -971,14 +972,18 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "panic_on_oom", .data = &sysctl_panic_on_oom, .maxlen = sizeof(sysctl_panic_on_oom), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "oom_kill_allocating_task", @@ -1006,7 +1011,8 @@ static struct ctl_table vm_table[] = { .data = &page_cluster, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "dirty_background_ratio", @@ -1054,7 +1060,8 @@ static struct ctl_table vm_table[] = { .data = &dirty_expire_interval, .maxlen = sizeof(dirty_expire_interval), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "nr_pdflush_threads", @@ -1130,6 +1137,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, }, #ifdef CONFIG_COMPACTION { -- cgit v1.2.3 From bfdc0b497faa82a0ba2f9dddcf109231dd519fcc Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 23 Mar 2011 16:43:11 -0700 Subject: sysctl: restrict write access to dmesg_restrict When dmesg_restrict is set to 1 CAP_SYS_ADMIN is needed to read the kernel ring buffer. But a root user without CAP_SYS_ADMIN is able to reset dmesg_restrict to 0. This is an issue when e.g. LXC (Linux Containers) are used and complete user space is running without CAP_SYS_ADMIN. A unprivileged and jailed root user can bypass the dmesg_restrict protection. With this patch writing to dmesg_restrict is only allowed when root has CAP_SYS_ADMIN. Signed-off-by: Richard Weinberger Acked-by: Dan Rosenberg Acked-by: Serge E. Hallyn Cc: Eric Paris Cc: Kees Cook Cc: James Morris Cc: Eugene Teo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97ab1690f5ed..c0bb32414b17 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -170,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -707,7 +712,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dmesg_restrict, .extra1 = &zero, .extra2 = &two, }, @@ -2394,6 +2399,17 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; -- cgit v1.2.3 From 45a68628d37222e655219febce9e91b6484789b2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Mar 2011 16:43:12 -0700 Subject: pid: remove the child_reaper special case in init/main.c This patchset is a cleanup and a preparation to unshare the pid namespace. These prerequisites prepare for Eric's patchset to give a file descriptor to a namespace and join an existing namespace. This patch: It turns out that the existing assignment in copy_process of the child_reaper can handle the initial assignment of child_reaper we just need to generalize the test in kernel/fork.c Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f2b494d7c557..17aed4378eda 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1296,7 +1296,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, tracehook_finish_clone(p, clone_flags, trace); if (thread_group_leader(p)) { - if (clone_flags & CLONE_NEWPID) + if (is_child_reaper(pid)) p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; -- cgit v1.2.3 From 4308eebbeb2026827d4492ce8c23d99f7f144a82 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 23 Mar 2011 16:43:13 -0700 Subject: pidns: call pid_ns_prepare_proc() from create_pid_namespace() Reorganize proc_get_sb() so it can be called before the struct pid of the first process is allocated. Signed-off-by: Eric W. Biederman Signed-off-by: Daniel Lezcano Cc: Oleg Nesterov Cc: Alexey Dobriyan Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 ------ kernel/pid_namespace.c | 11 +++++++++-- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 17aed4378eda..457fff2e17e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1187,12 +1187,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94e1f0b..e9c9adc84ca6 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -14,6 +14,7 @@ #include #include #include +#include #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i; + int i, err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + return ns; +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); out_free_map: kfree(ns->pidmap[0].page); out_free: kmem_cache_free(pid_ns_cachep, ns); out: - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); } static void destroy_pid_namespace(struct pid_namespace *ns) -- cgit v1.2.3 From 59607db367c57f515183cb203642291bb14d9c40 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:16 -0700 Subject: userns: add a user_namespace as creator/owner of uts_namespace The expected course of development for user namespaces targeted capabilities is laid out at https://wiki.ubuntu.com/UserNamespace. Goals: - Make it safe for an unprivileged user to unshare namespaces. They will be privileged with respect to the new namespace, but this should only include resources which the unprivileged user already owns. - Provide separate limits and accounting for userids in different namespaces. Status: Currently (as of 2.6.38) you can clone with the CLONE_NEWUSER flag to get a new user namespace if you have the CAP_SYS_ADMIN, CAP_SETUID, and CAP_SETGID capabilities. What this gets you is a whole new set of userids, meaning that user 500 will have a different 'struct user' in your namespace than in other namespaces. So any accounting information stored in struct user will be unique to your namespace. However, throughout the kernel there are checks which - simply check for a capability. Since root in a child namespace has all capabilities, this means that a child namespace is not constrained. - simply compare uid1 == uid2. Since these are the integer uids, uid 500 in namespace 1 will be said to be equal to uid 500 in namespace 2. As a result, the lxc implementation at lxc.sf.net does not use user namespaces. This is actually helpful because it leaves us free to develop user namespaces in such a way that, for some time, user namespaces may be unuseful. Bugs aside, this patchset is supposed to not at all affect systems which are not actively using user namespaces, and only restrict what tasks in child user namespace can do. They begin to limit privilege to a user namespace, so that root in a container cannot kill or ptrace tasks in the parent user namespace, and can only get world access rights to files. Since all files currently belong to the initila user namespace, that means that child user namespaces can only get world access rights to *all* files. While this temporarily makes user namespaces bad for system containers, it starts to get useful for some sandboxing. I've run the 'runltplite.sh' with and without this patchset and found no difference. This patch: copy_process() handles CLONE_NEWUSER before the rest of the namespaces. So in the case of clone(CLONE_NEWUSER|CLONE_NEWUTS) the new uts namespace will have the new user namespace as its owner. That is what we want, since we want root in that new userns to be able to have privilege over it. Changelog: Feb 15: don't set uts_ns->user_ns if we didn't create a new uts_ns. Feb 23: Move extern init_user_ns declaration from init/version.c to utsname.h. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 5 +++++ kernel/user.c | 8 ++++++-- kernel/utsname.c | 4 ++++ 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f74e6c00e26d..034dc2ed13ac 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -74,6 +74,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } + if (new_nsp->uts_ns != tsk->nsproxy->uts_ns) { + put_user_ns(new_nsp->uts_ns->user_ns); + new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; + get_user_ns(new_nsp->uts_ns->user_ns); + } new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { diff --git a/kernel/user.c b/kernel/user.c index 5c598ca781df..9e03e9c1df8d 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -17,9 +17,13 @@ #include #include +/* + * userns count is 1 for root user, 1 for init_uts_ns, + * and 1 for... ? + */ struct user_namespace init_user_ns = { .kref = { - .refcount = ATOMIC_INIT(2), + .refcount = ATOMIC_INIT(3), }, .creator = &root_user, }; @@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep; */ static DEFINE_SPINLOCK(uidhash_lock); -/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */ +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ struct user_struct root_user = { .__count = ATOMIC_INIT(2), .processes = ATOMIC_INIT(1), diff --git a/kernel/utsname.c b/kernel/utsname.c index 8a82b4b8ea52..a7b3a8d1ad24 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct uts_namespace *create_uts_ns(void) { @@ -40,6 +41,8 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + ns->user_ns = old_ns->user_ns; + get_user_ns(ns->user_ns); up_read(&uts_sem); return ns; } @@ -71,5 +74,6 @@ void free_uts_ns(struct kref *kref) struct uts_namespace *ns; ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); kfree(ns); } -- cgit v1.2.3 From 3486740a4f32a6a466f5ac931654d154790ba648 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:17 -0700 Subject: userns: security: make capabilities relative to the user namespace - Introduce ns_capable to test for a capability in a non-default user namespace. - Teach cap_capable to handle capabilities in a non-default user namespace. The motivation is to get to the unprivileged creation of new namespaces. It looks like this gets us 90% of the way there, with only potential uid confusion issues left. I still need to handle getting all caps after creation but otherwise I think I have a good starter patch that achieves all of your goals. Changelog: 11/05/2010: [serge] add apparmor 12/14/2010: [serge] fix capabilities to created user namespaces Without this, if user serge creates a user_ns, he won't have capabilities to the user_ns he created. THis is because we were first checking whether his effective caps had the caps he needed and returning -EPERM if not, and THEN checking whether he was the creator. Reverse those checks. 12/16/2010: [serge] security_real_capable needs ns argument in !security case 01/11/2011: [serge] add task_ns_capable helper 01/11/2011: [serge] add nsown_capable() helper per Bastian Blank suggestion 02/16/2011: [serge] fix a logic bug: the root user is always creator of init_user_ns, but should not always have capabilities to it! Fix the check in cap_capable(). 02/21/2011: Add the required user_ns parameter to security_capable, fixing a compile failure. 02/23/2011: Convert some macros to functions as per akpm comments. Some couldn't be converted because we can't easily forward-declare them (they are inline if !SECURITY, extern if SECURITY). Add a current_user_ns function so we can use it in capability.h without #including cred.h. Move all forward declarations together to the top of the #ifdef __KERNEL__ section, and use kernel-doc format. 02/23/2011: Per dhowells, clean up comment in cap_capable(). 02/23/2011: Per akpm, remove unreachable 'return -EPERM' in cap_capable. (Original written and signed off by Eric; latest, modified version acked by him) [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: export current_user_ns() for ecryptfs] [serge.hallyn@canonical.com: remove unneeded extra argument in selinux's task_has_capability] Signed-off-by: Eric W. Biederman Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 42 +++++++++++++++++++++++++++++++++++++----- kernel/cred.c | 6 ++++++ 2 files changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 9e9385f132c8..0a3d2c863a1c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -14,6 +14,7 @@ #include #include #include +#include #include /* @@ -299,17 +300,48 @@ error: * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ -int capable(int cap) +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + +/** + * ns_capable - Determine if the current task has a superior capability in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable(struct user_namespace *ns, int cap) { if (unlikely(!cap_valid(cap))) { printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); BUG(); } - if (security_capable(current_cred(), cap) == 0) { + if (security_capable(ns, current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; - return 1; + return true; } - return 0; + return false; } -EXPORT_SYMBOL(capable); +EXPORT_SYMBOL(ns_capable); + +/** + * task_ns_capable - Determine whether current task has a superior + * capability targeted at a specific task's user namespace. + * @t: The task whose user namespace is targeted. + * @cap: The capability in question. + * + * Return true if it does, false otherwise. + */ +bool task_ns_capable(struct task_struct *t, int cap) +{ + return ns_capable(task_cred_xxx(t, user)->user_ns, cap); +} +EXPORT_SYMBOL(task_ns_capable); diff --git a/kernel/cred.c b/kernel/cred.c index 2343c132c5a7..5557b55048df 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -741,6 +741,12 @@ int set_create_files_as(struct cred *new, struct inode *inode) } EXPORT_SYMBOL(set_create_files_as); +struct user_namespace *current_user_ns(void) +{ + return _current_user_ns(); +} +EXPORT_SYMBOL(current_user_ns); + #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) -- cgit v1.2.3 From bb96a6f50be27390dc959ff67d9ea0ea0cfbe177 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:18 -0700 Subject: userns: allow sethostname in a container Changelog: Feb 23: let clone_uts_ns() handle setting uts->user_ns To do so we need to pass in the task_struct who'll get the utsname, so we can get its user_ns. Feb 23: As per Oleg's coment, just pass in tsk, instead of two of its members. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 7 +------ kernel/sys.c | 2 +- kernel/utsname.c | 12 +++++++----- 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 034dc2ed13ac..b97fc9d04ddf 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -69,16 +69,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_ns; } - new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns); + new_nsp->uts_ns = copy_utsname(flags, tsk); if (IS_ERR(new_nsp->uts_ns)) { err = PTR_ERR(new_nsp->uts_ns); goto out_uts; } - if (new_nsp->uts_ns != tsk->nsproxy->uts_ns) { - put_user_ns(new_nsp->uts_ns->user_ns); - new_nsp->uts_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; - get_user_ns(new_nsp->uts_ns->user_ns); - } new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); if (IS_ERR(new_nsp->ipc_ns)) { diff --git a/kernel/sys.c b/kernel/sys.c index 1ad48b3b9068..5761c53e19e3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1181,7 +1181,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; diff --git a/kernel/utsname.c b/kernel/utsname.c index a7b3a8d1ad24..44646179eaba 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -31,7 +31,8 @@ static struct uts_namespace *create_uts_ns(void) * @old_ns: namespace to clone * Return NULL on error (failure to kmalloc), new ns otherwise */ -static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, + struct uts_namespace *old_ns) { struct uts_namespace *ns; @@ -41,8 +42,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); - ns->user_ns = old_ns->user_ns; - get_user_ns(ns->user_ns); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); up_read(&uts_sem); return ns; } @@ -53,8 +53,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) * utsname of this process won't be seen by parent, and vice * versa. */ -struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns) +struct uts_namespace *copy_utsname(unsigned long flags, + struct task_struct *tsk) { + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; struct uts_namespace *new_ns; BUG_ON(!old_ns); @@ -63,7 +65,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol if (!(flags & CLONE_NEWUTS)) return old_ns; - new_ns = clone_uts_ns(old_ns); + new_ns = clone_uts_ns(tsk, old_ns); put_uts_ns(old_ns); return new_ns; -- cgit v1.2.3 From 39fd33933b0209e4b6254743f2cede07c5ad4c52 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:19 -0700 Subject: userns: allow killing tasks in your own or child userns Changelog: Dec 8: Fixed bug in my check_kill_permission pointed out by Eric Biederman. Dec 13: Apply Eric's suggestion to pass target task into kill_ok_by_cred() for clarity Dec 31: address comment by Eric Biederman: don't need cred/tcred in check_kill_permission. Jan 1: use const cred struct. Jan 11: Per Bastian Blank's advice, clean up kill_ok_by_cred(). Feb 16: kill_ok_by_cred: fix bad parentheses Feb 23: per akpm, let compiler inline kill_ok_by_cred Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 31751868de88..324eff5468ad 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -635,6 +635,27 @@ static inline bool si_fromuser(const struct siginfo *info) (!is_si_special(info) && SI_FROMUSER(info)); } +/* + * called with RCU read lock from check_kill_permission() + */ +static int kill_ok_by_cred(struct task_struct *t) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = __task_cred(t); + + if (cred->user->user_ns == tcred->user->user_ns && + (cred->euid == tcred->suid || + cred->euid == tcred->uid || + cred->uid == tcred->suid || + cred->uid == tcred->uid)) + return 1; + + if (ns_capable(tcred->user->user_ns, CAP_KILL)) + return 1; + + return 0; +} + /* * Bad permissions for sending the signal * - the caller must hold the RCU read lock @@ -642,7 +663,6 @@ static inline bool si_fromuser(const struct siginfo *info) static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,14 +676,8 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; - cred = current_cred(); - tcred = __task_cred(t); if (!same_thread_group(current, t) && - (cred->euid ^ tcred->suid) && - (cred->euid ^ tcred->uid) && - (cred->uid ^ tcred->suid) && - (cred->uid ^ tcred->uid) && - !capable(CAP_KILL)) { + !kill_ok_by_cred(t)) { switch (sig) { case SIGCONT: sid = task_session(t); -- cgit v1.2.3 From 8409cca7056113bee3236cb6a8e4d8d4d1eef102 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:20 -0700 Subject: userns: allow ptrace from non-init user namespaces ptrace is allowed to tasks in the same user namespace according to the usual rules (i.e. the same rules as for two tasks in the init user namespace). ptrace is also allowed to a user namespace to which the current task the has CAP_SYS_PTRACE capability. Changelog: Dec 31: Address feedback by Eric: . Correct ptrace uid check . Rename may_ptrace_ns to ptrace_capable . Also fix the cap_ptrace checks. Jan 1: Use const cred struct Jan 11: use task_ns_capable() in place of ptrace_capable(). Feb 23: same_or_ancestore_user_ns() was not an appropriate check to constrain cap_issubset. Rather, cap_issubset() only is meaningful when both capsets are in the same user_ns. Signed-off-by: Serge E. Hallyn Cc: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e2302e40b360..0fc1eed28d27 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -134,21 +134,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) return 0; rcu_read_lock(); tcred = __task_cred(task); - if ((cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_PTRACE)) { - rcu_read_unlock(); - return -EPERM; - } + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + goto ok; + if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + goto ok; + rcu_read_unlock(); + return -EPERM; +ok: rcu_read_unlock(); smp_rmb(); if (task->mm) dumpable = get_dumpable(task->mm); - if (!dumpable && !capable(CAP_SYS_PTRACE)) + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) return -EPERM; return security_ptrace_access_check(task, mode); @@ -198,7 +201,7 @@ static int ptrace_attach(struct task_struct *task) goto unlock_tasklist; task->ptrace = PT_PTRACED; - if (capable(CAP_SYS_PTRACE)) + if (task_ns_capable(task, CAP_SYS_PTRACE)) task->ptrace |= PT_PTRACE_CAP; __ptrace_link(task, current); -- cgit v1.2.3 From 3263245de48344ad7bdd0e7256bf1606d2592f88 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:21 -0700 Subject: userns: make has_capability* into real functions So we can let type safety keep things sane, and as a bonus we can remove the declaration of init_user_ns in capability.h. Signed-off-by: Serge E. Hallyn Cc: "Eric W. Biederman" Cc: Daniel Lezcano Cc: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 0a3d2c863a1c..bf0c734d0c12 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -290,6 +290,60 @@ error: return ret; } +/** + * has_capability - Does a task have a capability in init_user_ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the initial user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability(struct task_struct *t, int cap) +{ + int ret = security_real_capable(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * has_capability - Does a task have a capability in a specific user ns + * @t: The task in question + * @ns: target user namespace + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the specified user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + int ret = security_real_capable(t, ns, cap); + + return (ret == 0); +} + +/** + * has_capability_noaudit - Does a task have a capability (unaudited) + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + + return (ret == 0); +} + /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for -- cgit v1.2.3 From fc832ad3645f0507f24d11752544525a50a83c71 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:22 -0700 Subject: userns: user namespaces: convert all capable checks in kernel/sys.c This allows setuid/setgid in containers. It also fixes some corner cases where kernel logic foregoes capability checks when uids are equivalent. The latter will need to be done throughout the whole kernel. Changelog: Jan 11: Use nsown_capable() as suggested by Bastian Blank. Jan 11: Fix logic errors in uid checks pointed out by Bastian. Feb 15: allow prlimit to current (was regression in previous version) Feb 23: remove debugging printks, uninline set_one_prio_perm and make it bool, and document its return value. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 75 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 5761c53e19e3..af468edf096a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -119,17 +119,34 @@ EXPORT_SYMBOL(cad_pid); void (*pm_power_off_prepare)(void); +/* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe + */ +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (pcred->user->user_ns == cred->user->user_ns && + (pcred->uid == cred->euid || + pcred->euid == cred->euid)) + return true; + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + return true; + return false; +} + /* * set the priority of a task * - the caller must hold the RCU read lock */ static int set_one_prio(struct task_struct *p, int niceval, int error) { - const struct cred *cred = current_cred(), *pcred = __task_cred(p); int no_nice; - if (pcred->uid != cred->euid && - pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) { + if (!set_one_prio_perm(p)) { error = -EPERM; goto out; } @@ -506,7 +523,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (old->gid == rgid || old->egid == rgid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->gid = rgid; else goto error; @@ -515,7 +532,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (old->gid == egid || old->egid == egid || old->sgid == egid || - capable(CAP_SETGID)) + nsown_capable(CAP_SETGID)) new->egid = egid; else goto error; @@ -550,7 +567,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETGID)) + if (nsown_capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; else if (gid == old->gid || gid == old->sgid) new->egid = new->fsgid = gid; @@ -617,7 +634,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = ruid; if (old->uid != ruid && old->euid != ruid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -626,7 +643,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (old->uid != euid && old->euid != euid && old->suid != euid && - !capable(CAP_SETUID)) + !nsown_capable(CAP_SETUID)) goto error; } @@ -674,7 +691,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (capable(CAP_SETUID)) { + if (nsown_capable(CAP_SETUID)) { new->suid = new->uid = uid; if (uid != old->uid) { retval = set_user(new); @@ -716,7 +733,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETUID)) { + if (!nsown_capable(CAP_SETUID)) { if (ruid != (uid_t) -1 && ruid != old->uid && ruid != old->euid && ruid != old->suid) goto error; @@ -780,7 +797,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!capable(CAP_SETGID)) { + if (!nsown_capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && rgid != old->egid && rgid != old->sgid) goto error; @@ -840,7 +857,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || - capable(CAP_SETUID)) { + nsown_capable(CAP_SETUID)) { if (uid != old_fsuid) { new->fsuid = uid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -873,7 +890,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || - capable(CAP_SETGID)) { + nsown_capable(CAP_SETGID)) { if (gid != old_fsgid) { new->fsgid = gid; goto change_okay; @@ -1183,6 +1200,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; down_write(&uts_sem); @@ -1230,7 +1248,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) int errno; char tmp[__NEW_UTS_LEN]; - if (!capable(CAP_SYS_ADMIN)) + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL; @@ -1345,6 +1363,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, rlim = tsk->signal->rlim + resource; task_lock(tsk->group_leader); if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ if (new_rlim->rlim_max > rlim->rlim_max && !capable(CAP_SYS_RESOURCE)) retval = -EPERM; @@ -1388,19 +1408,22 @@ static int check_prlimit_permission(struct task_struct *task) { const struct cred *cred = current_cred(), *tcred; - tcred = __task_cred(task); - if (current != task && - (cred->uid != tcred->euid || - cred->uid != tcred->suid || - cred->uid != tcred->uid || - cred->gid != tcred->egid || - cred->gid != tcred->sgid || - cred->gid != tcred->gid) && - !capable(CAP_SYS_RESOURCE)) { - return -EPERM; - } + if (current == task) + return 0; - return 0; + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, -- cgit v1.2.3 From b515498f5bb5f38fc0e390b4ff7d00b6077de127 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:23 -0700 Subject: userns: add a user namespace owner of ipc ns Changelog: Feb 15: Don't set new ipc->user_ns if we didn't create a new ipc_ns. Feb 23: Move extern declaration to ipc_namespace.h, and group fwd declarations at top. Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index b97fc9d04ddf..ac8a56e90bf8 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -80,6 +80,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } + if (new_nsp->ipc_ns != tsk->nsproxy->ipc_ns) { + put_user_ns(new_nsp->ipc_ns->user_ns); + new_nsp->ipc_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; + get_user_ns(new_nsp->ipc_ns->user_ns); + } new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); if (IS_ERR(new_nsp->pid_ns)) { -- cgit v1.2.3 From b0e77598f87107001a00b8a4ece9c95e4254ccc4 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Wed, 23 Mar 2011 16:43:24 -0700 Subject: userns: user namespaces: convert several capable() calls CAP_IPC_OWNER and CAP_IPC_LOCK can be checked against current_user_ns(), because the resource comes from current's own ipc namespace. setuid/setgid are to uids in own namespace, so again checks can be against current_user_ns(). Changelog: Jan 11: Use task_ns_capable() in place of sched_capable(). Jan 11: Use nsown_capable() as suggested by Bastian Blank. Jan 11: Clarify (hopefully) some logic in futex and sched.c Feb 15: use ns_capable for ipc, not nsown_capable Feb 23: let copy_ipcs handle setting ipc_ns->user_ns Feb 23: pass ns down rather than taking it from current [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Serge E. Hallyn Acked-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: David Howells Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 11 ++++++++++- kernel/futex_compat.c | 11 ++++++++++- kernel/groups.c | 2 +- kernel/nsproxy.c | 7 +------ kernel/sched.c | 9 ++++++--- kernel/uid16.c | 2 +- 6 files changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bda415715382..6570c459f31c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2418,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->robust_list; rcu_read_unlock(); } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index a7934ac75e5b..5f9e689dc8f0 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, goto err_unlock; ret = -EPERM; pcred = __task_cred(p); + /* If victim is in different user_ns, then uids are not + comparable, so we must have CAP_SYS_PTRACE */ + if (cred->user->user_ns != pcred->user->user_ns) { + if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) + goto err_unlock; + goto ok; + } + /* If victim is in same user_ns, then uids are comparable */ if (cred->euid != pcred->euid && cred->euid != pcred->uid && - !capable(CAP_SYS_PTRACE)) + !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE)) goto err_unlock; +ok: head = p->compat_robust_list; rcu_read_unlock(); } diff --git a/kernel/groups.c b/kernel/groups.c index 253dc0f35cf4..1cc476d52dd3 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ac8a56e90bf8..a05d191ffdd9 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -75,16 +75,11 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_uts; } - new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns); + new_nsp->ipc_ns = copy_ipcs(flags, tsk); if (IS_ERR(new_nsp->ipc_ns)) { err = PTR_ERR(new_nsp->ipc_ns); goto out_ipc; } - if (new_nsp->ipc_ns != tsk->nsproxy->ipc_ns) { - put_user_ns(new_nsp->ipc_ns->user_ns); - new_nsp->ipc_ns->user_ns = task_cred_xxx(tsk, user)->user_ns; - get_user_ns(new_nsp->ipc_ns->user_ns); - } new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); if (IS_ERR(new_nsp->pid_ns)) { diff --git a/kernel/sched.c b/kernel/sched.c index a172494a9a63..480adeb63f8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4892,8 +4892,11 @@ static bool check_same_owner(struct task_struct *p) rcu_read_lock(); pcred = __task_cred(p); - match = (cred->euid == pcred->euid || - cred->euid == pcred->uid); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; rcu_read_unlock(); return match; } @@ -5221,7 +5224,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) goto out_free_cpus_allowed; } retval = -EPERM; - if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p); diff --git a/kernel/uid16.c b/kernel/uid16.c index 419209893d87..51c6e89e8619 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!nsown_capable(CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; -- cgit v1.2.3 From f9b182e24ecb2b3bb33340f053ba31c8c4e1d895 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Wed, 23 Mar 2011 16:43:27 -0700 Subject: taskstats: use appropriate printk priority level printk()s without a priority level default to KERN_WARNING. To reduce noise at KERN_WARNING, this patch set the priority level appriopriately for unleveled printks()s. This should be useful to folks that look at dmesg warnings closely. Signed-off-by: Mandeep Singh Baines Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 3971c6b9d58d..9ffea360a778 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -685,7 +685,7 @@ static int __init taskstats_init(void) goto err_cgroup_ops; family_registered = 1; - printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); + pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; err_cgroup_ops: genl_unregister_ops(&family, &taskstats_ops); -- cgit v1.2.3 From 93a72052be81823fa1584b9be037d51924f9efa4 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Wed, 23 Mar 2011 16:43:29 -0700 Subject: crash_dump: export is_kdump_kernel to modules, consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn The Xen PV drivers in a crashed HVM guest can not connect to the dom0 backend drivers because both frontend and backend drivers are still in connected state. To run the connection reset function only in case of a crashdump, the is_kdump_kernel() function needs to be available for the PV driver modules. Consolidate elfcorehdr_addr, setup_elfcorehdr and saved_max_pfn into kernel/crash_dump.c Also export elfcorehdr_addr to make is_kdump_kernel() usable for modules. Leave 'elfcorehdr' as early_param(). This changes powerpc from __setup() to early_param(). It adds an address range check from x86 also on ia64 and powerpc. [akpm@linux-foundation.org: additional #includes] [akpm@linux-foundation.org: remove elfcorehdr_addr export] [akpm@linux-foundation.org: fix for Tejun's mm/nobootmem.c changes] Signed-off-by: Olaf Hering Cc: Russell King Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 1 + kernel/crash_dump.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 kernel/crash_dump.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe8ba33..85cbfb31e73e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -107,6 +107,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 000000000000..5f85690285d4 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,34 @@ +#include +#include +#include +#include +#include + +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); -- cgit v1.2.3 From 0f77a8d378254f27df4a114a5da67223af1fe93f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 24 Mar 2011 11:42:29 +0900 Subject: vsprintf: Introduce %pB format specifier The %pB format specifier is for stack backtrace. Its handler sprint_backtrace() does symbol lookup using (address-1) to ensure the address will not point outside of the function. If there is a tail-call to the function marked "noreturn", gcc optimized out the code after the call then causes saved return address points outside of the function (i.e. the start of the next function), so pollutes call trace somewhat. This patch adds the %pB printk mechanism that allows architecture call-trace printout functions to improve backtrace printouts. Signed-off-by: Namhyung Kim Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Andrew Morton Cc: linux-arch@vger.kernel.org LKML-Reference: <1300934550-21394-1-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- kernel/kallsyms.c | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 6f6d091b5757..59e879929b17 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size, } /* Look up a kernel symbol and return it in a text buffer. */ -int sprint_symbol(char *buffer, unsigned long address) +static int __sprint_symbol(char *buffer, unsigned long address, + int symbol_offset) { char *modname; const char *name; unsigned long offset, size; int len; + address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); @@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address) strcpy(buffer, name); len = strlen(buffer); buffer += len; + offset -= symbol_offset; if (modname) - len += sprintf(buffer, "+%#lx/%#lx [%s]", - offset, size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); else len += sprintf(buffer, "+%#lx/%#lx", offset, size); return len; } + +/** + * sprint_symbol - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size and module name to @buffer if possible. If no symbol was found, + * just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0); +} + EXPORT_SYMBOL_GPL(sprint_symbol); +/** + * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1); +} + /* Look up a kernel symbol and print it to the kernel messages. */ void __print_symbol(const char *fmt, unsigned long address) { -- cgit v1.2.3 From 29096202176ceaa5016a17ea2dd1aea19a4e90e2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Mar 2011 15:21:07 -0400 Subject: futex: Fix WARN_ON() test for UP An update of the futex code had a WARN_ON(!spin_is_locked(q->lock_ptr)) But on UP, spin_is_locked() is always false, and will trigger this warning, and even worse, it will exit the function without doing the necessary work. Converting this to a WARN_ON_SMP() fixes the problem. Reported-by: Richard Weinberger Tested-by: Richard Weinberger Signed-off-by: Steven Rostedt Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra Acked-by: Darren Hart Cc: Lai Jiangshan LKML-Reference: <20110317192208.682654502@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index bda415715382..823aae3e2a96 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -782,8 +782,8 @@ static void __unqueue_futex(struct futex_q *q) { struct futex_hash_bucket *hb; - if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) - || plist_node_empty(&q->list))) + if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) return; hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); -- cgit v1.2.3 From ab7798ffcf98b11a9525cf65bacdae3fd58d357f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Mar 2011 16:48:50 +0100 Subject: genirq: Expand generic show_interrupts() Some archs want to print extra information for certain irq_chips which is per irq and not per chip. Allow them to provide a chip callback to print the chip name and the extra information. PowerPC wants to print the LEVEL/EDGE type information. Make it configurable. Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 4 ++++ kernel/irq/proc.c | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 09bef82d74cb..00f2c037267a 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -31,6 +31,10 @@ config GENERIC_IRQ_PROBE config GENERIC_IRQ_SHOW bool +# Print level/edge extra information +config GENERIC_IRQ_SHOW_LEVEL + bool + # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 760248de109d..626d092eed9a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -404,7 +404,20 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%*d: ", prec, i); for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->irq_data.chip) { + if (desc->irq_data.chip->irq_print_chip) + desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); + else if (desc->irq_data.chip->name) + seq_printf(p, " %8s", desc->irq_data.chip->name); + else + seq_printf(p, " %8s", "-"); + } else { + seq_printf(p, " %8s", "None"); + } +#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); +#endif if (desc->name) seq_printf(p, "-%-8s", desc->name); -- cgit v1.2.3 From 27029c339b1beebe79bb4e64422ad1bb8d0b6440 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Mon, 15 Mar 2010 07:28:00 -0500 Subject: kdb: code cleanup to use macro instead of value It's better to use macro KDB_BASE_CMD_MAX instead of 50 Signed-off-by: Jovi Zhang Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index bd3e8e29caa3..38a85428c70f 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic; static kdbtab_t *kdb_commands; #define KDB_BASE_CMD_MAX 50 static int kdb_max_commands = KDB_BASE_CMD_MAX; -static kdbtab_t kdb_base_commands[50]; +static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; #define for_each_kdbcmd(cmd, num) \ for ((cmd) = kdb_base_commands, (num) = 0; \ num < kdb_max_commands; \ -- cgit v1.2.3 From 0d3db28daed2529ab90933a3aaaaf46446fdfda8 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 15 Mar 2010 07:28:00 -0500 Subject: kdb: add usage string of 'per_cpu' command Signed-off-by: Namhyung Kim Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 38a85428c70f..6bc6e3bc4f9c 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void) "Send a signal to a process", 0, KDB_REPEAT_NONE); kdb_register_repeat("summary", kdb_summary, "", "Summarize the system", 4, KDB_REPEAT_NONE); - kdb_register_repeat("per_cpu", kdb_per_cpu, "", + kdb_register_repeat("per_cpu", kdb_per_cpu, " [] []", "Display per_cpu variables", 3, KDB_REPEAT_NONE); kdb_register_repeat("grephelp", kdb_grep_help, "", "Display help on | grep", 0, KDB_REPEAT_NONE); -- cgit v1.2.3 From d72274e5895d11570a0a4a3214a1933c86d5ccb7 Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 25 Mar 2011 12:38:48 -0700 Subject: genirq: Reserve the irq when calling irq_set_chip() The helper macros and functions like for_each_active_irq() don't work unless the irq is in the allocated_irqs set. In the case of !CONFIG_SPARSE_IRQ, instead of forcing all users of the irq infrastructure to explicitly call irq_reserve_irq(), do it for them. Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Cc: ralf@linux-mips.org LKML-Reference: <1301081931-11240-2-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c9c0601f0615..c35d74c08b50 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -37,6 +37,12 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) irq_chip_set_defaults(chip); desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); + /* + * For !CONFIG_SPARSE_IRQ make the irq show up in + * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is + * already marked, and this call is harmless. + */ + irq_reserve_irq(irq); return 0; } EXPORT_SYMBOL(irq_set_chip); -- cgit v1.2.3 From 801a0e9ae36e9b487092e31699d28c0b9a21ad52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 27 Mar 2011 11:02:49 +0200 Subject: genirq: Add irq disabled flag to irq_data state Some irq_chip implementation require to know the disabled state of the interrupt in certain callbacks. Add a state flag and accessor to irq_data. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 5 +++-- kernel/irq/irqdesc.c | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index c35d74c08b50..0a890bdd9c63 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -141,12 +141,14 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { desc->istate &= ~IRQS_DISABLED; + irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_clr_disabled(desc); } static void irq_state_set_disabled(struct irq_desc *desc) { desc->istate |= IRQS_DISABLED; + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_set_disabled(desc); } @@ -648,8 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (handle == handle_bad_irq) { if (desc->irq_data.chip != &no_irq_chip) mask_ack_irq(desc); - irq_compat_set_disabled(desc); - desc->istate |= IRQS_DISABLED; + irq_state_set_disabled(desc); desc->depth = 1; } desc->handle_irq = handle; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 6fb014f172f7..96c3268a509d 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -80,6 +80,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.handler_data = NULL; desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; -- cgit v1.2.3 From 0fdb4b259ed3e91b687ac26848202f5e7c217e62 Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 25 Mar 2011 12:38:49 -0700 Subject: genirq: Add chip hooks for taking CPUs on/off line. [ tglx: Removed the enabled argument as this is now available in irq_data ] Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Cc: ralf@linux-mips.org LKML-Reference: <1301081931-11240-3-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 0a890bdd9c63..44b16a1ecd9a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -696,3 +696,61 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_put_desc_unlock(desc, flags); } + +/** + * irq_cpu_online - Invoke all irq_cpu_online functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_online() + * for each. + */ +void irq_cpu_online(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + + if (chip && chip->irq_cpu_online) + chip->irq_cpu_online(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} + +/** + * irq_cpu_offline - Invoke all irq_cpu_offline functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_offline() + * for each. + */ +void irq_cpu_offline(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + + if (chip && chip->irq_cpu_offline) + chip->irq_cpu_offline(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} -- cgit v1.2.3 From b3d422329f2e061d66af4f933ef316e50e5edcac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 27 Mar 2011 16:05:36 +0200 Subject: genirq: Add chip flag for restricting cpu_on/offline calls Add a flag which indicates that the on/offline callback should only be called on enabled interrupts. Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 44b16a1ecd9a..9283d3300ea9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -718,8 +718,9 @@ void irq_cpu_online(void) raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); - - if (chip && chip->irq_cpu_online) + if (chip && chip->irq_cpu_online && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !(desc->istate & IRQS_DISABLED))) chip->irq_cpu_online(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -747,8 +748,9 @@ void irq_cpu_offline(void) raw_spin_lock_irqsave(&desc->lock, flags); chip = irq_data_get_irq_chip(&desc->irq_data); - - if (chip && chip->irq_cpu_offline) + if (chip && chip->irq_cpu_offline && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !(desc->istate & IRQS_DISABLED))) chip->irq_cpu_offline(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); -- cgit v1.2.3 From c2d0c555c22242c3a76e366074c4d83ef9fa3b8c Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 25 Mar 2011 12:38:50 -0700 Subject: genirq: Split irq_set_affinity() so it can be called with lock held. The .irq_cpu_online() and .irq_cpu_offline() functions may need to adjust affinity, but they are called with the descriptor lock held. Create __irq_set_affinity_locked() which is called with the lock held. Make irq_set_affinity() just a wrapper that acquires the lock. [ tglx: Changed the argument to irq_data, added a !desc check and moved the !irq_set_affinity check where it belongs ] Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Cc: ralf@linux-mips.org LKML-Reference: <1301081931-11240-4-git-send-email-ddaney@caviumnetworks.com> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0a2aa73e536c..3d151fd762ad 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -139,35 +139,26 @@ static inline void irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } #endif -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @cpumask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) { - struct irq_desc *desc = irq_to_desc(irq); - struct irq_chip *chip = desc->irq_data.chip; - unsigned long flags; + struct irq_chip *chip = irq_data_get_irq_chip(data); + struct irq_desc *desc = irq_data_to_desc(data); int ret = 0; - if (!chip->irq_set_affinity) + if (!chip || !chip->irq_set_affinity) return -EINVAL; - raw_spin_lock_irqsave(&desc->lock, flags); - - if (irq_can_move_pcntxt(desc)) { - ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + if (irqd_can_move_in_process_context(data)) { + ret = chip->irq_set_affinity(data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: - cpumask_copy(desc->irq_data.affinity, mask); + cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); ret = 0; } } else { - irqd_set_move_pending(&desc->irq_data); + irqd_set_move_pending(data); irq_copy_pending(desc, mask); } @@ -176,7 +167,28 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) schedule_work(&desc->affinity_notify->work); } irq_compat_set_affinity(desc); - irqd_set(&desc->irq_data, IRQD_AFFINITY_SET); + irqd_set(data, IRQD_AFFINITY_SET); + + return ret; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @cpumask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + if (!desc) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } -- cgit v1.2.3 From 32f4125ebffee4f3c4dbc6a437fc656129eb9e60 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 14:10:52 +0200 Subject: genirq: Move INPROGRESS, MASKED and DISABLED state flags to irq_data We really need these flags for some of the interrupt chips. Move it from internal state to irq_data and provide proper accessors. Signed-off-by: Thomas Gleixner Cc: David Daney --- kernel/irq/chip.c | 40 +++++++++++++++++++--------------------- kernel/irq/debug.h | 10 +++++++--- kernel/irq/handle.c | 4 ++-- kernel/irq/internals.h | 6 ------ kernel/irq/irqdesc.c | 2 -- kernel/irq/manage.c | 30 ++++++++++++++---------------- kernel/irq/migration.c | 4 ++-- kernel/irq/spurious.c | 10 +++++----- 8 files changed, 49 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9283d3300ea9..e00bdc56269f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -140,27 +140,25 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { - desc->istate &= ~IRQS_DISABLED; irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_clr_disabled(desc); } static void irq_state_set_disabled(struct irq_desc *desc) { - desc->istate |= IRQS_DISABLED; irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irq_compat_set_disabled(desc); } static void irq_state_clr_masked(struct irq_desc *desc) { - desc->istate &= ~IRQS_MASKED; + irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); irq_compat_clr_masked(desc); } static void irq_state_set_masked(struct irq_desc *desc) { - desc->istate |= IRQS_MASKED; + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); irq_compat_set_masked(desc); } @@ -380,11 +378,11 @@ void handle_nested_irq(unsigned int irq) kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; - if (unlikely(!action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); action_ret = action->thread_fn(action->irq, action->dev_id); @@ -392,7 +390,7 @@ void handle_nested_irq(unsigned int irq) note_interrupt(irq, desc, action_ret); raw_spin_lock_irq(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); irq_compat_clr_progress(desc); out_unlock: @@ -424,14 +422,14 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); kstat_incr_irqs_this_cpu(irq, desc); - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); @@ -456,7 +454,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); mask_ack_irq(desc); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out_unlock; @@ -467,12 +465,12 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * keep it masked and get out of here */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; handle_irq_event(desc); - if (!(desc->istate & (IRQS_DISABLED | IRQS_ONESHOT))) + if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); @@ -504,7 +502,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { raw_spin_lock(&desc->lock); - if (unlikely(desc->istate & IRQS_INPROGRESS)) + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) if (!irq_check_poll(desc)) goto out; @@ -515,7 +513,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * If its disabled or no action available * then mask it and get out of here: */ - if (unlikely(!desc->action || (desc->istate & IRQS_DISABLED))) { + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_irq(desc); @@ -566,8 +564,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * we shouldn't process the IRQ. Mark it pending, handle * the necessary masking and go out */ - if (unlikely((desc->istate & (IRQS_DISABLED | IRQS_INPROGRESS) || - !desc->action))) { + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { if (!irq_check_poll(desc)) { irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; @@ -592,15 +590,15 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) * Renable it, if it was not disabled in meantime. */ if (unlikely(desc->istate & IRQS_PENDING)) { - if (!(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) unmask_irq(desc); } handle_irq_event(desc); } while ((desc->istate & IRQS_PENDING) && - !(desc->istate & IRQS_DISABLED)); + !irqd_irq_disabled(&desc->irq_data)); out_unlock: raw_spin_unlock(&desc->lock); @@ -720,7 +718,7 @@ void irq_cpu_online(void) chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_online && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || - !(desc->istate & IRQS_DISABLED))) + !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_online(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -750,7 +748,7 @@ void irq_cpu_offline(void) chip = irq_data_get_irq_chip(&desc->irq_data); if (chip && chip->irq_cpu_offline && (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || - !(desc->istate & IRQS_DISABLED))) + !irqd_irq_disabled(&desc->irq_data))) chip->irq_cpu_offline(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index d1a33b7fa61d..a0bd875ba3d5 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -6,6 +6,8 @@ #define P(f) if (desc->status & f) printk("%14s set\n", #f) #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +/* FIXME */ +#define PD(f) do { } while (0) static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { @@ -28,13 +30,15 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) P(IRQ_NOAUTOEN); PS(IRQS_AUTODETECT); - PS(IRQS_INPROGRESS); PS(IRQS_REPLAY); PS(IRQS_WAITING); - PS(IRQS_DISABLED); PS(IRQS_PENDING); - PS(IRQS_MASKED); + + PD(IRQS_INPROGRESS); + PD(IRQS_DISABLED); + PD(IRQS_MASKED); } #undef P #undef PS +#undef PD diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 517561fc7317..60fd5cd75c77 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -178,13 +178,13 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; irq_compat_set_progress(desc); - desc->istate |= IRQS_INPROGRESS; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); ret = handle_irq_event_percpu(desc, action); raw_spin_lock(&desc->lock); - desc->istate &= ~IRQS_INPROGRESS; + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); irq_compat_clr_progress(desc); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6c6ec9a49027..6b8b9713e28d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -44,26 +44,20 @@ enum { * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt * detection * IRQS_POLL_INPROGRESS - polling in progress - * IRQS_INPROGRESS - Interrupt in progress * IRQS_ONESHOT - irq is not unmasked in primary handler * IRQS_REPLAY - irq is replayed * IRQS_WAITING - irq is waiting - * IRQS_DISABLED - irq is disabled * IRQS_PENDING - irq is pending and replayed later - * IRQS_MASKED - irq is masked * IRQS_SUSPENDED - irq is suspended */ enum { IRQS_AUTODETECT = 0x00000001, IRQS_SPURIOUS_DISABLED = 0x00000002, IRQS_POLL_INPROGRESS = 0x00000008, - IRQS_INPROGRESS = 0x00000010, IRQS_ONESHOT = 0x00000020, IRQS_REPLAY = 0x00000040, IRQS_WAITING = 0x00000080, - IRQS_DISABLED = 0x00000100, IRQS_PENDING = 0x00000200, - IRQS_MASKED = 0x00000400, IRQS_SUSPENDED = 0x00000800, }; diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 96c3268a509d..2c039c9b9383 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -81,7 +81,6 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_data.msi_desc = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); - desc->istate = IRQS_DISABLED; desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; @@ -239,7 +238,6 @@ int __init early_irq_init(void) struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { - .istate = IRQS_DISABLED, .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3d151fd762ad..6e8acb755993 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -41,7 +41,7 @@ early_param("threadirqs", setup_forced_irqthreads); void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - unsigned int state; + bool inprogress; if (!desc) return; @@ -53,16 +53,16 @@ void synchronize_irq(unsigned int irq) * Wait until we're out of the critical section. This might * give the wrong answer due to the lack of memory barriers. */ - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); /* Ok, that indicated we're done: double-check carefully. */ raw_spin_lock_irqsave(&desc->lock, flags); - state = desc->istate; + inprogress = irqd_irq_inprogress(&desc->irq_data); raw_spin_unlock_irqrestore(&desc->lock, flags); /* Oops, that failed? */ - } while (state & IRQS_INPROGRESS); + } while (inprogress); /* * We made sure that no hardirq handler is running. Now verify @@ -563,9 +563,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, flags &= IRQ_TYPE_SENSE_MASK; if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { - if (!(desc->istate & IRQS_MASKED)) + if (!irqd_irq_masked(&desc->irq_data)) mask_irq(desc); - if (!(desc->istate & IRQS_DISABLED)) + if (!irqd_irq_disabled(&desc->irq_data)) unmask = 1; } @@ -663,7 +663,7 @@ again: * irq_wake_thread(). See the comment there which explains the * serialization. */ - if (unlikely(desc->istate & IRQS_INPROGRESS)) { + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); cpu_relax(); @@ -680,12 +680,10 @@ again: desc->threads_oneshot &= ~action->thread_mask; - if (!desc->threads_oneshot && !(desc->istate & IRQS_DISABLED) && - (desc->istate & IRQS_MASKED)) { - irq_compat_clr_masked(desc); - desc->istate &= ~IRQS_MASKED; - desc->irq_data.chip->irq_unmask(&desc->irq_data); - } + if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); + out_unlock: raw_spin_unlock_irq(&desc->lock); chip_bus_sync_unlock(desc); @@ -779,7 +777,7 @@ static int irq_thread(void *data) atomic_inc(&desc->threads_active); raw_spin_lock_irq(&desc->lock); - if (unlikely(desc->istate & IRQS_DISABLED)) { + if (unlikely(irqd_irq_disabled(&desc->irq_data))) { /* * CHECKME: We might need a dedicated * IRQ_THREAD_PENDING flag here, which @@ -997,8 +995,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ - IRQS_INPROGRESS | IRQS_ONESHOT | \ - IRQS_WAITING); + IRQS_ONESHOT | IRQS_WAITING); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); if (new->flags & IRQF_PERCPU) { irqd_set(&desc->irq_data, IRQD_PER_CPU); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index ec4806d4778b..5e81d34b08d6 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -66,7 +66,7 @@ void irq_move_irq(struct irq_data *idata) if (likely(!irqd_is_setaffinity_pending(idata))) return; - if (unlikely(desc->istate & IRQS_DISABLED)) + if (unlikely(irqd_irq_disabled(idata))) return; /* @@ -74,7 +74,7 @@ void irq_move_irq(struct irq_data *idata) * threaded interrupt with ONESHOT set, we can end up with an * interrupt storm. */ - masked = desc->istate & IRQS_MASKED; + masked = irqd_irq_masked(idata); if (!masked) idata->chip->irq_mask(idata); irq_move_masked_irq(idata); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd586ebf9c8c..cd424cdf17fc 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -45,12 +45,12 @@ bool irq_wait_for_poll(struct irq_desc *desc) #ifdef CONFIG_SMP do { raw_spin_unlock(&desc->lock); - while (desc->istate & IRQS_INPROGRESS) + while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); raw_spin_lock(&desc->lock); - } while (desc->istate & IRQS_INPROGRESS); + } while irqd_irq_inprogress(&desc->irq_data); /* Might have been disabled in meantime */ - return !(desc->istate & IRQS_DISABLED) && desc->action; + return !irqd_irq_disabled(&desc->irq_data) && desc->action; #else return false; #endif @@ -75,7 +75,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Do not poll disabled interrupts unless the spurious * disabled poller asks explicitely. */ - if ((desc->istate & IRQS_DISABLED) && !force) + if (irqd_irq_disabled(&desc->irq_data) && !force) goto out; /* @@ -88,7 +88,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) goto out; /* Already running on another processor */ - if (desc->istate & IRQS_INPROGRESS) { + if (irqd_irq_inprogress(&desc->irq_data)) { /* * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too -- cgit v1.2.3 From 0521c8fbb3da45c2a58cd551ca6e9644983f6028 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 16:13:24 +0200 Subject: genirq: Provide edge_eoi flow handler This is a replacment for the cell flow handler which is in the way of cleanups. Must be selected to avoid general bloat. Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 4 ++++ kernel/irq/chip.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 00f2c037267a..72606ba10b14 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -51,6 +51,10 @@ config HARDIRQS_SW_RESEND config IRQ_PREFLOW_FASTEOI bool +# Edge style eoi based handler (cell) +config IRQ_EDGE_EOI_HANDLER + bool + # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e00bdc56269f..451d1e81c15c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -604,6 +604,51 @@ out_unlock: raw_spin_unlock(&desc->lock); } +#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER +/** + * handle_edge_eoi_irq - edge eoi type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Similar as the above handle_edge_irq, but using eoi and w/o the + * mask/unmask logic. + */ +void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + raw_spin_lock(&desc->lock); + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + } + kstat_incr_irqs_this_cpu(irq, desc); + + do { + if (unlikely(!desc->action)) + goto out_eoi; + + handle_irq_event(desc); + + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); + +out_unlock: + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +#endif + /** * handle_percpu_irq - Per CPU local irq handler * @irq: the interrupt number -- cgit v1.2.3 From 33b054b867b84015173a38d9cd9ff513b6498818 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 16:27:31 +0200 Subject: genirq: Remove handle_IRQ_event Last user gone. Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 60fd5cd75c77..1a2fb77f2fd6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -188,15 +188,3 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) irq_compat_clr_progress(desc); return ret; } - -/** - * handle_IRQ_event - irq action chain handler - * @irq: the interrupt number - * @action: the interrupt action chain for this irq - * - * Handles the action chain of an irq event - */ -irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) -{ - return handle_irq_event_percpu(irq_to_desc(irq), action); -} -- cgit v1.2.3 From 30398bf6c684a77274dbdabf7efc1f24e4a99028 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Mar 2011 09:33:56 -0700 Subject: genirq: Fix new kernel-doc warnings Fix new irq-related kernel-doc warnings in 2.6.38: Warning(kernel/irq/manage.c:149): No description found for parameter 'mask' Warning(kernel/irq/manage.c:149): Excess function parameter 'cpumask' description in 'irq_set_affinity' Warning(include/linux/irq.h:161): No description found for parameter 'state_use_accessors' Warning(include/linux/irq.h:161): Excess struct/union/enum/typedef member 'state_use_accessor' description in 'irq_data' Signed-off-by: Randy Dunlap LKML-Reference: <20110318093356.b939558d.randy.dunlap@oracle.com> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6e8acb755993..805c6a0ce780 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -175,7 +175,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity - * @cpumask: cpumask + * @mask: cpumask * */ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) -- cgit v1.2.3 From a6aeddd1c4e464a2150f97ca2d1c3d68cfbd9296 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 20:28:56 +0200 Subject: genirq: Fix typo and remove unused variable Sigh, I'm overworked. Signed-off-by: Thomas Gleixner --- kernel/irq/migration.c | 1 - kernel/irq/spurious.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 5e81d34b08d6..e33d9c8d5089 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -60,7 +60,6 @@ void move_masked_irq(int irq) void irq_move_irq(struct irq_data *idata) { - struct irq_desc *desc = irq_data_to_desc(idata); bool masked; if (likely(!irqd_is_setaffinity_pending(idata))) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index cd424cdf17fc..83f4799f46be 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -48,7 +48,7 @@ bool irq_wait_for_poll(struct irq_desc *desc) while (irqd_irq_inprogress(&desc->irq_data)) cpu_relax(); raw_spin_lock(&desc->lock); - } while irqd_irq_inprogress(&desc->irq_data); + } while (irqd_irq_inprogress(&desc->irq_data)); /* Might have been disabled in meantime */ return !irqd_irq_disabled(&desc->irq_data) && desc->action; #else -- cgit v1.2.3 From 243b422af9ea9af4ead07a8ad54c90d4f9b6081a Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 28 Mar 2011 14:13:35 -0700 Subject: Relax si_code check in rt_sigqueueinfo and rt_tgsigqueueinfo Commit da48524eb206 ("Prevent rt_sigqueueinfo and rt_tgsigqueueinfo from spoofing the signal code") made the check on si_code too strict. There are several legitimate places where glibc wants to queue a negative si_code different from SI_QUEUE: - This was first noticed with glibc's aio implementation, which wants to queue a signal with si_code SI_ASYNCIO; the current kernel causes glibc's tst-aio4 test to fail because rt_sigqueueinfo() fails with EPERM. - Further examination of the glibc source shows that getaddrinfo_a() wants to use SI_ASYNCNL (which the kernel does not even define). The timer_create() fallback code wants to queue signals with SI_TIMER. As suggested by Oleg Nesterov , loosen the check to forbid only the problematic SI_TKILL case. Reported-by: Klaus Dittrich Acked-by: Julien Tinnes Cc: Signed-off-by: Roland Dreier Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 324eff5468ad..1186cf7fac77 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2437,7 +2437,7 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (info.si_code != SI_QUEUE) { + if (info.si_code >= 0 || info.si_code == SI_TKILL) { /* We used to allow any < 0 si_code */ WARN_ON_ONCE(info.si_code < 0); return -EPERM; @@ -2457,7 +2457,7 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) /* Not even root can pretend to send signals from the kernel. * Nor can they impersonate a kill()/tgkill(), which adds source info. */ - if (info->si_code != SI_QUEUE) { + if (info->si_code >= 0 || info->si_code == SI_TKILL) { /* We used to allow any < 0 si_code */ WARN_ON_ONCE(info->si_code < 0); return -EPERM; -- cgit v1.2.3 From 0ef5ca1e1f0de71300142b8f730f26ded6a0c2f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 21:59:37 +0200 Subject: genirq; Fix cleanup fallout I missed the CONFIG_GENERIC_PENDING_IRQ dependency in the affinity related functions and the IRQ_LEVEL propagation into irq_data state. Did not pop up on my main test platforms. :( Signed-off-by: Thomas Gleixner Tested-by: David Daney --- kernel/irq/chip.c | 2 ++ kernel/irq/manage.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 451d1e81c15c..03099d521f5e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -734,6 +734,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irqd_set(&desc->irq_data, IRQD_PER_CPU); if (irq_settings_can_move_pcntxt(desc)) irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 805c6a0ce780..acf540768b8f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -112,13 +112,13 @@ void irq_set_thread_affinity(struct irq_desc *desc) } #ifdef CONFIG_GENERIC_PENDING_IRQ -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) +static inline bool irq_can_move_pcntxt(struct irq_data *data) { - return irq_settings_can_move_pcntxt(desc); + return irqd_can_move_in_process_context(data); } -static inline bool irq_move_pending(struct irq_desc *desc) +static inline bool irq_move_pending(struct irq_data *data) { - return irqd_is_setaffinity_pending(&desc->irq_data); + return irqd_is_setaffinity_pending(data); } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) @@ -131,8 +131,8 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) cpumask_copy(mask, desc->pending_mask); } #else -static inline bool irq_can_move_pcntxt(struct irq_desc *desc) { return true; } -static inline bool irq_move_pending(struct irq_desc *desc) { return false; } +static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } +static inline bool irq_move_pending(struct irq_desc *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void @@ -148,7 +148,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irqd_can_move_in_process_context(data)) { + if (irq_can_move_pcntxt(data)) { ret = chip->irq_set_affinity(data, mask, false); switch (ret) { case IRQ_SET_MASK_OK: @@ -218,7 +218,7 @@ static void irq_affinity_notify(struct work_struct *work) goto out; raw_spin_lock_irqsave(&desc->lock, flags); - if (irq_move_pending(desc)) + if (irq_move_pending(&desc->irq_data)) irq_get_pending(cpumask, desc); else cpumask_copy(cpumask, desc->irq_data.affinity); -- cgit v1.2.3 From cd22c0e44b105aecd78e5f9e77abab3a1b8dc00c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Mar 2011 11:36:05 +0200 Subject: genirq: Fix harmless typo The late night fixup missed to convert the data type from irq_desc to irq_data, which results in a harmless but annoying warning. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index acf540768b8f..b3bf54f7d977 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -132,7 +132,7 @@ irq_get_pending(struct cpumask *mask, struct irq_desc *desc) } #else static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } -static inline bool irq_move_pending(struct irq_desc *data) { return false; } +static inline bool irq_move_pending(struct irq_data *data) { return false; } static inline void irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } static inline void -- cgit v1.2.3 From a6e120ed42004d6051fff7c3233e2554f12ccecb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 25 Mar 2011 22:20:51 +0100 Subject: alpha: Use generic show_interrupts() The only subtle difference is that alpha uses ACTUAL_NR_IRQS and prints the IRQF_DISABLED flag. Change the generic implementation to deal with ACTUAL_NR_IRQS if defined. The IRQF_DISABLED printing is pointless, as we nowadays run all interrupts with irqs disabled. Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 626d092eed9a..dd201bd35103 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -364,6 +364,10 @@ int __weak arch_show_interrupts(struct seq_file *p, int prec) return 0; } +#ifndef ACTUAL_NR_IRQS +# define ACTUAL_NR_IRQS nr_irqs +#endif + int show_interrupts(struct seq_file *p, void *v) { static int prec; @@ -373,10 +377,10 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction *action; struct irq_desc *desc; - if (i > nr_irqs) + if (i > ACTUAL_NR_IRQS) return 0; - if (i == nr_irqs) + if (i == ACTUAL_NR_IRQS) return arch_show_interrupts(p, prec); /* print header and calculate the width of the first column */ -- cgit v1.2.3 From 0c6f8a8b917ad361319c8ace3e9f28e69bfdb4c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 28 Mar 2011 13:32:20 +0200 Subject: genirq: Remove compat code Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 4 -- kernel/irq/autoprobe.c | 4 +- kernel/irq/chip.c | 129 ------------------------------------------------- kernel/irq/compat.h | 72 --------------------------- kernel/irq/debug.h | 2 +- kernel/irq/dummychip.c | 9 ---- kernel/irq/handle.c | 3 -- kernel/irq/internals.h | 10 ---- kernel/irq/manage.c | 10 +--- kernel/irq/resend.c | 1 - kernel/irq/settings.h | 55 ++++++++------------- kernel/irq/spurious.c | 1 - 12 files changed, 24 insertions(+), 276 deletions(-) delete mode 100644 kernel/irq/compat.h (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 72606ba10b14..a69c333f78e4 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -10,10 +10,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -# Select this to disable the deprecated stuff -config GENERIC_HARDIRQS_NO_DEPRECATED - bool - config GENERIC_HARDIRQS_NO_COMPAT bool diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 394784c57060..342d8f44e401 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -70,10 +70,8 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) { - irq_compat_set_pending(desc); + if (irq_startup(desc)) desc->istate |= IRQS_PENDING; - } } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 03099d521f5e..616ec1c6b06f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -34,7 +34,6 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - irq_chip_set_defaults(chip); desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); /* @@ -141,25 +140,21 @@ EXPORT_SYMBOL_GPL(irq_get_irq_data); static void irq_state_clr_disabled(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); - irq_compat_clr_disabled(desc); } static void irq_state_set_disabled(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); - irq_compat_set_disabled(desc); } static void irq_state_clr_masked(struct irq_desc *desc) { irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); - irq_compat_clr_masked(desc); } static void irq_state_set_masked(struct irq_desc *desc) { irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); - irq_compat_set_masked(desc); } int irq_startup(struct irq_desc *desc) @@ -209,126 +204,6 @@ void irq_disable(struct irq_desc *desc) } } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -/* Temporary migration helpers */ -static void compat_irq_mask(struct irq_data *data) -{ - data->chip->mask(data->irq); -} - -static void compat_irq_unmask(struct irq_data *data) -{ - data->chip->unmask(data->irq); -} - -static void compat_irq_ack(struct irq_data *data) -{ - data->chip->ack(data->irq); -} - -static void compat_irq_mask_ack(struct irq_data *data) -{ - data->chip->mask_ack(data->irq); -} - -static void compat_irq_eoi(struct irq_data *data) -{ - data->chip->eoi(data->irq); -} - -static void compat_irq_enable(struct irq_data *data) -{ - data->chip->enable(data->irq); -} - -static void compat_irq_disable(struct irq_data *data) -{ - data->chip->disable(data->irq); -} - -static void compat_irq_shutdown(struct irq_data *data) -{ - data->chip->shutdown(data->irq); -} - -static unsigned int compat_irq_startup(struct irq_data *data) -{ - return data->chip->startup(data->irq); -} - -static int compat_irq_set_affinity(struct irq_data *data, - const struct cpumask *dest, bool force) -{ - return data->chip->set_affinity(data->irq, dest); -} - -static int compat_irq_set_type(struct irq_data *data, unsigned int type) -{ - return data->chip->set_type(data->irq, type); -} - -static int compat_irq_set_wake(struct irq_data *data, unsigned int on) -{ - return data->chip->set_wake(data->irq, on); -} - -static int compat_irq_retrigger(struct irq_data *data) -{ - return data->chip->retrigger(data->irq); -} - -static void compat_bus_lock(struct irq_data *data) -{ - data->chip->bus_lock(data->irq); -} - -static void compat_bus_sync_unlock(struct irq_data *data) -{ - data->chip->bus_sync_unlock(data->irq); -} -#endif - -/* - * Fixup enable/disable function pointers - */ -void irq_chip_set_defaults(struct irq_chip *chip) -{ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED - if (chip->enable) - chip->irq_enable = compat_irq_enable; - if (chip->disable) - chip->irq_disable = compat_irq_disable; - if (chip->shutdown) - chip->irq_shutdown = compat_irq_shutdown; - if (chip->startup) - chip->irq_startup = compat_irq_startup; - if (!chip->end) - chip->end = dummy_irq_chip.end; - if (chip->bus_lock) - chip->irq_bus_lock = compat_bus_lock; - if (chip->bus_sync_unlock) - chip->irq_bus_sync_unlock = compat_bus_sync_unlock; - if (chip->mask) - chip->irq_mask = compat_irq_mask; - if (chip->unmask) - chip->irq_unmask = compat_irq_unmask; - if (chip->ack) - chip->irq_ack = compat_irq_ack; - if (chip->mask_ack) - chip->irq_mask_ack = compat_irq_mask_ack; - if (chip->eoi) - chip->irq_eoi = compat_irq_eoi; - if (chip->set_affinity) - chip->irq_set_affinity = compat_irq_set_affinity; - if (chip->set_type) - chip->irq_set_type = compat_irq_set_type; - if (chip->set_wake) - chip->irq_set_wake = compat_irq_set_wake; - if (chip->retrigger) - chip->irq_retrigger = compat_irq_retrigger; -#endif -} - static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) @@ -381,7 +256,6 @@ void handle_nested_irq(unsigned int irq) if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) goto out_unlock; - irq_compat_set_progress(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -391,7 +265,6 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - irq_compat_clr_progress(desc); out_unlock: raw_spin_unlock_irq(&desc->lock); @@ -514,7 +387,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) * then mask it and get out of here: */ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_irq(desc); goto out; @@ -567,7 +439,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(irqd_irq_disabled(&desc->irq_data) || irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { if (!irq_check_poll(desc)) { - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; mask_ack_irq(desc); goto out_unlock; diff --git a/kernel/irq/compat.h b/kernel/irq/compat.h deleted file mode 100644 index 6bbaf66aca85..000000000000 --- a/kernel/irq/compat.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Compat layer for transition period - */ -#ifndef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -static inline void irq_compat_set_progress(struct irq_desc *desc) -{ - desc->status |= IRQ_INPROGRESS; -} - -static inline void irq_compat_clr_progress(struct irq_desc *desc) -{ - desc->status &= ~IRQ_INPROGRESS; -} -static inline void irq_compat_set_disabled(struct irq_desc *desc) -{ - desc->status |= IRQ_DISABLED; -} -static inline void irq_compat_clr_disabled(struct irq_desc *desc) -{ - desc->status &= ~IRQ_DISABLED; -} -static inline void irq_compat_set_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_PENDING; -} - -static inline void irq_compat_clr_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_PENDING; -} -static inline void irq_compat_set_masked(struct irq_desc *desc) -{ - desc->status |= IRQ_MASKED; -} - -static inline void irq_compat_clr_masked(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MASKED; -} -static inline void irq_compat_set_move_pending(struct irq_desc *desc) -{ - desc->status |= IRQ_MOVE_PENDING; -} - -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) -{ - desc->status &= ~IRQ_MOVE_PENDING; -} -static inline void irq_compat_set_affinity(struct irq_desc *desc) -{ - desc->status |= IRQ_AFFINITY_SET; -} - -static inline void irq_compat_clr_affinity(struct irq_desc *desc) -{ - desc->status &= ~IRQ_AFFINITY_SET; -} -#else -static inline void irq_compat_set_progress(struct irq_desc *desc) { } -static inline void irq_compat_clr_progress(struct irq_desc *desc) { } -static inline void irq_compat_set_disabled(struct irq_desc *desc) { } -static inline void irq_compat_clr_disabled(struct irq_desc *desc) { } -static inline void irq_compat_set_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_masked(struct irq_desc *desc) { } -static inline void irq_compat_clr_masked(struct irq_desc *desc) { } -static inline void irq_compat_set_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_clr_move_pending(struct irq_desc *desc) { } -static inline void irq_compat_set_affinity(struct irq_desc *desc) { } -static inline void irq_compat_clr_affinity(struct irq_desc *desc) { } -#endif - diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index a0bd875ba3d5..306cba37e9a5 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -4,7 +4,7 @@ #include -#define P(f) if (desc->status & f) printk("%14s set\n", #f) +#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ #define PD(f) do { } while (0) diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index 20dc5474947e..b5fcd96c7102 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c @@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data) return 0; } -#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED -static void compat_noop(unsigned int irq) { } -#define END_INIT .end = compat_noop -#else -#define END_INIT -#endif - /* * Generic no controller implementation */ @@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = { .irq_enable = noop, .irq_disable = noop, .irq_ack = ack_bad, - END_INIT }; /* @@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = { .irq_ack = noop, .irq_mask = noop, .irq_unmask = noop, - END_INIT }; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 1a2fb77f2fd6..90cb55f6d7eb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -175,9 +175,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) struct irqaction *action = desc->action; irqreturn_t ret; - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; - irq_compat_set_progress(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); @@ -185,6 +183,5 @@ irqreturn_t handle_irq_event(struct irq_desc *desc) raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - irq_compat_clr_progress(desc); return ret; } diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6b8b9713e28d..6546431447d7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -15,10 +15,6 @@ #define istate core_internal_state__do_not_mess_with_it -#ifdef CONFIG_GENERIC_HARDIRQS_NO_COMPAT -# define status status_use_accessors -#endif - extern int noirqdebug; /* @@ -61,15 +57,11 @@ enum { IRQS_SUSPENDED = 0x00000800, }; -#include "compat.h" #include "debug.h" #include "settings.h" #define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) -/* Set default functions for irq_chip structures: */ -extern void irq_chip_set_defaults(struct irq_chip *chip); - extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); @@ -156,13 +148,11 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) static inline void irqd_set_move_pending(struct irq_data *d) { d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; - irq_compat_set_move_pending(irq_data_to_desc(d)); } static inline void irqd_clr_move_pending(struct irq_data *d) { d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; - irq_compat_clr_move_pending(irq_data_to_desc(d)); } static inline void irqd_clear(struct irq_data *d, unsigned int mask) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index b3bf54f7d977..12a80fdae11c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -166,7 +166,6 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) kref_get(&desc->affinity_notify->kref); schedule_work(&desc->affinity_notify->work); } - irq_compat_set_affinity(desc); irqd_set(data, IRQD_AFFINITY_SET); return ret; @@ -297,10 +296,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) if (cpumask_intersects(desc->irq_data.affinity, cpu_online_mask)) set = desc->irq_data.affinity; - else { - irq_compat_clr_affinity(desc); + else irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); - } } cpumask_and(mask, cpu_online_mask, set); @@ -587,8 +584,6 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, irqd_set(&desc->irq_data, IRQD_LEVEL); } - if (chip != desc->irq_data.chip) - irq_chip_set_defaults(desc->irq_data.chip); ret = 0; break; default: @@ -785,7 +780,6 @@ static int irq_thread(void *data) * but AFAICT IRQS_PENDING should be fine as it * retriggers the interrupt itself --- tglx */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; raw_spin_unlock_irq(&desc->lock); } else { @@ -981,8 +975,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) new->thread_mask = 1 << ffz(thread_mask); if (!shared) { - irq_chip_set_defaults(desc->irq_data.chip); - init_waitqueue_head(&desc->wait_for_threads); /* Setup the type (level, edge polarity) if configured: */ diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index ad683a99b1ec..14dd5761e8c9 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -65,7 +65,6 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if (desc->istate & IRQS_REPLAY) return; if (desc->istate & IRQS_PENDING) { - irq_compat_clr_pending(desc); desc->istate &= ~IRQS_PENDING; desc->istate |= IRQS_REPLAY; diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0227ad358272..0d91730b6330 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -15,17 +15,8 @@ enum { _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; -#define IRQ_INPROGRESS GOT_YOU_MORON -#define IRQ_REPLAY GOT_YOU_MORON -#define IRQ_WAITING GOT_YOU_MORON -#define IRQ_DISABLED GOT_YOU_MORON -#define IRQ_PENDING GOT_YOU_MORON -#define IRQ_MASKED GOT_YOU_MORON -#define IRQ_WAKEUP GOT_YOU_MORON -#define IRQ_MOVE_PENDING GOT_YOU_MORON #define IRQ_PER_CPU GOT_YOU_MORON #define IRQ_NO_BALANCING GOT_YOU_MORON -#define IRQ_AFFINITY_SET GOT_YOU_MORON #define IRQ_LEVEL GOT_YOU_MORON #define IRQ_NOPROBE GOT_YOU_MORON #define IRQ_NOREQUEST GOT_YOU_MORON @@ -37,102 +28,98 @@ enum { static inline void irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) { - desc->status &= ~(clr & _IRQF_MODIFY_MASK); - desc->status |= (set & _IRQF_MODIFY_MASK); + desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); + desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); } static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) { - return desc->status & _IRQ_PER_CPU; + return desc->status_use_accessors & _IRQ_PER_CPU; } static inline void irq_settings_set_per_cpu(struct irq_desc *desc) { - desc->status |= _IRQ_PER_CPU; + desc->status_use_accessors |= _IRQ_PER_CPU; } static inline void irq_settings_set_no_balancing(struct irq_desc *desc) { - desc->status |= _IRQ_NO_BALANCING; + desc->status_use_accessors |= _IRQ_NO_BALANCING; } static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) { - return desc->status & _IRQ_NO_BALANCING; + return desc->status_use_accessors & _IRQ_NO_BALANCING; } static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) { - return desc->status & IRQ_TYPE_SENSE_MASK; + return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; } static inline void irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) { - desc->status &= ~IRQ_TYPE_SENSE_MASK; - desc->status |= mask & IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; } static inline bool irq_settings_is_level(struct irq_desc *desc) { - return desc->status & _IRQ_LEVEL; + return desc->status_use_accessors & _IRQ_LEVEL; } static inline void irq_settings_clr_level(struct irq_desc *desc) { - desc->status &= ~_IRQ_LEVEL; + desc->status_use_accessors &= ~_IRQ_LEVEL; } static inline void irq_settings_set_level(struct irq_desc *desc) { - desc->status |= _IRQ_LEVEL; + desc->status_use_accessors |= _IRQ_LEVEL; } static inline bool irq_settings_can_request(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOREQUEST); + return !(desc->status_use_accessors & _IRQ_NOREQUEST); } static inline void irq_settings_clr_norequest(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOREQUEST; + desc->status_use_accessors &= ~_IRQ_NOREQUEST; } static inline void irq_settings_set_norequest(struct irq_desc *desc) { - desc->status |= _IRQ_NOREQUEST; + desc->status_use_accessors |= _IRQ_NOREQUEST; } static inline bool irq_settings_can_probe(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOPROBE); + return !(desc->status_use_accessors & _IRQ_NOPROBE); } static inline void irq_settings_clr_noprobe(struct irq_desc *desc) { - desc->status &= ~_IRQ_NOPROBE; + desc->status_use_accessors &= ~_IRQ_NOPROBE; } static inline void irq_settings_set_noprobe(struct irq_desc *desc) { - desc->status |= _IRQ_NOPROBE; + desc->status_use_accessors |= _IRQ_NOPROBE; } static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) { - return desc->status & _IRQ_MOVE_PCNTXT; + return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; } static inline bool irq_settings_can_autoenable(struct irq_desc *desc) { - return !(desc->status & _IRQ_NOAUTOEN); + return !(desc->status_use_accessors & _IRQ_NOAUTOEN); } static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) { - return desc->status & _IRQ_NESTED_THREAD; + return desc->status_use_accessors & _IRQ_NESTED_THREAD; } - -/* Nothing should touch desc->status from now on */ -#undef status -#define status USE_THE_PROPER_WRAPPERS_YOU_MORON diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 83f4799f46be..dfbd550401b2 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -93,7 +93,6 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force) * Already running: If it is shared get the other * CPU to go looking for our mystery interrupt too */ - irq_compat_set_pending(desc); desc->istate |= IRQS_PENDING; goto out; } -- cgit v1.2.3 From 851d7cf647e0d31668eb5dc496f7698a2f6136b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Mar 2011 02:51:13 +0200 Subject: genirq: Remove move_*irq leftovers All users converted to new interface. Signed-off-by: Thomas Gleixner --- kernel/irq/migration.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index e33d9c8d5089..bc6194698dfd 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -53,11 +53,6 @@ void irq_move_masked_irq(struct irq_data *idata) cpumask_clear(desc->pending_mask); } -void move_masked_irq(int irq) -{ - irq_move_masked_irq(irq_get_irq_data(irq)); -} - void irq_move_irq(struct irq_data *idata) { bool masked; @@ -80,8 +75,3 @@ void irq_move_irq(struct irq_data *idata) if (!masked) idata->chip->irq_unmask(idata); } - -void move_native_irq(int irq) -{ - irq_move_irq(irq_get_irq_data(irq)); -} -- cgit v1.2.3 From 353c8ed44f8f7414be614685ed29d1e23f5fa76b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Mar 2011 22:18:28 +0200 Subject: genirq: Fix misnamed label in handle_edge_eoi_irq Reported-by: michael@ellerman.id.au Signed-off-by: Thomas Gleixner Cc: linuxppc-dev@lists.ozlabs.org --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 616ec1c6b06f..1dafc8652bd8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -514,7 +514,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) } while ((desc->istate & IRQS_PENDING) && !irqd_irq_disabled(&desc->irq_data)); -out_unlock: +out_eoi: chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } -- cgit v1.2.3 From 78c89825649a9a5ed526c507603196f467d781a5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Mar 2011 14:13:23 +0200 Subject: genirq: Remove the now obsolete config options and select statements Signed-off-by: Thomas Gleixner --- kernel/irq/Kconfig | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index a69c333f78e4..c574f9a12c48 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -10,9 +10,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO_COMPAT - bool - # Options selectable by the architecture code # Make sparse irq Kconfig switch below available -- cgit v1.2.3 From a51e91981870d013fcfcc08b0117997edbcbc7a7 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Thu, 24 Mar 2011 14:00:18 +0100 Subject: sched: Leave sched_setscheduler() earlier if possible, do not disturb SCHED_FIFO tasks sched_setscheduler() (in sched.c) is called in order of changing the scheduling policy and/or the real-time priority of a task. Thus, if we find out that neither of those are actually being modified, it is possible to return earlier and save the overhead of a full deactivate+activate cycle of the task in question. Beside that, if we have more than one SCHED_FIFO task with the same priority on the same rq (which means they share the same priority queue) having one of them changing its position in the priority queue because of a sched_setscheduler (as it happens by means of the deactivate+activate) that does not actually change the priority violates POSIX which states, for SCHED_FIFO: "If a thread whose policy or priority has been modified by pthread_setschedprio() is a running thread or is runnable, the effect on its position in the thread list depends on the direction of the modification, as follows: a. <...> b. If the priority is unchanged, the thread does not change position in the thread list. c. <...>" http://pubs.opengroup.org/onlinepubs/009695399/functions/xsh_chap02_08.html (ed: And the POSIX specification here does, briefly and somewhat unexpectedly, match what common sense tells us as well. ) Signed-off-by: Dario Faggioli Signed-off-by: Peter Zijlstra LKML-Reference: <1300971618.3960.82.camel@Palantir> Signed-off-by: Ingo Molnar --- kernel/sched.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f592ce6f8616..a8845516ace6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5011,6 +5011,17 @@ recheck: return -EINVAL; } + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* -- cgit v1.2.3 From 3436ae1298cb22d722a6520fc97f112dd767a9e1 Mon Sep 17 00:00:00 2001 From: Sisir Koppaka Date: Sat, 26 Mar 2011 18:22:55 +0530 Subject: sched: Fix rebalance interval calculation The interval for checking scheduling domains if they are due to be balanced currently depends on boot state NR_CPUS, which may not accurately reflect the number of online CPUs at the time of check. Thus replace NR_CPUS with num_online_cpus(). (ed: Should only affect those who set NR_CPUS really high, such as 4096 or so :-) Signed-off-by: Sisir Koppaka Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3f7ec9e27ee1..c7ec5c8e7b44 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -22,6 +22,7 @@ #include #include +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -3850,8 +3851,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; - if (interval > HZ*NR_CPUS/10) - interval = HZ*NR_CPUS/10; + if (interval > HZ*num_online_cpus()/10) + interval = HZ*num_online_cpus()/10; need_serialize = sd->flags & SD_SERIALIZE; -- cgit v1.2.3 From 20443384fe090c5f8aeb016e7e85659c5bbdd69f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Mar 2011 03:33:29 +0200 Subject: perf: Rebase max unprivileged mlock threshold on top of page size Ensure we allow 512 kiB + 1 page for user control without assuming a 4096 bytes page size. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian Cc: LKML-Reference: <1301535209-9679-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c75925c4d1e2..261690923ffb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -145,8 +145,8 @@ static struct srcu_struct pmus_srcu; */ int sysctl_perf_event_paranoid __read_mostly = 1; -/* Minimum for 128 pages + 1 for the user control page */ -int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */ +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ /* * max perf event sample rate -- cgit v1.2.3 From fd1edb3aa2c1d92618d8f0c6d15d44ea41fcac6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Mar 2011 13:13:56 +0200 Subject: perf: Fix task_struct reference leak sys_perf_event_open() had an imbalance in the number of task refs it took causing memory leakage Cc: Jiri Olsa Cc: Oleg Nesterov Cc: stable@kernel.org # .37+ Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 261690923ffb..27960f114efd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6531,6 +6531,11 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (task) { + put_task_struct(task); + task = NULL; + } + /* * Look up the group leader (we will attach this event to it): */ -- cgit v1.2.3 From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 30 Mar 2011 22:57:33 -0300 Subject: Fix common misspellings Fixes generated by 'codespell' and manually reviewed. Signed-off-by: Lucas De Marchi --- kernel/audit_tree.c | 2 +- kernel/auditsc.c | 2 +- kernel/cgroup.c | 2 +- kernel/cpu.c | 2 +- kernel/debug/debug_core.c | 2 +- kernel/debug/kdb/kdb_main.c | 6 +++--- kernel/debug/kdb/kdb_support.c | 2 +- kernel/exit.c | 2 +- kernel/irq/chip.c | 2 +- kernel/irq/migration.c | 2 +- kernel/kexec.c | 6 +++--- kernel/kthread.c | 2 +- kernel/latencytop.c | 2 +- kernel/lockdep.c | 4 ++-- kernel/module.c | 6 +++--- kernel/mutex.c | 2 +- kernel/padata.c | 8 ++++---- kernel/params.c | 2 +- kernel/posix-cpu-timers.c | 2 +- kernel/posix-timers.c | 2 +- kernel/power/main.c | 2 +- kernel/sched.c | 6 +++--- kernel/sched_autogroup.c | 2 +- kernel/sched_fair.c | 2 +- kernel/sched_rt.c | 4 ++-- kernel/signal.c | 2 +- kernel/softirq.c | 2 +- kernel/time/jiffies.c | 2 +- kernel/time/timer_stats.c | 2 +- kernel/trace/ftrace.c | 4 ++-- kernel/trace/ring_buffer.c | 4 ++-- kernel/trace/trace.c | 2 +- kernel/trace/trace_clock.c | 2 +- kernel/trace/trace_entries.h | 2 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_kprobe.c | 2 +- kernel/user-return-notifier.c | 2 +- kernel/wait.c | 2 +- kernel/workqueue.c | 2 +- 40 files changed, 55 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 37b2bea170c8..e99dda04b126 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -607,7 +607,7 @@ void audit_trim_trees(void) spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); - /* this could be NULL if the watch is dieing else where... */ + /* this could be NULL if the watch is dying else where... */ struct inode *inode = chunk->mark.i.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f49a0318c2ed..b33513a08beb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, /* * to_send and len_sent accounting are very loose estimates. We aren't * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundry) + * within about 500 bytes (next page boundary) * * why snprintf? an int is up to 12 digits long. if we just assumed when * logging that a[%d]= was going to be 16 characters long we would be wasting diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e31b220a743d..25c7eb52de1a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -157,7 +157,7 @@ struct css_id { }; /* - * cgroup_event represents events which userspace want to recieve. + * cgroup_event represents events which userspace want to receive. */ struct cgroup_event { /* diff --git a/kernel/cpu.c b/kernel/cpu.c index c95fc4df0faa..12b7458f23b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -126,7 +126,7 @@ static void cpu_hotplug_done(void) #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} -#endif /* #esle #if CONFIG_HOTPLUG_CPU */ +#endif /* #else #if CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index cefd4a11f6d9..bad6786dee88 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -538,7 +538,7 @@ return_normal: /* * For single stepping, try to only enter on the processor - * that was single stepping. To gaurd against a deadlock, the + * that was single stepping. To guard against a deadlock, the * kernel will only try for the value of sstep_tries before * giving up and continuing on. */ diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6bc6e3bc4f9c..be14779bcef6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -441,9 +441,9 @@ static int kdb_check_regs(void) * symbol name, and offset to the caller. * * The argument may consist of a numeric value (decimal or - * hexidecimal), a symbol name, a register name (preceeded by the + * hexidecimal), a symbol name, a register name (preceded by the * percent sign), an environment variable with a numeric value - * (preceeded by a dollar sign) or a simple arithmetic expression + * (preceded by a dollar sign) or a simple arithmetic expression * consisting of a symbol name, +/-, and a numeric constant value * (offset). * Parameters: @@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value) * error The hardware-defined error code * reason2 kdb's current reason code. * Initially error but can change - * acording to kdb state. + * according to kdb state. * db_result Result code from break or debug point. * regs The exception frame at time of fault/breakpoint. * should always be valid. diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6b2485dcb050..5532dd37aa86 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) * Mask for process state. * Notes: * The mask folds data from several sources into a single long value, so - * be carefull not to overlap the bits. TASK_* bits are in the LSB, + * be careful not to overlap the bits. TASK_* bits are in the LSB, * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there * is no overlap between TASK_* and EXIT_* but that may not always be * true, so EXIT_* bits are shifted left 16 bits before being stored in diff --git a/kernel/exit.c b/kernel/exit.c index 6a488ad2dce5..f5d2f63bae0b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* Let father know we died * * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitary processes. + * that to send signals to arbitrary processes. * That stops right now. * * If the parent exec id doesn't match the exec id we saved diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1dafc8652bd8..4af1e2b244cb 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -415,7 +415,7 @@ out: * @desc: the interrupt description structure for this irq * * Interrupt occures on the falling and/or rising edge of a hardware - * signal. The occurence is latched into the irq controller hardware + * signal. The occurrence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one * is handled by the associated event handler. If this happens it diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index bc6194698dfd..47420908fba0 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -35,7 +35,7 @@ void irq_move_masked_irq(struct irq_data *idata) * do the disable, re-program, enable sequence. * This is *not* particularly important for level triggered * but in a edge trigger case, we might be setting rte - * when an active trigger is comming in. This could + * when an active trigger is coming in. This could * cause some ioapics to mal-function. * Being paranoid i guess! * diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92c7ebd..e7e3d9788dc3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -144,7 +144,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); - /* Initialize the list of unuseable pages */ + /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ @@ -454,7 +454,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single - * page allocations, and add everyting to image->dest_pages. + * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ @@ -602,7 +602,7 @@ static void kimage_free_extra_pages(struct kimage *image) /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); - /* Walk through and free any unuseable pages I have cached */ + /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unuseable_pages); } diff --git a/kernel/kthread.c b/kernel/kthread.c index 684ab3f7dd72..3b34d2732bce 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -139,7 +139,7 @@ static void create_kthread(struct kthread_create_info *create) * in @node, to get NUMA affinity for kthread stack, or else give -1. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a - * standalone thread for which noone will call kthread_stop(), or + * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ee74b35e528d..376066e10413 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk, } /** - * __account_scheduler_latency - record an occured latency + * __account_scheduler_latency - record an occurred latency * @tsk - the task struct of the task hitting the latency * @usecs - the duration of the latency in microseconds * @inter - 1 if the sleep was interruptible, 0 if uninterruptible diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058da80f5..53a68956f131 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2309,7 +2309,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(curr->hardirqs_enabled)) { /* * Neither irq nor preemption are disabled here - * so this is racy by nature but loosing one hit + * so this is racy by nature but losing one hit * in a stat is not a big deal. */ __debug_atomic_inc(redundant_hardirqs_on); @@ -2620,7 +2620,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (!graph_lock()) return 0; /* - * Make sure we didnt race: + * Make sure we didn't race: */ if (unlikely(hlock_class(this)->usage_mask & new_mask)) { graph_unlock(); diff --git a/kernel/module.c b/kernel/module.c index 1f9f7bc56ca1..d5938a5c19c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -809,7 +809,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, wait_for_zero_refcount(mod); mutex_unlock(&module_mutex); - /* Final destruction now noone is using it. */ + /* Final destruction now no one is using it. */ if (mod->exit != NULL) mod->exit(); blocking_notifier_call_chain(&module_notify_list, @@ -2777,7 +2777,7 @@ static struct module *load_module(void __user *umod, mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. Noone should access us, since + * info during argument parsing. No one should access us, since * strong_try_module_get() will fail. * lockdep/oops can run asynchronous, so use the RCU list insertion * function to insert in a way safe to concurrent readers. @@ -2971,7 +2971,7 @@ static const char *get_ksymbol(struct module *mod, else nextval = (unsigned long)mod->module_core+mod->core_text_size; - /* Scan for closest preceeding symbol, and next symbol. (ELF + /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ for (i = 1; i < mod->num_symtab; i++) { if (mod->symtab[i].st_shndx == SHN_UNDEF) diff --git a/kernel/mutex.c b/kernel/mutex.c index a5889fb28ecf..c4195fa98900 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -245,7 +245,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, state); - /* didnt get the lock, go to sleep: */ + /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable_no_resched(); schedule(); diff --git a/kernel/padata.c b/kernel/padata.c index 751019415d23..b91941df5e63 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd) /* * This cpu has to do the parallel processing of the next * object. It's waiting in the cpu's parallelization queue, - * so exit imediately. + * so exit immediately. */ if (PTR_ERR(padata) == -ENODATA) { del_timer(&pd->timer); @@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd) /* * The next object that needs serialization might have arrived to * the reorder queues in the meantime, we will be called again - * from the timer function if noone else cares for it. + * from the timer function if no one else cares for it. */ if (atomic_read(&pd->reorder_objects) && !(pinst->flags & PADATA_RESET)) @@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst) put_online_cpus(); } -/* Replace the internal control stucture with a new one. */ +/* Replace the internal control structure with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { @@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) } /** - * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) * padata cpumasks. * * @pinst: padata instance diff --git a/kernel/params.c b/kernel/params.c index 0da1411222b9..7ab388a48a2e 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -95,7 +95,7 @@ static int parse_one(char *param, /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { - /* Noone handled NULL, so do it here. */ + /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool) return -EINVAL; DEBUGP("They are equal! Calling %p\n", diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 67fea9d25d55..0791b13df7bf 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1347,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Now that all the timers on our list have the firing flag, - * noone will touch their list entries but us. We'll take + * no one will touch their list entries but us. We'll take * each timer's lock before clearing its firing flag, so no * timer call will interfere. */ diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4c0124919f9a..e5498d7405c3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -313,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr) * restarted (i.e. we have flagged this in the sys_private entry of the * info block). * - * To protect aginst the timer going away while the interrupt is queued, + * To protect against the timer going away while the interrupt is queued, * we require that the it_requeue_pending flag be set. */ void do_schedule_next_timer(struct siginfo *info) diff --git a/kernel/power/main.c b/kernel/power/main.c index 8eaba5f27b10..de9aef8742f4 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -224,7 +224,7 @@ power_attr(state); * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to - * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. diff --git a/kernel/sched.c b/kernel/sched.c index f592ce6f8616..865b433fac5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2309,7 +2309,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) * - * NOTE: this function doesnt have to take the runqueue lock, + * NOTE: this function doesn't have to take the runqueue lock, * because all it wants to ensure is that the remote task enters * the kernel. If the IPI races and the task has been migrated * to another CPU then no harm is done and the purpose has been @@ -4997,7 +4997,7 @@ recheck: */ raw_spin_lock_irqsave(&p->pi_lock, flags); /* - * To be able to change p->policy safely, the apropriate + * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ rq = __task_rq_lock(p); @@ -5705,7 +5705,7 @@ void show_state_filter(unsigned long state_filter) do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow - * console might take alot of time: + * console might take a lot of time: */ touch_nmi_watchdog(); if (!state_filter || (p->state & state_filter)) diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 5946ac515602..429242f3c484 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -179,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p) struct autogroup *ag = autogroup_create(); autogroup_move_group(p, ag); - /* drop extra refrence added by autogroup_create() */ + /* drop extra reference added by autogroup_create() */ autogroup_kref_put(ag); } EXPORT_SYMBOL(sched_autogroup_create_attach); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3f7ec9e27ee1..3cb7f07887a1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3061,7 +3061,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have + * there is no guarantee that any tasks will be moved so we'll have * a think about bumping its value to force at least one task to be * moved */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index db308cb08b75..e7cebdc65f82 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1378,7 +1378,7 @@ retry: task = pick_next_pushable_task(rq); if (task_cpu(next_task) == rq->cpu && task == next_task) { /* - * If we get here, the task hasnt moved at all, but + * If we get here, the task hasn't moved at all, but * it has failed to push. We will not try again, * since the other cpus will pull from us when they * are ready. @@ -1488,7 +1488,7 @@ static int pull_rt_task(struct rq *this_rq) /* * We continue with the search, just in * case there's an even higher prio task - * in another runqueue. (low likelyhood + * in another runqueue. (low likelihood * but possible) */ } diff --git a/kernel/signal.c b/kernel/signal.c index 1186cf7fac77..f486d10f3b8e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1885,7 +1885,7 @@ relock: for (;;) { struct k_sigaction *ka; /* - * Tracing can induce an artifical signal and choose sigaction. + * Tracing can induce an artificial signal and choose sigaction. * The return value in @signr determines the default action, * but @info->si_signo is the signal number we will report. */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 735d87095172..174f976c2874 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -567,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data) /** * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks * @ttimer: tasklet_hrtimer which is initialized - * @function: hrtimer callback funtion which gets called from softirq context + * @function: hrtimer callback function which gets called from softirq context * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) */ diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index b2fa506667c0..a470154e0408 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -34,7 +34,7 @@ * inaccuracies caused by missed or lost timer * interrupts and the inability for the timer * interrupt hardware to accuratly tick at the - * requested HZ value. It is also not reccomended + * requested HZ value. It is also not recommended * for "tick-less" systems. */ #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585b8d7d..a5d0a3a85dd8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, unsigned int timer_flag) { /* - * It doesnt matter which lock we take: + * It doesn't matter which lock we take: */ raw_spinlock_t *lock; struct entry *entry, input; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c075f4ea6b94..ee24fa1935ac 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1268,7 +1268,7 @@ static int ftrace_update_code(struct module *mod) p->flags = 0L; /* - * Do the initial record convertion from mcount jump + * Do the initial record conversion from mcount jump * to the NOP instructions. */ if (!ftrace_code_disable(mod, p)) { @@ -3425,7 +3425,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ + /* make curr_ret_stack visible before we add the ret_stack */ smp_wmb(); t->ret_stack = ret_stack; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9c8bcafb120..0ef7b4b2a1f7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1478,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage) return local_read(&bpage->entries) & RB_WRITE_MASK; } -/* Size is determined by what has been commited */ +/* Size is determined by what has been committed */ static inline unsigned rb_page_size(struct buffer_page *bpage) { return rb_page_commit(bpage); @@ -2932,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* * cpu_buffer->pages just needs to point to the buffer, it * has no specific buffer page to point to. Lets move it out - * of our way so we don't accidently swap it. + * of our way so we don't accidentally swap it. */ cpu_buffer->pages = reader->list.prev; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9541c27c1cf2..d38c16a06a6f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3239,7 +3239,7 @@ waitagain: trace_seq_init(&iter->seq); /* - * If there was nothing to send to user, inspite of consuming trace + * If there was nothing to send to user, in spite of consuming trace * entries, go back to wait for more entries. */ if (sret == -EBUSY) diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d55db0..6302747a1398 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void) } /* - * trace_clock(): 'inbetween' trace clock. Not completely serialized, + * trace_clock(): 'between' trace clock. Not completely serialized, * but not completely incorrect when crossing CPUs either. * * This is based on cpu_clock(), which will allow at most ~1 jiffy of diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 1516cb3ec549..e32744c84d94 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -27,7 +27,7 @@ * in the structure. * * * for structures within structures, the format of the internal - * structure is layed out. This allows the internal structure + * structure is laid out. This allows the internal structure * to be deciphered for the format file. Although these macros * may become out of sync with the internal structure, they * will create a compile error if it happens. Since the diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 76b05980225c..962cdb24ed81 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, * * returns 1 if * - we are inside irq code - * - we just extered irq code + * - we just entered irq code * * retunns 0 if * - funcgraph-interrupts option is set diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 92b6e1e12d98..a4969b47afc1 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = { * skip the latency if the sequence has changed - some other section * did a maximum and could disturb our measurement with serial console * printouts, etc. Truly coinciding maximum latencies should be rare - * and what happens together happens separately as well, so this doesnt + * and what happens together happens separately as well, so this doesn't * decrease the validity of the maximum found: */ static __cacheline_aligned_in_smp unsigned long max_sequence; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8435b43b1782..35d55a386145 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1839,7 +1839,7 @@ static void unregister_probe_event(struct trace_probe *tp) kfree(tp->call.print_fmt); } -/* Make a debugfs interface for controling probe points */ +/* Make a debugfs interface for controlling probe points */ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index eb27fd3430a2..92cb706c7fc8 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c @@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); /* * Removes a registered user return notifier. Must be called from atomic - * context, and from the same cpu registration occured in. + * context, and from the same cpu registration occurred in. */ void user_return_notifier_unregister(struct user_return_notifier *urn) { diff --git a/kernel/wait.c b/kernel/wait.c index b0310eb6cc1e..f45ea8d2a1ce 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait); * woken up through the queue. * * This prevents waiter starvation where an exclusive waiter - * aborts and is woken up concurrently and noone wakes up + * aborts and is woken up concurrently and no one wakes up * the next waiter. */ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04ef830690ec..8859a41806dd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1291,7 +1291,7 @@ __acquires(&gcwq->lock) return true; spin_unlock_irq(&gcwq->lock); - /* CPU has come up inbetween, retry migration */ + /* CPU has come up in between, retry migration */ cpu_relax(); } } -- cgit v1.2.3 From c0bb9e45f3a7f67fc358946727bc3d5f23d0f55d Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 25 Aug 2010 10:22:58 +1000 Subject: kdump: Allow shrinking of kdump region to be overridden On ppc64 the crashkernel region almost always overlaps an area of firmware. This works fine except when using the sysfs interface to reduce the kdump region. If we free the firmware area we are guaranteed to crash. Rename free_reserved_phys_range to crash_free_reserved_phys_range and make it a weak function so we can override it. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- kernel/kexec.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92c7ebd..4e240a378df6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1099,7 +1099,8 @@ size_t crash_get_memory_size(void) return size; } -static void free_reserved_phys_range(unsigned long begin, unsigned long end) +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) { unsigned long addr; @@ -1135,7 +1136,7 @@ int crash_shrink_memory(unsigned long new_size) start = roundup(start, PAGE_SIZE); end = roundup(start + new_size, PAGE_SIZE); - free_reserved_phys_range(end, crashk_res.end); + crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); -- cgit v1.2.3 From 4f5058c3b71ed5930bb2b478c4d5dbc799dd9ad1 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Sat, 2 Apr 2011 19:39:35 +0800 Subject: genirq: Fix cpumask leak in __setup_irq() The allocated cpumask should be freed in __setup_irq(). Signed-off-by: Xiaotian Feng LKML-Reference: <1301744375-6812-1-git-send-email-dfeng@redhat.com> Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 12a80fdae11c..07c1611f3899 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1051,6 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); + free_cpumask_var(mask); return 0; -- cgit v1.2.3 From 4352d9d44b935e4d000be6ec89ddb55c2bf35f24 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Mon, 4 Apr 2011 08:31:23 -0700 Subject: ntp: fix non privileged system time shifting The ADJ_SETOFFSET bit added in commit 094aa188 ("ntp: Add ADJ_SETOFFSET mode bit") also introduced a way for any user to change the system time. Sneaky or buggy calls to adjtimex() could set ADJ_OFFSET_SS_READ | ADJ_SETOFFSET which would result in a successful call to timekeeping_inject_offset(). This patch fixes the issue by adding the capability check. Signed-off-by: Richard Cochran Signed-off-by: Linus Torvalds --- kernel/time/ntp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5f1bb8e2008f..f6117a4c7cb8 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -652,6 +652,8 @@ int do_adjtimex(struct timex *txc) struct timespec delta; delta.tv_sec = txc->time.tv_sec; delta.tv_nsec = txc->time.tv_usec; + if (!capable(CAP_SYS_TIME)) + return -EPERM; if (!(txc->modes & ADJ_NANO)) delta.tv_nsec *= 1000; result = timekeeping_inject_offset(&delta); -- cgit v1.2.3 From 5aba085ededa6c5a1ff465e2aebc3e8eb00a7567 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Apr 2011 14:59:31 -0700 Subject: kernel/signal.c: fix typos and coding style General coding style and comment fixes; no code changes: - Use multi-line-comment coding style. - Put some function signatures completely on one line. - Hyphenate some words. - Spell Posix as POSIX. - Correct typos & spellos in some comments. - Drop trailing whitespace. - End sentences with periods. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/signal.c | 90 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1186cf7fac77..3ab90e8b6ecf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -226,7 +226,7 @@ static inline void print_dropped_signal(int sig) /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an - * appopriate lock must be held to stop the target task from exiting + * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) @@ -375,15 +375,15 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tracehook_consider_fatal_signal(tsk, sig); } - -/* Notify the system that a driver wants to block all signals for this +/* + * Notify the system that a driver wants to block all signals for this * process, and wants to be notified if any signals at all were to be * sent/acted upon. If the notifier routine returns non-zero, then the * signal will be acted upon after all. If the notifier routine returns 0, * then then signal will be blocked. Only one block per process is * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. */ - + * can use to determine if the signal should be blocked or not. + */ void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) { @@ -434,9 +434,10 @@ still_pending: copy_siginfo(info, &first->info); __sigqueue_free(first); } else { - /* Ok, it wasn't in the queue. This must be - a fast-pathed signal or we must have been - out of queue space. So zero out the info. + /* + * Ok, it wasn't in the queue. This must be + * a fast-pathed signal or we must have been + * out of queue space. So zero out the info. */ info->si_signo = sig; info->si_errno = 0; @@ -468,7 +469,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, } /* - * Dequeue a signal and return the element to the caller, which is + * Dequeue a signal and return the element to the caller, which is * expected to free it. * * All callers have to hold the siglock. @@ -490,7 +491,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * itimers are process shared and we restart periodic * itimers in the signal delivery path to prevent DoS * attacks in the high resolution timer case. This is - * compliant with the old way of self restarting + * compliant with the old way of self-restarting * itimers, as the SIGALRM is a legacy signal and only * queued once. Changing the restart behaviour to * restart the timer in the signal dequeue path is @@ -923,14 +924,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, if (info == SEND_SIG_FORCED) goto out_set; - /* Real-time signals must be queued if sent by sigqueue, or - some other real-time mechanism. It is implementation - defined whether kill() does so. We attempt to do so, on - the principle of least surprise, but since kill is not - allowed to fail with EAGAIN when low on memory we just - make sure at least one signal gets delivered and don't - pass on the info struct. */ - + /* + * Real-time signals must be queued if sent by sigqueue, or + * some other real-time mechanism. It is implementation + * defined whether kill() does so. We attempt to do so, on + * the principle of least surprise, but since kill is not + * allowed to fail with EAGAIN when low on memory we just + * make sure at least one signal gets delivered and don't + * pass on the info struct. + */ if (sig < SIGRTMIN) override_rlimit = (is_si_special(info) || info->si_code >= 0); else @@ -1201,8 +1203,7 @@ retry: return error; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); @@ -1299,8 +1300,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -int -send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { /* * Make sure legacy kernel users don't send in bad values @@ -1368,7 +1368,7 @@ EXPORT_SYMBOL(kill_pid); * These functions support sending signals using preallocated sigqueue * structures. This is needed "because realtime applications cannot * afford to lose notifications of asynchronous events, like timer - * expirations or I/O completions". In the case of Posix Timers + * expirations or I/O completions". In the case of POSIX Timers * we allocate the sigqueue structure from the timer_create. If this * allocation fails we are able to report the failure to the application * with an EAGAIN error. @@ -1553,7 +1553,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) info.si_signo = SIGCHLD; info.si_errno = 0; /* - * see comment in do_notify_parent() abot the following 3 lines + * see comment in do_notify_parent() about the following 4 lines */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); @@ -1611,7 +1611,7 @@ static inline int may_ptrace_stop(void) } /* - * Return nonzero if there is a SIGKILL that should be waking us up. + * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ static int sigkill_pending(struct task_struct *tsk) @@ -1735,7 +1735,7 @@ void ptrace_notify(int exit_code) /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. - * Returns nonzero if we've actually stopped and released the siglock. + * Returns non-zero if we've actually stopped and released the siglock. * Returns zero if we didn't stop and still hold the siglock. */ static int do_signal_stop(int signr) @@ -1823,10 +1823,12 @@ static int ptrace_signal(int signr, siginfo_t *info, current->exit_code = 0; - /* Update the siginfo structure if the signal has - changed. If the debugger wanted something - specific in the siginfo structure then it should - have updated *info via PTRACE_SETSIGINFO. */ + /* + * Update the siginfo structure if the signal has + * changed. If the debugger wanted something + * specific in the siginfo structure then it should + * have updated *info via PTRACE_SETSIGINFO. + */ if (signr != info->si_signo) { info->si_signo = signr; info->si_errno = 0; @@ -2034,7 +2036,8 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - /* It could be that __group_complete_signal() choose us to + /* + * It could be that __group_complete_signal() choose us to * notify about group-wide signal. Another thread should be * woken now to take the signal since we will not. */ @@ -2183,7 +2186,7 @@ long do_sigpending(void __user *set, unsigned long sigsetsize) out: return error; -} +} SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { @@ -2233,9 +2236,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* * Other callers might not initialize the si_lsb field, - * so check explicitely for the right codes here. + * so check explicitly for the right codes here. */ if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); @@ -2280,7 +2283,7 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; - + /* * Invert the set of allowed signals to get those we * want to block. @@ -2305,9 +2308,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + (ts.tv_sec || ts.tv_nsec)); if (timeout) { - /* None ready -- temporarily unblock those we're + /* + * None ready -- temporarily unblock those we're * interested while we are sleeping in so that we'll - * be awakened when they arrive. */ + * be awakened when they arrive. + */ current->real_blocked = current->blocked; sigandsets(¤t->blocked, ¤t->blocked, &these); recalc_sigpending(); @@ -2553,12 +2558,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s error = -EINVAL; /* - * - * Note - this code used to test ss_flags incorrectly + * Note - this code used to test ss_flags incorrectly: * old code may have been written using ss_flags==0 * to mean ss_flags==SS_ONSTACK (as this was the only * way that worked) - this fix preserves that older - * mechanism + * mechanism. */ if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) goto out; @@ -2600,8 +2604,10 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* Some platforms have their own version with special arguments others - support only sys_rt_sigprocmask. */ +/* + * Some platforms have their own version with special arguments; + * others support only sys_rt_sigprocmask. + */ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, old_sigset_t __user *, oset) -- cgit v1.2.3 From 41c57892a2895865afc89ff1a21f91a0f1506f66 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 4 Apr 2011 15:00:26 -0700 Subject: kernel/signal.c: add kernel-doc notation to syscalls Add kernel-doc to syscalls in signal.c. Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/signal.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 3ab90e8b6ecf..dc17929ab78a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2075,6 +2075,9 @@ EXPORT_SYMBOL(unblock_all_signals); * System call entry points. */ +/** + * sys_restart_syscall - restart a system call + */ SYSCALL_DEFINE0(restart_syscall) { struct restart_block *restart = ¤t_thread_info()->restart_block; @@ -2128,6 +2131,13 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) return error; } +/** + * sys_rt_sigprocmask - change the list of currently blocked signals + * @how: whether to add, remove, or set signals + * @set: stores pending signals + * @oset: previous value of signal mask if non-null + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, sigset_t __user *, oset, size_t, sigsetsize) { @@ -2188,6 +2198,12 @@ out: return error; } +/** + * sys_rt_sigpending - examine a pending signal that has been raised + * while blocked + * @set: stores pending signals + * @sigsetsize: size of sigset_t type or larger + */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) { return do_sigpending(set, sigsetsize); @@ -2267,6 +2283,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif +/** + * sys_rt_sigtimedwait - synchronously wait for queued signals specified + * in @uthese + * @uthese: queued signals to wait for + * @uinfo: if non-null, the signal's siginfo is returned here + * @uts: upper bound on process time suspension + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct timespec __user *, uts, size_t, sigsetsize) @@ -2344,6 +2368,11 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, return ret; } +/** + * sys_kill - send a signal to a process + * @pid: the PID of the process + * @sig: signal to be sent + */ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; @@ -2419,7 +2448,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) return do_tkill(tgid, pid, sig); } -/* +/** + * sys_tkill - send signal to one specific task + * @pid: the PID of the task + * @sig: signal to be sent + * * Send a signal to only one task, even if it's a CLONE_THREAD task. */ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) @@ -2431,6 +2464,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) return do_tkill(0, pid, sig); } +/** + * sys_rt_sigqueueinfo - send signal information to a signal + * @pid: the PID of the thread + * @sig: signal to be sent + * @uinfo: signal info to be sent + */ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, siginfo_t __user *, uinfo) { @@ -2596,6 +2635,10 @@ out: #ifdef __ARCH_WANT_SYS_SIGPENDING +/** + * sys_sigpending - examine pending signals + * @set: where mask of pending signal is returned + */ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) { return do_sigpending(set, sizeof(*set)); @@ -2604,7 +2647,12 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #endif #ifdef __ARCH_WANT_SYS_SIGPROCMASK -/* +/** + * sys_sigprocmask - examine and change blocked signals + * @how: whether to add, remove, or set signals + * @set: signals to add or remove (if non-null) + * @oset: previous value of signal mask if non-null + * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ @@ -2660,6 +2708,13 @@ out: #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ #ifdef __ARCH_WANT_SYS_RT_SIGACTION +/** + * sys_rt_sigaction - alter an action taken by a process + * @sig: signal to be sent + * @act: the thread group ID of the thread + * @oact: the PID of the thread + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE4(rt_sigaction, int, sig, const struct sigaction __user *, act, struct sigaction __user *, oact, @@ -2746,6 +2801,12 @@ SYSCALL_DEFINE0(pause) #endif #ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND +/** + * sys_rt_sigsuspend - replace the signal mask for a value with the + * @unewset value until a signal is received + * @unewset: new signal mask value + * @sigsetsize: size of sigset_t type + */ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) { sigset_t newset; -- cgit v1.2.3 From 49c022e657fbe661460d191fbe776a387132e2b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 10:14:25 +0200 Subject: sched: Clean up rebalance_domains() load-balance interval calculation Instead of the possible multiple-evaluation of num_online_cpus() in rebalance_domains() that Linus reported, avoid it altogether in the normal case since it's implemented with a Hamming weight function over a cpu bitmask which can be darn expensive for those with big iron. This also makes it cleaner, smaller and documents the code. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra LKML-Reference: <1301991265.2225.12.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 +++ kernel/sched_fair.c | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a8845516ace6..17b4d226ee0d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6331,6 +6331,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif } + + update_max_interval(); + return NOTIFY_OK; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c7ec5c8e7b44..80ecd09452e0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3820,6 +3820,17 @@ void select_nohz_load_balancer(int stop_tick) static DEFINE_SPINLOCK(balancing); +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +static void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -3849,10 +3860,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; - if (interval > HZ*num_online_cpus()/10) - interval = HZ*num_online_cpus()/10; + interval = clamp(interval, 1UL, max_load_balance_interval); need_serialize = sd->flags & SD_SERIALIZE; -- cgit v1.2.3 From f9fa0bc1fabe1d861e46d80ecbe7e85da359195c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 8 Apr 2011 10:53:46 -0700 Subject: signal.c: fix erroneous syscall kernel-doc Fix erroneous syscall kernel-doc comments in kernel/signal.c. Reported-by: Matt Fleming Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 29e233fd7a0f..7165af5f1b11 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2711,8 +2711,8 @@ out: /** * sys_rt_sigaction - alter an action taken by a process * @sig: signal to be sent - * @act: the thread group ID of the thread - * @oact: the PID of the thread + * @act: new sigaction + * @oact: used to save the previous sigaction * @sigsetsize: size of sigset_t type */ SYSCALL_DEFINE4(rt_sigaction, int, sig, -- cgit v1.2.3 From e566b76ed30768140df8f0023904aed5a41244f7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 6 Apr 2011 02:54:54 +0200 Subject: perf_event: Fix cgrp event scheduling bug in perf_enable_on_exec() There is a bug in perf_event_enable_on_exec() when cgroup events are active on a CPU: the cgroup events may be scheduled twice causing event state corruptions which eventually may lead to kernel panics. The reason is that the function needs to first schedule out the cgroup events, just like for the per-thread events. The cgroup event are scheduled back in automatically from the perf_event_context_sched_in() function. The patch also adds a WARN_ON_ONCE() is perf_cgroup_switch() to catch any bogus state. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110406005454.GA1062@quad Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 27960f114efd..8e81a9860a0d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -364,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode) } if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); /* set cgrp before ctxsw in to * allow event_filter_match() to not * have to pass task around @@ -2423,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) if (!ctx || !ctx->nr_events) goto out; + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -2447,6 +2456,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); + /* + * Also calls ctxswin for cgroup events, if any: + */ perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); -- cgit v1.2.3 From b0432d8f162c7d5d9537b4cb749d44076b76a783 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 7 Apr 2011 17:23:22 -0700 Subject: sched: Fix sched-domain avg_load calculation In function find_busiest_group(), the sched-domain avg_load isn't calculated at all if there is a group imbalance within the domain. This will cause erroneous imbalance calculation. The reason is that calculate_imbalance() sees sds->avg_load = 0 and it will dump entire sds->max_load into imbalance variable, which is used later on to migrate entire load from busiest CPU to the puller CPU. This has two really bad effect: 1. stampede of task migration, and they won't be able to break out of the bad state because of positive feedback loop: large load delta -> heavier load migration -> larger imbalance and the cycle goes on. 2. severe imbalance in CPU queue depth. This causes really long scheduling latency blip which affects badly on application that has tight latency requirement. The fix is to have kernel calculate domain avg_load in both cases. This will ensure that imbalance calculation is always sensible and the target is usually half way between busiest and puller CPU. Signed-off-by: Ken Chen Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/20110408002322.3A0D812217F@elm.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7f00772e57c9..60f9d407c5ec 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3127,6 +3127,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + /* * If the busiest group is imbalanced the below checks don't * work because they assumes all things are equal, which typically @@ -3151,7 +3153,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Don't pull any tasks if this group is already above the domain * average load. */ - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; if (sds.this_load >= sds.avg_load) goto out_balanced; -- cgit v1.2.3 From b30aef17f71cf9e24b10c11cbb5e5f0ebe8a85ab Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Fri, 8 Apr 2011 12:20:16 -0700 Subject: sched: Fix erroneous all_pinned logic The scheduler load balancer has specific code to deal with cases of unbalanced system due to lots of unmovable tasks (for example because of hard CPU affinity). In those situation, it excludes the busiest CPU that has pinned tasks for load balance consideration such that it can perform second 2nd load balance pass on the rest of the system. This all works as designed if there is only one cgroup in the system. However, when we have multiple cgroups, this logic has false positives and triggers multiple load balance passes despite there are actually no pinned tasks at all. The reason it has false positives is that the all pinned logic is deep in the lowest function of can_migrate_task() and is too low level: load_balance_fair() iterates each task group and calls balance_tasks() to migrate target load. Along the way, balance_tasks() will also set a all_pinned variable. Given that task-groups are iterated, this all_pinned variable is essentially the status of last group in the scanning process. Task group can have number of reasons that no load being migrated, none due to cpu affinity. However, this status bit is being propagated back up to the higher level load_balance(), which incorrectly think that no tasks were moved. It kick off the all pinned logic and start multiple passes attempt to move load onto puller CPU. To fix this, move the all_pinned aggregation up at the iterator level. This ensures that the status is aggregated over all task-groups, not just last one in the list. Signed-off-by: Ken Chen Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/BANLkTi=ernzNawaR5tJZEsV_QVnfxqXmsQ@mail.gmail.com Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 60f9d407c5ec..6fa833ab2cb8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2104,21 +2104,20 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, struct cfs_rq *busiest_cfs_rq) { - int loops = 0, pulled = 0, pinned = 0; + int loops = 0, pulled = 0; long rem_load_move = max_load_move; struct task_struct *p, *n; if (max_load_move == 0) goto out; - pinned = 1; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) break; if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) + !can_migrate_task(p, busiest, this_cpu, sd, idle, + all_pinned)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2153,9 +2152,6 @@ out: */ schedstat_add(sd, lb_gained[idle], pulled); - if (all_pinned) - *all_pinned = pinned; - return max_load_move - rem_load_move; } @@ -3341,6 +3337,7 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ + all_pinned = 1; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, -- cgit v1.2.3 From 1f112cee07b314e244ee9e71d9c1e6950dc13327 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 11 Apr 2011 22:54:42 +0200 Subject: PM / Hibernate: Introduce CONFIG_HIBERNATE_CALLBACKS Xen save/restore is going to use hibernate device callbacks for quiescing devices and putting them back to normal operations and it would need to select CONFIG_HIBERNATION for this purpose. However, that also would cause the hibernate interfaces for user space to be enabled, which might confuse user space, because the Xen kernels don't support hibernation. Moreover, it would be wasteful, as it would make the Xen kernels include a substantial amount of code that they would never use. To address this issue introduce new power management Kconfig option CONFIG_HIBERNATE_CALLBACKS, such that it will only select the code that is necessary for the hibernate device callbacks to work and make CONFIG_HIBERNATION select it. Then, Xen save/restore will be able to select CONFIG_HIBERNATE_CALLBACKS without dragging the entire hibernate code along with it. Signed-off-by: Rafael J. Wysocki Tested-by: Shriram Rajagopalan --- kernel/power/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4603f08dc47b..049791468d37 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -18,9 +18,13 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HIBERNATE_CALLBACKS + bool + config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS ---help--- @@ -85,7 +89,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y - depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE + depends on SUSPEND || HIBERNATE_CALLBACKS || XEN_SAVE_RESTORE config PM_SLEEP_SMP def_bool y -- cgit v1.2.3 From d419e4c0f7584ffc5c72d9aeeaac485cc756ebcf Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Mon, 11 Apr 2011 22:54:48 +0200 Subject: fix XEN_SAVE_RESTORE Kconfig dependencies Make XEN_SAVE_RESTORE select HIBERNATE_CALLBACKS. Remove XEN_SAVE_RESTORE dependency from PM_SLEEP. Signed-off-by: Shriram Rajagopalan Acked-by: Ian Campbell Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 049791468d37..6de9a8fc3417 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -89,7 +89,7 @@ config PM_STD_PARTITION config PM_SLEEP def_bool y - depends on SUSPEND || HIBERNATE_CALLBACKS || XEN_SAVE_RESTORE + depends on SUSPEND || HIBERNATE_CALLBACKS config PM_SLEEP_SMP def_bool y -- cgit v1.2.3 From d9c97833179036408e53ef5f3f5c7eaf781769bc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2011 10:06:33 +0200 Subject: block: remove block_unplug_timer() trace point We no longer have an unplug timer running, so no point in keeping the trace point. Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7aa40f8e182d..824708cbfb7b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -863,19 +863,6 @@ static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) } } -static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) -{ - struct blk_trace *bt = q->blk_trace; - - if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); - - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, - sizeof(rpdu), &rpdu); - } -} - static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) @@ -1015,8 +1002,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); - WARN_ON(ret); ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); @@ -1033,7 +1018,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); @@ -1348,7 +1332,6 @@ static const struct { [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, - [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, -- cgit v1.2.3 From 94b5eb28b41cc79d9713696e0005ae167b5afd1b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Apr 2011 10:12:19 +0200 Subject: block: fixup block IO unplug trace call It was removed with the on-stack plugging, readd it and track the depth of requests added when flushing the plug. Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 824708cbfb7b..3e3970d53d14 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -850,13 +850,13 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) +static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q, + unsigned int depth) { struct blk_trace *bt = q->blk_trace; if (bt) { - unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; - __be64 rpdu = cpu_to_be64(pdu); + __be64 rpdu = cpu_to_be64(depth); __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, sizeof(rpdu), &rpdu); -- cgit v1.2.3 From 6631e635c65dc33cb798cc2f51d0ddd69ada6319 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 13 Apr 2011 08:08:20 -0700 Subject: block: don't flush plugged IO on forced preemtion scheduling We really only want to unplug the pending IO when the process actually goes to sleep. So move the test for flushing the plug up to the place where we actually deactivate the task - where we have properly checked for preemption and for the process really sleeping. Acked-by: Jens Axboe Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 48013633d792..a187c3fe027b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4111,20 +4111,20 @@ need_resched: try_to_wake_up_local(to_wakeup); } deactivate_task(rq, prev, DEQUEUE_SLEEP); + + /* + * If we are going to sleep and we have plugged IO queued, make + * sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(prev)) { + raw_spin_unlock(&rq->lock); + blk_flush_plug(prev); + raw_spin_lock(&rq->lock); + } } switch_count = &prev->nvcsw; } - /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. - */ - if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) { - raw_spin_unlock(&rq->lock); - blk_flush_plug(prev); - raw_spin_lock(&rq->lock); - } - pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) -- cgit v1.2.3 From 0cd9c6494ee5c19aef085152bc37f3a4e774a9e1 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 14 Apr 2011 15:41:57 -0700 Subject: futex: Set FLAGS_HAS_TIMEOUT during futex_wait restart setup The FLAGS_HAS_TIMEOUT flag was not getting set, causing the restart_block to restart futex_wait() without a timeout after a signal. Commit b41277dc7a18ee332d in 2.6.38 introduced the regression by accidentally removing the the FLAGS_HAS_TIMEOUT assignment from futex_wait() during the setup of the restart block. Restore the originaly behavior. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=32922 Reported-by: Tim Smith Reported-by: Torsten Hilbrich Signed-off-by: Darren Hart Signed-off-by: Eric Dumazet Cc: Peter Zijlstra Cc: John Kacur Cc: stable@kernel.org Link: http://lkml.kernel.org/r/%3Cdaac0eb3af607f72b9a4d3126b2ba8fb5ed3b883.1302820917.git.dvhart%40linux.intel.com%3E Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index dfb924ffe65b..fe28dc282eae 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1886,7 +1886,7 @@ retry: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = flags; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; ret = -ERESTART_RESTARTBLOCK; -- cgit v1.2.3 From a237c1c5bc5dc5c76a21be922dca4826f3eca8ca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Apr 2011 13:27:55 +0200 Subject: block: let io_schedule() flush the plug inline Linus correctly observes that the most important dispatch cases are now done from kblockd, this isn't ideal for latency reasons. The original reason for switching dispatches out-of-line was to avoid too deep a stack, so by _only_ letting the "accidental" flush directly in schedule() be guarded by offload to kblockd, we should be able to get the best of both worlds. So add a blk_schedule_flush_plug() that offloads to kblockd, and only use that from the schedule() path. Signed-off-by: Jens Axboe --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a187c3fe027b..312f8b95c2d4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4118,7 +4118,7 @@ need_resched: */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); - blk_flush_plug(prev); + blk_schedule_flush_plug(prev); raw_spin_lock(&rq->lock); } } -- cgit v1.2.3 From 49cac01e1fa74174d72adb0e872504a7fefd7c01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Apr 2011 13:51:05 +0200 Subject: block: make unplug timer trace event correspond to the schedule() unplug It's a pretty close match to what we had before - the timer triggering would mean that nobody unplugged the plug in due time, in the new scheme this matches very closely what the schedule() unplug now is. It's essentially the difference between an explicit unplug (IO unplug) or an implicit unplug (timer unplug, we scheduled with pending IO queued). Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3e3970d53d14..6957aa298dfa 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -850,16 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q, - unsigned int depth) +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) { struct blk_trace *bt = q->blk_trace; if (bt) { __be64 rpdu = cpu_to_be64(depth); + u32 what; - __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, - sizeof(rpdu), &rpdu); + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; + + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); } } @@ -1002,7 +1007,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); @@ -1017,7 +1022,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); @@ -1332,6 +1337,7 @@ static const struct { [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, -- cgit v1.2.3 From 1791f881435fab951939ad700e947b66c062e083 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Wed, 30 Mar 2011 15:24:21 +0200 Subject: posix clocks: Replace mutex with reader/writer semaphore A dynamic posix clock is protected from asynchronous removal by a mutex. However, using a mutex has the unwanted effect that a long running clock operation in one process will unnecessarily block other processes. For example, one process might call read() to get an external time stamp coming in at one pulse per second. A second process calling clock_gettime would have to wait for almost a whole second. This patch fixes the issue by using a reader/writer semaphore instead of a mutex. Signed-off-by: Richard Cochran Cc: John Stultz Link: http://lkml.kernel.org/r/%3C20110330132421.GA31771%40riccoc20.at.omicron.at%3E Signed-off-by: Thomas Gleixner --- kernel/time/posix-clock.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 25028dd4fa18..c340ca658f37 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -19,7 +19,6 @@ */ #include #include -#include #include #include #include @@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp) { struct posix_clock *clk = fp->private_data; - mutex_lock(&clk->mutex); + down_read(&clk->rwsem); if (!clk->zombie) return clk; - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); return NULL; } static void put_posix_clock(struct posix_clock *clk) { - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); } static ssize_t posix_clock_read(struct file *fp, char __user *buf, @@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) struct posix_clock *clk = container_of(inode->i_cdev, struct posix_clock, cdev); - mutex_lock(&clk->mutex); + down_read(&clk->rwsem); if (clk->zombie) { err = -ENODEV; @@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) fp->private_data = clk; } out: - mutex_unlock(&clk->mutex); + up_read(&clk->rwsem); return err; } @@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid) int err; kref_init(&clk->kref); - mutex_init(&clk->mutex); + init_rwsem(&clk->rwsem); cdev_init(&clk->cdev, &posix_clock_file_operations); clk->cdev.owner = clk->ops.owner; err = cdev_add(&clk->cdev, devid, 1); - if (err) - goto no_cdev; return err; -no_cdev: - mutex_destroy(&clk->mutex); - return err; } EXPORT_SYMBOL_GPL(posix_clock_register); static void delete_clock(struct kref *kref) { struct posix_clock *clk = container_of(kref, struct posix_clock, kref); - mutex_destroy(&clk->mutex); + if (clk->release) clk->release(clk); } @@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk) { cdev_del(&clk->cdev); - mutex_lock(&clk->mutex); + down_write(&clk->rwsem); clk->zombie = true; - mutex_unlock(&clk->mutex); + up_write(&clk->rwsem); kref_put(&clk->kref, delete_clock); } -- cgit v1.2.3 From c78193e9c7bcbf25b8237ad0dec82f805c4ea69b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 18 Apr 2011 10:35:30 -0700 Subject: next_pidmap: fix overflow condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit next_pidmap() just quietly accepted whatever 'last' pid that was passed in, which is not all that safe when one of the users is /proc. Admittedly the proc code should do some sanity checking on the range (and that will be the next commit), but that doesn't mean that the helper functions should just do that pidmap pointer arithmetic without checking the range of its arguments. So clamp 'last' to PID_MAX_LIMIT. The fact that we then do "last+1" doesn't really matter, the for-loop does check against the end of the pidmap array properly (it's only the actual pointer arithmetic overflow case we need to worry about, and going one bit beyond isn't going to overflow). [ Use PID_MAX_LIMIT rather than pid_max as per Eric Biederman ] Reported-by: Tavis Ormandy Analyzed-by: Robert Święcki Cc: Eric W. Biederman Cc: Pavel Emelyanov Signed-off-by: Linus Torvalds --- kernel/pid.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 02f221274265..57a8346a270e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) return -1; } -int next_pidmap(struct pid_namespace *pid_ns, int last) +int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) { int offset; struct pidmap *map, *end; + if (last >= PID_MAX_LIMIT) + return -1; + offset = (last + 1) & BITS_PER_PAGE_MASK; map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; end = &pid_ns->pidmap[PIDMAP_ENTRIES]; -- cgit v1.2.3 From 2ca6f62f595c01f689b269db6736de5544da7667 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 18 Apr 2011 23:58:59 +0200 Subject: PM: Fix error code paths executed after failing syscore_suspend() If syscore_suspend() fails in suspend_enter(), create_image() or resume_target_kernel(), it is necessary to call sysdev_resume(), because sysdev_suspend() has been called already and succeeded and we are going to abort the transition. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman --- kernel/power/hibernate.c | 10 ++++++++-- kernel/power/suspend.c | 5 ++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aeabd26e3342..50aae660174d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -273,8 +273,11 @@ static int create_image(int platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_FREEZE); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) { printk(KERN_ERR "PM: Some system devices failed to power down, " "aborting hibernation\n"); @@ -407,8 +410,11 @@ static int resume_target_kernel(bool platform_mode) local_irq_disable(); error = sysdev_suspend(PMSG_QUIESCE); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) goto Enable_irqs; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2814c32aed51..8935369d503a 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -164,8 +164,11 @@ static int suspend_enter(suspend_state_t state) BUG_ON(!irqs_disabled()); error = sysdev_suspend(PMSG_SUSPEND); - if (!error) + if (!error) { error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (!error) { if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); -- cgit v1.2.3 From 19234c0819da0e043a02710488dfd9b242b42eba Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Apr 2011 00:36:11 +0200 Subject: PM: Add missing syscore_suspend() and syscore_resume() calls Device suspend/resume infrastructure is used not only by the suspend and hibernate code in kernel/power, but also by APM, Xen and the kexec jump feature. However, commit 40dc166cb5dddbd36aa4ad11c03915ea (PM / Core: Introduce struct syscore_ops for core subsystems PM) failed to add syscore_suspend() and syscore_resume() calls to that code, which generally leads to breakage when the features in question are used. To fix this problem, add the missing syscore_suspend() and syscore_resume() calls to arch/x86/kernel/apm_32.c, kernel/kexec.c and drivers/xen/manage.c. Signed-off-by: Rafael J. Wysocki Acked-by: Greg Kroah-Hartman Acked-by: Ian Campbell --- kernel/kexec.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 55936f9cb251..87b77de03dd3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1532,6 +1533,11 @@ int kernel_kexec(void) local_irq_disable(); /* Suspend system devices */ error = sysdev_suspend(PMSG_FREEZE); + if (!error) { + error = syscore_suspend(); + if (error) + sysdev_resume(); + } if (error) goto Enable_irqs; } else @@ -1546,6 +1552,7 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + syscore_resume(); sysdev_resume(); Enable_irqs: local_irq_enable(); -- cgit v1.2.3 From d20ac252821ab9780ddf00b95629547d3cebc857 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Mon, 4 Apr 2011 11:20:12 +0200 Subject: ftrace: Build without frame pointers on Microblaze Microblaze doesn't need/support FRAME_POINTERS in order to have a working function tracer. The patch remove Kconfig warning. Warning log: warning: (LOCKDEP && FAULT_INJECTION_STACKTRACE_FILTER && LATENCYTOP && FUNCTION_TRACER && KMEMCHECK) selects FRAME_POINTER which has unmet direct dependencies (DEBUG_KERNEL && (CRIS || M68K || FRV || UML || AVR32 || SUPERH || BLACKFIN || MN10300) || ARCH_WANT_FRAME_POINTERS) Signed-off-by: Michal Simek Link: http://lkml.kernel.org/r/1301908812-8119-2-git-send-email-monstr@monstr.eu CC: Frederic Weisbecker CC: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61d7d59f4a1a..2ad39e556cb4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !S390 + select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER -- cgit v1.2.3 From bf26c018490c2fce7fe9b629083b96ce0e6ad019 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 7 Apr 2011 16:53:20 +0200 Subject: ptrace: Prepare to fix racy accesses on task breakpoints When a task is traced and is in a stopped state, the tracer may execute a ptrace request to examine the tracee state and get its task struct. Right after, the tracee can be killed and thus its breakpoints released. This can happen concurrently when the tracer is in the middle of reading or modifying these breakpoints, leading to dereferencing a freed pointer. Hence, to prepare the fix, create a generic breakpoint reference holding API. When a reference on the breakpoints of a task is held, the breakpoints won't be released until the last reference is dropped. After that, no more ptrace request on the task's breakpoints can be serviced for the tracer. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Prasad Cc: Paul Mundt Cc: v2.6.33.. Link: http://lkml.kernel.org/r/1302284067-7860-2-git-send-email-fweisbec@gmail.com --- kernel/exit.c | 2 +- kernel/ptrace.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f5d2f63bae0b..8dd874181542 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1016,7 +1016,7 @@ NORET_TYPE void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - flush_ptrace_hw_breakpoint(tsk); + ptrace_put_breakpoints(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0fc1eed28d27..dc7ab65f3b36 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include /* @@ -879,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +int ptrace_get_breakpoints(struct task_struct *tsk) +{ + if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) + return 0; + + return -1; +} + +void ptrace_put_breakpoints(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) + flush_ptrace_hw_breakpoint(tsk); +} +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -- cgit v1.2.3 From 1409f141ac719b994d2832911b1e9ec928943fc2 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Wed, 27 Apr 2011 15:26:55 -0700 Subject: kernel/watchdog.c: disable nmi perf event in the error path of enabling watchdog In corner cases where softlockup watchdog is not setup successfully, the relevant nmi perf event for hardlockup watchdog could be disabled, then the status of the underlying hardware remains unchanged. Also, if the kthread doesn't start then the hrtimer won't run and the hardlockup detector will falsely fire. Signed-off-by: Hillf Danton Signed-off-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 140dce750450..14733d4d156b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -430,9 +430,12 @@ static int watchdog_enable(int cpu) p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - if (!err) + if (!err) { /* if hardlockup hasn't already set this */ err = PTR_ERR(p); + /* and disable the perf event */ + watchdog_nmi_disable(cpu); + } goto out; } kthread_bind(p, cpu); -- cgit v1.2.3 From ce31332d3c77532d6ea97ddcb475a2b02dd358b4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 29 Apr 2011 00:02:00 +0200 Subject: hrtimer: Initialize CLOCK_ID to HRTIMER_BASE table statically MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sedat and Bruno reported RCU stalls which turned out to be caused by the following; sched_init() calls init_rt_bandwidth() which calls hrtimer_init() _BEFORE_ hrtimers_init() is called. While not entirely correct this worked because hrtimer_init() only accessed statically initialized data (hrtimer_bases.clock_base[CLOCK_MONOTONIC]) Commit e06383db9 (hrtimers: extend hrtimer base code to handle more then 2 clockids) added an indirection to the hrtimer_bases.clock_base lookup to avoid gap handling in the hot path. The table which is used for the translataion from CLOCK_ID to HRTIMER_BASE index is initialized at runtime in hrtimers_init(). So the early call of the scheduler code translates CLOCK_MONOTONIC to HRTIMER_BASE_REALTIME. Thus the rt_bandwith timer ends up on CLOCK_REALTIME. If the timer is armed and the wall clock time is set (e.g. ntpdate in the early boot process - which also gives the problem deterministic behaviour i.e. magic recovery after N hours), then the timer ends up with an expiry time far into the future. That breaks the RT throttler mechanism as rt runtime is accumulated and never cleared, so the rt throttler detects a false cpu hog condition and blocks all RT tasks until the timer finally expires. That in turn stalls the RCU thread of TINYRCU which leads to an huge amount of RCU callbacks piling up. Make the translation table statically initialized, so we are back to the status of <= 2.6.39. Reported-and-tested-by: Sedat Dilek Reported-by: Bruno Prémont Cc: John stultz Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/%3Calpine.LFD.2.02.1104282353140.3005%40ionos%3E Reviewed-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9017478c5d4c..87fdb3f8db14 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -81,7 +81,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = } }; -static int hrtimer_clock_to_base_table[MAX_CLOCKS]; +static int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, + [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, +}; static inline int hrtimer_clockid_to_base(clockid_t clock_id) { @@ -1722,10 +1726,6 @@ static struct notifier_block __cpuinitdata hrtimers_nb = { void __init hrtimers_init(void) { - hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; - hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; - hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME; - hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -- cgit v1.2.3 From 5035b20fa5cd146b66f5f89619c20a4177fb736d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Apr 2011 18:08:37 +0200 Subject: workqueue: fix deadlock in worker_maybe_bind_and_lock() If a rescuer and stop_machine() bringing down a CPU race with each other, they may deadlock on non-preemptive kernel. The CPU won't accept a new task, so the rescuer can't migrate to the target CPU, while stop_machine() can't proceed because the rescuer is holding one of the CPU retrying migration. GCWQ_DISASSOCIATED is never cleared and worker_maybe_bind_and_lock() retries indefinitely. This problem can be reproduced semi reliably while the system is entering suspend. http://thread.gmane.org/gmane.linux.kernel/1122051 A lot of kudos to Thilo-Alexander for reporting this tricky issue and painstaking testing. stable: This affects all kernels with cmwq, so all kernels since and including v2.6.36 need this fix. Signed-off-by: Tejun Heo Reported-by: Thilo-Alexander Ginkel Tested-by: Thilo-Alexander Ginkel Cc: stable@kernel.org --- kernel/workqueue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04ef830690ec..e3378e8d3a5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1291,8 +1291,14 @@ __acquires(&gcwq->lock) return true; spin_unlock_irq(&gcwq->lock); - /* CPU has come up inbetween, retry migration */ + /* + * We've raced with CPU hot[un]plug. Give it a breather + * and retry migration. cond_resched() is required here; + * otherwise, we might deadlock against cpu_stop trying to + * bring down the CPU on non-preemptive kernel. + */ cpu_relax(); + cond_resched(); } } -- cgit v1.2.3 From 94b2c363dcf732a4edab4ed66041cb36e7f28fbf Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sat, 30 Apr 2011 22:56:20 +0200 Subject: genirq: Fix typo CONFIG_GENIRC_IRQ_SHOW_LEVEL commit ab7798ffcf98b11a9525cf65bacdae3fd58d357f ("genirq: Expand generic show_interrupts()") added the Kconfig option GENERIC_IRQ_SHOW_LEVEL to accomodate PowerPC, but this doesn't actually enable the functionality due to a typo in the #ifdef check. Signed-off-by: Geert Uytterhoeven Cc: Linux/PPC Development Link: http://lkml.kernel.org/r/%3Calpine.DEB.2.00.1104302251370.19068%40ayla.of.borg%3E Signed-off-by: Thomas Gleixner --- kernel/irq/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index dd201bd35103..834899f2500f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -419,7 +419,7 @@ int show_interrupts(struct seq_file *p, void *v) } else { seq_printf(p, " %8s", "None"); } -#ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL +#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); #endif if (desc->name) -- cgit v1.2.3 From a3a4a5acd3bd2f6f1e102e1f1b9d2e2bb320a7fd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 5 May 2011 23:55:18 -0400 Subject: Regression: partial revert "tracing: Remove lock_depth from event entry" This partially reverts commit e6e1e2593592a8f6f6380496655d8c6f67431266. That commit changed the structure layout of the trace structure, which in turn broke PowerTOP (1.9x generation) quite badly. I appreciate not wanting to expose the variable in question, and PowerTOP was not using it, so I've replaced the variable with just a padding field - that way if in the future a new field is needed it can just use this padding field. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/trace/trace.c | 1 + kernel/trace/trace_events.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d38c16a06a6f..1cb49be7c7fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1110,6 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e88f74fe1d4c..2fe110341359 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,6 +116,7 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(int, padding); return ret; } -- cgit v1.2.3