From 49f474331e563a6ecf3b1e87ec27ec5482b3e4f1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 27 Dec 2009 11:51:52 +0100 Subject: perf events: Remove arg from perf sched hooks Since we only ever schedule the local cpu, there is no need to pass the cpu number to the perf sched hooks. This micro-optimizes things a bit. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 27 ++++++++++++++------------- kernel/sched.c | 6 +++--- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 03cc061398d1..099bd662daa6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1170,9 +1170,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, * not restart the event. */ void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; @@ -1252,8 +1252,9 @@ static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) static void __perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) + struct perf_cpu_context *cpuctx) { + int cpu = smp_processor_id(); struct perf_event *event; int can_add_hw = 1; @@ -1326,24 +1327,24 @@ __perf_event_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; if (cpuctx->task_ctx == ctx) return; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); cpuctx->task_ctx = ctx; } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx) { struct perf_event_context *ctx = &cpuctx->ctx; - __perf_event_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx); } #define MAX_INTERRUPTS (~0ULL) @@ -1461,7 +1462,7 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; @@ -1469,7 +1470,7 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (!atomic_read(&nr_events)) return; - cpuctx = &per_cpu(perf_cpu_context, cpu); + cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; perf_ctx_adjust_freq(&cpuctx->ctx); @@ -1484,9 +1485,9 @@ void perf_event_task_tick(struct task_struct *curr, int cpu) if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx, cpu); + perf_event_cpu_sched_in(cpuctx); if (ctx) - perf_event_task_sched_in(curr, cpu); + perf_event_task_sched_in(curr); } /* @@ -1527,7 +1528,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task, smp_processor_id()); + perf_event_task_sched_in(task); out: local_irq_restore(flags); } diff --git a/kernel/sched.c b/kernel/sched.c index 18cceeecce35..d6527ac0f6e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2752,7 +2752,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_event_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -5266,7 +5266,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr, cpu); + perf_event_task_tick(curr); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5480,7 +5480,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From 07b139c8c81b97bbe55c68daf0cbeca8b1c609ca Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 21 Dec 2009 14:27:35 +0800 Subject: perf events: Remove CONFIG_EVENT_PROFILE Quoted from Ingo: | This reminds me - i think we should eliminate CONFIG_EVENT_PROFILE - | it's an unnecessary Kconfig complication. If both PERF_EVENTS and | EVENT_TRACING is enabled we should expose generic tracepoints. | | Nor is it limited to event 'profiling', so it has become a misnomer as | well. Signed-off-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B2F1557.2050705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++-- kernel/trace/Makefile | 4 +++- kernel/trace/trace_events_filter.c | 4 ++-- kernel/trace/trace_kprobe.c | 14 +++++++------- kernel/trace/trace_syscalls.c | 5 ++--- 5 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 099bd662daa6..5b987b4a98a8 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4177,7 +4177,7 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) @@ -4282,7 +4282,7 @@ static void perf_event_free_filter(struct perf_event *event) { } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT static void bp_perf_event_destroy(struct perf_event *event) diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cd9ecd89ec77..d00c6fe23f54 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -51,7 +51,9 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o -obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 50504cb228de..74563d7e102e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1360,7 +1360,7 @@ out_unlock: return err; } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS void ftrace_profile_free_filter(struct perf_event *event) { @@ -1428,5 +1428,5 @@ out_unlock: return err; } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 375f81a568dc..75d75dec226a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1249,7 +1249,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, ", REC->" FIELD_STRING_RETIP); } -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ static __kprobes int kprobe_profile_func(struct kprobe *kp, @@ -1407,7 +1407,7 @@ static void probe_profile_disable(struct ftrace_event_call *call) disable_kprobe(&tp->rp.kp); } } -#endif /* CONFIG_EVENT_PROFILE */ +#endif /* CONFIG_PERF_EVENTS */ static __kprobes @@ -1417,10 +1417,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kprobe_trace_func(kp, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kprobe_profile_func(kp, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1431,10 +1431,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) if (tp->flags & TP_FLAG_TRACE) kretprobe_trace_func(ri, regs); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) kretprobe_profile_func(ri, regs); -#endif /* CONFIG_EVENT_PROFILE */ +#endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1463,7 +1463,7 @@ static int register_probe_event(struct trace_probe *tp) call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS call->profile_enable = probe_profile_enable; call->profile_disable = probe_profile_disable; #endif diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 75289f372dd2..f694f66d75b0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -421,7 +421,7 @@ int __init init_ftrace_syscalls(void) } core_initcall(init_ftrace_syscalls); -#ifdef CONFIG_EVENT_PROFILE +#ifdef CONFIG_PERF_EVENTS static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); @@ -626,6 +626,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call) mutex_unlock(&syscall_trace_lock); } -#endif - +#endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From 14640106f243a3b29944d7198569090fa6546f2d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jan 2010 17:46:48 -0500 Subject: tracing/kprobe: Drop function argument access syntax Drop function argument access syntax, because the function arguments depend on not only architecture but also compile-options and function API. And now, we have perf-probe for finding register/memory assigned to each argument. Signed-off-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Roland McGrath Cc: Oleg Nesterov Cc: Mahesh Salgaonkar Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: linuxppc-dev@ozlabs.org LKML-Reference: <20100105224648.19431.52309.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 47f54ab57b68..7ac728ded964 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) return retval; } -static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) -{ - return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); -} - static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, void *dummy) { @@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) { int ret = -EINVAL; - if (ff->func == fetch_argument) - ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); - else if (ff->func == fetch_register) { + if (ff->func == fetch_register) { const char *name; name = regs_query_register_name((unsigned int)((long)ff->data)); ret = snprintf(buf, n, "%%%s", name); @@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) } } else ret = -EINVAL; - } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { - ret = strict_strtoul(arg + 3, 10, ¶m); - if (ret || param > PARAM_MAX_ARGS) - ret = -EINVAL; - else { - ff->func = fetch_argument; - ff->data = (void *)param; - } } else ret = -EINVAL; return ret; @@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv) * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] * Fetch args: - * $argN : fetch Nth of function argument. (N:0-) * $retval : fetch return value * $stack : fetch stack address * $stackN : fetch Nth of stack (N:0-) -- cgit v1.2.3 From 8381f65d097dad90416808314737dd7d3ae38ea9 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Fri, 8 Jan 2010 15:27:33 +0000 Subject: sched/perf: Make sure irqs are disabled for perf_event_task_sched_in() perf_event_task_sched_in() expects interrupts to be disabled, but on architectures with __ARCH_WANT_INTERRUPTS_ON_CTXSW defined, this isn't true. If this is defined, disable irqs around the call in finish_task_switch(). Signed-off-by: Jamie Iles Acked-by: Peter Zijlstra Cc: Russell King - ARM Linux LKML-Reference: <1262964453-27370-1-git-send-email-jamie.iles@picochip.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e507af086b42..c3ad3427a2a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2783,7 +2783,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); -- cgit v1.2.3 From 889ff0150661512d79484219612b7e2e024b6c07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 20:04:47 +0100 Subject: perf/core: Split context's event group list into pinned and non-pinned lists Split-up struct perf_event_context::group_list into pinned_groups and flexible_groups (non-pinned). This first appears to be useless as it duplicates various loops around the group list handlings. But it scales better in the fast-path in perf_sched_in(). We don't anymore iterate twice through the entire list to separate pinned and non-pinned scheduling. Instead we interate through two distinct lists. The another desired effect is that it makes easier to define distinct scheduling rules on both. Changes in v2: - Respectively rename pinned_grp_list and volatile_grp_list into pinned_groups and flexible_groups as per Ingo suggestion. - Various cleanups Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 227 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 151 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 27f69a04541d..c9f8a757649d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -289,6 +289,15 @@ static void update_event_times(struct perf_event *event) event->total_time_running = run_end - event->tstamp_running; } +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + /* * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -303,9 +312,12 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) * add it straight to the context's event list, or to the group * leader's sibling list: */ - if (group_leader == event) - list_add_tail(&event->group_entry, &ctx->group_list); - else { + if (group_leader == event) { + struct list_head *list; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } else { list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -355,8 +367,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) * to the context list directly: */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + struct list_head *list; - list_move_tail(&sibling->group_entry, &ctx->group_list); + list = ctx_group_list(event, ctx); + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; } } @@ -1056,7 +1070,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(event, &ctx->group_list, group_entry) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } perf_enable(); @@ -1271,9 +1288,8 @@ __perf_event_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF || - !event->attr.pinned) + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) continue; if (event->cpu != -1 && event->cpu != cpu) continue; @@ -1291,15 +1307,10 @@ __perf_event_sched_in(struct perf_event_context *ctx, } } - list_for_each_entry(event, &ctx->group_list, group_entry) { - /* - * Ignore events in OFF or ERROR state, and - * ignore pinned events since we did them already. - */ - if (event->state <= PERF_EVENT_STATE_OFF || - event->attr.pinned) + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) continue; - /* * Listen to the 'cpu' scheduling filter constraint * of events: @@ -1453,8 +1464,13 @@ static void rotate_ctx(struct perf_event_context *ctx) * Rotate the first entry last (works just fine for group events too): */ perf_disable(); - list_for_each_entry(event, &ctx->group_list, group_entry) { - list_move_tail(&event->group_entry, &ctx->group_list); + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->pinned_groups); + break; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + list_move_tail(&event->group_entry, &ctx->flexible_groups); break; } perf_enable(); @@ -1490,6 +1506,21 @@ void perf_event_task_tick(struct task_struct *curr) perf_event_task_sched_in(curr); } +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; +} + /* * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. @@ -1500,6 +1531,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) struct perf_event *event; unsigned long flags; int enabled = 0; + int ret; local_irq_save(flags); ctx = task->perf_event_ctxp; @@ -1510,14 +1542,16 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_lock(&ctx->lock); - list_for_each_entry(event, &ctx->group_list, group_entry) { - if (!event->attr.enable_on_exec) - continue; - event->attr.enable_on_exec = 0; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - continue; - __perf_event_mark_enabled(event, ctx); - enabled = 1; + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; } /* @@ -1591,7 +1625,8 @@ __perf_event_init_context(struct perf_event_context *ctx, { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->pinned_groups); + INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); ctx->task = task; @@ -5032,7 +5067,11 @@ void perf_event_exit_task(struct task_struct *child) mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, group_entry) __perf_event_exit_task(child_event, child_ctx, child); @@ -5041,7 +5080,8 @@ again: * its siblings to the list, but we obtained 'tmp' before that which * will still point to the list head terminating the iteration. */ - if (!list_empty(&child_ctx->group_list)) + if (!list_empty(&child_ctx->pinned_groups) || + !list_empty(&child_ctx->flexible_groups)) goto again; mutex_unlock(&child_ctx->mutex); @@ -5049,6 +5089,24 @@ again: put_ctx(child_ctx); } +static void perf_free_event(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + return; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); +} + /* * free an unexposed, unused context as created by inheritance by * init_task below, used by fork() in case of fail. @@ -5063,36 +5121,70 @@ void perf_event_free_task(struct task_struct *task) mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { - struct perf_event *parent = event->parent; + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + perf_free_event(event, ctx); - if (WARN_ON_ONCE(!parent)) - continue; + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - mutex_lock(&parent->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent->child_mutex); + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - fput(parent->filp); + mutex_unlock(&ctx->mutex); - list_del_event(event, ctx); - free_event(event); + put_ctx(ctx); +} + +static int +inherit_task_group(struct perf_event *event, struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + int *inherited_all) +{ + int ret; + struct perf_event_context *child_ctx = child->perf_event_ctxp; + + if (!event->attr.inherit) { + *inherited_all = 0; + return 0; } - if (!list_empty(&ctx->group_list)) - goto again; + if (!child_ctx) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ - mutex_unlock(&ctx->mutex); + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; - put_ctx(ctx); + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + + if (ret) + *inherited_all = 0; + + return ret; } + /* * Initialize the perf_event context in task_struct */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx = NULL, *parent_ctx; + struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5130,41 +5222,22 @@ int perf_event_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(event, &parent_ctx->group_list, group_entry) { - - if (!event->attr.inherit) { - inherited_all = 0; - continue; - } - - if (!child->perf_event_ctxp) { - /* - * This is executed from the parent task context, so - * inherit events that have been marked for cloning. - * First allocate and initialize a context for the - * child. - */ - - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); - if (!child_ctx) { - ret = -ENOMEM; - break; - } - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - } + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) + break; + } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, child, + &inherited_all); + if (ret) break; - } } + child_ctx = child->perf_event_ctxp; + if (child_ctx && inherited_all) { /* * Mark the child context as a clone of the parent @@ -5213,7 +5286,9 @@ static void __perf_event_exit_cpu(void *info) struct perf_event_context *ctx = &cpuctx->ctx; struct perf_event *event, *tmp; - list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + __perf_event_remove_from_context(event); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } static void perf_event_exit_cpu(int cpu) -- cgit v1.2.3 From e286417378b4f9ce6e473b556193465ab22e12ab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 9 Jan 2010 21:05:28 +0100 Subject: perf: Round robin flexible groups of events using list_rotate_left() This is more proper that doing it through a list_for_each_entry() that breaks after the first entry. v2: Don't rotate pinned groups as its not needed to time share them. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c9f8a757649d..bbebe2832639 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1454,25 +1454,16 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ static void rotate_ctx(struct perf_event_context *ctx) { - struct perf_event *event; - if (!ctx->nr_events) return; raw_spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group events too): - */ + + /* Rotate the first entry last of non-pinned groups */ perf_disable(); - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - list_move_tail(&event->group_entry, &ctx->pinned_groups); - break; - } - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { - list_move_tail(&event->group_entry, &ctx->flexible_groups); - break; - } + list_rotate_left(&ctx->flexible_groups); + perf_enable(); raw_spin_unlock(&ctx->lock); -- cgit v1.2.3 From d6f962b57bfaab62891c7abbf1469212a56d6103 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 10 Jan 2010 01:25:51 +0100 Subject: perf: Export software-only event group characteristic as a flag Before scheduling an event group, we first check if a group can go on. We first check if the group is made of software only events first, in which case it is enough to know if the group can be scheduled in. For that purpose, we iterate through the whole group, which is wasteful as we could do this check when we add/delete an event to a group. So we create a group_flags field in perf event that can host characteristics from a group of events, starting with a first PERF_GROUP_SOFTWARE flag that reduces the check on the fast path. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bbebe2832639..eae6ff693604 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -315,9 +315,16 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (group_leader == event) { struct list_head *list; + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); } else { + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } @@ -372,6 +379,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; } } @@ -699,24 +709,6 @@ group_error: return -EAGAIN; } -/* - * Return 1 for a group consisting entirely of software events, - * 0 if the group contains any hardware events. - */ -static int is_software_only_group(struct perf_event *leader) -{ - struct perf_event *event; - - if (!is_software_event(leader)) - return 0; - - list_for_each_entry(event, &leader->sibling_list, group_entry) - if (!is_software_event(event)) - return 0; - - return 1; -} - /* * Work out whether we can put this event group on the CPU now. */ @@ -727,7 +719,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (is_software_only_group(event)) + if (event->group_flags & PERF_GROUP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware -- cgit v1.2.3 From 231e36f4d2e63dd770db80b9f5113310c2bcfcfd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Jan 2010 00:12:12 -0500 Subject: tracing/kprobe: Update kprobe tracing self test for new syntax Update kprobe tracing self test for new syntax (it supports deleting individual probes, and drops $argN support) and behavior change (new probes are disabled in default). This selftest includes the following checks: - Adding function-entry probe and return probe with arguments. - Enabling these probes. - Deleting it individually. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20100114051211.7814.29436.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 55 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 7ac728ded964..d6266cad6953 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1507,28 +1507,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3, static __init int kprobe_trace_self_tests_init(void) { - int ret; + int ret, warn = 0; int (*target)(int, int, int, int, int, int); + struct trace_probe *tp; target = kprobe_trace_selftest_target; pr_info("Testing kprobe tracing: "); ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " - "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); - if (WARN_ON_ONCE(ret)) - pr_warning("error enabling function entry\n"); + "$stack $stack0 +0($stack)"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on probing function entry.\n"); + warn++; + } else { + /* Enable trace point */ + tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting new probe.\n"); + warn++; + } else + probe_event_enable(&tp->call); + } ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " "$retval"); - if (WARN_ON_ONCE(ret)) - pr_warning("error enabling function return\n"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on probing function return.\n"); + warn++; + } else { + /* Enable trace point */ + tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting new probe.\n"); + warn++; + } else + probe_event_enable(&tp->call); + } + + if (warn) + goto end; ret = target(1, 2, 3, 4, 5, 6); - cleanup_all_probes(); + ret = command_trace_probe("-:testprobe"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on deleting a probe.\n"); + warn++; + } + + ret = command_trace_probe("-:testprobe2"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on deleting a probe.\n"); + warn++; + } - pr_cont("OK\n"); +end: + cleanup_all_probes(); + if (warn) + pr_cont("NG: Some tests are failed. Please check them.\n"); + else + pr_cont("OK\n"); return 0; } -- cgit v1.2.3 From 42cce92f4ddfa41e2dfe26fdcad4887943c032f2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 10:36:08 +0100 Subject: perf: Make __perf_event_sched_out static __perf_event_sched_out doesn't need to be globally available, make it static. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index eae6ff693604..c4e90b8cd60d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1049,8 +1049,8 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -void __perf_event_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) { struct perf_event *event; -- cgit v1.2.3 From 5b0311e1f2464547fc6f17a82d7ea2538c8c7a70 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 11:59:13 +0100 Subject: perf: Allow pinned and flexible groups to be scheduled separately Tune the scheduling helpers so that we can choose to schedule either pinned and/or flexible groups from a context. And while at it, refactor a bit the naming of these helpers to make these more consistent and flexible. There is no (intended) change in scheduling behaviour in this patch. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 137 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 93 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c4e90b8cd60d..bfc4ee015c87 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1049,8 +1049,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -static void __perf_event_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) { struct perf_event *event; @@ -1061,13 +1068,18 @@ static void __perf_event_sched_out(struct perf_event_context *ctx, update_context_time(ctx); perf_disable(); - if (ctx->nr_active) { + if (!ctx->nr_active) + goto out_enable; + + if (event_type & EVENT_PINNED) list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); + if (event_type & EVENT_FLEXIBLE) list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); - } + + out_enable: perf_enable(); out: raw_spin_unlock(&ctx->lock); @@ -1229,15 +1241,13 @@ void perf_event_task_sched_out(struct task_struct *task, rcu_read_unlock(); if (do_switch) { - __perf_event_sched_out(ctx, cpuctx); + ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; } } -/* - * Called with IRQs disabled - */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); @@ -1247,39 +1257,34 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx) if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - __perf_event_sched_out(ctx, cpuctx); + ctx_sched_out(ctx, cpuctx, event_type); cpuctx->task_ctx = NULL; } /* * Called with IRQs disabled */ -static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +static void __perf_event_task_sched_out(struct perf_event_context *ctx) { - __perf_event_sched_out(&cpuctx->ctx, cpuctx); + task_ctx_sched_out(ctx, EVENT_ALL); +} + +/* + * Called with IRQs disabled + */ +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); } static void -__perf_event_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + int cpu) { - int cpu = smp_processor_id(); struct perf_event *event; - int can_add_hw = 1; - - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_events)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ list_for_each_entry(event, &ctx->pinned_groups, group_entry) { if (event->state <= PERF_EVENT_STATE_OFF) continue; @@ -1298,6 +1303,15 @@ __perf_event_sched_in(struct perf_event_context *ctx, event->state = PERF_EVENT_STATE_ERROR; } } +} + +static void +ctx_flexible_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; list_for_each_entry(event, &ctx->flexible_groups, group_entry) { /* Ignore events in OFF or ERROR state */ @@ -1314,11 +1328,53 @@ __perf_event_sched_in(struct perf_event_context *ctx, if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; } +} + +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + int cpu = smp_processor_id(); + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + if (event_type & EVENT_PINNED) + ctx_pinned_sched_in(ctx, cpuctx, cpu); + + /* Then walk through the lower prio flexible groups */ + if (event_type & EVENT_FLEXIBLE) + ctx_flexible_sched_in(ctx, cpuctx, cpu); + perf_enable(); out: raw_spin_unlock(&ctx->lock); } +static void task_ctx_sched_in(struct task_struct *task, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + ctx_sched_in(ctx, cpuctx, event_type); + cpuctx->task_ctx = ctx; +} /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -1332,22 +1388,15 @@ __perf_event_sched_in(struct perf_event_context *ctx, */ void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_event_sched_in(ctx, cpuctx); - cpuctx->task_ctx = ctx; + task_ctx_sched_in(task, EVENT_ALL); } -static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx) +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) { struct perf_event_context *ctx = &cpuctx->ctx; - __perf_event_sched_in(ctx, cpuctx); + ctx_sched_in(ctx, cpuctx, event_type); } #define MAX_INTERRUPTS (~0ULL) @@ -1476,17 +1525,17 @@ void perf_event_task_tick(struct task_struct *curr) if (ctx) perf_ctx_adjust_freq(ctx); - perf_event_cpu_sched_out(cpuctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); if (ctx) - __perf_event_task_sched_out(ctx); + task_ctx_sched_out(ctx, EVENT_ALL); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - perf_event_cpu_sched_in(cpuctx); + cpu_ctx_sched_in(cpuctx, EVENT_ALL); if (ctx) - perf_event_task_sched_in(curr); + task_ctx_sched_in(curr, EVENT_ALL); } static int event_enable_on_exec(struct perf_event *event, -- cgit v1.2.3 From 7defb0f879bbcfe29e3c6f29d685d4f29b7a0700 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 12:15:31 +0100 Subject: perf: Don't schedule out/in pinned events on task tick We don't need to schedule in/out pinned events on task tick, now that pinned and flexible groups can be scheduled separately. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bfc4ee015c87..a90ae694cbc1 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1525,17 +1525,17 @@ void perf_event_task_tick(struct task_struct *curr) if (ctx) perf_ctx_adjust_freq(ctx); - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_out(ctx, EVENT_ALL); + task_ctx_sched_out(ctx, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_in(curr, EVENT_ALL); + task_ctx_sched_in(curr, EVENT_FLEXIBLE); } static int event_enable_on_exec(struct perf_event *event, -- cgit v1.2.3 From 329c0e012b99fa2325a0be205c052e4aba690f16 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 17 Jan 2010 12:56:05 +0100 Subject: perf: Better order flexible and pinned scheduling When a task gets scheduled in. We don't touch the cpu bound events so the priority order becomes: cpu pinned, cpu flexible, task pinned, task flexible. So schedule out cpu flexibles when a new task context gets in and correctly order the groups to schedule in: task pinned, cpu flexible, task flexible. Cpu pinned groups don't need to be touched at this time. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo --- kernel/perf_event.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a90ae694cbc1..edc46b92b508 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1362,6 +1362,14 @@ ctx_sched_in(struct perf_event_context *ctx, raw_spin_unlock(&ctx->lock); } +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + ctx_sched_in(ctx, cpuctx, event_type); +} + static void task_ctx_sched_in(struct task_struct *task, enum event_type_t event_type) { @@ -1388,15 +1396,27 @@ static void task_ctx_sched_in(struct task_struct *task, */ void perf_event_task_sched_in(struct task_struct *task) { - task_ctx_sched_in(task, EVENT_ALL); -} + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = task->perf_event_ctxp; -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - struct perf_event_context *ctx = &cpuctx->ctx; + if (likely(!ctx)) + return; - ctx_sched_in(ctx, cpuctx, event_type); + if (cpuctx->task_ctx == ctx) + return; + + /* + * We want to keep the following priority order: + * cpu pinned (that don't need to move), task pinned, + * cpu flexible, task flexible. + */ + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + + ctx_sched_in(ctx, cpuctx, EVENT_PINNED); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + + cpuctx->task_ctx = ctx; } #define MAX_INTERRUPTS (~0ULL) -- cgit v1.2.3 From abd50713944c8ea9e0af5b7bffa0aacae21cc91a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 26 Jan 2010 18:50:16 +0100 Subject: perf: Reimplement frequency driven sampling There was a bug in the old period code that caused intel_pmu_enable_all() or native_write_msr_safe() to show up quite high in the profiles. In staring at that code it made my head hurt, so I rewrote it in a hopefully simpler fashion. Its now fully symetric between tick and overflow driven adjustments and uses less data to boot. The only complication is that it basically wants to do a u128 division. The code approximates that in a rather simple truncate until it fits fashion, taking care to balance the terms while truncating. This version does not generate that sampling artefact. Signed-off-by: Peter Zijlstra LKML-Reference: Cc: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 132 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 92 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index edc46b92b508..251fb9552492 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1423,14 +1423,83 @@ void perf_event_task_sched_in(struct task_struct *task) static void perf_log_throttle(struct perf_event *event, int enable); -static void perf_adjust_period(struct perf_event *event, u64 events) +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) +{ + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } + + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + return div64_u64(dividend, divisor); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; u64 period, sample_period; s64 delta; - events *= hwc->sample_period; - period = div64_u64(events, event->attr.sample_freq); + period = perf_calculate_period(event, nsec, count); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ @@ -1441,13 +1510,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events) sample_period = 1; hwc->sample_period = sample_period; + + if (atomic64_read(&hwc->period_left) > 8*sample_period) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } } static void perf_ctx_adjust_freq(struct perf_event_context *ctx) { struct perf_event *event; struct hw_perf_event *hwc; - u64 interrupts, freq; + u64 interrupts, now; + s64 delta; raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { @@ -1468,44 +1546,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); event->pmu->unthrottle(event); - interrupts = 2*sysctl_perf_event_sample_rate/HZ; } if (!event->attr.freq || !event->attr.sample_freq) continue; - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (event->attr.sample_freq < HZ) { - freq = event->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(event, freq * interrupts); + event->pmu->read(event); + now = atomic64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - event->pmu->disable(event); - atomic64_set(&hwc->period_left, 0); - event->pmu->enable(event); - perf_enable(); - } + if (delta > 0) + perf_adjust_period(event, TICK_NSEC, delta); } raw_spin_unlock(&ctx->lock); } @@ -3768,12 +3820,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (event->attr.freq) { u64 now = perf_clock(); - s64 delta = now - hwc->freq_stamp; + s64 delta = now - hwc->freq_time_stamp; - hwc->freq_stamp = now; + hwc->freq_time_stamp = now; - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + if (delta > 0 && delta < 2*TICK_NSEC) + perf_adjust_period(event, delta, hwc->last_period); } /* -- cgit v1.2.3 From 430ad5a600a83956749307b13257c464c3826b55 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 28 Jan 2010 09:32:29 +0800 Subject: perf: Factorize trace events raw sample buffer operations Introduce ftrace_perf_buf_prepare() and ftrace_perf_buf_submit() to gather the common code that operates on raw events sampling buffer. This cleans up redundant code between regular trace events, syscall events and kprobe events. Changelog v1->v2: - Rename function name as per Masami and Frederic's suggestion - Add __kprobes for ftrace_perf_buf_prepare() and make ftrace_perf_buf_submit() inline as per Masami's suggestion - Export ftrace_perf_buf_prepare since modules will use it Signed-off-by: Xiao Guangrong Acked-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Steven Rostedt Cc: Paul Mackerras Cc: Jason Baron Cc: Peter Zijlstra LKML-Reference: <4B60E92D.9000808@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_event_profile.c | 52 ++++++++++++++++++++--- kernel/trace/trace_kprobe.c | 86 +++++--------------------------------- kernel/trace/trace_syscalls.c | 71 +++++-------------------------- 3 files changed, 67 insertions(+), 142 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 9e25573242cf..f0d693005075 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -6,14 +6,12 @@ */ #include +#include #include "trace.h" -char *perf_trace_buf; -EXPORT_SYMBOL_GPL(perf_trace_buf); - -char *perf_trace_buf_nmi; -EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); +static char *perf_trace_buf; +static char *perf_trace_buf_nmi; typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; @@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id) } mutex_unlock(&event_mutex); } + +__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, + int *rctxp, unsigned long *irq_flags) +{ + struct trace_entry *entry; + char *trace_buf, *raw_data; + int pc, cpu; + + pc = preempt_count(); + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(*irq_flags); + + *rctxp = perf_swevent_get_recursion_context(); + if (*rctxp < 0) + goto err_recursion; + + cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto err; + + raw_data = per_cpu_ptr(trace_buf, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + entry = (struct trace_entry *)raw_data; + tracing_generic_entry_update(entry, *irq_flags, pc); + entry->type = type; + + return raw_data; +err: + perf_swevent_put_recursion_context(*rctxp); +err_recursion: + local_irq_restore(*irq_flags); + return NULL; +} +EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d6266cad6953..2e28ee36646f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1243,14 +1243,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry *entry; - struct trace_entry *ent; - int size, __size, i, pc, __cpu; + int size, __size, i; unsigned long irq_flags; - char *trace_buf; - char *raw_data; int rctx; - pc = preempt_count(); __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1258,45 +1254,16 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, "profile buffer not large enough")) return 0; - /* - * Protect the non nmi buffer - * This also protects the rcu read side - */ - local_irq_save(irq_flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - __cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, __cpu); - - /* Zero dead bytes from alignment to avoid buffer leak to userspace */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - entry = (struct kprobe_trace_entry *)raw_data; - ent = &entry->ent; + entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + if (!entry) + return 0; - tracing_generic_entry_update(ent, irq_flags, pc); - ent->type = call->id; entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tp_event(call->id, entry->ip, 1, entry, size); -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); return 0; } @@ -1308,14 +1275,10 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry *entry; - struct trace_entry *ent; - int size, __size, i, pc, __cpu; + int size, __size, i; unsigned long irq_flags; - char *trace_buf; - char *raw_data; int rctx; - pc = preempt_count(); __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); @@ -1323,46 +1286,17 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, "profile buffer not large enough")) return 0; - /* - * Protect the non nmi buffer - * This also protects the rcu read side - */ - local_irq_save(irq_flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - __cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, __cpu); - - /* Zero dead bytes from alignment to avoid buffer leak to userspace */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - entry = (struct kretprobe_trace_entry *)raw_data; - ent = &entry->ent; + entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + if (!entry) + return 0; - tracing_generic_entry_update(ent, irq_flags, pc); - ent->type = call->id; entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - perf_tp_event(call->id, entry->ret_ip, 1, entry, size); -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index f694f66d75b0..4e332b9e449c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; unsigned long flags; - char *trace_buf; - char *raw_data; int syscall_nr; int rctx; int size; - int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) @@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) "profile buffer not large enough")) return; - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - cpu = smp_processor_id(); - - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, cpu); - - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, + sys_data->enter_event->id, &rctx, &flags); + if (!rec) + return; - rec = (struct syscall_trace_enter *) raw_data; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->enter_event->id; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); - -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); } int prof_sysenter_enable(struct ftrace_event_call *call) @@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) struct syscall_trace_exit *rec; unsigned long flags; int syscall_nr; - char *trace_buf; - char *raw_data; int rctx; int size; - int cpu; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) @@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) "exit event has grown above profile buffer size")) return; - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(flags); - - rctx = perf_swevent_get_recursion_context(); - if (rctx < 0) - goto end_recursion; - - cpu = smp_processor_id(); - - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto end; - - raw_data = per_cpu_ptr(trace_buf, cpu); - - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - - rec = (struct syscall_trace_exit *)raw_data; + rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, + sys_data->exit_event->id, &rctx, &flags); + if (!rec) + return; - tracing_generic_entry_update(&rec->ent, 0, 0); - rec->ent.type = sys_data->exit_event->id; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); - -end: - perf_swevent_put_recursion_context(rctx); -end_recursion: - local_irq_restore(flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); } int prof_sysexit_enable(struct ftrace_event_call *call) -- cgit v1.2.3 From 1e12a4a7a3a78bc9c3aaf3486dde3b8ab1cdf465 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 28 Jan 2010 09:34:27 +0800 Subject: tracing/kprobe: Cleanup unused return value of tracing functions The return values of the kprobe's tracing functions are meaningless, lets remove these. Signed-off-by: Xiao Guangrong Acked-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Ingo Molnar Cc: Paul Mackerras Cc: Jason Baron Cc: Peter Zijlstra LKML-Reference: <4B60E9A3.2040505@cn.fujitsu.com> [fweisbec@gmail: whitespace fixes, drop useless void returns in end of functions] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_kprobe.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2e28ee36646f..6178abf3637e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -942,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = { }; /* Kprobe handler */ -static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct kprobe_trace_entry *entry; @@ -962,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); if (!event) - return 0; + return; entry = ring_buffer_event_data(event); entry->nargs = tp->nr_args; @@ -972,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); - return 0; } /* Kretprobe handler */ -static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, +static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); @@ -995,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); if (!event) - return 0; + return; entry = ring_buffer_event_data(event); entry->nargs = tp->nr_args; @@ -1006,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); - - return 0; } /* Event entry printers */ @@ -1237,7 +1234,7 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes int kprobe_profile_func(struct kprobe *kp, +static __kprobes void kprobe_profile_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); @@ -1252,11 +1249,11 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, size -= sizeof(u32); if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, "profile buffer not large enough")) - return 0; + return; entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) - return 0; + return; entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; @@ -1264,12 +1261,10 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp, entry->args[i] = call_fetch(&tp->args[i].fetch, regs); ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); - - return 0; } /* Kretprobe profile handler */ -static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, +static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); @@ -1284,11 +1279,11 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, size -= sizeof(u32); if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, "profile buffer not large enough")) - return 0; + return; entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) - return 0; + return; entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; @@ -1297,8 +1292,6 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, entry->args[i] = call_fetch(&tp->args[i].fetch, regs); ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); - - return 0; } static int probe_profile_enable(struct ftrace_event_call *call) -- cgit v1.2.3 From 75c9f3284a7ff957829f44baace82406a6354ceb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Jan 2010 09:04:26 +0100 Subject: perf_events: Fix sample_period transfer on inherit One problem with frequency driven counters is that we cannot predict the rate at which they trigger, therefore we have to start them at period=1, this causes a ramp up effect. However, if we fail to propagate the stable state on fork each new child will have to ramp up again. This can lead to significant artifacts in sample data. Signed-off-by: Peter Zijlstra Cc: eranian@google.com Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <1264752266.4283.2121.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 251fb9552492..53dc2a362111 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5002,8 +5002,15 @@ inherit_event(struct perf_event *parent_event, else child_event->state = PERF_EVENT_STATE_OFF; - if (parent_event->attr.freq) - child_event->hw.sample_period = parent_event->hw.sample_period; + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + atomic64_set(&hwc->period_left, sample_period); + } child_event->overflow_handler = parent_event->overflow_handler; -- cgit v1.2.3 From d6ad3e286d2c075a60b9f11075a2c55aeeeca2ad Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 27 Jan 2010 16:25:22 -0600 Subject: softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets the time from hardware such as the TSC on x86. In this configuration kgdb will report a softlock warning message on resuming or detaching from a debug session. Sequence of events in the problem case: 1) "cpu sched clock" and "hardware time" are at 100 sec prior to a call to kgdb_handle_exception() 2) Debugger waits in kgdb_handle_exception() for 80 sec and on exit the following is called ... touch_softlockup_watchdog() --> __raw_get_cpu_var(touch_timestamp) = 0; 3) "cpu sched clock" = 100s (it was not updated, because the interrupt was disabled in kgdb) but the "hardware time" = 180 sec 4) The first timer interrupt after resuming from kgdb_handle_exception updates the watchdog from the "cpu sched clock" update_process_times() { ... run_local_timers() --> softlockup_tick() --> check (touch_timestamp == 0) (it is "YES" here, we have set "touch_timestamp = 0" at kgdb) --> __touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp" to "get_timestamp()" (Here, the "touch_timestamp" will still be set to 100s.) ... scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched clock" to "hardware time" = 180s) ... } 5) The Second timer interrupt handler appears to have a large jump and trips the softlockup warning. update_process_times() { ... run_local_timers() --> softlockup_tick() --> "cpu sched clock" - "touch_timestamp" = 180s-100s > 60s --> printk "soft lockup error messages" ... } note: ***(A) reset "touch_timestamp" to "get_timestamp(this_cpu)" Why is "touch_timestamp" 100 sec, instead of 180 sec? When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of get_timestamp() is: get_timestamp(this_cpu) -->cpu_clock(this_cpu) -->sched_clock_cpu(this_cpu) -->__update_sched_clock(sched_clock_data, now) The __update_sched_clock() function uses the GTOD tick value to create a window to normalize the "now" values. So if "now" value is too big for sched_clock_data, it will be ignored. The fix is to invoke sched_clock_tick() to update "cpu sched clock" in order to recover from this state. This is done by introducing the function touch_softlockup_watchdog_sync(). This allows kgdb to request that the sched clock is updated when the watchdog thread runs the first time after a resume from kgdb. [yong.zhang0@gmail.com: Use per cpu instead of an array] Signed-off-by: Jason Wessel Signed-off-by: Dongdong Deng Cc: kgdb-bugreport@lists.sourceforge.net Cc: peterz@infradead.org LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar --- kernel/kgdb.c | 6 +++--- kernel/softlockup.c | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..87f2cc557553 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -596,7 +596,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1450,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1550,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e27..0d4c7898ab80 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } -- cgit v1.2.3 From 4f48f8b7fd18c44f8478174f9925cc3c059c6ce4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 2 Feb 2010 15:32:09 +0800 Subject: tracing: Fix circular dead lock in stack trace When we cat /tracing/stack_trace, we may cause circular lock: sys_read() t_start() arch_spin_lock(&max_stack_lock); t_show() seq_printf(), vsnprintf() .... /* they are all trace-able, when they are traced, max_stack_lock may be required again. */ The following script can trigger this circular dead lock very easy: #!/bin/bash echo 1 > /proc/sys/kernel/stack_tracer_enabled mount -t debugfs xxx /mnt > /dev/null 2>&1 ( # make check_stack() zealous to require max_stack_lock for ((; ;)) { echo 1 > /mnt/tracing/stack_max_size } ) & for ((; ;)) { cat /mnt/tracing/stack_trace > /dev/null } To fix this bug, we increase the percpu trace_active before require the lock. Reported-by: Li Zefan Signed-off-by: Lai Jiangshan LKML-Reference: <4B67D4F9.9080905@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee30..f4bc9b27de5f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); *ptr = val; arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + int cpu; + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); if (*pos == 0) @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { + int cpu; + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } -- cgit v1.2.3 From 5ecb01cfdf96c5f465192bdb2a4fd4a61a24c6cc Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Sat, 23 Jan 2010 22:36:29 +0100 Subject: futex_lock_pi() key refcnt fix This fixes a futex key reference count bug in futex_lock_pi(), where a key's reference count is incremented twice but decremented only once, causing the backing object to not be released. If the futex is created in a temporary file in an ext3 file system, this bug causes the file's inode to become an "undead" orphan, which causes an oops from a BUG_ON() in ext3_put_super() when the file system is unmounted. glibc's test suite is known to trigger this, see . The bug is a regression from 2.6.28-git3, namely Peter Zijlstra's 38d47c1b7075bd7ec3881141bb3629da58f88dab "[PATCH] futex: rely on get_user_pages() for shared futexes". That commit made get_futex_key() also increment the reference count of the futex key, and updated its callers to decrement the key's reference count before returning. Unfortunately the normal exit path in futex_lock_pi() wasn't corrected: the reference count is incremented by get_futex_key() and queue_lock(), but the normal exit path only decrements once, via unqueue_me_pi(). The fix is to put_futex_key() after unqueue_me_pi(), since 2.6.31 this is easily done by 'goto out_put_key' rather than 'goto out'. Signed-off-by: Mikael Pettersson Acked-by: Peter Zijlstra Acked-by: Darren Hart Signed-off-by: Thomas Gleixner Cc: --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d9b3a2228f9d..17828033a639 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1971,7 +1971,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); -- cgit v1.2.3 From 51246bfd189064079c54421507236fd2723b18f3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 2 Feb 2010 11:40:27 +0100 Subject: futex: Handle user space corruption gracefully If the owner of a PI futex dies we fix up the pi_state and set pi_state->owner to NULL. When a malicious or just sloppy programmed user space application sets the futex value to 0 e.g. by calling pthread_mutex_init(), then the futex can be acquired again. A new waiter manages to enqueue itself on the pi_state w/o damage, but on unlock the kernel dereferences pi_state->owner and oopses. Prevent this by checking pi_state->owner in the unlock path. If pi_state->owner is not current we know that user space manipulated the futex value. Ignore the mess and return -EINVAL. This catches the above case and also the case where a task hijacks the futex by setting the tid value and then tries to unlock it. Reported-by: Jermome Marchand Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Acked-by: Peter Zijlstra Cc: --- kernel/futex.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 17828033a639..06e8240d2abe 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -758,6 +758,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); -- cgit v1.2.3 From 59647b6ac3050dd964bc556fe6ef22f4db5b935c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Feb 2010 09:33:05 +0100 Subject: futex: Handle futex value corruption gracefully The WARN_ON in lookup_pi_state which complains about a mismatch between pi_state->owner->pid and the pid which we retrieved from the user space futex is completely bogus. The code just emits the warning and then continues despite the fact that it detected an inconsistent state of the futex. A conveniant way for user space to spam the syslog. Replace the WARN_ON by a consistency check. If the values do not match return -EINVAL and let user space deal with the mess it created. This also fixes the missing task_pid_vnr() when we compare the pi_state->owner pid with the futex value. Reported-by: Jermome Marchand Signed-off-by: Thomas Gleixner Acked-by: Darren Hart Acked-by: Peter Zijlstra Cc: --- kernel/futex.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 06e8240d2abe..e7a35f1039e7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; -- cgit v1.2.3 From b9c3032277f756e73f6c673419dc414155e04e46 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Feb 2010 18:08:52 +0100 Subject: hrtimer, softirq: Fix hrtimer->softirq trampoline hrtimers callbacks are always done from hardirq context, either the jiffy tick interrupt or the hrtimer device interrupt. [ there is currently one exception that can still call a hrtimer callback from softirq, but even in that case this will still work correctly. ] Reported-by: Wei Yongjun Signed-off-by: Peter Zijlstra Cc: Yury Polyanskiy Tested-by: Wei Yongjun Acked-by: David S. Miller LKML-Reference: <1265120401.24455.306.camel@laptop> Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e2ef75..7c1a67ef0274 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill); */ /* - * The trampoline is called when the hrtimer expires. If this is - * called from the hrtimer interrupt then we schedule the tasklet as - * the timer callback function expects to run in softirq context. If - * it's called in softirq context anyway (i.e. high resolution timers - * disabled) then the hrtimer callback is called right away. + * The trampoline is called when the hrtimer expires. It schedules a tasklet + * to run __tasklet_hrtimer_trampoline() which in turn will call the intended + * hrtimer callback, but from softirq context. */ static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) { struct tasklet_hrtimer *ttimer = container_of(timer, struct tasklet_hrtimer, timer); - if (hrtimer_is_hres_active(timer)) { - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; - } - return ttimer->function(timer); + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; } /* -- cgit v1.2.3 From 615d0ebbc782b67296e3226c293f520f93f93515 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:04 -0500 Subject: kprobes: Disable booster when CONFIG_PREEMPT=y Disable kprobe booster when CONFIG_PREEMPT=y at this time, because it can't ensure that all kernel threads preempted on kprobe's boosted slot run out from the slot even using freeze_processes(). The booster on preemptive kernel will be resumed if synchronize_tasks() or something like that is introduced. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Frederic Weisbecker Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Steven Rostedt LKML-Reference: <20100202214904.4694.24330.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b7df302a0204..9907a03c29f6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -124,30 +124,6 @@ static LIST_HEAD(kprobe_insn_pages); static int kprobe_garbage_slots; static int collect_garbage_slots(void); -static int __kprobes check_safety(void) -{ - int ret = 0; -#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER) - ret = freeze_processes(); - if (ret == 0) { - struct task_struct *p, *q; - do_each_thread(p, q) { - if (p != current && p->state == TASK_RUNNING && - p->pid != 0) { - printk("Check failed: %s is running\n",p->comm); - ret = -1; - goto loop_end; - } - } while_each_thread(p, q); - } -loop_end: - thaw_processes(); -#else - synchronize_sched(); -#endif - return ret; -} - /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. @@ -235,9 +211,8 @@ static int __kprobes collect_garbage_slots(void) { struct kprobe_insn_page *kip, *next; - /* Ensure no-one is preepmted on the garbages */ - if (check_safety()) - return -EAGAIN; + /* Ensure no-one is interrupted on the garbages */ + synchronize_sched(); list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { int i; -- cgit v1.2.3 From 2cfa19780d61740f65790c5bae363b759d7c96fa Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:11 -0500 Subject: ftrace/alternatives: Introducing *_text_reserved functions Introducing *_text_reserved functions for checking the text address range is partially reserved or not. This patch provides checking routines for x86 smp alternatives and dynamic ftrace. Since both functions modify fixed pieces of kernel text, they should reserve and protect those from other dynamic text modifier, like kprobes. This will also be extended when introducing other subsystems which modify fixed pieces of kernel text. Dynamic text modifiers should avoid those. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Jason Baron LKML-Reference: <20100202214911.4694.16587.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1e6640f80454..3d90661a5f40 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1025,6 +1025,21 @@ static void ftrace_bug(int failed, unsigned long ip) } +/* Return 1 if the address range is reserved for ftrace */ +int ftrace_text_reserved(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip <= (unsigned long)end && + rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + + static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { -- cgit v1.2.3 From 4554dbcb85a4ed2abaa2b6fa15649b796699ec89 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:18 -0500 Subject: kprobes: Check probe address is reserved Check whether the address of new probe is already reserved by ftrace or alternatives (on x86) when registering new probe. If reserved, it returns an error and not register the probe. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Mathieu Desnoyers Cc: Jason Baron LKML-Reference: <20100202214918.4694.94179.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9907a03c29f6..c3340e836c37 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -703,7 +704,8 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr)) { + in_kprobes_functions((unsigned long) p->addr) || + ftrace_text_reserved(p->addr, p->addr)) { preempt_enable(); return -EINVAL; } -- cgit v1.2.3 From f24bb999d2b9f2950e5cac5b69bffedf73c24ea4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 2 Feb 2010 16:49:25 -0500 Subject: ftrace: Remove record freezing Remove record freezing. Because kprobes never puts probe on ftrace's mcount call anymore, it doesn't need ftrace to check whether kprobes on it. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Steven Rostedt Cc: przemyslaw@pawelczyk.it Cc: Frederic Weisbecker LKML-Reference: <20100202214925.4694.73469.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3d90661a5f40..1904797f4a8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records; } \ } -#ifdef CONFIG_KPROBES - -static int frozen_record_count; - -static inline void freeze_record(struct dyn_ftrace *rec) -{ - if (!(rec->flags & FTRACE_FL_FROZEN)) { - rec->flags |= FTRACE_FL_FROZEN; - frozen_record_count++; - } -} - -static inline void unfreeze_record(struct dyn_ftrace *rec) -{ - if (rec->flags & FTRACE_FL_FROZEN) { - rec->flags &= ~FTRACE_FL_FROZEN; - frozen_record_count--; - } -} - -static inline int record_frozen(struct dyn_ftrace *rec) -{ - return rec->flags & FTRACE_FL_FROZEN; -} -#else -# define freeze_record(rec) ({ 0; }) -# define unfreeze_record(rec) ({ 0; }) -# define record_frozen(rec) ({ 0; }) -#endif /* CONFIG_KPROBES */ - static void ftrace_free_rec(struct dyn_ftrace *rec) { rec->freelist = ftrace_free_records; @@ -1091,14 +1060,6 @@ static void ftrace_replace_code(int enable) !(rec->flags & FTRACE_FL_CONVERTED)) continue; - /* ignore updates to this record's mcount site */ - if (get_kprobe((void *)rec->ip)) { - freeze_record(rec); - continue; - } else { - unfreeze_record(rec); - } - failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; -- cgit v1.2.3 From 9717e6cd3db22eade7dbae0fc9235c66325a7132 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Jan 2010 13:57:44 +0100 Subject: perf_events: Optimize perf_event_task_tick() Pretty much all of the calls do perf_disable/perf_enable cycles, pull that out to cut back on hardware programming. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 40f8b07c5601..087025fe3ba1 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1573,12 +1573,8 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_lock(&ctx->lock); /* Rotate the first entry last of non-pinned groups */ - perf_disable(); - list_rotate_left(&ctx->flexible_groups); - perf_enable(); - raw_spin_unlock(&ctx->lock); } @@ -1593,6 +1589,8 @@ void perf_event_task_tick(struct task_struct *curr) cpuctx = &__get_cpu_var(perf_cpu_context); ctx = curr->perf_event_ctxp; + perf_disable(); + perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); @@ -1608,6 +1606,8 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_in(curr, EVENT_FLEXIBLE); + + perf_enable(); } static int event_enable_on_exec(struct perf_event *event, -- cgit v1.2.3 From 5ecaafdbf44b1ba400b746c60c401d54c7ee0863 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 5 Feb 2010 01:24:34 -0500 Subject: kprobes: Add mcount to the kprobes blacklist Since mcount function can be called from everywhere, it should be blacklisted. Moreover, the "mcount" symbol is a special symbol name. So, it is better to put it in the generic blacklist. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Steven Rostedt LKML-Reference: <20100205062433.3745.36726.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c3340e836c37..ccec774c716d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -94,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { {"native_get_debugreg",}, {"irq_entries_start",}, {"common_interrupt",}, + {"mcount",}, /* mcount can be called from everywhere */ {NULL} /* Terminator */ }; -- cgit v1.2.3 From c93d89f3dbf0202bf19c07960ca8602b48c2f9a0 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 27 Jan 2010 19:13:40 +0800 Subject: Export the symbol of getboottime and mmonotonic_to_bootbased Export getboottime and monotonic_to_bootbased in order to let them could be used by following patch. Cc: stable@kernel.org Signed-off-by: Jason Wang Signed-off-by: Marcelo Tosatti --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7faaa32fbf4f..e2ab064c6d41 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -880,6 +880,7 @@ void getboottime(struct timespec *ts) set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } +EXPORT_SYMBOL_GPL(getboottime); /** * monotonic_to_bootbased - Convert the monotonic time to boot based. @@ -889,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts) { *ts = timespec_add_safe(*ts, total_sleep_time); } +EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { -- cgit v1.2.3 From a9bb18f36c8056f0712fb28c52c0f85d98438dfb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 10 Feb 2010 17:23:47 +0100 Subject: tracing/kprobes: Fix probe parsing Trying to add a probe like: echo p:myprobe 0x10000 > /sys/kernel/debug/tracing/kprobe_events will fail since the wrong pointer is passed to strict_strtoul when trying to convert the address to an unsigned long. Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <20100210162346.GA6933@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6ea90c0e2c96..50b1b8239806 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -689,7 +689,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; -- cgit v1.2.3 From 1a02d59aba9b61b820517fb135086471c065b573 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 27 Jan 2010 17:09:34 +0300 Subject: kfifo: Make kfifo_initialized work after kfifo_free After kfifo rework it's no longer possible to reliably know if kfifo is usable, since after kfifo_free(), kfifo_initialized() would still return true. The correct behaviour is needed for at least FHCI USB driver. This patch fixes the issue by resetting the kfifo to zero values (the same approach is used in kfifo_alloc() if allocation failed). Signed-off-by: Anton Vorontsov Acked-by: Stefani Seibold Signed-off-by: Greg Kroah-Hartman --- kernel/kfifo.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 498cabba225e..559fb5582b60 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); + _kfifo_init(fifo, NULL, 0); } EXPORT_SYMBOL(kfifo_free); -- cgit v1.2.3 From 5a5e0f4c7038168e38d1db6af09d1ac715ee9888 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 27 Jan 2010 17:09:38 +0300 Subject: kfifo: Don't use integer as NULL pointer This patch fixes following sparse warnings: include/linux/kfifo.h:127:25: warning: Using plain integer as NULL pointer kernel/kfifo.c:83:21: warning: Using plain integer as NULL pointer Signed-off-by: Anton Vorontsov Acked-by: Stefani Seibold Signed-off-by: Greg Kroah-Hartman --- kernel/kfifo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 559fb5582b60..35edbe22e9a9 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0); + _kfifo_init(fifo, NULL, 0); return -ENOMEM; } -- cgit v1.2.3 From 701188374b6f1ef9cf7e4dce4a2e69ef4c0012ac Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 22 Feb 2010 12:44:16 -0800 Subject: kernel/sys.c: fix missing rcu protection for sys_getpriority() find_task_by_vpid() is not safe without rcu_read_lock(). 2.6.33-rc7 got RCU protection for sys_setpriority() but missed it for sys_getpriority(). Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Cc: "Paul E. McKenney" Acked-by: Serge Hallyn Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b85..18bde979f346 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.3 From 3a0304e90aa5a2c0c308a05d28f7d109a48d8539 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2010 10:33:41 +0100 Subject: perf_events: Report the MMAP pgoff value in bytes DaveM reported that currently perf interprets the pgoff value reported by the MMAP events as a byte range, but the kernel reports it as a page offset. Since its broken (and unusable) anyway, change the kernel behaviour (ABI) to report bytes indeed, avoiding the need for userspace to deal with PAGE_SIZE things. Reported-by: David Miller Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 087025fe3ba1..5a69abb05ac3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3749,7 +3749,7 @@ void __perf_event_mmap(struct vm_area_struct *vma) /* .tid */ .start = vma->vm_start, .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, + .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, }; -- cgit v1.2.3 From d76a0812ac4139ceb54daab3cc70e1bd8bd9d43a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Feb 2010 17:06:01 +0200 Subject: perf_events: Add new start/stop PMU callbacks In certain situations, the kernel may need to stop and start the same event rapidly. The current PMU callbacks do not distinguish between stop and release (i.e., stop + free the resource). Thus, a counter may be released, then it will be immediately re-acquired. Event scheduling will again take place with no guarantee to assign the same counter. On some processors, this may event yield to failure to assign the event back due to competion between cores. This patch is adding a new pair of callback to stop and restart a counter without actually release the underlying counter resource. On stop, the counter is stopped, its values saved and that's it. On start, the value is reloaded and counter is restarted (on x86, actual restart is delayed until perf_enable()). Signed-off-by: Stephane Eranian [ added fallback to ->enable/->disable for all other PMUs fixed x86_pmu_start() to call x86_pmu.enable() merged __x86_pmu_disable into x86_pmu_stop() ] Signed-off-by: Peter Zijlstra LKML-Reference: <4b703875.0a04d00a.7896.ffffb824@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 5a69abb05ac3..74c60021cdbc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1493,6 +1493,22 @@ do { \ return div64_u64(dividend, divisor); } +static void perf_event_stop(struct perf_event *event) +{ + if (!event->pmu->stop) + return event->pmu->disable(event); + + return event->pmu->stop(event); +} + +static int perf_event_start(struct perf_event *event) +{ + if (!event->pmu->start) + return event->pmu->enable(event); + + return event->pmu->start(event); +} + static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -1513,9 +1529,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) if (atomic64_read(&hwc->period_left) > 8*sample_period) { perf_disable(); - event->pmu->disable(event); + perf_event_stop(event); atomic64_set(&hwc->period_left, 0); - event->pmu->enable(event); + perf_event_start(event); perf_enable(); } } -- cgit v1.2.3 From 38331f62c20456454eed9ebea2525f072c6f1d2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 8 Feb 2010 17:17:01 +0200 Subject: perf_events, x86: AMD event scheduling This patch adds correct AMD NorthBridge event scheduling. NB events are events measuring L3 cache, Hypertransport traffic. They are identified by an event code >= 0xe0. They measure events on the Northbride which is shared by all cores on a package. NB events are counted on a shared set of counters. When a NB event is programmed in a counter, the data actually comes from a shared counter. Thus, access to those counters needs to be synchronized. We implement the synchronization such that no two cores can be measuring NB events using the same counters. Thus, we maintain a per-NB allocation table. The available slot is propagated using the event_constraint structure. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4b703957.0702d00a.6bf2.7b7d@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 74c60021cdbc..fb4e56eb58f4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -98,6 +98,7 @@ void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_event_setup(int cpu) { barrier(); } void __weak hw_perf_event_setup_online(int cpu) { barrier(); } +void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_event *group_leader, @@ -5462,6 +5463,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_event_exit_cpu(cpu); break; + case CPU_DEAD: + hw_perf_event_setup_offline(cpu); + break; + default: break; } -- cgit v1.2.3 From 6e37738a2fac964583debe91099bc3248554f6e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Feb 2010 13:21:58 +0100 Subject: perf_events: Simplify code by removing cpu argument to hw_perf_group_sched_in() Since the cpu argument to hw_perf_group_sched_in() is always smp_processor_id(), simplify the code a little by removing this argument and using the current cpu where needed. Signed-off-by: Peter Zijlstra Cc: David Miller Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker LKML-Reference: <1265890918.5396.3.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 45 ++++++++++++++++++--------------------------- 1 file changed, 18 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fb4e56eb58f4..05b6c6b825e3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -103,7 +103,7 @@ void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, int cpu) + struct perf_event_context *ctx) { return 0; } @@ -633,14 +633,13 @@ void perf_event_disable(struct perf_event *event) static int event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - int cpu) + struct perf_event_context *ctx) { if (event->state <= PERF_EVENT_STATE_OFF) return 0; event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + event->oncpu = smp_processor_id(); /* * The new state must be visible before we turn it on in the hardware: */ @@ -667,8 +666,7 @@ event_sched_in(struct perf_event *event, static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - int cpu) + struct perf_event_context *ctx) { struct perf_event *event, *partial_group; int ret; @@ -676,18 +674,18 @@ group_sched_in(struct perf_event *group_event, if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); if (ret) return ret < 0 ? ret : 0; - if (event_sched_in(group_event, cpuctx, ctx, cpu)) + if (event_sched_in(group_event, cpuctx, ctx)) return -EAGAIN; /* * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx, cpu)) { + if (event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } @@ -761,7 +759,6 @@ static void __perf_install_in_context(void *info) struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - int cpu = smp_processor_id(); int err; /* @@ -808,7 +805,7 @@ static void __perf_install_in_context(void *info) if (!group_can_go_on(event, cpuctx, 1)) err = -EEXIST; else - err = event_sched_in(event, cpuctx, ctx, cpu); + err = event_sched_in(event, cpuctx, ctx); if (err) { /* @@ -950,11 +947,9 @@ static void __perf_event_enable(void *info) } else { perf_disable(); if (event == leader) - err = group_sched_in(event, cpuctx, ctx, - smp_processor_id()); + err = group_sched_in(event, cpuctx, ctx); else - err = event_sched_in(event, cpuctx, ctx, - smp_processor_id()); + err = event_sched_in(event, cpuctx, ctx); perf_enable(); } @@ -1281,19 +1276,18 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, static void ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - int cpu) + struct perf_cpu_context *cpuctx) { struct perf_event *event; list_for_each_entry(event, &ctx->pinned_groups, group_entry) { if (event->state <= PERF_EVENT_STATE_OFF) continue; - if (event->cpu != -1 && event->cpu != cpu) + if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; if (group_can_go_on(event, cpuctx, 1)) - group_sched_in(event, cpuctx, ctx, cpu); + group_sched_in(event, cpuctx, ctx); /* * If this pinned group hasn't been scheduled, @@ -1308,8 +1302,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, static void ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - int cpu) + struct perf_cpu_context *cpuctx) { struct perf_event *event; int can_add_hw = 1; @@ -1322,11 +1315,11 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, * Listen to the 'cpu' scheduling filter constraint * of events: */ - if (event->cpu != -1 && event->cpu != cpu) + if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; if (group_can_go_on(event, cpuctx, can_add_hw)) - if (group_sched_in(event, cpuctx, ctx, cpu)) + if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; } } @@ -1336,8 +1329,6 @@ ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - int cpu = smp_processor_id(); - raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) @@ -1352,11 +1343,11 @@ ctx_sched_in(struct perf_event_context *ctx, * in order to give them the best chance of going on. */ if (event_type & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx, cpu); + ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ if (event_type & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx, cpu); + ctx_flexible_sched_in(ctx, cpuctx); perf_enable(); out: -- cgit v1.2.3 From 24691ea964cc0123e386b661e03a86a481c6ee79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Feb 2010 16:36:23 +0100 Subject: perf_event: Fix preempt warning in perf_clock() A recent commit introduced a preemption warning for perf_clock(), use raw_smp_processor_id() to avoid this, it really doesn't matter which cpu we use here. Signed-off-by: Peter Zijlstra LKML-Reference: <1267198583.22519.684.camel@laptop> Cc: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 05b6c6b825e3..aa6155b5e24c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -249,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(smp_processor_id()); + return cpu_clock(raw_smp_processor_id()); } /* -- cgit v1.2.3 From 44ee63587dce85593c22497140db16f4e5027860 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 Feb 2010 10:50:50 +0900 Subject: percpu: Add __percpu sparse annotations to hw_breakpoint Add __percpu sparse annotations to hw_breakpoint. These annotations are to make sparse consider percpu variables to be in a different address space and warn if accessed without going through percpu accessors. This patch doesn't affect normal builds. In kernel/hw_breakpoint.c, per_cpu(nr_task_bp_pinned, cpu)'s will trigger spurious noderef related warnings from sparse. Changing it to &per_cpu(nr_task_bp_pinned[0], cpu) will work around the problem but deemed to ugly by the maintainer. Leave it alone until better solution can be found. Signed-off-by: Tejun Heo Cc: Stephen Rothwell Cc: K.Prasad LKML-Reference: <4B7B4B7A.9050902@kernel.org> Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..6542eacb3fa5 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); * * @return a set of per_cpu pointers to perf events */ -struct perf_event ** +struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { - struct perf_event **cpu_events, **pevent, *bp; + struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return ERR_PTR(-ENOMEM); + return (void __percpu __force *)ERR_PTR(-ENOMEM); get_online_cpus(); for_each_online_cpu(cpu) { @@ -451,7 +451,7 @@ fail: put_online_cpus(); free_percpu(cpu_events); - return ERR_PTR(err); + return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ -void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; struct perf_event **pevent; -- cgit v1.2.3 From 320ebf09cbb6d01954c9a060266aa8e0d27f4638 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 12:35:37 +0100 Subject: perf, x86: Restrict the ANY flag The ANY flag can show SMT data of another task (like 'top'), so we want to disable it when system-wide profiling is disabled. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a661e7991865..482d5e1d3764 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly; */ int sysctl_perf_event_paranoid __read_mostly = 1; -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_event_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_event_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_event_paranoid > 1; -} - int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* -- cgit v1.2.3