diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/hw_breakpoint.c | 8 | ||||
-rw-r--r-- | kernel/irq_work.c | 164 | ||||
-rw-r--r-- | kernel/perf_event.c | 274 | ||||
-rw-r--r-- | kernel/timer.c | 7 | ||||
-rw-r--r-- | kernel/tracepoint.c | 6 |
6 files changed, 303 insertions, 158 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index d52b473c99a1..4d9bf5f8531f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_perf_event.o = -pg +CFLAGS_REMOVE_irq_work.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 3b714e839c10..2c9120f0afca 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = bp->ctx; + struct task_struct *tsk = bp->hw.bp_target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->ctx == ctx && find_slot_idx(iter) == type) + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) count += hw_breakpoint_weight(iter); } @@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; /* Pinned counter cpu profiling */ if (!tsk) { diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..f16763ff8481 --- /dev/null +++ b/kernel/irq_work.c @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Provides a framework for enqueueing and running callbacks from hardirq + * context. The enqueueing is NMI-safe. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> + +/* + * An entry can be in one of four states: + * + * free NULL, 0 -> {claimed} : free to be used + * claimed NULL, 3 -> {pending} : claimed to be enqueued + * pending next, 3 -> {busy} : queued, pending callback + * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed + * + * We use the lower two bits of the next pointer to keep PENDING and BUSY + * flags. + */ + +#define IRQ_WORK_PENDING 1UL +#define IRQ_WORK_BUSY 2UL +#define IRQ_WORK_FLAGS 3UL + +static inline bool irq_work_is_set(struct irq_work *entry, int flags) +{ + return (unsigned long)entry->next & flags; +} + +static inline struct irq_work *irq_work_next(struct irq_work *entry) +{ + unsigned long next = (unsigned long)entry->next; + next &= ~IRQ_WORK_FLAGS; + return (struct irq_work *)next; +} + +static inline struct irq_work *next_flags(struct irq_work *entry, int flags) +{ + unsigned long next = (unsigned long)entry; + next |= flags; + return (struct irq_work *)next; +} + +static DEFINE_PER_CPU(struct irq_work *, irq_work_list); + +/* + * Claim the entry so that no one else will poke at it. + */ +static bool irq_work_claim(struct irq_work *entry) +{ + struct irq_work *next, *nflags; + + do { + next = entry->next; + if ((unsigned long)next & IRQ_WORK_PENDING) + return false; + nflags = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(&entry->next, next, nflags) != next); + + return true; +} + + +void __weak arch_irq_work_raise(void) +{ + /* + * Lame architectures will get the timer tick callback + */ +} + +/* + * Queue the entry and raise the IPI if needed. + */ +static void __irq_work_queue(struct irq_work *entry) +{ + struct irq_work **head, *next; + + head = &get_cpu_var(irq_work_list); + + do { + next = *head; + /* Can assign non-atomic because we keep the flags set. */ + entry->next = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(head, next, entry) != next); + + /* The list was empty, raise self-interrupt to start processing. */ + if (!irq_work_next(entry)) + arch_irq_work_raise(); + + put_cpu_var(irq_work_list); +} + +/* + * Enqueue the irq_work @entry, returns true on success, failure when the + * @entry was already enqueued by someone else. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue(struct irq_work *entry) +{ + if (!irq_work_claim(entry)) { + /* + * Already enqueued, can't do! + */ + return false; + } + + __irq_work_queue(entry); + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue); + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + struct irq_work *list, **head; + + head = &__get_cpu_var(irq_work_list); + if (*head == NULL) + return; + + BUG_ON(!in_irq()); + BUG_ON(!irqs_disabled()); + + list = xchg(head, NULL); + while (list != NULL) { + struct irq_work *entry = list; + + list = irq_work_next(list); + + /* + * Clear the PENDING bit, after this point the @entry + * can be re-used. + */ + entry->next = next_flags(NULL, IRQ_WORK_BUSY); + entry->func(entry); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + } +} +EXPORT_SYMBOL_GPL(irq_work_run); + +/* + * Synchronize against the irq_work @entry, ensures the entry is not + * currently in use. + */ +void irq_work_sync(struct irq_work *entry) +{ + WARN_ON_ONCE(irqs_disabled()); + + while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + cpu_relax(); +} +EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 1ec3916ffef0..05ecf6f7c672 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -34,7 +34,7 @@ #include <asm/irq_regs.h> -static atomic_t nr_events __read_mostly; +atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -315,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader; - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) @@ -412,8 +417,8 @@ event_filter_match(struct perf_event *event) return event->cpu == -1 || event->cpu == smp_processor_id(); } -static void -event_sched_out(struct perf_event *event, +static int +__event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -432,14 +437,13 @@ event_sched_out(struct perf_event *event, } if (event->state != PERF_EVENT_STATE_ACTIVE) - return; + return 0; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; event->pmu->del(event, 0); event->oncpu = -1; @@ -448,6 +452,19 @@ event_sched_out(struct perf_event *event, ctx->nr_active--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + return 1; +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret; + + ret = __event_sched_out(event, cpuctx, ctx); + if (ret) + event->tstamp_stopped = ctx->time; } static void @@ -647,7 +664,7 @@ retry: } static int -event_sched_in(struct perf_event *event, +__event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -667,8 +684,6 @@ event_sched_in(struct perf_event *event, return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -679,6 +694,35 @@ event_sched_in(struct perf_event *event, return 0; } +static inline int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret = __event_sched_in(event, cpuctx, ctx); + if (ret) + return ret; + event->tstamp_running += ctx->time - event->tstamp_stopped; + return 0; +} + +static void +group_commit_event_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + u64 now = ctx->time; + + group_event->tstamp_running += now - group_event->tstamp_stopped; + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + event->tstamp_running += now - event->tstamp_stopped; + } +} + static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, @@ -692,7 +736,13 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu); - if (event_sched_in(group_event, cpuctx, ctx)) { + /* + * use __event_sched_in() to delay updating tstamp_running + * until the transaction is committed. In case of failure + * we will keep an unmodified tstamp_running which is a + * requirement to get correct timing information + */ + if (__event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); return -EAGAIN; } @@ -701,26 +751,31 @@ group_sched_in(struct perf_event *group_event, * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { + if (__event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } - if (!pmu->commit_txn(pmu)) + if (!pmu->commit_txn(pmu)) { + /* commit tstamp_running */ + group_commit_event_sched_in(group_event, cpuctx, ctx); return 0; - + } group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * + * use __event_sched_out() to avoid updating tstamp_stopped + * because the event never actually ran */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + __event_sched_out(event, cpuctx, ctx); } - event_sched_out(group_event, cpuctx, ctx); + __event_sched_out(group_event, cpuctx, ctx); pmu->cancel_txn(pmu); @@ -1256,8 +1311,8 @@ void perf_event_context_sched_out(struct task_struct *task, int ctxn, * accessing the event control register. If a NMI hits, then it will * not restart the event. */ -void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) { int ctxn; @@ -1285,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, /* * Called with IRQs disabled */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) -{ - task_ctx_sched_out(ctx, EVENT_ALL); -} - -/* - * Called with IRQs disabled - */ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type) { @@ -1439,7 +1486,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -1780,7 +1827,13 @@ static u64 perf_event_read(struct perf_event *event) unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2129,11 +2182,9 @@ retry: } } - put_task_struct(task); return ctx; errout: - put_task_struct(task); return ERR_PTR(err); } @@ -2150,15 +2201,15 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_pending_sync(struct perf_event *event); static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { - perf_pending_sync(event); + irq_work_sync(&event->pending); if (!event->parent) { - atomic_dec(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -3106,16 +3157,7 @@ void perf_event_wakeup(struct perf_event *event) } } -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_event(struct perf_pending_entry *entry) +static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); @@ -3131,89 +3173,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) } } -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_event_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_event *event) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return event->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_event *event) -{ - wait_event(event->waitq, perf_not_pending(event)); -} - -void perf_event_do_pending(void) -{ - __perf_pending_run(); -} - /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -3263,8 +3222,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) if (handle->nmi) { handle->event->pending_wakeup = 1; - perf_pending_queue(&handle->event->pending, - perf_pending_event); + irq_work_queue(&handle->event->pending); } else perf_event_wakeup(handle->event); } @@ -4300,8 +4258,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, event->pending_kill = POLL_HUP; if (nmi) { event->pending_disable = 1; - perf_pending_queue(&event->pending, - perf_pending_event); + irq_work_queue(&event->pending); } else perf_event_disable(event); } @@ -4712,7 +4669,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - atomic_dec(&perf_swevent_enabled[event_id]); + jump_label_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -4742,7 +4699,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - atomic_inc(&perf_swevent_enabled[event_id]); + jump_label_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5291,9 +5248,10 @@ unlock: */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler) + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) { struct pmu *pmu; struct perf_event *event; @@ -5318,6 +5276,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); @@ -5334,6 +5293,17 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; @@ -5377,7 +5347,8 @@ done: event->pmu = pmu; if (!event->parent) { - atomic_inc(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5586,10 +5557,18 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } - event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL); + if (pid != -1) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_fd; + goto err_task; } /* @@ -5621,21 +5600,13 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (pid != -1) { - task = find_lively_task_by_vpid(pid); - if (IS_ERR(task)) { - err = PTR_ERR(task); - goto err_group_fd; - } - } - /* * Get the target context (task or percpu): */ ctx = find_get_context(pmu, task, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_group_fd; + goto err_alloc; } /* @@ -5731,9 +5702,13 @@ SYSCALL_DEFINE5(perf_event_open, err_context: put_ctx(ctx); +err_alloc: + free_event(event); +err_task: + if (task) + put_task_struct(task); err_group_fd: fput_light(group_file, fput_needed); - free_event(event); err_fd: put_unused_fd(event_fd); return err; @@ -5759,7 +5734,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Get the target context (task or percpu): */ - event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -5868,7 +5843,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * our context. */ child_ctx = child->perf_event_ctxp[ctxn]; - __perf_event_task_sched_out(child_ctx); + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is @@ -6027,6 +6002,7 @@ inherit_event(struct perf_event *parent_event, child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, + child, group_leader, parent_event, NULL); if (IS_ERR(child_event)) diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..68a9ae7679b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_event.h> +#include <linux/irq_work.h> #include <linux/sched.h> #include <linux/slab.h> @@ -1279,7 +1279,10 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); - perf_event_do_pending(); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_run(); +#endif scheduler_tick(); run_posix_cpu_timers(p); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d6073a50a6ca..e95ee7f31d43 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -265,10 +265,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); if (!elem->state && active) { - enable_jump_label(&elem->state); + jump_label_enable(&elem->state); elem->state = active; } else if (elem->state && !active) { - disable_jump_label(&elem->state); + jump_label_disable(&elem->state); elem->state = active; } } @@ -285,7 +285,7 @@ static void disable_tracepoint(struct tracepoint *elem) elem->unregfunc(); if (elem->state) { - disable_jump_label(&elem->state); + jump_label_disable(&elem->state); elem->state = 0; } rcu_assign_pointer(elem->funcs, NULL); |