From 8a6b173287bb94b3ef8360119020e856afb1c934 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 30 Mar 2014 18:56:22 +0200 Subject: uprobes: Kill UPROBE_SKIP_SSTEP and can_skip_sstep() UPROBE_COPY_INSN, UPROBE_SKIP_SSTEP, and uprobe->flags must die. This patch kills UPROBE_SKIP_SSTEP. I never understood why it was added; not only it doesn't help, it harms. It can only help to avoid arch_uprobe_skip_sstep() if it was already called before and failed. But this is ugly, if we want to know whether we can emulate this instruction or not we should do this analysis in arch_uprobe_analyze_insn(), not when we hit this probe for the first time. And in fact this logic is simply wrong. arch_uprobe_skip_sstep() can fail or not depending on the task/register state, if this insn can be emulated but, say, put_user() fails we need to xol it this time, but this doesn't mean we shouldn't try to emulate it when this or another thread hits this bp next time. And this is the actual reason for this change. We need to emulate the "call" insn, but push(return-address) can obviously fail. Per-arch notes: x86: __skip_sstep() can only emulate "rep;nop". With this change it will be called every time and most probably for no reason. This will be fixed by the next changes. We need to change this suboptimal code anyway. arm: Should not be affected. It has its own "bool simulate" flag checked in arch_uprobe_skip_sstep(). ppc: Looks like, it can emulate almost everything. Does it actually need to record the fact that emulate_step() failed? Hopefully not. But if yes, it can add the ppc- specific flag into arch_uprobe. TODO: rename arch_uprobe_skip_sstep() to arch_uprobe_emulate_insn(), Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: David A. Long Reviewed-by: Jim Keniston Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 04709b66369d..ea2a7c0728bb 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -60,8 +60,6 @@ static struct percpu_rw_semaphore dup_mmap_sem; /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 -/* Can skip singlestep */ -#define UPROBE_SKIP_SSTEP 1 struct uprobe { struct rb_node rb_node; /* node in the rb tree */ @@ -491,12 +489,9 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) uprobe->offset = offset; init_rwsem(&uprobe->register_rwsem); init_rwsem(&uprobe->consumer_rwsem); - /* For now assume that the instruction need not be single-stepped */ - __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); /* add to uprobes_tree, sorted on inode:offset */ cur_uprobe = insert_uprobe(uprobe); - /* a uprobe exists for this inode:offset combination */ if (cur_uprobe) { kfree(uprobe); @@ -1628,20 +1623,6 @@ bool uprobe_deny_signal(void) return true; } -/* - * Avoid singlestepping the original instruction if the original instruction - * is a NOP or can be emulated. - */ -static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) -{ - if (test_bit(UPROBE_SKIP_SSTEP, &uprobe->flags)) { - if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) - return true; - clear_bit(UPROBE_SKIP_SSTEP, &uprobe->flags); - } - return false; -} - static void mmf_recalc_uprobes(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -1868,13 +1849,13 @@ static void handle_swbp(struct pt_regs *regs) handler_chain(uprobe, regs); - if (can_skip_sstep(uprobe, regs)) + if (arch_uprobe_skip_sstep(&uprobe->arch, regs)) goto out; if (!pre_ssout(uprobe, regs, bp_vaddr)) return; - /* can_skip_sstep() succeeded, or restart if can't singlestep */ + /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ out: put_uprobe(uprobe); } -- cgit v1.2.3 From 014940bad8e46ca7bd0483f760f9cba60088a3d4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 3 Apr 2014 20:20:10 +0200 Subject: uprobes/x86: Send SIGILL if arch_uprobe_post_xol() fails Currently the error from arch_uprobe_post_xol() is silently ignored. This doesn't look good and this can lead to the hard-to-debug problems. 1. Change handle_singlestep() to loudly complain and send SIGILL. Note: this only affects x86, ppc/arm can't fail. 2. Change arch_uprobe_post_xol() to call arch_uprobe_abort_xol() and avoid TF games if it is going to return an error. This can help to to analyze the problem, if nothing else we should not report ->ip = xol_slot in the core-file. Note: this means that handle_riprel_post_xol() can be called twice, but this is fine because it is idempotent. Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Reviewed-by: Jim Keniston --- kernel/events/uprobes.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ea2a7c0728bb..d1edc5e6fd03 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1867,10 +1867,11 @@ out: static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) { struct uprobe *uprobe; + int err = 0; uprobe = utask->active_uprobe; if (utask->state == UTASK_SSTEP_ACK) - arch_uprobe_post_xol(&uprobe->arch, regs); + err = arch_uprobe_post_xol(&uprobe->arch, regs); else if (utask->state == UTASK_SSTEP_TRAPPED) arch_uprobe_abort_xol(&uprobe->arch, regs); else @@ -1884,6 +1885,11 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ spin_unlock_irq(¤t->sighand->siglock); + + if (unlikely(err)) { + uprobe_warn(current, "execute the probed insn, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); + } } /* -- cgit v1.2.3 From c464c76eec4be587604ca082e8cded7e6b89f3bf Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Mar 2014 16:56:41 +0800 Subject: perf: Allow building PMU drivers as modules This patch adds support for building PMU driver as module. It exports the functions perf_pmu_{register,unregister}() and adds reference tracking for the PMU driver module. When the PMU driver is built as a module, each active event of the PMU holds a reference to the driver module. Signed-off-by: Yan, Zheng Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1395133004-23205-1-git-send-email-zheng.z.yan@intel.com Cc: eranian@google.com Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..5129b1201050 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "internal.h" @@ -3229,6 +3230,9 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); + if (event->pmu) + module_put(event->pmu->module); + call_rcu(&event->rcu_head, free_event_rcu); } static void free_event(struct perf_event *event) @@ -6551,6 +6555,7 @@ free_pdc: free_percpu(pmu->pmu_disable_count); goto unlock; } +EXPORT_SYMBOL_GPL(perf_pmu_register); void perf_pmu_unregister(struct pmu *pmu) { @@ -6572,6 +6577,7 @@ void perf_pmu_unregister(struct pmu *pmu) put_device(pmu->dev); free_pmu_context(pmu); } +EXPORT_SYMBOL_GPL(perf_pmu_unregister); struct pmu *perf_init_event(struct perf_event *event) { @@ -6585,6 +6591,10 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (ret) @@ -6593,6 +6603,10 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + if (!try_module_get(pmu->module)) { + pmu = ERR_PTR(-ENODEV); + goto unlock; + } event->pmu = pmu; ret = pmu->event_init(event); if (!ret) @@ -6771,6 +6785,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, err_pmu: if (event->destroy) event->destroy(event); + module_put(pmu->module); err_ns: if (event->ns) put_pid_ns(event->ns); -- cgit v1.2.3 From 8588a2bbddc524325d84d1d6996758e9242e4ffc Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 18 Mar 2014 16:56:42 +0800 Subject: hrtimer: Export __hrtimer_start_range_ns() Export __hrtimer_start_range_ns() to allow building perf Intel uncore driver as a module. Signed-off-by: Yan, Zheng Acked-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1395133004-23205-2-git-send-email-zheng.z.yan@intel.com Cc: eranian@google.com Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/hrtimer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d55092ceee29..53d26829cd4d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1017,6 +1017,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return ret; } +EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU -- cgit v1.2.3 From 1f4ee5038f0c1ef95f8e6d47ad6623e006b5bce1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 May 2014 09:59:34 +0200 Subject: perf: Ensure consistent inherit state in groups Make sure all events in a group have the same inherit state. It was possible for group leaders to have inherit set while sibling events would not have inherit set. In this case we'd still inherit the siblings, leading to some non-fatal weirdness. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-r32tt8yldvic3jlcghd3g35u@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 09866a330af8..1de0d709f69f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7081,20 +7081,26 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (task && group_leader && + group_leader->attr.inherit != attr.inherit) { + err = -EINVAL; + goto err_task; + } + get_online_cpus(); event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_task; + goto err_cpus; } if (flags & PERF_FLAG_PID_CGROUP) { err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) { __free_event(event); - goto err_task; + goto err_cpus; } } @@ -7256,8 +7262,9 @@ err_context: put_ctx(ctx); err_alloc: free_event(event); -err_task: +err_cpus: put_online_cpus(); +err_task: if (task) put_task_struct(task); err_group_fd: -- cgit v1.2.3 From 15a2d4de0eab533a76bee9e68d7e1063dd25401c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 11:41:02 +0200 Subject: perf: Always destroy groups on exit Commit 38b435b16c36 ("perf: Fix tear-down of inherited group events") states that we need to destroy groups for inherited events, but it doesn't make any sense to not also destroy groups for normal events. And while it usually makes no difference (the normal events won't leak, and its very likely all the group events will die in quick succession) it does make the code more consistent and closes a potential hole for trouble. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-426egt8zmsm12d2q8k2xz4tt@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1de0d709f69f..819ffc006d67 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7400,7 +7400,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - perf_remove_from_context(child_event, !!child_event->parent); + perf_remove_from_context(child_event, true); /* * It can happen that the parent exits first, and has events -- cgit v1.2.3 From 63342411efd2d9350ad405205da036cd45ed1640 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 11:49:16 +0200 Subject: perf: Validate locking assumption Document and validate the locking assumption of event_sched_in(). Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-sybq1publ9xt5no77cwvi0eo@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 819ffc006d67..0de199729f04 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1678,6 +1678,8 @@ event_sched_in(struct perf_event *event, u64 tstamp = perf_event_time(event); int ret = 0; + lockdep_assert_held(&ctx->lock); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; -- cgit v1.2.3 From 683ede43dd412c6cad0d23578a018409ac9c683e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 12:11:24 +0200 Subject: perf: Rework free paths Primarily make perf_event_release_kernel() into put_event(), this will allow kernel space to create per-task inherited events, and is safer in general. Also, document the free_event() assumptions. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-rk9pvr6e1d0559lxstltbztc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 66 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 0de199729f04..83505a35afba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3251,7 +3251,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -static void free_event(struct perf_event *event) + +static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3279,42 +3280,31 @@ static void free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); } -int perf_event_release_kernel(struct perf_event *event) +/* + * Used to free events which have a known refcount of 1, such as in error paths + * where the event isn't exposed yet and inherited events. + */ +static void free_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - perf_remove_from_context(event, true); - mutex_unlock(&ctx->mutex); - - free_event(event); + if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { + /* leak to avoid use-after-free */ + return; + } - return 0; + _free_event(event); } -EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static void put_event(struct perf_event *event) { + struct perf_event_context *ctx = event->ctx; struct task_struct *owner; if (!atomic_long_dec_and_test(&event->refcount)) @@ -3353,9 +3343,33 @@ static void put_event(struct perf_event *event) put_task_struct(owner); } - perf_event_release_kernel(event); + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + perf_remove_from_context(event, true); + mutex_unlock(&ctx->mutex); + + _free_event(event); } +int perf_event_release_kernel(struct perf_event *event) +{ + put_event(event); + return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + static int perf_release(struct inode *inode, struct file *file) { put_event(file->private_data); -- cgit v1.2.3 From 3a497f48637e2aac17eabb84a17f8ac5216028fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 5 May 2014 12:42:29 +0200 Subject: perf: Simplify perf_event_exit_task_context() Instead of jumping through hoops to make sure to find (and exit) each event, do it the simple straight fwd way. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-tij931199thfkys8vbnokdpf@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 83505a35afba..7ab734fbaeeb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7485,24 +7485,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ mutex_lock(&child_ctx->mutex); -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) + list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); -- cgit v1.2.3 From 12665b35b0b48c9583ee1b8f0a403dc708fb4a92 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 10 May 2014 13:10:59 +0200 Subject: perf/events/core: Drop unused variable after cleanup ... in 3a497f48637 ("perf: Simplify perf_event_exit_task_context()") Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1399720259-28275-1-git-send-email-bp@alien8.de Signed-off-by: Thomas Gleixner --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7ab734fbaeeb..ed50b0943213 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7431,7 +7431,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *tmp; + struct perf_event *child_event; struct perf_event_context *child_ctx; unsigned long flags; -- cgit v1.2.3