summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c12
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c931
-rw-r--r--kernel/events/hw_breakpoint.c10
-rw-r--r--kernel/events/internal.h96
-rw-r--r--kernel/events/ring_buffer.c380
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/stacktrace.c12
-rw-r--r--kernel/taskstats.c15
-rw-r--r--kernel/time/alarmtimer.c158
-rw-r--r--kernel/trace/ftrace.c20
-rw-r--r--kernel/trace/ring_buffer.c66
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c242
-rw-r--r--kernel/trace/trace.h10
-rw-r--r--kernel/trace/trace_events.c26
-rw-r--r--kernel/trace/trace_functions_graph.c223
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_sched_wakeup.c4
-rw-r--r--kernel/trace/trace_stack.c13
-rw-r--r--kernel/watchdog.c10
23 files changed, 1187 insertions, 1061 deletions
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb913c77..d5fe7af0de2e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
-#include <asm/atomic.h>
static async_cookie_t next_cookie = 1;
@@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work)
/* 2) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
+ printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ (long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
}
@@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk("initcall %lli_%pF returned 0 after %lld usecs\n",
+ printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
@@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
ktime_t starttime, delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("async_waiting @ %i\n", task_pid_nr(current));
+ printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
@@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie,
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk("async_continuing @ %i after %lli usec\n",
+ printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d8394..89e5e8aa4c36 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = -pg
endif
-obj-y := core.o
+obj-y := core.o ring_buffer.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9efe7108ccaf..0567e32d71aa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
+#include "internal.h"
+
#include <asm/irq_regs.h>
struct remote_function_call {
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx)
+ raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ raw_spin_unlock(&ctx->lock);
+ raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
#ifdef CONFIG_CGROUP_PERF
/*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
-
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
/*
* perf_cgroup_events says at least one
* context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
* events for a context.
*/
if (cpuctx->ctx.nr_cgroups > 0) {
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
-
- perf_pmu_enable(cpuctx->ctx.pmu);
}
rcu_read_unlock();
@@ -731,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event)
/*
* Update the total_time_enabled and total_time_running fields for a event.
+ * The caller of this function needs to hold the ctx->lock.
*/
static void update_event_times(struct perf_event *event)
{
@@ -1105,6 +1123,10 @@ static int __perf_remove_from_context(void *info)
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
list_del_event(event, ctx);
+ if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+ ctx->is_active = 0;
+ cpuctx->task_ctx = NULL;
+ }
raw_spin_unlock(&ctx->lock);
return 0;
@@ -1454,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event,
event->tstamp_stopped = tstamp;
}
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *tsk);
+static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void
+ctx_sched_in(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ struct task_struct *task)
+{
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ if (ctx)
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ if (ctx)
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+}
/*
* Cross CPU call to install and enable a performance event
@@ -1466,20 +1504,37 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_event *leader = event->group_leader;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- int err;
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ struct task_struct *task = current;
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
/*
- * In case we're installing a new context to an already running task,
- * could also happen before perf_event_task_sched_in() on architectures
- * which do context switches with IRQs enabled.
+ * If there was an active task_ctx schedule it out.
*/
- if (ctx->task && !cpuctx->task_ctx)
- perf_event_context_sched_in(ctx, ctx->task);
+ if (task_ctx)
+ task_ctx_sched_out(task_ctx);
+
+ /*
+ * If the context we're installing events in is not the
+ * active task_ctx, flip them.
+ */
+ if (ctx->task && task_ctx != ctx) {
+ if (task_ctx)
+ raw_spin_unlock(&task_ctx->lock);
+ raw_spin_lock(&ctx->lock);
+ task_ctx = ctx;
+ }
+
+ if (task_ctx) {
+ cpuctx->task_ctx = task_ctx;
+ task = task_ctx->task;
+ }
+
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- raw_spin_lock(&ctx->lock);
- ctx->is_active = 1;
update_context_time(ctx);
/*
* update cgrp time only if current cgrp
@@ -1490,43 +1545,13 @@ static int __perf_install_in_context(void *info)
add_event_to_ctx(event, ctx);
- if (!event_filter_match(event))
- goto unlock;
-
/*
- * Don't put the event on if it is disabled or if
- * it is in a group and the group isn't on.
+ * Schedule everything back in
*/
- if (event->state != PERF_EVENT_STATE_INACTIVE ||
- (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
- goto unlock;
+ perf_event_sched_in(cpuctx, task_ctx, task);
- /*
- * An exclusive event can't go on if there are already active
- * hardware events, and no hardware event can go on if there
- * is already an exclusive event on.
- */
- if (!group_can_go_on(event, cpuctx, 1))
- err = -EEXIST;
- else
- err = event_sched_in(event, cpuctx, ctx);
-
- if (err) {
- /*
- * This event couldn't go on. If it is in a group
- * then we have to pull the whole group off.
- * If the event group is pinned then put it in error state.
- */
- if (leader != event)
- group_sched_out(leader, cpuctx, ctx);
- if (leader->attr.pinned) {
- update_group_times(leader);
- leader->state = PERF_EVENT_STATE_ERROR;
- }
- }
-
-unlock:
- raw_spin_unlock(&ctx->lock);
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, task_ctx);
return 0;
}
@@ -1739,7 +1764,7 @@ out:
raw_spin_unlock_irq(&ctx->lock);
}
-static int perf_event_refresh(struct perf_event *event, int refresh)
+int perf_event_refresh(struct perf_event *event, int refresh)
{
/*
* not supported on inherited events
@@ -1752,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
return 0;
}
+EXPORT_SYMBOL_GPL(perf_event_refresh);
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
struct perf_event *event;
+ int is_active = ctx->is_active;
- raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
- ctx->is_active = 0;
+ ctx->is_active &= ~event_type;
if (likely(!ctx->nr_events))
- goto out;
+ return;
+
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
-
if (!ctx->nr_active)
- goto out;
+ return;
- if (event_type & EVENT_PINNED) {
+ perf_pmu_disable(ctx->pmu);
+ if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
list_for_each_entry(event, &ctx->pinned_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
- if (event_type & EVENT_FLEXIBLE) {
+ if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
list_for_each_entry(event, &ctx->flexible_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
-out:
perf_pmu_enable(ctx->pmu);
- raw_spin_unlock(&ctx->lock);
}
/*
@@ -1929,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
rcu_read_unlock();
if (do_switch) {
+ raw_spin_lock(&ctx->lock);
ctx_sched_out(ctx, cpuctx, EVENT_ALL);
cpuctx->task_ctx = NULL;
+ raw_spin_unlock(&ctx->lock);
}
}
@@ -1965,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_sched_out(task);
}
-static void task_ctx_sched_out(struct perf_event_context *ctx,
- enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
@@ -1976,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
cpuctx->task_ctx = NULL;
}
@@ -2055,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
u64 now;
+ int is_active = ctx->is_active;
- raw_spin_lock(&ctx->lock);
- ctx->is_active = 1;
+ ctx->is_active |= event_type;
if (likely(!ctx->nr_events))
- goto out;
+ return;
now = perf_clock();
ctx->timestamp = now;
@@ -2068,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx,
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (event_type & EVENT_PINNED)
+ if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
ctx_pinned_sched_in(ctx, cpuctx);
/* Then walk through the lower prio flexible groups */
- if (event_type & EVENT_FLEXIBLE)
+ if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
ctx_flexible_sched_in(ctx, cpuctx);
-
-out:
- raw_spin_unlock(&ctx->lock);
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2088,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, cpuctx, event_type, task);
}
-static void task_ctx_sched_in(struct perf_event_context *ctx,
- enum event_type_t event_type)
-{
- struct perf_cpu_context *cpuctx;
-
- cpuctx = __get_cpu_context(ctx);
- if (cpuctx->task_ctx == ctx)
- return;
-
- ctx_sched_in(ctx, cpuctx, event_type, NULL);
- cpuctx->task_ctx = ctx;
-}
-
static void perf_event_context_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
@@ -2110,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (cpuctx->task_ctx == ctx)
return;
+ perf_ctx_lock(cpuctx, ctx);
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
@@ -2118,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ perf_event_sched_in(cpuctx, ctx, task);
cpuctx->task_ctx = ctx;
+ perf_pmu_enable(ctx->pmu);
+ perf_ctx_unlock(cpuctx, ctx);
+
/*
* Since these rotations are per-cpu, we need to ensure the
* cpu-context we got scheduled on is actually rotating.
*/
perf_pmu_rotate_start(ctx->pmu);
- perf_pmu_enable(ctx->pmu);
}
/*
@@ -2269,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
u64 interrupts, now;
s64 delta;
- raw_spin_lock(&ctx->lock);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -2301,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
if (delta > 0)
perf_adjust_period(event, period, delta);
}
- raw_spin_unlock(&ctx->lock);
}
/*
@@ -2309,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
*/
static void rotate_ctx(struct perf_event_context *ctx)
{
- raw_spin_lock(&ctx->lock);
-
/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
if (!ctx->rotate_disable)
list_rotate_left(&ctx->flexible_groups);
-
- raw_spin_unlock(&ctx->lock);
}
/*
@@ -2345,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
rotate = 1;
}
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
perf_ctx_adjust_freq(&cpuctx->ctx, interval);
if (ctx)
@@ -2355,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (ctx)
- task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx);
if (ctx)
rotate_ctx(ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
- if (ctx)
- task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
+ perf_event_sched_in(cpuctx, ctx, current);
done:
if (remove)
list_del_init(&cpuctx->rotation_list);
perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
void perf_event_task_tick(void)
@@ -2424,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
* in.
*/
perf_cgroup_sched_out(current);
- task_ctx_sched_out(ctx, EVENT_ALL);
raw_spin_lock(&ctx->lock);
+ task_ctx_sched_out(ctx);
list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
ret = event_enable_on_exec(event, ctx);
@@ -2835,16 +2839,12 @@ retry:
unclone_ctx(ctx);
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- }
-
- if (!ctx) {
+ } else {
ctx = alloc_perf_context(pmu, task);
err = -ENOMEM;
if (!ctx)
goto errout;
- get_ctx(ctx);
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -2856,14 +2856,14 @@ retry:
else if (task->perf_event_ctxp[ctxn])
err = -EAGAIN;
else {
+ get_ctx(ctx);
++ctx->pin_count;
rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
}
mutex_unlock(&task->perf_event_mutex);
if (unlikely(err)) {
- put_task_struct(task);
- kfree(ctx);
+ put_ctx(ctx);
if (err == -EAGAIN)
goto retry;
@@ -2890,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head)
kfree(event);
}
-static void perf_buffer_put(struct perf_buffer *buffer);
+static void ring_buffer_put(struct ring_buffer *rb);
static void free_event(struct perf_event *event)
{
@@ -2913,9 +2913,9 @@ static void free_event(struct perf_event *event)
}
}
- if (event->buffer) {
- perf_buffer_put(event->buffer);
- event->buffer = NULL;
+ if (event->rb) {
+ ring_buffer_put(event->rb);
+ event->rb = NULL;
}
if (is_cgroup_event(event))
@@ -2934,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- /*
- * Remove from the PMU, can't get re-enabled since we got
- * here because the last ref went.
- */
- perf_event_disable(event);
-
WARN_ON_ONCE(ctx->parent_ctx);
/*
* There are two ways this annotation is useful:
@@ -2956,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event)
mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
raw_spin_lock_irq(&ctx->lock);
perf_group_detach(event);
- list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event);
mutex_unlock(&ctx->mutex);
free_event(event);
@@ -3149,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
- struct perf_buffer *buffer;
+ struct ring_buffer *rb;
unsigned int events = POLL_HUP;
rcu_read_lock();
- buffer = rcu_dereference(event->buffer);
- if (buffer)
- events = atomic_xchg(&buffer->poll, 0);
+ rb = rcu_dereference(event->rb);
+ if (rb)
+ events = atomic_xchg(&rb->poll, 0);
rcu_read_unlock();
poll_wait(file, &event->waitq, wait);
@@ -3358,6 +3352,18 @@ static int perf_event_index(struct perf_event *event)
return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
}
+static void calc_timer_values(struct perf_event *event,
+ u64 *running,
+ u64 *enabled)
+{
+ u64 now, ctx_time;
+
+ now = perf_clock();
+ ctx_time = event->shadow_ctx_time + now;
+ *enabled = ctx_time - event->tstamp_enabled;
+ *running = ctx_time - event->tstamp_running;
+}
+
/*
* Callers need to ensure there can be no nesting of this function, otherwise
* the seqlock logic goes bad. We can not serialize this because the arch
@@ -3366,14 +3372,25 @@ static int perf_event_index(struct perf_event *event)
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
- struct perf_buffer *buffer;
+ struct ring_buffer *rb;
+ u64 enabled, running;
rcu_read_lock();
- buffer = rcu_dereference(event->buffer);
- if (!buffer)
+ /*
+ * compute total_time_enabled, total_time_running
+ * based on snapshot values taken when the event
+ * was last scheduled in.
+ *
+ * we cannot simply called update_context_time()
+ * because of locking issue as we can be called in
+ * NMI context
+ */
+ calc_timer_values(event, &enabled, &running);
+ rb = rcu_dereference(event->rb);
+ if (!rb)
goto unlock;
- userpg = buffer->user_page;
+ userpg = rb->user_page;
/*
* Disable preemption so as to not let the corresponding user-space
@@ -3387,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event)
if (event->state == PERF_EVENT_STATE_ACTIVE)
userpg->offset -= local64_read(&event->hw.prev_count);
- userpg->time_enabled = event->total_time_enabled +
+ userpg->time_enabled = enabled +
atomic64_read(&event->child_total_time_enabled);
- userpg->time_running = event->total_time_running +
+ userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
barrier();
@@ -3400,220 +3417,10 @@ unlock:
rcu_read_unlock();
}
-static unsigned long perf_data_size(struct perf_buffer *buffer);
-
-static void
-perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
-{
- long max_size = perf_data_size(buffer);
-
- if (watermark)
- buffer->watermark = min(max_size, watermark);
-
- if (!buffer->watermark)
- buffer->watermark = max_size / 2;
-
- if (flags & PERF_BUFFER_WRITABLE)
- buffer->writable = 1;
-
- atomic_set(&buffer->refcount, 1);
-}
-
-#ifndef CONFIG_PERF_USE_VMALLOC
-
-/*
- * Back perf_mmap() with regular GFP_KERNEL-0 pages.
- */
-
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
- if (pgoff > buffer->nr_pages)
- return NULL;
-
- if (pgoff == 0)
- return virt_to_page(buffer->user_page);
-
- return virt_to_page(buffer->data_pages[pgoff - 1]);
-}
-
-static void *perf_mmap_alloc_page(int cpu)
-{
- struct page *page;
- int node;
-
- node = (cpu == -1) ? cpu : cpu_to_node(cpu);
- page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
- if (!page)
- return NULL;
-
- return page_address(page);
-}
-
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
- struct perf_buffer *buffer;
- unsigned long size;
- int i;
-
- size = sizeof(struct perf_buffer);
- size += nr_pages * sizeof(void *);
-
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- goto fail;
-
- buffer->user_page = perf_mmap_alloc_page(cpu);
- if (!buffer->user_page)
- goto fail_user_page;
-
- for (i = 0; i < nr_pages; i++) {
- buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
- if (!buffer->data_pages[i])
- goto fail_data_pages;
- }
-
- buffer->nr_pages = nr_pages;
-
- perf_buffer_init(buffer, watermark, flags);
-
- return buffer;
-
-fail_data_pages:
- for (i--; i >= 0; i--)
- free_page((unsigned long)buffer->data_pages[i]);
-
- free_page((unsigned long)buffer->user_page);
-
-fail_user_page:
- kfree(buffer);
-
-fail:
- return NULL;
-}
-
-static void perf_mmap_free_page(unsigned long addr)
-{
- struct page *page = virt_to_page((void *)addr);
-
- page->mapping = NULL;
- __free_page(page);
-}
-
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
- int i;
-
- perf_mmap_free_page((unsigned long)buffer->user_page);
- for (i = 0; i < buffer->nr_pages; i++)
- perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
- kfree(buffer);
-}
-
-static inline int page_order(struct perf_buffer *buffer)
-{
- return 0;
-}
-
-#else
-
-/*
- * Back perf_mmap() with vmalloc memory.
- *
- * Required for architectures that have d-cache aliasing issues.
- */
-
-static inline int page_order(struct perf_buffer *buffer)
-{
- return buffer->page_order;
-}
-
-static struct page *
-perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
-{
- if (pgoff > (1UL << page_order(buffer)))
- return NULL;
-
- return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
-}
-
-static void perf_mmap_unmark_page(void *addr)
-{
- struct page *page = vmalloc_to_page(addr);
-
- page->mapping = NULL;
-}
-
-static void perf_buffer_free_work(struct work_struct *work)
-{
- struct perf_buffer *buffer;
- void *base;
- int i, nr;
-
- buffer = container_of(work, struct perf_buffer, work);
- nr = 1 << page_order(buffer);
-
- base = buffer->user_page;
- for (i = 0; i < nr + 1; i++)
- perf_mmap_unmark_page(base + (i * PAGE_SIZE));
-
- vfree(base);
- kfree(buffer);
-}
-
-static void perf_buffer_free(struct perf_buffer *buffer)
-{
- schedule_work(&buffer->work);
-}
-
-static struct perf_buffer *
-perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
-{
- struct perf_buffer *buffer;
- unsigned long size;
- void *all_buf;
-
- size = sizeof(struct perf_buffer);
- size += sizeof(void *);
-
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- goto fail;
-
- INIT_WORK(&buffer->work, perf_buffer_free_work);
-
- all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
- if (!all_buf)
- goto fail_all_buf;
-
- buffer->user_page = all_buf;
- buffer->data_pages[0] = all_buf + PAGE_SIZE;
- buffer->page_order = ilog2(nr_pages);
- buffer->nr_pages = 1;
-
- perf_buffer_init(buffer, watermark, flags);
-
- return buffer;
-
-fail_all_buf:
- kfree(buffer);
-
-fail:
- return NULL;
-}
-
-#endif
-
-static unsigned long perf_data_size(struct perf_buffer *buffer)
-{
- return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
-}
-
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct perf_event *event = vma->vm_file->private_data;
- struct perf_buffer *buffer;
+ struct ring_buffer *rb;
int ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3623,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
rcu_read_lock();
- buffer = rcu_dereference(event->buffer);
- if (!buffer)
+ rb = rcu_dereference(event->rb);
+ if (!rb)
goto unlock;
if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
goto unlock;
- vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
+ vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
if (!vmf->page)
goto unlock;
@@ -3645,35 +3452,35 @@ unlock:
return ret;
}
-static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
+static void rb_free_rcu(struct rcu_head *rcu_head)
{
- struct perf_buffer *buffer;
+ struct ring_buffer *rb;
- buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
- perf_buffer_free(buffer);
+ rb = container_of(rcu_head, struct ring_buffer, rcu_head);
+ rb_free(rb);
}
-static struct perf_buffer *perf_buffer_get(struct perf_event *event)
+static struct ring_buffer *ring_buffer_get(struct perf_event *event)
{
- struct perf_buffer *buffer;
+ struct ring_buffer *rb;
rcu_read_lock();
- buffer = rcu_dereference(event->buffer);
- if (buffer) {
- if (!atomic_inc_not_zero(&buffer->refcount))
- buffer = NULL;
+ rb = rcu_dereference(event->rb);
+ if (rb) {
+ if (!atomic_inc_not_zero(&rb->refcount))
+ rb = NULL;
}
rcu_read_unlock();
- return buffer;
+ return rb;
}
-static void perf_buffer_put(struct perf_buffer *buffer)
+static void ring_buffer_put(struct ring_buffer *rb)
{
- if (!atomic_dec_and_test(&buffer->refcount))
+ if (!atomic_dec_and_test(&rb->refcount))
return;
- call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
+ call_rcu(&rb->rcu_head, rb_free_rcu);
}
static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3688,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->buffer);
+ unsigned long size = perf_data_size(event->rb);
struct user_struct *user = event->mmap_user;
- struct perf_buffer *buffer = event->buffer;
+ struct ring_buffer *rb = event->rb;
atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
vma->vm_mm->locked_vm -= event->mmap_locked;
- rcu_assign_pointer(event->buffer, NULL);
+ rcu_assign_pointer(event->rb, NULL);
mutex_unlock(&event->mmap_mutex);
- perf_buffer_put(buffer);
+ ring_buffer_put(rb);
free_uid(user);
}
}
@@ -3715,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
unsigned long locked, lock_limit;
- struct perf_buffer *buffer;
+ struct ring_buffer *rb;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra, extra;
@@ -3724,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
/*
* Don't allow mmap() of inherited per-task counters. This would
* create a performance issue due to all children writing to the
- * same buffer.
+ * same rb.
*/
if (event->cpu == -1 && event->attr.inherit)
return -EINVAL;
@@ -3736,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
nr_pages = (vma_size / PAGE_SIZE) - 1;
/*
- * If we have buffer pages ensure they're a power-of-two number, so we
+ * If we have rb pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
*/
if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3750,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->mmap_mutex);
- if (event->buffer) {
- if (event->buffer->nr_pages == nr_pages)
- atomic_inc(&event->buffer->refcount);
+ if (event->rb) {
+ if (event->rb->nr_pages == nr_pages)
+ atomic_inc(&event->rb->refcount);
else
ret = -EINVAL;
goto unlock;
@@ -3782,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
goto unlock;
}
- WARN_ON(event->buffer);
+ WARN_ON(event->rb);
if (vma->vm_flags & VM_WRITE)
- flags |= PERF_BUFFER_WRITABLE;
+ flags |= RING_BUFFER_WRITABLE;
- buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
- event->cpu, flags);
- if (!buffer) {
+ rb = rb_alloc(nr_pages,
+ event->attr.watermark ? event->attr.wakeup_watermark : 0,
+ event->cpu, flags);
+
+ if (!rb) {
ret = -ENOMEM;
goto unlock;
}
- rcu_assign_pointer(event->buffer, buffer);
+ rcu_assign_pointer(event->rb, rb);
atomic_long_add(user_extra, &user->locked_vm);
event->mmap_locked = extra;
@@ -3892,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
-/*
- * Output
- */
-static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
- unsigned long offset, unsigned long head)
-{
- unsigned long mask;
-
- if (!buffer->writable)
- return true;
-
- mask = perf_data_size(buffer) - 1;
-
- offset = (offset - tail) & mask;
- head = (head - tail) & mask;
-
- if ((int)(head - offset) < 0)
- return false;
-
- return true;
-}
-
-static void perf_output_wakeup(struct perf_output_handle *handle)
-{
- atomic_set(&handle->buffer->poll, POLL_IN);
-
- if (handle->nmi) {
- handle->event->pending_wakeup = 1;
- irq_work_queue(&handle->event->pending);
- } else
- perf_event_wakeup(handle->event);
-}
-
-/*
- * We need to ensure a later event_id doesn't publish a head when a former
- * event isn't done writing. However since we need to deal with NMIs we
- * cannot fully serialize things.
- *
- * We only publish the head (and generate a wakeup) when the outer-most
- * event completes.
- */
-static void perf_output_get_handle(struct perf_output_handle *handle)
-{
- struct perf_buffer *buffer = handle->buffer;
-
- preempt_disable();
- local_inc(&buffer->nest);
- handle->wakeup = local_read(&buffer->wakeup);
-}
-
-static void perf_output_put_handle(struct perf_output_handle *handle)
-{
- struct perf_buffer *buffer = handle->buffer;
- unsigned long head;
-
-again:
- head = local_read(&buffer->head);
-
- /*
- * IRQ/NMI can happen here, which means we can miss a head update.
- */
-
- if (!local_dec_and_test(&buffer->nest))
- goto out;
-
- /*
- * Publish the known good head. Rely on the full barrier implied
- * by atomic_dec_and_test() order the buffer->head read and this
- * write.
- */
- buffer->user_page->data_head = head;
-
- /*
- * Now check if we missed an update, rely on the (compiler)
- * barrier in atomic_dec_and_test() to re-read buffer->head.
- */
- if (unlikely(head != local_read(&buffer->head))) {
- local_inc(&buffer->nest);
- goto again;
- }
-
- if (handle->wakeup != local_read(&buffer->wakeup))
- perf_output_wakeup(handle);
-
-out:
- preempt_enable();
-}
-
-__always_inline void perf_output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
-{
- do {
- unsigned long size = min_t(unsigned long, handle->size, len);
-
- memcpy(handle->addr, buf, size);
-
- len -= size;
- handle->addr += size;
- buf += size;
- handle->size -= size;
- if (!handle->size) {
- struct perf_buffer *buffer = handle->buffer;
-
- handle->page++;
- handle->page &= buffer->nr_pages - 1;
- handle->addr = buffer->data_pages[handle->page];
- handle->size = PAGE_SIZE << page_order(buffer);
- }
- } while (len);
-}
-
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -4033,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
}
}
-static void perf_event_header__init_id(struct perf_event_header *header,
- struct perf_sample_data *data,
- struct perf_event *event)
+void perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event)
{
if (event->attr.sample_id_all)
__perf_event_header__init_id(header, data, event);
@@ -4062,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
perf_output_put(handle, data->cpu_entry);
}
-static void perf_event__output_id_sample(struct perf_event *event,
- struct perf_output_handle *handle,
- struct perf_sample_data *sample)
+void perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample)
{
if (event->attr.sample_id_all)
__perf_event__output_id_sample(handle, sample);
}
-int perf_output_begin(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size,
- int nmi, int sample)
-{
- struct perf_buffer *buffer;
- unsigned long tail, offset, head;
- int have_lost;
- struct perf_sample_data sample_data;
- struct {
- struct perf_event_header header;
- u64 id;
- u64 lost;
- } lost_event;
-
- rcu_read_lock();
- /*
- * For inherited events we send all the output towards the parent.
- */
- if (event->parent)
- event = event->parent;
-
- buffer = rcu_dereference(event->buffer);
- if (!buffer)
- goto out;
-
- handle->buffer = buffer;
- handle->event = event;
- handle->nmi = nmi;
- handle->sample = sample;
-
- if (!buffer->nr_pages)
- goto out;
-
- have_lost = local_read(&buffer->lost);
- if (have_lost) {
- lost_event.header.size = sizeof(lost_event);
- perf_event_header__init_id(&lost_event.header, &sample_data,
- event);
- size += lost_event.header.size;
- }
-
- perf_output_get_handle(handle);
-
- do {
- /*
- * Userspace could choose to issue a mb() before updating the
- * tail pointer. So that all reads will be completed before the
- * write is issued.
- */
- tail = ACCESS_ONCE(buffer->user_page->data_tail);
- smp_rmb();
- offset = head = local_read(&buffer->head);
- head += size;
- if (unlikely(!perf_output_space(buffer, tail, offset, head)))
- goto fail;
- } while (local_cmpxchg(&buffer->head, offset, head) != offset);
-
- if (head - local_read(&buffer->wakeup) > buffer->watermark)
- local_add(buffer->watermark, &buffer->wakeup);
-
- handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
- handle->page &= buffer->nr_pages - 1;
- handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
- handle->addr = buffer->data_pages[handle->page];
- handle->addr += handle->size;
- handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
-
- if (have_lost) {
- lost_event.header.type = PERF_RECORD_LOST;
- lost_event.header.misc = 0;
- lost_event.id = event->id;
- lost_event.lost = local_xchg(&buffer->lost, 0);
-
- perf_output_put(handle, lost_event);
- perf_event__output_id_sample(event, handle, &sample_data);
- }
-
- return 0;
-
-fail:
- local_inc(&buffer->lost);
- perf_output_put_handle(handle);
-out:
- rcu_read_unlock();
-
- return -ENOSPC;
-}
-
-void perf_output_end(struct perf_output_handle *handle)
-{
- struct perf_event *event = handle->event;
- struct perf_buffer *buffer = handle->buffer;
-
- int wakeup_events = event->attr.wakeup_events;
-
- if (handle->sample && wakeup_events) {
- int events = local_inc_return(&buffer->events);
- if (events >= wakeup_events) {
- local_sub(wakeup_events, &buffer->events);
- local_inc(&buffer->wakeup);
- }
- }
-
- perf_output_put_handle(handle);
- rcu_read_unlock();
-}
-
static void perf_output_read_one(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
@@ -4197,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
- perf_output_copy(handle, values, n * sizeof(u64));
+ __output_copy(handle, values, n * sizeof(u64));
}
/*
@@ -4227,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- perf_output_copy(handle, values, n * sizeof(u64));
+ __output_copy(handle, values, n * sizeof(u64));
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;
@@ -4239,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
- perf_output_copy(handle, values, n * sizeof(u64));
+ __output_copy(handle, values, n * sizeof(u64));
}
}
@@ -4249,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
- u64 enabled = 0, running = 0, now, ctx_time;
+ u64 enabled = 0, running = 0;
u64 read_format = event->attr.read_format;
/*
@@ -4261,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle,
* because of locking issue as we are called in
* NMI context
*/
- if (read_format & PERF_FORMAT_TOTAL_TIMES) {
- now = perf_clock();
- ctx_time = event->shadow_ctx_time + now;
- enabled = ctx_time - event->tstamp_enabled;
- running = ctx_time - event->tstamp_running;
- }
+ if (read_format & PERF_FORMAT_TOTAL_TIMES)
+ calc_timer_values(event, &enabled, &running);
if (event->attr.read_format & PERF_FORMAT_GROUP)
perf_output_read_group(handle, event, enabled, running);
@@ -4319,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle,
size *= sizeof(u64);
- perf_output_copy(handle, data->callchain, size);
+ __output_copy(handle, data->callchain, size);
} else {
u64 nr = 0;
perf_output_put(handle, nr);
@@ -4329,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
perf_output_put(handle, data->raw->size);
- perf_output_copy(handle, data->raw->data,
- data->raw->size);
+ __output_copy(handle, data->raw->data,
+ data->raw->size);
} else {
struct {
u32 size;
@@ -4342,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, raw);
}
}
+
+ if (!event->attr.watermark) {
+ int wakeup_events = event->attr.wakeup_events;
+
+ if (wakeup_events) {
+ struct ring_buffer *rb = handle->rb;
+ int events = local_inc_return(&rb->events);
+
+ if (events >= wakeup_events) {
+ local_sub(wakeup_events, &rb->events);
+ local_inc(&rb->wakeup);
+ }
+ }
+ }
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4386,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-static void perf_event_output(struct perf_event *event, int nmi,
+static void perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -4398,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size, nmi, 1))
+ if (perf_output_begin(&handle, event, header.size))
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -4438,7 +4039,7 @@ perf_event_read_event(struct perf_event *event,
int ret;
perf_event_header__init_id(&read_event.header, &sample, event);
- ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
+ ret = perf_output_begin(&handle, event, read_event.header.size);
if (ret)
return;
@@ -4481,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event,
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
- task_event->event_id.header.size, 0, 0);
+ task_event->event_id.header.size);
if (ret)
goto out;
@@ -4618,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event,
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
- comm_event->event_id.header.size, 0, 0);
+ comm_event->event_id.header.size);
if (ret)
goto out;
@@ -4627,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event,
comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
perf_output_put(&handle, comm_event->event_id);
- perf_output_copy(&handle, comm_event->comm,
+ __output_copy(&handle, comm_event->comm,
comm_event->comm_size);
perf_event__output_id_sample(event, &handle, &sample);
@@ -4765,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
- mmap_event->event_id.header.size, 0, 0);
+ mmap_event->event_id.header.size);
if (ret)
goto out;
@@ -4773,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.tid = perf_event_tid(event, current);
perf_output_put(&handle, mmap_event->event_id);
- perf_output_copy(&handle, mmap_event->file_name,
+ __output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
perf_event__output_id_sample(event, &handle, &sample);
@@ -4829,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
if (file) {
/*
- * d_path works from the end of the buffer backwards, so we
+ * d_path works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
@@ -4960,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_event_header__init_id(&throttle_event.header, &sample, event);
ret = perf_output_begin(&handle, event,
- throttle_event.header.size, 1, 0);
+ throttle_event.header.size);
if (ret)
return;
@@ -4973,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
* Generic event overflow handling, sampling.
*/
-static int __perf_event_overflow(struct perf_event *event, int nmi,
+static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -5016,34 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
- if (nmi) {
- event->pending_disable = 1;
- irq_work_queue(&event->pending);
- } else
- perf_event_disable(event);
+ event->pending_disable = 1;
+ irq_work_queue(&event->pending);
}
if (event->overflow_handler)
- event->overflow_handler(event, nmi, data, regs);
+ event->overflow_handler(event, data, regs);
else
- perf_event_output(event, nmi, data, regs);
+ perf_event_output(event, data, regs);
if (event->fasync && event->pending_kill) {
- if (nmi) {
- event->pending_wakeup = 1;
- irq_work_queue(&event->pending);
- } else
- perf_event_wakeup(event);
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending);
}
return ret;
}
-int perf_event_overflow(struct perf_event *event, int nmi,
+int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
- return __perf_event_overflow(event, nmi, 1, data, regs);
+ return __perf_event_overflow(event, 1, data, regs);
}
/*
@@ -5092,7 +4687,7 @@ again:
}
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
- int nmi, struct perf_sample_data *data,
+ struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
return;
for (; overflow; overflow--) {
- if (__perf_event_overflow(event, nmi, throttle,
+ if (__perf_event_overflow(event, throttle,
data, regs)) {
/*
* We inhibit the overflow from happening when
@@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
}
static void perf_swevent_event(struct perf_event *event, u64 nr,
- int nmi, struct perf_sample_data *data,
+ struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
return;
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
- return perf_swevent_overflow(event, 1, nmi, data, regs);
+ return perf_swevent_overflow(event, 1, data, regs);
if (local64_add_negative(nr, &hwc->period_left))
return;
- perf_swevent_overflow(event, 0, nmi, data, regs);
+ perf_swevent_overflow(event, 0, data, regs);
}
static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
}
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
- u64 nr, int nmi,
+ u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
- perf_swevent_event(event, nr, nmi, data, regs);
+ perf_swevent_event(event, nr, data, regs);
}
end:
rcu_read_unlock();
@@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
put_recursion_context(swhash->recursion, rctx);
}
-void __perf_sw_event(u32 event_id, u64 nr, int nmi,
- struct pt_regs *regs, u64 addr)
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data;
int rctx;
@@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
perf_sample_data_init(&data, addr);
- do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
+ do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
perf_swevent_put_recursion_context(rctx);
preempt_enable_notrace();
@@ -5524,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, 1, &data, regs);
+ perf_swevent_event(event, count, &data, regs);
}
perf_swevent_put_recursion_context(rctx);
@@ -5617,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
perf_sample_data_init(&sample, bp->attr.bp_addr);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
- perf_swevent_event(bp, 1, 1, &sample, regs);
+ perf_swevent_event(bp, 1, &sample, regs);
}
#endif
@@ -5646,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && current->pid == 0))
- if (perf_event_overflow(event, 0, &data, regs))
+ if (perf_event_overflow(event, &data, regs))
ret = HRTIMER_NORESTART;
}
@@ -5986,6 +5580,7 @@ free_dev:
}
static struct lock_class_key cpuctx_mutex;
+static struct lock_class_key cpuctx_lock;
int perf_pmu_register(struct pmu *pmu, char *name, int type)
{
@@ -6036,6 +5631,7 @@ skip_type:
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
__perf_event_init_context(&cpuctx->ctx);
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.type = cpu_context;
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
@@ -6150,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
struct perf_event *group_leader,
struct perf_event *parent_event,
- perf_overflow_handler_t overflow_handler)
+ perf_overflow_handler_t overflow_handler,
+ void *context)
{
struct pmu *pmu;
struct perf_event *event;
@@ -6208,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
#endif
}
- if (!overflow_handler && parent_event)
+ if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
+ context = parent_event->overflow_handler_context;
+ }
event->overflow_handler = overflow_handler;
+ event->overflow_handler_context = context;
if (attr->disabled)
event->state = PERF_EVENT_STATE_OFF;
@@ -6354,7 +5954,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct perf_buffer *buffer = NULL, *old_buffer = NULL;
+ struct ring_buffer *rb = NULL, *old_rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -6371,7 +5971,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
goto out;
/*
- * If its not a per-cpu buffer, it must be the same task.
+ * If its not a per-cpu rb, it must be the same task.
*/
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
@@ -6383,20 +5983,20 @@ set:
goto unlock;
if (output_event) {
- /* get the buffer we want to redirect to */
- buffer = perf_buffer_get(output_event);
- if (!buffer)
+ /* get the rb we want to redirect to */
+ rb = ring_buffer_get(output_event);
+ if (!rb)
goto unlock;
}
- old_buffer = event->buffer;
- rcu_assign_pointer(event->buffer, buffer);
+ old_rb = event->rb;
+ rcu_assign_pointer(event->rb, rb);
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
- if (old_buffer)
- perf_buffer_put(old_buffer);
+ if (old_rb)
+ ring_buffer_put(old_rb);
out:
return ret;
}
@@ -6478,7 +6078,8 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
- event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL);
+ event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
+ NULL, NULL);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err_task;
@@ -6663,7 +6264,8 @@ err_fd:
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
- perf_overflow_handler_t overflow_handler)
+ perf_overflow_handler_t overflow_handler,
+ void *context)
{
struct perf_event_context *ctx;
struct perf_event *event;
@@ -6673,7 +6275,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
* Get the target context (task or percpu):
*/
- event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler);
+ event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+ overflow_handler, context);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err;
@@ -6780,7 +6383,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* our context.
*/
child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
- task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Take the context lock here so that if find_get_context is
@@ -6788,6 +6390,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* incremented the context's refcount before we do put_ctx below.
*/
raw_spin_lock(&child_ctx->lock);
+ task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
/*
* If this context is a clone; unclone it so it can't get
@@ -6957,7 +6560,7 @@ inherit_event(struct perf_event *parent_event,
parent_event->cpu,
child,
group_leader, parent_event,
- NULL);
+ NULL, NULL);
if (IS_ERR(child_event))
return child_event;
get_ctx(child_ctx);
@@ -6984,6 +6587,8 @@ inherit_event(struct perf_event *parent_event,
child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
+ child_event->overflow_handler_context
+ = parent_event->overflow_handler_context;
/*
* Precalculate sample_data sizes
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
struct perf_event *
register_user_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered,
+ void *context,
struct task_struct *tsk)
{
- return perf_event_create_kernel_counter(attr, -1, tsk, triggered);
+ return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
+ context);
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
*/
struct perf_event * __percpu *
register_wide_hw_breakpoint(struct perf_event_attr *attr,
- perf_overflow_handler_t triggered)
+ perf_overflow_handler_t triggered,
+ void *context)
{
struct perf_event * __percpu *cpu_events, **pevent, *bp;
long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
get_online_cpus();
for_each_online_cpu(cpu) {
pevent = per_cpu_ptr(cpu_events, cpu);
- bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered);
+ bp = perf_event_create_kernel_counter(attr, cpu, NULL,
+ triggered, context);
*pevent = bp;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..09097dd8116c
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
+#ifndef _KERNEL_EVENTS_INTERNAL_H
+#define _KERNEL_EVENTS_INTERNAL_H
+
+#define RING_BUFFER_WRITABLE 0x01
+
+struct ring_buffer {
+ atomic_t refcount;
+ struct rcu_head rcu_head;
+#ifdef CONFIG_PERF_USE_VMALLOC
+ struct work_struct work;
+ int page_order; /* allocation order */
+#endif
+ int nr_pages; /* nr of data pages */
+ int writable; /* are we writable */
+
+ atomic_t poll; /* POLL_ for wakeups */
+
+ local_t head; /* write position */
+ local_t nest; /* nested writers */
+ local_t events; /* event limit */
+ local_t wakeup; /* wakeup stamp */
+ local_t lost; /* nr records lost */
+
+ long watermark; /* wakeup watermark */
+
+ struct perf_event_mmap_page *user_page;
+ void *data_pages[0];
+};
+
+extern void rb_free(struct ring_buffer *rb);
+extern struct ring_buffer *
+rb_alloc(int nr_pages, long watermark, int cpu, int flags);
+extern void perf_event_wakeup(struct perf_event *event);
+
+extern void
+perf_event_header__init_id(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_event *event);
+extern void
+perf_event__output_id_sample(struct perf_event *event,
+ struct perf_output_handle *handle,
+ struct perf_sample_data *sample);
+
+extern struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
+
+#ifdef CONFIG_PERF_USE_VMALLOC
+/*
+ * Back perf_mmap() with vmalloc memory.
+ *
+ * Required for architectures that have d-cache aliasing issues.
+ */
+
+static inline int page_order(struct ring_buffer *rb)
+{
+ return rb->page_order;
+}
+
+#else
+
+static inline int page_order(struct ring_buffer *rb)
+{
+ return 0;
+}
+#endif
+
+static unsigned long perf_data_size(struct ring_buffer *rb)
+{
+ return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
+}
+
+static inline void
+__output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
+{
+ do {
+ unsigned long size = min_t(unsigned long, handle->size, len);
+
+ memcpy(handle->addr, buf, size);
+
+ len -= size;
+ handle->addr += size;
+ buf += size;
+ handle->size -= size;
+ if (!handle->size) {
+ struct ring_buffer *rb = handle->rb;
+
+ handle->page++;
+ handle->page &= rb->nr_pages - 1;
+ handle->addr = rb->data_pages[handle->page];
+ handle->size = PAGE_SIZE << page_order(rb);
+ }
+ } while (len);
+}
+
+#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..a2a29205cc0f
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
+/*
+ * Performance events ring-buffer code:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
+ unsigned long offset, unsigned long head)
+{
+ unsigned long mask;
+
+ if (!rb->writable)
+ return true;
+
+ mask = perf_data_size(rb) - 1;
+
+ offset = (offset - tail) & mask;
+ head = (head - tail) & mask;
+
+ if ((int)(head - offset) < 0)
+ return false;
+
+ return true;
+}
+
+static void perf_output_wakeup(struct perf_output_handle *handle)
+{
+ atomic_set(&handle->rb->poll, POLL_IN);
+
+ handle->event->pending_wakeup = 1;
+ irq_work_queue(&handle->event->pending);
+}
+
+/*
+ * We need to ensure a later event_id doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_get_handle(struct perf_output_handle *handle)
+{
+ struct ring_buffer *rb = handle->rb;
+
+ preempt_disable();
+ local_inc(&rb->nest);
+ handle->wakeup = local_read(&rb->wakeup);
+}
+
+static void perf_output_put_handle(struct perf_output_handle *handle)
+{
+ struct ring_buffer *rb = handle->rb;
+ unsigned long head;
+
+again:
+ head = local_read(&rb->head);
+
+ /*
+ * IRQ/NMI can happen here, which means we can miss a head update.
+ */
+
+ if (!local_dec_and_test(&rb->nest))
+ goto out;
+
+ /*
+ * Publish the known good head. Rely on the full barrier implied
+ * by atomic_dec_and_test() order the rb->head read and this
+ * write.
+ */
+ rb->user_page->data_head = head;
+
+ /*
+ * Now check if we missed an update, rely on the (compiler)
+ * barrier in atomic_dec_and_test() to re-read rb->head.
+ */
+ if (unlikely(head != local_read(&rb->head))) {
+ local_inc(&rb->nest);
+ goto again;
+ }
+
+ if (handle->wakeup != local_read(&rb->wakeup))
+ perf_output_wakeup(handle);
+
+out:
+ preempt_enable();
+}
+
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_event *event, unsigned int size)
+{
+ struct ring_buffer *rb;
+ unsigned long tail, offset, head;
+ int have_lost;
+ struct perf_sample_data sample_data;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ u64 lost;
+ } lost_event;
+
+ rcu_read_lock();
+ /*
+ * For inherited events we send all the output towards the parent.
+ */
+ if (event->parent)
+ event = event->parent;
+
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto out;
+
+ handle->rb = rb;
+ handle->event = event;
+
+ if (!rb->nr_pages)
+ goto out;
+
+ have_lost = local_read(&rb->lost);
+ if (have_lost) {
+ lost_event.header.size = sizeof(lost_event);
+ perf_event_header__init_id(&lost_event.header, &sample_data,
+ event);
+ size += lost_event.header.size;
+ }
+
+ perf_output_get_handle(handle);
+
+ do {
+ /*
+ * Userspace could choose to issue a mb() before updating the
+ * tail pointer. So that all reads will be completed before the
+ * write is issued.
+ */
+ tail = ACCESS_ONCE(rb->user_page->data_tail);
+ smp_rmb();
+ offset = head = local_read(&rb->head);
+ head += size;
+ if (unlikely(!perf_output_space(rb, tail, offset, head)))
+ goto fail;
+ } while (local_cmpxchg(&rb->head, offset, head) != offset);
+
+ if (head - local_read(&rb->wakeup) > rb->watermark)
+ local_add(rb->watermark, &rb->wakeup);
+
+ handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+ handle->page &= rb->nr_pages - 1;
+ handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
+ handle->addr = rb->data_pages[handle->page];
+ handle->addr += handle->size;
+ handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
+
+ if (have_lost) {
+ lost_event.header.type = PERF_RECORD_LOST;
+ lost_event.header.misc = 0;
+ lost_event.id = event->id;
+ lost_event.lost = local_xchg(&rb->lost, 0);
+
+ perf_output_put(handle, lost_event);
+ perf_event__output_id_sample(event, handle, &sample_data);
+ }
+
+ return 0;
+
+fail:
+ local_inc(&rb->lost);
+ perf_output_put_handle(handle);
+out:
+ rcu_read_unlock();
+
+ return -ENOSPC;
+}
+
+void perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
+{
+ __output_copy(handle, buf, len);
+}
+
+void perf_output_end(struct perf_output_handle *handle)
+{
+ perf_output_put_handle(handle);
+ rcu_read_unlock();
+}
+
+static void
+ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
+{
+ long max_size = perf_data_size(rb);
+
+ if (watermark)
+ rb->watermark = min(max_size, watermark);
+
+ if (!rb->watermark)
+ rb->watermark = max_size / 2;
+
+ if (flags & RING_BUFFER_WRITABLE)
+ rb->writable = 1;
+
+ atomic_set(&rb->refcount, 1);
+}
+
+#ifndef CONFIG_PERF_USE_VMALLOC
+
+/*
+ * Back perf_mmap() with regular GFP_KERNEL-0 pages.
+ */
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ if (pgoff > rb->nr_pages)
+ return NULL;
+
+ if (pgoff == 0)
+ return virt_to_page(rb->user_page);
+
+ return virt_to_page(rb->data_pages[pgoff - 1]);
+}
+
+static void *perf_mmap_alloc_page(int cpu)
+{
+ struct page *page;
+ int node;
+
+ node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct ring_buffer *rb;
+ unsigned long size;
+ int i;
+
+ size = sizeof(struct ring_buffer);
+ size += nr_pages * sizeof(void *);
+
+ rb = kzalloc(size, GFP_KERNEL);
+ if (!rb)
+ goto fail;
+
+ rb->user_page = perf_mmap_alloc_page(cpu);
+ if (!rb->user_page)
+ goto fail_user_page;
+
+ for (i = 0; i < nr_pages; i++) {
+ rb->data_pages[i] = perf_mmap_alloc_page(cpu);
+ if (!rb->data_pages[i])
+ goto fail_data_pages;
+ }
+
+ rb->nr_pages = nr_pages;
+
+ ring_buffer_init(rb, watermark, flags);
+
+ return rb;
+
+fail_data_pages:
+ for (i--; i >= 0; i--)
+ free_page((unsigned long)rb->data_pages[i]);
+
+ free_page((unsigned long)rb->user_page);
+
+fail_user_page:
+ kfree(rb);
+
+fail:
+ return NULL;
+}
+
+static void perf_mmap_free_page(unsigned long addr)
+{
+ struct page *page = virt_to_page((void *)addr);
+
+ page->mapping = NULL;
+ __free_page(page);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+ int i;
+
+ perf_mmap_free_page((unsigned long)rb->user_page);
+ for (i = 0; i < rb->nr_pages; i++)
+ perf_mmap_free_page((unsigned long)rb->data_pages[i]);
+ kfree(rb);
+}
+
+#else
+
+struct page *
+perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
+{
+ if (pgoff > (1UL << page_order(rb)))
+ return NULL;
+
+ return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
+}
+
+static void perf_mmap_unmark_page(void *addr)
+{
+ struct page *page = vmalloc_to_page(addr);
+
+ page->mapping = NULL;
+}
+
+static void rb_free_work(struct work_struct *work)
+{
+ struct ring_buffer *rb;
+ void *base;
+ int i, nr;
+
+ rb = container_of(work, struct ring_buffer, work);
+ nr = 1 << page_order(rb);
+
+ base = rb->user_page;
+ for (i = 0; i < nr + 1; i++)
+ perf_mmap_unmark_page(base + (i * PAGE_SIZE));
+
+ vfree(base);
+ kfree(rb);
+}
+
+void rb_free(struct ring_buffer *rb)
+{
+ schedule_work(&rb->work);
+}
+
+struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
+{
+ struct ring_buffer *rb;
+ unsigned long size;
+ void *all_buf;
+
+ size = sizeof(struct ring_buffer);
+ size += sizeof(void *);
+
+ rb = kzalloc(size, GFP_KERNEL);
+ if (!rb)
+ goto fail;
+
+ INIT_WORK(&rb->work, rb_free_work);
+
+ all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
+ if (!all_buf)
+ goto fail_all_buf;
+
+ rb->user_page = all_buf;
+ rb->data_pages[0] = all_buf + PAGE_SIZE;
+ rb->page_order = ilog2(nr_pages);
+ rb->nr_pages = 1;
+
+ ring_buffer_init(rb, watermark, flags);
+
+ return rb;
+
+fail_all_buf:
+ kfree(rb);
+
+fail:
+ return NULL;
+}
+
+#endif
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 7d02d33be699..42ddbc6f0de6 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
if (error)
pm_notifier_call_chain(PM_POST_RESTORE);
}
- if (error)
+ if (error) {
+ free_basic_memory_bitmaps();
atomic_inc(&snapshot_device_available);
+ }
data->frozen = 0;
data->ready = 0;
data->platform_support = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 3f2e502d609b..d08d110b8976 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2220,7 +2220,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (task_cpu(p) != new_cpu) {
p->se.nr_migrations++;
- perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+ perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
}
__set_task_cpu(p, new_cpu);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index eb212f8f8bc8..d20c6983aad9 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
EXPORT_SYMBOL_GPL(print_stack_trace);
/*
- * Architectures that do not implement save_stack_trace_tsk get this
- * weak alias and a once-per-bootup warning (whenever this facility
- * is utilized - for example by procfs):
+ * Architectures that do not implement save_stack_trace_tsk or
+ * save_stack_trace_regs get this weak alias and a once-per-bootup warning
+ * (whenever this facility is utilized - for example by procfs):
*/
__weak void
save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
}
+
+__weak void
+save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
+{
+ WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n");
+}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 9ffea360a778..fc0f22005417 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -285,16 +285,18 @@ ret:
static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
{
struct listener_list *listeners;
- struct listener *s, *tmp;
+ struct listener *s, *tmp, *s2;
unsigned int cpu;
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
+ s = NULL;
if (isadd == REGISTER) {
for_each_cpu(cpu, mask) {
- s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
- cpu_to_node(cpu));
+ if (!s)
+ s = kmalloc_node(sizeof(struct listener),
+ GFP_KERNEL, cpu_to_node(cpu));
if (!s)
goto cleanup;
s->pid = pid;
@@ -303,9 +305,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
listeners = &per_cpu(listener_array, cpu);
down_write(&listeners->sem);
+ list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
+ if (s2->pid == pid)
+ goto next_cpu;
+ }
list_add(&s->list, &listeners->list);
+ s = NULL;
+next_cpu:
up_write(&listeners->sem);
}
+ kfree(s);
return 0;
}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 2d966244ea60..59f369f98a04 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -42,15 +42,75 @@ static struct alarm_base {
clockid_t base_clockid;
} alarm_bases[ALARM_NUMTYPE];
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
+
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
static struct rtc_device *rtcdev;
-#endif
+static DEFINE_SPINLOCK(rtcdev_lock);
-/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
-static ktime_t freezer_delta;
-static DEFINE_SPINLOCK(freezer_delta_lock);
+/**
+ * has_wakealarm - check rtc device has wakealarm ability
+ * @dev: current device
+ * @name_ptr: name to be returned
+ *
+ * This helper function checks to see if the rtc device can wake
+ * from suspend.
+ */
+static int has_wakealarm(struct device *dev, void *name_ptr)
+{
+ struct rtc_device *candidate = to_rtc_device(dev);
+
+ if (!candidate->ops->set_alarm)
+ return 0;
+ if (!device_may_wakeup(candidate->dev.parent))
+ return 0;
+
+ *(const char **)name_ptr = dev_name(dev);
+ return 1;
+}
+
+/**
+ * alarmtimer_get_rtcdev - Return selected rtcdevice
+ *
+ * This function returns the rtc device to use for wakealarms.
+ * If one has not already been chosen, it checks to see if a
+ * functional rtc device is available.
+ */
+static struct rtc_device *alarmtimer_get_rtcdev(void)
+{
+ struct device *dev;
+ char *str;
+ unsigned long flags;
+ struct rtc_device *ret;
+
+ spin_lock_irqsave(&rtcdev_lock, flags);
+ if (!rtcdev) {
+ /* Find an rtc device and init the rtc_timer */
+ dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
+ /* If we have a device then str is valid. See has_wakealarm() */
+ if (dev) {
+ rtcdev = rtc_class_open(str);
+ /*
+ * Drop the reference we got in class_find_device,
+ * rtc_open takes its own.
+ */
+ put_device(dev);
+ rtc_timer_init(&rtctimer, NULL, NULL);
+ }
+ }
+ ret = rtcdev;
+ spin_unlock_irqrestore(&rtcdev_lock, flags);
+
+ return ret;
+}
+#else
+#define alarmtimer_get_rtcdev() (0)
+#define rtcdev (0)
+#endif
/**
@@ -166,6 +226,7 @@ static int alarmtimer_suspend(struct device *dev)
struct rtc_time tm;
ktime_t min, now;
unsigned long flags;
+ struct rtc_device *rtc;
int i;
spin_lock_irqsave(&freezer_delta_lock, flags);
@@ -173,8 +234,9 @@ static int alarmtimer_suspend(struct device *dev)
freezer_delta = ktime_set(0, 0);
spin_unlock_irqrestore(&freezer_delta_lock, flags);
+ rtc = rtcdev;
/* If we have no rtcdev, just return */
- if (!rtcdev)
+ if (!rtc)
return 0;
/* Find the soonest timer to expire*/
@@ -199,12 +261,12 @@ static int alarmtimer_suspend(struct device *dev)
WARN_ON(min.tv64 < NSEC_PER_SEC);
/* Setup an rtc timer to fire that far in the future */
- rtc_timer_cancel(rtcdev, &rtctimer);
- rtc_read_time(rtcdev, &tm);
+ rtc_timer_cancel(rtc, &rtctimer);
+ rtc_read_time(rtc, &tm);
now = rtc_tm_to_ktime(tm);
now = ktime_add(now, min);
- rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0));
+ rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
return 0;
}
@@ -322,6 +384,9 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
return hrtimer_get_res(baseid, tp);
}
@@ -336,6 +401,9 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
{
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
*tp = ktime_to_timespec(base->gettime());
return 0;
}
@@ -351,6 +419,9 @@ static int alarm_timer_create(struct k_itimer *new_timer)
enum alarmtimer_type type;
struct alarm_base *base;
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -385,6 +456,9 @@ static void alarm_timer_get(struct k_itimer *timr,
*/
static int alarm_timer_del(struct k_itimer *timr)
{
+ if (!rtcdev)
+ return -ENOTSUPP;
+
alarm_cancel(&timr->it.alarmtimer);
return 0;
}
@@ -402,6 +476,9 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
struct itimerspec *new_setting,
struct itimerspec *old_setting)
{
+ if (!rtcdev)
+ return -ENOTSUPP;
+
/* Save old values */
old_setting->it_interval =
ktime_to_timespec(timr->it.alarmtimer.period);
@@ -541,6 +618,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
int ret = 0;
struct restart_block *restart;
+ if (!alarmtimer_get_rtcdev())
+ return -ENOTSUPP;
+
if (!capable(CAP_WAKE_ALARM))
return -EPERM;
@@ -638,65 +718,3 @@ static int __init alarmtimer_init(void)
}
device_initcall(alarmtimer_init);
-#ifdef CONFIG_RTC_CLASS
-/**
- * has_wakealarm - check rtc device has wakealarm ability
- * @dev: current device
- * @name_ptr: name to be returned
- *
- * This helper function checks to see if the rtc device can wake
- * from suspend.
- */
-static int __init has_wakealarm(struct device *dev, void *name_ptr)
-{
- struct rtc_device *candidate = to_rtc_device(dev);
-
- if (!candidate->ops->set_alarm)
- return 0;
- if (!device_may_wakeup(candidate->dev.parent))
- return 0;
-
- *(const char **)name_ptr = dev_name(dev);
- return 1;
-}
-
-/**
- * alarmtimer_init_late - Late initializing of alarmtimer code
- *
- * This function locates a rtc device to use for wakealarms.
- * Run as late_initcall to make sure rtc devices have been
- * registered.
- */
-static int __init alarmtimer_init_late(void)
-{
- struct device *dev;
- char *str;
-
- /* Find an rtc device and init the rtc_timer */
- dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
- /* If we have a device then str is valid. See has_wakealarm() */
- if (dev) {
- rtcdev = rtc_class_open(str);
- /*
- * Drop the reference we got in class_find_device,
- * rtc_open takes its own.
- */
- put_device(dev);
- }
- if (!rtcdev) {
- printk(KERN_WARNING "No RTC device found, ALARM timers will"
- " not wake from suspend");
- }
- rtc_timer_init(&rtctimer, NULL, NULL);
-
- return 0;
-}
-#else
-static int __init alarmtimer_init_late(void)
-{
- printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers"
- " will not wake from suspend");
- return 0;
-}
-#endif
-late_initcall(alarmtimer_init_late);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ef9271b69b4f..a0e246e2cee3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,7 +32,6 @@
#include <trace/events/sched.h>
-#include <asm/ftrace.h>
#include <asm/setup.h>
#include "trace_output.h"
@@ -82,8 +81,7 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops ftrace_list_end __read_mostly =
-{
+static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
};
@@ -785,8 +783,7 @@ static void unregister_ftrace_profiler(void)
unregister_ftrace_graph();
}
#else
-static struct ftrace_ops ftrace_profile_ops __read_mostly =
-{
+static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
};
@@ -806,19 +803,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
unsigned long val;
- char buf[64]; /* big enough to hold a number */
int ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
val = !!val;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa407943..731201bf4acc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
unsigned nr_pages)
{
struct buffer_page *bpage, *tmp;
- unsigned long addr;
LIST_HEAD(pages);
unsigned i;
WARN_ON(!nr_pages);
for (i = 0; i < nr_pages; i++) {
+ struct page *page;
+ /*
+ * __GFP_NORETRY flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is
+ * not destabilized.
+ */
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
+ GFP_KERNEL | __GFP_NORETRY,
+ cpu_to_node(cpu_buffer->cpu));
if (!bpage)
goto free_pages;
@@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, &pages);
- addr = __get_free_page(GFP_KERNEL);
- if (!addr)
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu),
+ GFP_KERNEL | __GFP_NORETRY, 0);
+ if (!page)
goto free_pages;
- bpage->page = (void *)addr;
+ bpage->page = page_address(page);
rb_init_page(bpage->page);
}
@@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_page *bpage;
- unsigned long addr;
+ struct page *page;
int ret;
cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
@@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- addr = __get_free_page(GFP_KERNEL);
- if (!addr)
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+ if (!page)
goto fail_free_reader;
- bpage->page = (void *)addr;
+ bpage->page = page_address(page);
rb_init_page(bpage->page);
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
@@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
unsigned nr_pages, rm_pages, new_pages;
struct buffer_page *bpage, *tmp;
unsigned long buffer_size;
- unsigned long addr;
LIST_HEAD(pages);
int i, cpu;
@@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
for_each_buffer_cpu(buffer, cpu) {
for (i = 0; i < new_pages; i++) {
+ struct page *page;
+ /*
+ * __GFP_NORETRY flag makes sure that the allocation
+ * fails gracefully without invoking oom-killer and
+ * the system is not destabilized.
+ */
bpage = kzalloc_node(ALIGN(sizeof(*bpage),
cache_line_size()),
- GFP_KERNEL, cpu_to_node(cpu));
+ GFP_KERNEL | __GFP_NORETRY,
+ cpu_to_node(cpu));
if (!bpage)
goto free_pages;
list_add(&bpage->list, &pages);
- addr = __get_free_page(GFP_KERNEL);
- if (!addr)
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY, 0);
+ if (!page)
goto free_pages;
- bpage->page = (void *)addr;
+ bpage->page = page_address(page);
rb_init_page(bpage->page);
}
}
@@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or NULL on error.
*/
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
struct buffer_data_page *bpage;
- unsigned long addr;
+ struct page *page;
- addr = __get_free_page(GFP_KERNEL);
- if (!addr)
+ page = alloc_pages_node(cpu_to_node(cpu),
+ GFP_KERNEL | __GFP_NORETRY, 0);
+ if (!page)
return NULL;
- bpage = (void *)addr;
+ bpage = page_address(page);
rb_init_page(bpage);
@@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
unsigned long *p = filp->private_data;
- char buf[64];
unsigned long val;
int ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
if (val)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 302f8a614635..a5457d577b98 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -106,7 +106,7 @@ static enum event_status read_page(int cpu)
int inc;
int i;
- bpage = ring_buffer_alloc_read_page(buffer);
+ bpage = ring_buffer_alloc_read_page(buffer, cpu);
if (!bpage)
return EVENT_DROPPED;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee9c921d7f21..d9c16123f6e2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
static int trace_stop_count;
static DEFINE_SPINLOCK(tracing_start_lock);
+static void wakeup_work_handler(struct work_struct *work)
+{
+ wake_up(&trace_wait);
+}
+
+static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
+
/**
* trace_wake_up - wake up tasks waiting for trace input
*
- * Simply wakes up any task that is blocked on the trace_wait
- * queue. These is used with trace_poll for tasks polling the trace.
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * trace_wait queue. These is used with trace_poll for tasks polling the
+ * trace.
*/
void trace_wake_up(void)
{
- int cpu;
+ const unsigned long delay = msecs_to_jiffies(2);
if (trace_flags & TRACE_ITER_BLOCK)
return;
- /*
- * The runqueue_is_locked() can fail, but this is the best we
- * have for now:
- */
- cpu = get_cpu();
- if (!runqueue_is_locked(cpu))
- wake_up(&trace_wait);
- put_cpu();
+ schedule_delayed_work(&wakeup_work, delay);
}
static int __init set_buf_size(char *str)
@@ -424,6 +425,7 @@ static const char *trace_options[] = {
"graph-time",
"record-cmd",
"overwrite",
+ "disable_on_free",
NULL
};
@@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc,
+ struct pt_regs *regs)
+{
+ ring_buffer_unlock_commit(buffer, event);
+
+ ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
+ ftrace_trace_userstack(buffer, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs);
+
void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
struct ring_buffer_event *event)
{
@@ -1236,7 +1250,7 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
#ifdef CONFIG_STACKTRACE
static void __ftrace_trace_stack(struct ring_buffer *buffer,
unsigned long flags,
- int skip, int pc)
+ int skip, int pc, struct pt_regs *regs)
{
struct ftrace_event_call *call = &event_kernel_stack;
struct ring_buffer_event *event;
@@ -1255,24 +1269,36 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
trace.skip = skip;
trace.entries = entry->caller;
- save_stack_trace(&trace);
+ if (regs)
+ save_stack_trace_regs(regs, &trace);
+ else
+ save_stack_trace(&trace);
if (!filter_check_discard(call, entry, buffer, event))
ring_buffer_unlock_commit(buffer, event);
}
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
+{
+ if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ return;
+
+ __ftrace_trace_stack(buffer, flags, skip, pc, regs);
+}
+
void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
int skip, int pc)
{
if (!(trace_flags & TRACE_ITER_STACKTRACE))
return;
- __ftrace_trace_stack(buffer, flags, skip, pc);
+ __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
}
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
- __ftrace_trace_stack(tr->buffer, flags, skip, pc);
+ __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
}
/**
@@ -1288,7 +1314,7 @@ void trace_dump_stack(void)
local_save_flags(flags);
/* skipping 3 traces, seems to get us at the caller of this function */
- __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+ __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
}
static DEFINE_PER_CPU(int, user_stack_count);
@@ -2051,6 +2077,9 @@ void trace_default_header(struct seq_file *m)
{
struct trace_iterator *iter = m->private;
+ if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return;
+
if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
/* print nothing if the buffers are empty */
if (trace_empty(iter))
@@ -2701,20 +2730,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
- char buf[64];
unsigned long val;
int ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
val = !!val;
@@ -2767,7 +2787,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
return t->init(tr);
}
-static int tracing_resize_ring_buffer(unsigned long size)
+static int __tracing_resize_ring_buffer(unsigned long size)
{
int ret;
@@ -2819,6 +2839,41 @@ static int tracing_resize_ring_buffer(unsigned long size)
return ret;
}
+static ssize_t tracing_resize_ring_buffer(unsigned long size)
+{
+ int cpu, ret = size;
+
+ mutex_lock(&trace_types_lock);
+
+ tracing_stop();
+
+ /* disable all cpu buffers */
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_inc(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_inc(&max_tr.data[cpu]->disabled);
+ }
+
+ if (size != global_trace.entries)
+ ret = __tracing_resize_ring_buffer(size);
+
+ if (ret < 0)
+ ret = -ENOMEM;
+
+ for_each_tracing_cpu(cpu) {
+ if (global_trace.data[cpu])
+ atomic_dec(&global_trace.data[cpu]->disabled);
+ if (max_tr.data[cpu])
+ atomic_dec(&max_tr.data[cpu]->disabled);
+ }
+
+ tracing_start();
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
/**
* tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -2836,7 +2891,7 @@ int tracing_update_buffers(void)
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded)
- ret = tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size);
mutex_unlock(&trace_types_lock);
return ret;
@@ -2860,7 +2915,7 @@ static int tracing_set_tracer(const char *buf)
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded) {
- ret = tracing_resize_ring_buffer(trace_buf_size);
+ ret = __tracing_resize_ring_buffer(trace_buf_size);
if (ret < 0)
goto out;
ret = 0;
@@ -2966,20 +3021,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
unsigned long *ptr = filp->private_data;
- char buf[64];
unsigned long val;
int ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
*ptr = val * 1000;
@@ -3434,67 +3480,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
unsigned long val;
- char buf[64];
- int ret, cpu;
-
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
+ int ret;
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
/* must have at least 1 entry */
if (!val)
return -EINVAL;
- mutex_lock(&trace_types_lock);
-
- tracing_stop();
-
- /* disable all cpu buffers */
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_inc(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_inc(&max_tr.data[cpu]->disabled);
- }
-
/* value is in KB */
val <<= 10;
- if (val != global_trace.entries) {
- ret = tracing_resize_ring_buffer(val);
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
- }
+ ret = tracing_resize_ring_buffer(val);
+ if (ret < 0)
+ return ret;
*ppos += cnt;
- /* If check pages failed, return ENOMEM */
- if (tracing_disabled)
- cnt = -ENOMEM;
- out:
- for_each_tracing_cpu(cpu) {
- if (global_trace.data[cpu])
- atomic_dec(&global_trace.data[cpu]->disabled);
- if (max_tr.data[cpu])
- atomic_dec(&max_tr.data[cpu]->disabled);
- }
+ return cnt;
+}
- tracing_start();
- mutex_unlock(&trace_types_lock);
+static ssize_t
+tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ /*
+ * There is no need to read what the user has written, this function
+ * is just to make sure that there is no error when "echo" is used
+ */
+
+ *ppos += cnt;
return cnt;
}
+static int
+tracing_free_buffer_release(struct inode *inode, struct file *filp)
+{
+ /* disable tracing ? */
+ if (trace_flags & TRACE_ITER_STOP_ON_FREE)
+ tracing_off();
+ /* resize the ring buffer to 0 */
+ tracing_resize_ring_buffer(0);
+
+ return 0;
+}
+
static int mark_printk(const char *fmt, ...)
{
int ret;
@@ -3640,6 +3673,11 @@ static const struct file_operations tracing_entries_fops = {
.llseek = generic_file_llseek,
};
+static const struct file_operations tracing_free_buffer_fops = {
+ .write = tracing_free_buffer_write,
+ .release = tracing_free_buffer_release,
+};
+
static const struct file_operations tracing_mark_fops = {
.open = tracing_open_generic,
.write = tracing_mark_write,
@@ -3696,7 +3734,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return 0;
if (!info->spare)
- info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
+ info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
if (!info->spare)
return -ENOMEM;
@@ -3853,7 +3891,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
ref->ref = 1;
ref->buffer = info->tr->buffer;
- ref->page = ring_buffer_alloc_read_page(ref->buffer);
+ ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
if (!ref->page) {
kfree(ref);
break;
@@ -3862,8 +3900,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
r = ring_buffer_read_page(ref->buffer, &ref->page,
len, info->cpu, 1);
if (r < 0) {
- ring_buffer_free_read_page(ref->buffer,
- ref->page);
+ ring_buffer_free_read_page(ref->buffer, ref->page);
kfree(ref);
break;
}
@@ -4099,19 +4136,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
{
struct trace_option_dentry *topt = filp->private_data;
unsigned long val;
- char buf[64];
int ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
if (val != 0 && val != 1)
@@ -4159,20 +4187,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
long index = (long)filp->private_data;
- char buf[64];
unsigned long val;
int ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
if (val != 0 && val != 1)
@@ -4365,6 +4384,9 @@ static __init int tracer_init_debugfs(void)
trace_create_file("buffer_size_kb", 0644, d_tracer,
&global_trace, &tracing_entries_fops);
+ trace_create_file("free_buffer", 0644, d_tracer,
+ &global_trace, &tracing_free_buffer_fops);
+
trace_create_file("trace_marker", 0220, d_tracer,
NULL, &tracing_mark_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f8074072d111..30a94c26dcb3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -389,6 +389,9 @@ void update_max_tr_single(struct trace_array *tr,
void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
int skip, int pc);
+void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
+ int skip, int pc, struct pt_regs *regs);
+
void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
int pc);
@@ -400,6 +403,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer,
{
}
+static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
+ unsigned long flags, int skip,
+ int pc, struct pt_regs *regs)
+{
+}
+
static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
unsigned long flags, int pc)
{
@@ -609,6 +618,7 @@ enum trace_iterator_flags {
TRACE_ITER_GRAPH_TIME = 0x80000,
TRACE_ITER_RECORD_CMD = 0x100000,
TRACE_ITER_OVERWRITE = 0x200000,
+ TRACE_ITER_STOP_ON_FREE = 0x400000,
};
/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3e2a7c91c548..581876f9f387 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -515,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
- char buf[64];
unsigned long val;
int ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
ret = tracing_update_buffers();
@@ -601,19 +592,10 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
struct event_subsystem *system = filp->private_data;
const char *name = NULL;
unsigned long val;
- char buf[64];
ssize_t ret;
- if (cnt >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
return ret;
ret = tracing_update_buffers();
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 962cdb24ed81..e8d6bb55d719 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = {
static struct trace_array *graph_array;
+/*
+ * DURATION column is being also used to display IRQ signs,
+ * following values are used by print_graph_irq and others
+ * to fill in space into DURATION column.
+ */
+enum {
+ DURATION_FILL_FULL = -1,
+ DURATION_FILL_START = -2,
+ DURATION_FILL_END = -3,
+};
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+ u32 flags);
/* Add a function return address to the trace stack on thread info.*/
int
@@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter,
return next;
}
-/* Signal a overhead of time execution to the output */
-static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s,
- u32 flags)
-{
- /* If duration disappear, we don't need anything */
- if (!(flags & TRACE_GRAPH_PRINT_DURATION))
- return 1;
-
- /* Non nested entry or return */
- if (duration == -1)
- return trace_seq_printf(s, " ");
-
- if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
- /* Duration exceeded 100 msecs */
- if (duration > 100000ULL)
- return trace_seq_printf(s, "! ");
-
- /* Duration exceeded 10 msecs */
- if (duration > 10000ULL)
- return trace_seq_printf(s, "+ ");
- }
-
- return trace_seq_printf(s, " ");
-}
-
static int print_graph_abs_time(u64 t, struct trace_seq *s)
{
unsigned long usecs_rem;
@@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
addr >= (unsigned long)__irqentry_text_end)
return TRACE_TYPE_UNHANDLED;
- /* Absolute time */
- if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
- ret = print_graph_abs_time(iter->ts, s);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ /* Absolute time */
+ if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
+ ret = print_graph_abs_time(iter->ts, s);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
- /* Cpu */
- if (flags & TRACE_GRAPH_PRINT_CPU) {
- ret = print_graph_cpu(s, cpu);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ /* Cpu */
+ if (flags & TRACE_GRAPH_PRINT_CPU) {
+ ret = print_graph_cpu(s, cpu);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
- /* Proc */
- if (flags & TRACE_GRAPH_PRINT_PROC) {
- ret = print_graph_proc(s, pid);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- ret = trace_seq_printf(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ /* Proc */
+ if (flags & TRACE_GRAPH_PRINT_PROC) {
+ ret = print_graph_proc(s, pid);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
+ return TRACE_TYPE_PARTIAL_LINE;
+ ret = trace_seq_printf(s, " | ");
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+ }
}
/* No overhead */
- ret = print_graph_overhead(-1, s, flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
+ ret = print_graph_duration(DURATION_FILL_START, s, flags);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
if (type == TRACE_GRAPH_ENT)
ret = trace_seq_printf(s, "==========>");
@@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
- /* Don't close the duration column if haven't one */
- if (flags & TRACE_GRAPH_PRINT_DURATION)
- trace_seq_printf(s, " |");
+ ret = print_graph_duration(DURATION_FILL_END, s, flags);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
+
ret = trace_seq_printf(s, "\n");
if (!ret)
@@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
}
static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+print_graph_duration(unsigned long long duration, struct trace_seq *s,
+ u32 flags)
{
- int ret;
+ int ret = -1;
+
+ if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
+ !(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return TRACE_TYPE_HANDLED;
+
+ /* No real adata, just filling the column with spaces */
+ switch (duration) {
+ case DURATION_FILL_FULL:
+ ret = trace_seq_printf(s, " | ");
+ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ case DURATION_FILL_START:
+ ret = trace_seq_printf(s, " ");
+ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ case DURATION_FILL_END:
+ ret = trace_seq_printf(s, " |");
+ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+ }
+
+ /* Signal a overhead of time execution to the output */
+ if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
+ /* Duration exceeded 100 msecs */
+ if (duration > 100000ULL)
+ ret = trace_seq_printf(s, "! ");
+ /* Duration exceeded 10 msecs */
+ else if (duration > 10000ULL)
+ ret = trace_seq_printf(s, "+ ");
+ }
+
+ /*
+ * The -1 means we either did not exceed the duration tresholds
+ * or we dont want to print out the overhead. Either way we need
+ * to fill out the space.
+ */
+ if (ret == -1)
+ ret = trace_seq_printf(s, " ");
+
+ /* Catching here any failure happenned above */
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
ret = trace_print_graph_duration(duration, s);
if (ret != TRACE_TYPE_HANDLED)
@@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter,
cpu_data->enter_funcs[call->depth] = 0;
}
- /* Overhead */
- ret = print_graph_overhead(duration, s, flags);
- if (!ret)
+ /* Overhead and duration */
+ ret = print_graph_duration(duration, s, flags);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
- /* Duration */
- if (flags & TRACE_GRAPH_PRINT_DURATION) {
- ret = print_graph_duration(duration, s);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
-
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter,
cpu_data->enter_funcs[call->depth] = call->func;
}
- /* No overhead */
- ret = print_graph_overhead(-1, s, flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
/* No time */
- if (flags & TRACE_GRAPH_PRINT_DURATION) {
- ret = trace_seq_printf(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
+ if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return 0;
+
/* Absolute time */
if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
@@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
if (print_graph_prologue(iter, s, 0, 0, flags))
return TRACE_TYPE_PARTIAL_LINE;
- /* Overhead */
- ret = print_graph_overhead(duration, s, flags);
- if (!ret)
+ /* Overhead and duration */
+ ret = print_graph_duration(duration, s, flags);
+ if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
- /* Duration */
- if (flags & TRACE_GRAPH_PRINT_DURATION) {
- ret = print_graph_duration(duration, s);
- if (ret == TRACE_TYPE_PARTIAL_LINE)
- return TRACE_TYPE_PARTIAL_LINE;
- }
-
/* Closing brace */
for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
if (print_graph_prologue(iter, s, 0, 0, flags))
return TRACE_TYPE_PARTIAL_LINE;
- /* No overhead */
- ret = print_graph_overhead(-1, s, flags);
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
-
/* No time */
- if (flags & TRACE_GRAPH_PRINT_DURATION) {
- ret = trace_seq_printf(s, " | ");
- if (!ret)
- return TRACE_TYPE_PARTIAL_LINE;
- }
+ ret = print_graph_duration(DURATION_FILL_FULL, s, flags);
+ if (ret != TRACE_TYPE_HANDLED)
+ return ret;
/* Indentation */
if (depth > 0)
@@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
enum print_line_t
-__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
+print_graph_function_flags(struct trace_iterator *iter, u32 flags)
{
struct ftrace_graph_ent_entry *field;
struct fgraph_data *data = iter->private;
@@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags)
static enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
- return __print_graph_function_flags(iter, tracer_flags.val);
-}
-
-enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
- u32 flags)
-{
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
- flags |= TRACE_GRAPH_PRINT_DURATION;
- else
- flags |= TRACE_GRAPH_PRINT_ABS_TIME;
-
- return __print_graph_function_flags(iter, flags);
+ return print_graph_function_flags(iter, tracer_flags.val);
}
static enum print_line_t
@@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
- seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
- seq_printf(s, "#%.*s|||| / \n", size, spaces);
+ seq_printf(s, "#%.*s||| / \n", size, spaces);
}
static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
@@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
if (flags & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, " TASK/PID ");
if (lat)
- seq_printf(s, "|||||");
+ seq_printf(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " DURATION ");
seq_printf(s, " FUNCTION CALLS\n");
@@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
if (flags & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, " | | ");
if (lat)
- seq_printf(s, "|||||");
+ seq_printf(s, "||||");
if (flags & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " | | ");
seq_printf(s, " | | | |\n");
@@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
{
struct trace_iterator *iter = s->private;
+ if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ return;
+
if (trace_flags & TRACE_ITER_LATENCY_FMT) {
/* print nothing if the buffers are empty */
if (trace_empty(iter))
return;
print_trace_header(s, iter);
- flags |= TRACE_GRAPH_PRINT_DURATION;
- } else
- flags |= TRACE_GRAPH_PRINT_ABS_TIME;
+ }
__print_graph_headers_flags(s, flags);
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c77424be284d..667aa8cc0cfc 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -226,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
}
#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
- TRACE_GRAPH_PRINT_PROC)
+ TRACE_GRAPH_PRINT_PROC | \
+ TRACE_GRAPH_PRINT_ABS_TIME | \
+ TRACE_GRAPH_PRINT_DURATION)
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 27d13b36b8be..7db7b68c6c37 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1397,7 +1397,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
if (!filter_current_check_discard(buffer, call, entry, event))
- trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ trace_nowake_buffer_unlock_commit_regs(buffer, event,
+ irq_flags, pc, regs);
}
/* Kretprobe handler */
@@ -1429,7 +1430,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
if (!filter_current_check_discard(buffer, call, entry, event))
- trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ trace_nowake_buffer_unlock_commit_regs(buffer, event,
+ irq_flags, pc, regs);
}
/* Event entry printers */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f029dd4fd2ca..e4a70c0c71b6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -227,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter)
graph_trace_close(iter);
}
-#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
+#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \
+ TRACE_GRAPH_PRINT_ABS_TIME | \
+ TRACE_GRAPH_PRINT_DURATION)
static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
{
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b0b53b8e4c25..77575b386d97 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -156,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
{
long *ptr = filp->private_data;
unsigned long val, flags;
- char buf[64];
int ret;
int cpu;
- if (count >= sizeof(buf))
- return -EINVAL;
-
- if (copy_from_user(&buf, ubuf, count))
- return -EFAULT;
-
- buf[count] = 0;
-
- ret = strict_strtoul(buf, 10, &val);
- if (ret < 0)
+ ret = kstrtoul_from_user(ubuf, count, 10, &val);
+ if (ret)
return ret;
local_irq_save(flags);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56ad4792..a933e3a0398b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -200,6 +200,8 @@ static int is_softlockup(unsigned long touch_ts)
}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
+void __weak hw_nmi_watchdog_set_attr(struct perf_event_attr *wd_attr) { }
+
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -209,7 +211,7 @@ static struct perf_event_attr wd_hw_attr = {
};
/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event, int nmi,
+static void watchdog_overflow_callback(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
@@ -368,10 +370,12 @@ static int watchdog_nmi_enable(int cpu)
if (event != NULL)
goto out_enable;
- /* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
+ hw_nmi_watchdog_set_attr(wd_attr);
+
+ /* Try to register using hardware perf events */
+ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
goto out_save;