From 71a851b4d2a815adcfac09c1adda7ef6811fde66 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 10 Jul 2009 09:06:56 +0200
Subject: perf_counter: Stop open coding unclone_ctx

Instead of open coding the unclone context thingy, put it in
a common function.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

(limited to 'kernel/perf_counter.c')
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50da2347..8bf997d86bf4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -146,6 +146,14 @@ static void put_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+	if (ctx->parent_ctx) {
+		put_ctx(ctx->parent_ctx);
+		ctx->parent_ctx = NULL;
+	}
+}
+
 /*
  * Get the perf_counter_context for a task and lock it.
  * This has to cope with with the fact that until it is locked,
@@ -1463,10 +1471,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
 	/*
 	 * Unclone this context if we enabled any counter.
 	 */
-	if (enabled && ctx->parent_ctx) {
-		put_ctx(ctx->parent_ctx);
-		ctx->parent_ctx = NULL;
-	}
+	if (enabled)
+		unclone_ctx(ctx);
 
 	spin_unlock(&ctx->lock);
 
@@ -1526,7 +1532,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
-	struct perf_counter_context *parent_ctx;
 	struct perf_counter_context *ctx;
 	struct perf_cpu_context *cpuctx;
 	struct task_struct *task;
@@ -1586,11 +1591,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
  retry:
 	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
-		parent_ctx = ctx->parent_ctx;
-		if (parent_ctx) {
-			put_ctx(parent_ctx);
-			ctx->parent_ctx = NULL;		/* no longer a clone */
-		}
+		unclone_ctx(ctx);
 		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
@@ -4255,15 +4256,12 @@ void perf_counter_exit_task(struct task_struct *child)
 	 */
 	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
-	if (child_ctx->parent_ctx) {
-		/*
-		 * This context is a clone; unclone it so it can't get
-		 * swapped to another process while we're removing all
-		 * the counters from it.
-		 */
-		put_ctx(child_ctx->parent_ctx);
-		child_ctx->parent_ctx = NULL;
-	}
+	/*
+	 * If this context is a clone; unclone it so it can't get
+	 * swapped to another process while we're removing all
+	 * the counters from it.
+	 */
+	unclone_ctx(child_ctx);
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
-- 
cgit v1.2.3


From d4d7d0b9545721d3cabb19d15163bbc66b797707 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 6 Jul 2009 09:31:33 +0100
Subject: perf_counter: Fix the tracepoint channel to perfcounters

Fix a missed rename in EVENT_PROFILE support so that it gets
built and allows tracepoint tracing from the 'perf' tool.

Fix a typo in the (never before built & enabled) portion in
perf_counter.c as well, and update that code to the
attr.config changes as well.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Gamari <bgamari.foss@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1246869094-21237-1-git-send-email-chris@chris-wilson.co.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50da2347..c6c38fb7766a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3671,7 +3671,7 @@ static const struct pmu perf_ops_task_clock = {
 void perf_tpcounter_event(int event_id)
 {
 	struct perf_sample_data data = {
-		.regs = get_irq_regs();
+		.regs = get_irq_regs(),
 		.addr = 0,
 	};
 
@@ -3687,16 +3687,12 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(perf_event_id(&counter->attr));
+	ftrace_profile_disable(counter->attr.config);
 }
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = perf_event_id(&counter->attr);
-	int ret;
-
-	ret = ftrace_profile_enable(event_id);
-	if (ret)
+	if (ftrace_profile_enable(counter->attr.config))
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-- 
cgit v1.2.3


From 413ee3b48ab582ffea33e7e140c7a2c5ea657e9a Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 16 Jul 2009 15:15:52 +0200
Subject: perf_counter: Make sure we dont leak kernel memory to userspace

There are a few places we are leaking tiny amounts of kernel
memory to userspace. This happens when writing out strings
because we always align the end to 64 bits.

To avoid this we should always use an appropriately sized
temporary buffer and ensure it is zeroed.

Since d_path assembles the string from the end of the buffer
backwards, we need to add 64 bits after the buffer to allow for
alignment.

We also need to copy arch_vma_name to the temporary buffer,
because if we use it directly we may end up copying to
userspace a number of bytes after the end of the string
constant.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090716104817.273972048@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c6c38fb7766a..f7a8ab9576e4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2968,8 +2968,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
 	unsigned int size;
-	char *comm = comm_event->task->comm;
+	char comm[TASK_COMM_LEN];
 
+	memset(comm, 0, sizeof(comm));
+	strncpy(comm, comm_event->task->comm, sizeof(comm));
 	size = ALIGN(strlen(comm)+1, sizeof(u64));
 
 	comm_event->comm = comm;
@@ -3088,8 +3090,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 	char *buf = NULL;
 	const char *name;
 
+	memset(tmp, 0, sizeof(tmp));
+
 	if (file) {
-		buf = kzalloc(PATH_MAX, GFP_KERNEL);
+		/*
+		 * d_path works from the end of the buffer backwards, so we
+		 * need to add enough zero bytes after the string to handle
+		 * the 64bit alignment we do later.
+		 */
+		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
 		if (!buf) {
 			name = strncpy(tmp, "//enomem", sizeof(tmp));
 			goto got_name;
@@ -3100,9 +3109,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			goto got_name;
 		}
 	} else {
-		name = arch_vma_name(mmap_event->vma);
-		if (name)
+		if (arch_vma_name(mmap_event->vma)) {
+			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+				       sizeof(tmp));
 			goto got_name;
+		}
 
 		if (!vma->vm_mm) {
 			name = strncpy(tmp, "[vdso]", sizeof(tmp));
-- 
cgit v1.2.3


From c9f73a3dd27e03411f18a58c0814d51392d2b17a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 21 Jul 2009 00:55:05 -0700
Subject: perf: Fix stack data leak

the "reserved" field was not initialized to zero, resulting in 4 bytes
of stack data leaking to userspace....

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5c6fae4f43d8..ff854fd89a81 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2666,6 +2666,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
+		cpu_entry.reserved = 0;
 	}
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-- 
cgit v1.2.3


From 573402db02746179b3f95f83a11a787501f52d0a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Jul 2009 11:13:50 +0200
Subject: perf_counter: Plug more stack leaks

Per example of Arjan's patch, I went through and found a few more.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff854fd89a81..e1d6a3aa1333 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2897,8 +2897,11 @@ void perf_counter_fork(struct task_struct *task)
 		.event  = {
 			.header = {
 				.type = PERF_EVENT_FORK,
+				.misc = 0,
 				.size = sizeof(fork_event.event),
 			},
+			/* .pid  */
+			/* .ppid */
 		},
 	};
 
@@ -3008,8 +3011,16 @@ void perf_counter_comm(struct task_struct *task)
 
 	comm_event = (struct perf_comm_event){
 		.task	= task,
+		/* .comm      */
+		/* .comm_size */
 		.event  = {
-			.header = { .type = PERF_EVENT_COMM, },
+			.header = {
+				.type = PERF_EVENT_COMM,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
 		},
 	};
 
@@ -3160,8 +3171,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
 
 	mmap_event = (struct perf_mmap_event){
 		.vma	= vma,
+		/* .file_name */
+		/* .file_size */
 		.event  = {
-			.header = { .type = PERF_EVENT_MMAP, },
+			.header = {
+				.type = PERF_EVENT_MMAP,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
 			.start  = vma->vm_start,
 			.len    = vma->vm_end - vma->vm_start,
 			.pgoff  = vma->vm_pgoff,
-- 
cgit v1.2.3


From 7f453c24b95a085fc7bd35d53b33abc4dc5a048b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 21 Jul 2009 13:19:40 +0200
Subject: perf_counter: PERF_SAMPLE_ID and inherited counters

Anton noted that for inherited counters the counter-id as provided by
PERF_SAMPLE_ID isn't mappable to the id found through PERF_RECORD_ID
because each inherited counter gets its own id.

His suggestion was to always return the parent counter id, since that
is the primary counter id as exposed. However, these inherited
counters have a unique identifier so that events like
PERF_EVENT_PERIOD and PERF_EVENT_THROTTLE can be specific about which
counter gets modified, which is important when trying to normalize the
sample streams.

This patch removes PERF_EVENT_PERIOD in favour of PERF_SAMPLE_PERIOD,
which is more useful anyway, since changing periods became a lot more
common than initially thought -- rendering PERF_EVENT_PERIOD the less
useful solution (also, PERF_SAMPLE_PERIOD reports the more accurate
value, since it reports the value used to trigger the overflow,
whereas PERF_EVENT_PERIOD simply reports the requested period changed,
which might only take effect on the next cycle).

This still leaves us PERF_EVENT_THROTTLE to consider, but since that
_should_ be a rare occurrence, and linking it to a primary id is the
most useful bit to diagnose the problem, we introduce a
PERF_SAMPLE_STREAM_ID, for those few cases where the full
reconstruction is important.

[Does change the ABI a little, but I see no other way out]

Suggested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1248095846.15751.8781.camel@twins>
---
 kernel/perf_counter.c | 92 +++++++++++++++++----------------------------------
 1 file changed, 31 insertions(+), 61 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e1d6a3aa1333..7530588fa5c5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -154,6 +154,20 @@ static void unclone_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+/*
+ * If we inherit counters we want to return the parent counter id
+ * to userspace.
+ */
+static u64 primary_counter_id(struct perf_counter *counter)
+{
+	u64 id = counter->id;
+
+	if (counter->parent)
+		id = counter->parent->id;
+
+	return id;
+}
+
 /*
  * Get the perf_counter_context for a task and lock it.
  * This has to cope with with the fact that until it is locked,
@@ -1296,7 +1310,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_counter *counter, int enable);
-static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_period(struct perf_counter *counter, u64 events)
 {
@@ -1315,8 +1328,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
 	if (!sample_period)
 		sample_period = 1;
 
-	perf_log_period(counter, sample_period);
-
 	hwc->sample_period = sample_period;
 }
 
@@ -1705,7 +1716,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
 	if (counter->attr.read_format & PERF_FORMAT_ID)
-		values[n++] = counter->id;
+		values[n++] = primary_counter_id(counter);
 	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
@@ -1812,8 +1823,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 		counter->attr.sample_freq = value;
 	} else {
-		perf_log_period(counter, value);
-
 		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
 	}
@@ -2662,6 +2671,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_ID)
 		header.size += sizeof(u64);
 
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		header.size += sizeof(u64);
+
 	if (sample_type & PERF_SAMPLE_CPU) {
 		header.size += sizeof(cpu_entry);
 
@@ -2705,7 +2717,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, data->addr);
 
-	if (sample_type & PERF_SAMPLE_ID)
+	if (sample_type & PERF_SAMPLE_ID) {
+		u64 id = primary_counter_id(counter);
+
+		perf_output_put(&handle, id);
+	}
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
 		perf_output_put(&handle, counter->id);
 
 	if (sample_type & PERF_SAMPLE_CPU)
@@ -2728,7 +2746,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 			if (sub != counter)
 				sub->pmu->read(sub);
 
-			group_entry.id = sub->id;
+			group_entry.id = primary_counter_id(sub);
 			group_entry.counter = atomic64_read(&sub->count);
 
 			perf_output_put(&handle, group_entry);
@@ -2788,15 +2806,8 @@ perf_counter_read_event(struct perf_counter *counter,
 	}
 
 	if (counter->attr.read_format & PERF_FORMAT_ID) {
-		u64 id;
-
 		event.header.size += sizeof(u64);
-		if (counter->parent)
-			id = counter->parent->id;
-		else
-			id = counter->id;
-
-		event.format[i++] = id;
+		event.format[i++] = primary_counter_id(counter);
 	}
 
 	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
@@ -3190,49 +3201,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
 	perf_counter_mmap_event(&mmap_event);
 }
 
-/*
- * Log sample_period changes so that analyzing tools can re-normalize the
- * event flow.
- */
-
-struct freq_event {
-	struct perf_event_header	header;
-	u64				time;
-	u64				id;
-	u64				period;
-};
-
-static void perf_log_period(struct perf_counter *counter, u64 period)
-{
-	struct perf_output_handle handle;
-	struct freq_event event;
-	int ret;
-
-	if (counter->hw.sample_period == period)
-		return;
-
-	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
-		return;
-
-	event = (struct freq_event) {
-		.header = {
-			.type = PERF_EVENT_PERIOD,
-			.misc = 0,
-			.size = sizeof(event),
-		},
-		.time = sched_clock(),
-		.id = counter->id,
-		.period = period,
-	};
-
-	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, event);
-	perf_output_end(&handle);
-}
-
 /*
  * IRQ throttle logging
  */
@@ -3246,14 +3214,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		struct perf_event_header	header;
 		u64				time;
 		u64				id;
+		u64				stream_id;
 	} throttle_event = {
 		.header = {
 			.type = PERF_EVENT_THROTTLE + 1,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time	= sched_clock(),
-		.id	= counter->id,
+		.time		= sched_clock(),
+		.id		= primary_counter_id(counter),
+		.stream_id	= counter->id,
 	};
 
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
-- 
cgit v1.2.3


From 966ee4d6b887c14159043ac80b8c3661d2bbe5e2 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 22 Jul 2009 23:05:46 +1000
Subject: perf_counter: Fix throttle/unthrottle event logging

Right now we only print PERF_EVENT_THROTTLE + 1 (ie PERF_EVENT_UNTHROTTLE).
Fix this to print both a throttle and unthrottle event.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090722130546.GE9029@kryten>
---
 kernel/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7530588fa5c5..787d4daef185 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3217,7 +3217,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		u64				stream_id;
 	} throttle_event = {
 		.header = {
-			.type = PERF_EVENT_THROTTLE + 1,
+			.type = PERF_EVENT_THROTTLE,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
@@ -3226,6 +3226,9 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		.stream_id	= counter->id,
 	};
 
+	if (enable)
+		throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
 	if (ret)
 		return;
-- 
cgit v1.2.3


From 0dc3d523e8bc4718e0be2e4a742367d6e4be77cd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 21 Jul 2009 00:55:05 -0700
Subject: perf: fix stack data leak

the "reserved" field was not initialized to zero, resulting in 4 bytes
of stack data leaking to userspace....

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a641eb753b8c..7bc888dfd06a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2665,6 +2665,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
+		cpu_entry.reserved = 0;
 	}
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-- 
cgit v1.2.3


From e53c0994709166b111fbe9162d1a16ece7dfc45b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 24 Jul 2009 14:42:10 +0200
Subject: perf_counter: Collapse inherit on read()

Currently the counter value returned by read() is the value of
the parent counter, to which child counters are only fed back
on child exit.

Thus read() can return rather erratic (and meaningless) numbers
depending on the state of the child processes.

Change this by always iterating the full child hierarchy on
read() and sum all counters.

Suggested-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 950931041954..48471d75ae01 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1688,6 +1688,18 @@ static int perf_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static u64 perf_counter_read_tree(struct perf_counter *counter)
+{
+	struct perf_counter *child;
+	u64 total = 0;
+
+	total += perf_counter_read(counter);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		total += perf_counter_read(child);
+
+	return total;
+}
+
 /*
  * Read the performance counter - simple non blocking version for now
  */
@@ -1707,7 +1719,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
-	values[0] = perf_counter_read(counter);
+	values[0] = perf_counter_read_tree(counter);
 	n = 1;
 	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = counter->total_time_enabled +
-- 
cgit v1.2.3


From 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 23 Jul 2009 14:46:33 +0200
Subject: perf_counter: Full task tracing

In order to be able to distinguish between no samples due to
inactivity and no samples due to task ended, Arjan asked for
PERF_EVENT_EXIT events. This is useful to the boot delay
instrumentation (bootchart) app.

This patch changes the PERF_EVENT_FORK to be emitted on every
clone, and adds PERF_EVENT_EXIT to be emitted on task exit,
after the task's counters have been closed.

This task tracing is controlled through: attr.comm || attr.mmap
and through the new attr.task field.

Suggested-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[ cleaned up perf_counter.h a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 87 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 29 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 48471d75ae01..199ed4771315 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
 static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
+static atomic_t nr_task_counters __read_mostly;
 
 /*
  * perf counter paranoia level:
@@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter)
 			atomic_dec(&nr_mmap_counters);
 		if (counter->attr.comm)
 			atomic_dec(&nr_comm_counters);
+		if (counter->attr.task)
+			atomic_dec(&nr_task_counters);
 	}
 
 	if (counter->destroy)
@@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter,
 }
 
 /*
- * fork tracking
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
  */
 
-struct perf_fork_event {
+struct perf_task_event {
 	struct task_struct	*task;
 
 	struct {
@@ -2842,37 +2847,42 @@ struct perf_fork_event {
 
 		u32				pid;
 		u32				ppid;
+		u32				tid;
+		u32				ptid;
 	} event;
 };
 
-static void perf_counter_fork_output(struct perf_counter *counter,
-				     struct perf_fork_event *fork_event)
+static void perf_counter_task_output(struct perf_counter *counter,
+				     struct perf_task_event *task_event)
 {
 	struct perf_output_handle handle;
-	int size = fork_event->event.header.size;
-	struct task_struct *task = fork_event->task;
+	int size = task_event->event.header.size;
+	struct task_struct *task = task_event->task;
 	int ret = perf_output_begin(&handle, counter, size, 0, 0);
 
 	if (ret)
 		return;
 
-	fork_event->event.pid = perf_counter_pid(counter, task);
-	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+	task_event->event.pid = perf_counter_pid(counter, task);
+	task_event->event.ppid = perf_counter_pid(counter, task->real_parent);
 
-	perf_output_put(&handle, fork_event->event);
+	task_event->event.tid = perf_counter_tid(counter, task);
+	task_event->event.ptid = perf_counter_tid(counter, task->real_parent);
+
+	perf_output_put(&handle, task_event->event);
 	perf_output_end(&handle);
 }
 
-static int perf_counter_fork_match(struct perf_counter *counter)
+static int perf_counter_task_match(struct perf_counter *counter)
 {
-	if (counter->attr.comm || counter->attr.mmap)
+	if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
 		return 1;
 
 	return 0;
 }
 
-static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
-				  struct perf_fork_event *fork_event)
+static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+				  struct perf_task_event *task_event)
 {
 	struct perf_counter *counter;
 
@@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_fork_match(counter))
-			perf_counter_fork_output(counter, fork_event);
+		if (perf_counter_task_match(counter))
+			perf_counter_task_output(counter, task_event);
 	}
 	rcu_read_unlock();
 }
 
-static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+static void perf_counter_task_event(struct perf_task_event *task_event)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
-	perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+	perf_counter_task_ctx(&cpuctx->ctx, task_event);
 	put_cpu_var(perf_cpu_context);
 
 	rcu_read_lock();
@@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event)
 	 */
 	ctx = rcu_dereference(current->perf_counter_ctxp);
 	if (ctx)
-		perf_counter_fork_ctx(ctx, fork_event);
+		perf_counter_task_ctx(ctx, task_event);
 	rcu_read_unlock();
 }
 
-void perf_counter_fork(struct task_struct *task)
+static void perf_counter_task(struct task_struct *task, int new)
 {
-	struct perf_fork_event fork_event;
+	struct perf_task_event task_event;
 
 	if (!atomic_read(&nr_comm_counters) &&
-	    !atomic_read(&nr_mmap_counters))
+	    !atomic_read(&nr_mmap_counters) &&
+	    !atomic_read(&nr_task_counters))
 		return;
 
-	fork_event = (struct perf_fork_event){
+	task_event = (struct perf_task_event){
 		.task	= task,
 		.event  = {
 			.header = {
-				.type = PERF_EVENT_FORK,
+				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
 				.misc = 0,
-				.size = sizeof(fork_event.event),
+				.size = sizeof(task_event.event),
 			},
 			/* .pid  */
 			/* .ppid */
+			/* .tid  */
+			/* .ptid */
 		},
 	};
 
-	perf_counter_fork_event(&fork_event);
+	perf_counter_task_event(&task_event);
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+	perf_counter_task(task, 1);
 }
 
 /*
@@ -3887,6 +3905,8 @@ done:
 			atomic_inc(&nr_mmap_counters);
 		if (counter->attr.comm)
 			atomic_inc(&nr_comm_counters);
+		if (counter->attr.task)
+			atomic_inc(&nr_task_counters);
 	}
 
 	return counter;
@@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx;
 	unsigned long flags;
 
-	if (likely(!child->perf_counter_ctxp))
+	if (likely(!child->perf_counter_ctxp)) {
+		perf_counter_task(child, 0);
 		return;
+	}
 
 	local_irq_save(flags);
 	/*
@@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child)
 	 * incremented the context's refcount before we do put_ctx below.
 	 */
 	spin_lock(&child_ctx->lock);
-	child->perf_counter_ctxp = NULL;
 	/*
 	 * If this context is a clone; unclone it so it can't get
 	 * swapped to another process while we're removing all
 	 * the counters from it.
 	 */
 	unclone_ctx(child_ctx);
-	spin_unlock(&child_ctx->lock);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+	/*
+	 * Report the task dead after unscheduling the counters so that we
+	 * won't get any samples after PERF_EVENT_EXIT. We can however still
+	 * get a few PERF_EVENT_READ events.
+	 */
+	perf_counter_task(child, 0);
+
+	child->perf_counter_ctxp = NULL;
 
 	/*
 	 * We can recurse on the same lock type through:
-- 
cgit v1.2.3


From 1054598cab8674438675085fae459e960eb10799 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 6 Aug 2009 18:06:26 +0200
Subject: perf_counter: Fix double list iteration in per task precise stats

Brice Goglin reported this crash with per task precise stats:

> I finally managed to test the threaded perfcounter statistics (thanks a
> lot for implementing it). I am running 2.6.31-rc5 (with the AMD
> magny-cours patches but I don't think they matter here). I am trying to
> measure local/remote memory accesses per thread during the well-known
> stream benchmark. It's compiled with OpenMP using 16 threads on a
> quad-socket quad-core barcelona machine.
>
> Command line is:
>  /mnt/scratch/bgoglin/cpunode/linux-2.6.31/tools/perf/perf record -f -s
> -e r1000001e0 -e r1000002e0 -e r1000004e0 -e r1000008e0 ./stream
>
> It seems to work fine with a single -e <counter> on the command line
> while it crashes when there are at least 2 of them.
> It seems to work fine without -s as well.

A silly copy-paste resulted in a messed up iteration which would
cause the OOPS.

Reported-by: Brice Goglin <Brice.Goglin@inria.fr>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Brice Goglin <Brice.Goglin@inria.fr>
LKML-Reference: <1249574786.32113.550.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 199ed4771315..673c1aaf7332 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1104,7 +1104,7 @@ static void perf_counter_sync_stat(struct perf_counter_context *ctx,
 		__perf_counter_sync_stat(counter, next_counter);
 
 		counter = list_next_entry(counter, event_entry);
-		next_counter = list_next_entry(counter, event_entry);
+		next_counter = list_next_entry(next_counter, event_entry);
 	}
 }
 
-- 
cgit v1.2.3


From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 21 Jul 2009 17:34:57 +0200
Subject: perf_counter, ftrace: Fix perf_counter integration

Adds possible second part to the assign argument of TP_EVENT().

  TP_perf_assign(
	__perf_count(foo);
	__perf_addr(bar);
  )

Which, when specified make the swcounter increment with @foo instead
of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address
associated with the event) when this triggers a counter overflow.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 673c1aaf7332..52eb4b68d34f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count)
 {
 	struct perf_sample_data data = {
 		.regs = get_irq_regs(),
-		.addr = 0,
+		.addr = addr,
 	};
 
 	if (!data.regs)
 		data.regs = task_pt_regs(current);
 
-	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
+	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
-- 
cgit v1.2.3


From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 7 Aug 2009 01:25:54 +0200
Subject: perf_counter: Fix/complete ftrace event records sampling

This patch implements the kernel side support for ftrace event
record sampling.

A new counter sampling attribute is added:

   PERF_SAMPLE_TP_RECORD

which requests ftrace events record sampling. In this case
if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint
fires, we emit the tracepoint binary record to the
perfcounter event buffer, as a sample.

Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf
record:

 perf record -f -F 1 -a -e workqueue:workqueue_execution
 perf report -D

 0x21e18 [0x48]: event: 9
 .
 . ... raw event: size 72 bytes
 .  0000:  09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff  ......H........
 .  0010:  0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00  ........!......
 .  0020:  2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e  +...........eve
 .  0030:  74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00  ts/1...........
 .  0040:  e0 b1 31 81 ff ff ff ff                          .......
.
0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33

The raw ftrace binary record starts at offset 0020.

Translation:

 struct trace_entry {
	type		= 0x2b = 43;
	flags		= 1;
	preempt_count	= 2;
	pid		= 0xa = 10;
	tgid		= 0xa = 10;
 }

 thread_comm = "events/1"
 thread_pid  = 0xa = 10;
 func	    = 0xffffffff8131b1e0 = flush_to_ldisc()

What will come next?

 - Userspace support ('perf trace'), 'flight data recorder' mode
   for perf trace, etc.

 - The unconditional copy from the profiling callback brings
   some costs however if someone wants no such sampling to
   occur, and needs to be fixed in the future. For that we need
   to have an instant access to the perf counter attribute.
   This is a matter of a flag to add in the struct ftrace_event.

 - Take care of the events recursivity! Don't ever try to record
   a lock event for example, it seems some locking is used in
   the profiling fast path and lead to a tracing recursivity.
   That will be fixed using raw spinlock or recursivity
   protection.

 - [...]

 - Profit! :-)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52eb4b68d34f..868102172aa4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
+	struct perf_tracepoint_record *tp;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 			header.size += sizeof(u64);
 	}
 
+	if (sample_type & PERF_SAMPLE_TP_RECORD) {
+		tp = data->private;
+		header.size += tp->size;
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
@@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
+	if (sample_type & PERF_SAMPLE_TP_RECORD)
+		perf_output_copy(&handle, tp->record, tp->size);
+
 	perf_output_end(&handle);
 }
 
@@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = {
 };
 
 #ifdef CONFIG_EVENT_PROFILE
-void perf_tpcounter_event(int event_id, u64 addr, u64 count)
+void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
+			  int entry_size)
 {
+	struct perf_tracepoint_record tp = {
+		.size = entry_size,
+		.record = record,
+	};
+
 	struct perf_sample_data data = {
 		.regs = get_irq_regs(),
 		.addr = addr,
+		.private = &tp,
 	};
 
 	if (!data.regs)
-- 
cgit v1.2.3


From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Jul 2009 09:29:32 +0200
Subject: perf_counter: Fix software counters for fast moving event sources

Reimplement the software counters to deal with fast moving
event sources (such as tracepoints). This means being able
to generate multiple overflows from a single 'event' as well
as support throttling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 164 +++++++++++++++++++++++++++++---------------------
 1 file changed, 94 insertions(+), 70 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 868102172aa4..615440ab9295 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
  * Generic software counter infrastructure
  */
 
-static void perf_swcounter_update(struct perf_counter *counter)
+/*
+ * We directly increment counter->count and keep a second value in
+ * counter->hw.period_left to count intervals. This period counter
+ * is kept in the range [-sample_period, 0] so that we can use the
+ * sign as trigger.
+ */
+
+static u64 perf_swcounter_set_period(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
-	u64 prev, now;
-	s64 delta;
+	u64 period = hwc->last_period;
+	u64 nr, offset;
+	s64 old, val;
+
+	hwc->last_period = hwc->sample_period;
 
 again:
-	prev = atomic64_read(&hwc->prev_count);
-	now = atomic64_read(&hwc->count);
-	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
-		goto again;
+	old = val = atomic64_read(&hwc->period_left);
+	if (val < 0)
+		return 0;
 
-	delta = now - prev;
+	nr = div64_u64(period + val, period);
+	offset = nr * period;
+	val -= offset;
+	if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
+		goto again;
 
-	atomic64_add(delta, &counter->count);
-	atomic64_sub(delta, &hwc->period_left);
+	return nr;
 }
 
-static void perf_swcounter_set_period(struct perf_counter *counter)
+static void perf_swcounter_overflow(struct perf_counter *counter,
+				    int nmi, struct perf_sample_data *data)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
-	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->sample_period;
+	u64 overflow;
 
-	if (unlikely(left <= -period)) {
-		left = period;
-		atomic64_set(&hwc->period_left, left);
-		hwc->last_period = period;
-	}
+	data->period = counter->hw.last_period;
+	overflow = perf_swcounter_set_period(counter);
 
-	if (unlikely(left <= 0)) {
-		left += period;
-		atomic64_add(period, &hwc->period_left);
-		hwc->last_period = period;
-	}
+	if (hwc->interrupts == MAX_INTERRUPTS)
+		return;
 
-	atomic64_set(&hwc->prev_count, -left);
-	atomic64_set(&hwc->count, -left);
+	for (; overflow; overflow--) {
+		if (perf_counter_overflow(counter, nmi, data)) {
+			/*
+			 * We inhibit the overflow from happening when
+			 * hwc->interrupts == MAX_INTERRUPTS.
+			 */
+			break;
+		}
+	}
 }
 
-static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+static void perf_swcounter_unthrottle(struct perf_counter *counter)
 {
-	enum hrtimer_restart ret = HRTIMER_RESTART;
-	struct perf_sample_data data;
-	struct perf_counter *counter;
-	u64 period;
-
-	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->pmu->read(counter);
-
-	data.addr = 0;
-	data.regs = get_irq_regs();
 	/*
-	 * In case we exclude kernel IPs or are somehow not in interrupt
-	 * context, provide the next best thing, the user IP.
+	 * Nothing to do, we already reset hwc->interrupts.
 	 */
-	if ((counter->attr.exclude_kernel || !data.regs) &&
-			!counter->attr.exclude_user)
-		data.regs = task_pt_regs(current);
+}
 
-	if (data.regs) {
-		if (perf_counter_overflow(counter, 0, &data))
-			ret = HRTIMER_NORESTART;
-	}
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+			       int nmi, struct perf_sample_data *data)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
 
-	period = max_t(u64, 10000, counter->hw.sample_period);
-	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+	atomic64_add(nr, &counter->count);
 
-	return ret;
-}
+	if (!hwc->sample_period)
+		return;
 
-static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct perf_sample_data *data)
-{
-	data->period = counter->hw.last_period;
+	if (!data->regs)
+		return;
 
-	perf_swcounter_update(counter);
-	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, data))
-		/* soft-disable the counter */
-		;
+	if (!atomic64_add_negative(nr, &hwc->period_left))
+		perf_swcounter_overflow(counter, nmi, data);
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	return 1;
 }
 
-static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct perf_sample_data *data)
-{
-	int neg = atomic64_add_negative(nr, &counter->hw.count);
-
-	if (counter->hw.sample_period && !neg && data->regs)
-		perf_swcounter_overflow(counter, nmi, data);
-}
-
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_type_id type,
 				     u32 event, u64 nr, int nmi,
@@ -3575,26 +3560,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
 
 static void perf_swcounter_read(struct perf_counter *counter)
 {
-	perf_swcounter_update(counter);
 }
 
 static int perf_swcounter_enable(struct perf_counter *counter)
 {
-	perf_swcounter_set_period(counter);
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (hwc->sample_period) {
+		hwc->last_period = hwc->sample_period;
+		perf_swcounter_set_period(counter);
+	}
 	return 0;
 }
 
 static void perf_swcounter_disable(struct perf_counter *counter)
 {
-	perf_swcounter_update(counter);
 }
 
 static const struct pmu perf_ops_generic = {
 	.enable		= perf_swcounter_enable,
 	.disable	= perf_swcounter_disable,
 	.read		= perf_swcounter_read,
+	.unthrottle	= perf_swcounter_unthrottle,
 };
 
+/*
+ * hrtimer based swcounter callback
+ */
+
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
+	struct perf_counter *counter;
+	u64 period;
+
+	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
+	counter->pmu->read(counter);
+
+	data.addr = 0;
+	data.regs = get_irq_regs();
+	/*
+	 * In case we exclude kernel IPs or are somehow not in interrupt
+	 * context, provide the next best thing, the user IP.
+	 */
+	if ((counter->attr.exclude_kernel || !data.regs) &&
+			!counter->attr.exclude_user)
+		data.regs = task_pt_regs(current);
+
+	if (data.regs) {
+		if (perf_counter_overflow(counter, 0, &data))
+			ret = HRTIMER_NORESTART;
+	}
+
+	period = max_t(u64, 10000, counter->hw.sample_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
+
+	return ret;
+}
+
 /*
  * Software counter: cpu wall time clock
  */
-- 
cgit v1.2.3


From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Aug 2009 04:26:35 +0200
Subject: perf_counter: Work around gcc warning by initializing tracepoint
 record unconditionally
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Despite that the tracepoint record is always present when the
PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning,
thinking it might not be initialized:

  kernel/perf_counter.c: In function ‘perf_counter_output’:
  kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function

Then, initialize it to NULL and always check if it's not NULL
before dereference it.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 615440ab9295..117622cb73a3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
-	struct perf_tracepoint_record *tp;
+	struct perf_tracepoint_record *tp = NULL;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 
 	if (sample_type & PERF_SAMPLE_TP_RECORD) {
 		tp = data->private;
-		header.size += tp->size;
+		if (tp)
+			header.size += tp->size;
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if (sample_type & PERF_SAMPLE_TP_RECORD)
+	if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp)
 		perf_output_copy(&handle, tp->record, tp->size);
 
 	perf_output_end(&handle);
-- 
cgit v1.2.3


From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 8 Aug 2009 04:26:37 +0200
Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling

Based on Peter's comments, make tracepoint sampling generic
just like all the other sampling bits are. This is a rename
with no code changes:

- PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW
- struct perf_tracepoint_record to perf_raw_record

We want the system in place that transport tracepoints raw
samples events into the perf ring buffer to be generalized and
usable by any type of counter.

Reported-by; Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 117622cb73a3..002310540417 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
-	struct perf_tracepoint_record *tp = NULL;
+	struct perf_raw_record *raw = NULL;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 			header.size += sizeof(u64);
 	}
 
-	if (sample_type & PERF_SAMPLE_TP_RECORD) {
-		tp = data->private;
-		if (tp)
-			header.size += tp->size;
+	if (sample_type & PERF_SAMPLE_RAW) {
+		raw = data->raw;
+		if (raw)
+			header.size += raw->size;
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp)
-		perf_output_copy(&handle, tp->record, tp->size);
+	if ((sample_type & PERF_SAMPLE_RAW) && raw)
+		perf_output_copy(&handle, raw->data, raw->size);
 
 	perf_output_end(&handle);
 }
@@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = {
 void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
 			  int entry_size)
 {
-	struct perf_tracepoint_record tp = {
+	struct perf_raw_record raw = {
 		.size = entry_size,
-		.record = record,
+		.data = record,
 	};
 
 	struct perf_sample_data data = {
 		.regs = get_irq_regs(),
 		.addr = addr,
-		.private = &tp,
+		.raw = &raw,
 	};
 
 	if (!data.regs)
-- 
cgit v1.2.3


From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 7 Aug 2009 19:49:01 +0200
Subject: perf_counter: Fix a race on perf_counter_ctx

While extending perfcounters with BTS hw-tracing, Markus
Metzger managed to trigger this warning:

   [  995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b()

triggers because commit
9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full
task tracing) removed clearing of tsk->perf_counter_ctxp out
from under ctx->lock which introduced a race (against
perf_lock_task_context).

Move it back and deal with the exit notification by explicitly
passing along the former task context.

Reported-by: Markus T Metzger <markus.t.metzger@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249667341.17467.5.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 002310540417..546e62d62941 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter,
  */
 
 struct perf_task_event {
-	struct task_struct	*task;
+	struct task_struct		*task;
+	struct perf_counter_context	*task_ctx;
 
 	struct {
 		struct perf_event_header	header;
@@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx,
 static void perf_counter_task_event(struct perf_task_event *task_event)
 {
 	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
+	struct perf_counter_context *ctx = task_event->task_ctx;
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_task_ctx(&cpuctx->ctx, task_event);
 	put_cpu_var(perf_cpu_context);
 
 	rcu_read_lock();
-	/*
-	 * doesn't really matter which of the child contexts the
-	 * events ends up in.
-	 */
-	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (!ctx)
+		ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
 	if (ctx)
 		perf_counter_task_ctx(ctx, task_event);
 	rcu_read_unlock();
 }
 
-static void perf_counter_task(struct task_struct *task, int new)
+static void perf_counter_task(struct task_struct *task,
+			      struct perf_counter_context *task_ctx,
+			      int new)
 {
 	struct perf_task_event task_event;
 
@@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new)
 		return;
 
 	task_event = (struct perf_task_event){
-		.task	= task,
-		.event  = {
+		.task	  = task,
+		.task_ctx = task_ctx,
+		.event    = {
 			.header = {
 				.type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
 				.misc = 0,
@@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new)
 
 void perf_counter_fork(struct task_struct *task)
 {
-	perf_counter_task(task, 1);
+	perf_counter_task(task, NULL, 1);
 }
 
 /*
@@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child)
 	unsigned long flags;
 
 	if (likely(!child->perf_counter_ctxp)) {
-		perf_counter_task(child, 0);
+		perf_counter_task(child, NULL, 0);
 		return;
 	}
 
@@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child)
 	 * incremented the context's refcount before we do put_ctx below.
 	 */
 	spin_lock(&child_ctx->lock);
+	child->perf_counter_ctxp = NULL;
 	/*
 	 * If this context is a clone; unclone it so it can't get
 	 * swapped to another process while we're removing all
@@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child)
 	 * won't get any samples after PERF_EVENT_EXIT. We can however still
 	 * get a few PERF_EVENT_READ events.
 	 */
-	perf_counter_task(child, 0);
-
-	child->perf_counter_ctxp = NULL;
+	perf_counter_task(child, child_ctx, 0);
 
 	/*
 	 * We can recurse on the same lock type through:
-- 
cgit v1.2.3


From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 10 Aug 2009 11:16:52 +0200
Subject: perf_counter: Correct PERF_SAMPLE_RAW output

PERF_SAMPLE_* output switches should unconditionally output the
correct format, as they are the only way to unambiguously parse
the PERF_EVENT_SAMPLE data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249896447.17467.74.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 546e62d62941..5229d1666fa5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
-	struct perf_raw_record *raw = NULL;
 	int callchain_size = 0;
 	u64 time;
 	struct {
@@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
-		raw = data->raw;
-		if (raw)
-			header.size += raw->size;
+		int size = sizeof(u32);
+
+		if (data->raw)
+			size += data->raw->size;
+		else
+			size += sizeof(u32);
+
+		WARN_ON_ONCE(size & (sizeof(u64)-1));
+		header.size += size;
 	}
 
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
@@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		}
 	}
 
-	if ((sample_type & PERF_SAMPLE_RAW) && raw)
-		perf_output_copy(&handle, raw->data, raw->size);
+	if (sample_type & PERF_SAMPLE_RAW) {
+		if (data->raw) {
+			perf_output_put(&handle, data->raw->size);
+			perf_output_copy(&handle, data->raw->data, data->raw->size);
+		} else {
+			struct {
+				u32	size;
+				u32	data;
+			} raw = {
+				.size = sizeof(u32),
+				.data = 0,
+			};
+			perf_output_put(&handle, raw);
+		}
+	}
 
 	perf_output_end(&handle);
 }
-- 
cgit v1.2.3


From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 10 Aug 2009 11:20:12 +0200
Subject: perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data

Raw tracepoint data contains various kernel internals and
data from other users, so restrict this to CAP_SYS_ADMIN.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1249896452.17467.75.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5229d1666fa5..b0b20a07f394 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
+	/*
+	 * Raw tracepoint data is a severe data leak, only allow root to
+	 * have these.
+	 */
+	if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+			!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
 	if (ftrace_profile_enable(counter->attr.config))
 		return NULL;
 
-- 
cgit v1.2.3


From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 13 Aug 2009 10:13:22 +0200
Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs

Provide weak aliases for hw_perf_counter_setup_online(). This is
used by the BTS patches (for v2.6.32), but it interacts with
fixes so propagate this upstream. (it has no effect as of yet)

Also export perf_counter_output() to architecture code.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b0b20a07f394..e26d2fcfa320 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -88,6 +88,7 @@ void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
+void __weak hw_perf_counter_setup_online(int cpu)	{ barrier(); }
 
 int __weak
 hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
-static void perf_counter_output(struct perf_counter *counter, int nmi,
+void perf_counter_output(struct perf_counter *counter, int nmi,
 				struct perf_sample_data *data)
 {
 	int ret;
@@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 		perf_counter_init_cpu(cpu);
 		break;
 
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		hw_perf_counter_setup_online(cpu);
+		break;
+
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		perf_counter_exit_cpu(cpu);
@@ -4616,6 +4622,8 @@ void __init perf_counter_init(void)
 {
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
+			(void *)(long)smp_processor_id());
 	register_cpu_notifier(&perf_cpu_nb);
 }
 
-- 
cgit v1.2.3


From bcfc2602e8541ac13b1def38e2591dca072cff7a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 13 Aug 2009 09:51:55 +0200
Subject: perf_counter: Fix swcounter context invariance

perf_swcounter_is_counting() uses a lock, which means we cannot
use swcounters from NMI or when holding that particular lock,
this is unintended.

The below removes the lock, this opens up race window, but not
worse than the swcounters already experience due to RCU
traversal of the context in perf_swcounter_ctx_event().

This also fixes the hard lockups while opening a lockdep
tracepoint counter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
LKML-Reference: <1250149915.10001.66.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 44 ++++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e26d2fcfa320..3dd4339589a0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3444,40 +3444,32 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
 {
-	struct perf_counter_context *ctx;
-	unsigned long flags;
-	int count;
-
+	/*
+	 * The counter is active, we're good!
+	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		return 1;
 
+	/*
+	 * The counter is off/error, not counting.
+	 */
 	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
 		return 0;
 
 	/*
-	 * If the counter is inactive, it could be just because
-	 * its task is scheduled out, or because it's in a group
-	 * which could not go on the PMU.  We want to count in
-	 * the first case but not the second.  If the context is
-	 * currently active then an inactive software counter must
-	 * be the second case.  If it's not currently active then
-	 * we need to know whether the counter was active when the
-	 * context was last active, which we can determine by
-	 * comparing counter->tstamp_stopped with ctx->time.
-	 *
-	 * We are within an RCU read-side critical section,
-	 * which protects the existence of *ctx.
+	 * The counter is inactive, if the context is active
+	 * we're part of a group that didn't make it on the 'pmu',
+	 * not counting.
 	 */
-	ctx = counter->ctx;
-	spin_lock_irqsave(&ctx->lock, flags);
-	count = 1;
-	/* Re-check state now we have the lock */
-	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
-	    counter->ctx->is_active ||
-	    counter->tstamp_stopped < ctx->time)
-		count = 0;
-	spin_unlock_irqrestore(&ctx->lock, flags);
-	return count;
+	if (counter->ctx->is_active)
+		return 0;
+
+	/*
+	 * We're inactive and the context is too, this means the
+	 * task is scheduled out, we're counting events that happen
+	 * to us, like migration events.
+	 */
+	return 1;
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
cgit v1.2.3


From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 13 Aug 2009 11:47:53 +0200
Subject: perf: Rework/fix the whole read vs group stuff

Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce
PERF_FORMAT_GROUP to deal with group reads in a more generic
way.

This allows you to get group reads out of read() as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.117411814@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 274 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 202 insertions(+), 72 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3dd4339589a0..b8c6b97a20a3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static u64 perf_counter_read_tree(struct perf_counter *counter)
+static int perf_counter_read_size(struct perf_counter *counter)
+{
+	int entry = sizeof(u64); /* value */
+	int size = 0;
+	int nr = 1;
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		size += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_ID)
+		entry += sizeof(u64);
+
+	if (counter->attr.read_format & PERF_FORMAT_GROUP) {
+		nr += counter->group_leader->nr_siblings;
+		size += sizeof(u64);
+	}
+
+	size += entry * nr;
+
+	return size;
+}
+
+static u64 perf_counter_read_value(struct perf_counter *counter)
 {
 	struct perf_counter *child;
 	u64 total = 0;
@@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter)
 	return total;
 }
 
+static int perf_counter_read_entry(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	int n = 0, count = 0;
+	u64 values[2];
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
+}
+
+static int perf_counter_read_group(struct perf_counter *counter,
+				   u64 read_format, char __user *buf)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	int n = 0, size = 0, err = -EFAULT;
+	u64 values[3];
+
+	values[n++] = 1 + leader->nr_siblings;
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = leader->total_time_enabled +
+			atomic64_read(&leader->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = leader->total_time_running +
+			atomic64_read(&leader->child_total_time_running);
+	}
+
+	size = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, size))
+		return -EFAULT;
+
+	err = perf_counter_read_entry(leader, read_format, buf + size);
+	if (err < 0)
+		return err;
+
+	size += err;
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		err = perf_counter_read_entry(counter, read_format,
+				buf + size);
+		if (err < 0)
+			return err;
+
+		size += err;
+	}
+
+	return size;
+}
+
+static int perf_counter_read_one(struct perf_counter *counter,
+				 u64 read_format, char __user *buf)
+{
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = perf_counter_read_value(counter);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	if (copy_to_user(buf, values, n * sizeof(u64)))
+		return -EFAULT;
+
+	return n * sizeof(u64);
+}
+
 /*
  * Read the performance counter - simple non blocking version for now
  */
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 values[4];
-	int n;
+	u64 read_format = counter->attr.read_format;
+	int ret;
 
 	/*
 	 * Return end-of-file for a read on a counter that is in
@@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
+	if (count < perf_counter_read_size(counter))
+		return -ENOSPC;
+
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
-	values[0] = perf_counter_read_tree(counter);
-	n = 1;
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-		values[n++] = counter->total_time_enabled +
-			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-		values[n++] = counter->total_time_running +
-			atomic64_read(&counter->child_total_time_running);
-	if (counter->attr.read_format & PERF_FORMAT_ID)
-		values[n++] = primary_counter_id(counter);
+	if (read_format & PERF_FORMAT_GROUP)
+		ret = perf_counter_read_group(counter, read_format, buf);
+	else
+		ret = perf_counter_read_one(counter, read_format, buf);
 	mutex_unlock(&counter->child_mutex);
 
-	if (count < n * sizeof(u64))
-		return -EINVAL;
-	count = n * sizeof(u64);
-
-	if (copy_to_user(buf, values, count))
-		return -EFAULT;
-
-	return count;
+	return ret;
 }
 
 static ssize_t
@@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
+static void perf_output_read_one(struct perf_output_handle *handle,
+				 struct perf_counter *counter)
+{
+	u64 read_format = counter->attr.read_format;
+	u64 values[4];
+	int n = 0;
+
+	values[n++] = atomic64_read(&counter->count);
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	}
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
+	}
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(counter);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+}
+
+/*
+ * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
+ */
+static void perf_output_read_group(struct perf_output_handle *handle,
+			    struct perf_counter *counter)
+{
+	struct perf_counter *leader = counter->group_leader, *sub;
+	u64 read_format = counter->attr.read_format;
+	u64 values[5];
+	int n = 0;
+
+	values[n++] = 1 + leader->nr_siblings;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = leader->total_time_enabled;
+
+	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = leader->total_time_running;
+
+	if (leader != counter)
+		leader->pmu->read(leader);
+
+	values[n++] = atomic64_read(&leader->count);
+	if (read_format & PERF_FORMAT_ID)
+		values[n++] = primary_counter_id(leader);
+
+	perf_output_copy(handle, values, n * sizeof(u64));
+
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		n = 0;
+
+		if (sub != counter)
+			sub->pmu->read(sub);
+
+		values[n++] = atomic64_read(&sub->count);
+		if (read_format & PERF_FORMAT_ID)
+			values[n++] = primary_counter_id(sub);
+
+		perf_output_copy(handle, values, n * sizeof(u64));
+	}
+}
+
+static void perf_output_read(struct perf_output_handle *handle,
+			     struct perf_counter *counter)
+{
+	if (counter->attr.read_format & PERF_FORMAT_GROUP)
+		perf_output_read_group(handle, counter);
+	else
+		perf_output_read_one(handle, counter);
+}
+
 void perf_counter_output(struct perf_counter *counter, int nmi,
 				struct perf_sample_data *data)
 {
@@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	struct {
 		u32 pid, tid;
 	} tid_entry;
-	struct {
-		u64 id;
-		u64 counter;
-	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
@@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		header.size += sizeof(u64);
 
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		header.size += sizeof(u64) +
-			counter->nr_siblings * sizeof(group_entry);
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		header.size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(data->regs);
@@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		perf_output_put(&handle, data->period);
 
-	/*
-	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
-	 */
-	if (sample_type & PERF_SAMPLE_GROUP) {
-		struct perf_counter *leader, *sub;
-		u64 nr = counter->nr_siblings;
-
-		perf_output_put(&handle, nr);
-
-		leader = counter->group_leader;
-		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-			if (sub != counter)
-				sub->pmu->read(sub);
-
-			group_entry.id = primary_counter_id(sub);
-			group_entry.counter = atomic64_read(&sub->count);
-
-			perf_output_put(&handle, group_entry);
-		}
-	}
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(&handle, counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		if (callchain)
@@ -2818,8 +2964,6 @@ struct perf_read_event {
 
 	u32				pid;
 	u32				tid;
-	u64				value;
-	u64				format[3];
 };
 
 static void
@@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter,
 		.header = {
 			.type = PERF_EVENT_READ,
 			.misc = 0,
-			.size = sizeof(event) - sizeof(event.format),
+			.size = sizeof(event) + perf_counter_read_size(counter),
 		},
 		.pid = perf_counter_pid(counter, task),
 		.tid = perf_counter_tid(counter, task),
-		.value = atomic64_read(&counter->count),
 	};
-	int ret, i = 0;
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_enabled;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = counter->total_time_running;
-	}
-
-	if (counter->attr.read_format & PERF_FORMAT_ID) {
-		event.header.size += sizeof(u64);
-		event.format[i++] = primary_counter_id(counter);
-	}
+	int ret;
 
 	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
 	if (ret)
 		return;
 
-	perf_output_copy(&handle, &event, event.header.size);
+	perf_output_put(&handle, event);
+	perf_output_read(&handle, counter);
+
 	perf_output_end(&handle);
 }
 
@@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
-	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
+	 * we currently do not support PERF_FORMAT_GROUP on inherited counters
 	 */
-	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto done;
 
 	switch (attr->type) {
-- 
cgit v1.2.3


From 970892a9031a5dc7217bd394fb9d89fa75a4a7bc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 13 Aug 2009 11:47:54 +0200
Subject: perf_counter: Fix an ipi-deadlock

perf_pending_counter() is called from IRQ context and will call
perf_counter_disable(), however perf_counter_disable() uses
smp_call_function_single() which doesn't fancy being used with
IRQs disabled due to IPI deadlocks.

Fix this by making it use the local __perf_counter_disable()
call and teaching the counter_sched_out() code about pending
disables as well.

This should cover the case where a counter migrates before the
pending queue gets processed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey J Ashford <cjashfor@us.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: stephane eranian <eranian@googlemail.com>
LKML-Reference: <20090813103655.244097721@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b8c6b97a20a3..3f841beefc7b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -307,6 +307,10 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	if (counter->pending_disable) {
+		counter->pending_disable = 0;
+		counter->state = PERF_COUNTER_STATE_OFF;
+	}
 	counter->tstamp_stopped = ctx->time;
 	counter->pmu->disable(counter);
 	counter->oncpu = -1;
@@ -2343,7 +2347,7 @@ static void perf_pending_counter(struct perf_pending_entry *entry)
 
 	if (counter->pending_disable) {
 		counter->pending_disable = 0;
-		perf_counter_disable(counter);
+		__perf_counter_disable(counter);
 	}
 
 	if (counter->pending_wakeup) {
-- 
cgit v1.2.3


From 94d5d1b2d891f1fd5205f978246b7864d998b25c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 13 Aug 2009 16:14:42 +0200
Subject: perf_counter: Report the cloning task as parent on
 perf_counter_fork()

A bug in (9f498cc: perf_counter: Full task tracing) makes
profiling multi-threaded apps it go belly up.

[ output as: (PID:TID):(PPID:PTID) ]

 # ./perf report -D | grep FORK
0x4b0 [0x18]: PERF_EVENT_FORK: (3237:3237):(3236:3236)
0xa10 [0x18]: PERF_EVENT_FORK: (3237:3238):(3236:3236)
0xa70 [0x18]: PERF_EVENT_FORK: (3237:3239):(3236:3236)
0xad0 [0x18]: PERF_EVENT_FORK: (3237:3240):(3236:3236)
0xb18 [0x18]: PERF_EVENT_FORK: (3237:3241):(3236:3236)

Shows us that the test (27d028d perf report: Update for the new
FORK/EXIT events) in builtin-report.c:

        /*
         * A thread clone will have the same PID for both
         * parent and child.
         */
        if (thread == parent)
                return 0;

Will clearly fail.

The problem is that perf_counter_fork() reports the actual
parent, instead of the cloning thread.

Fixing that (with the below patch), yields:

 # ./perf report -D | grep FORK
0x4c8 [0x18]: PERF_EVENT_FORK: (1590:1590):(1589:1589)
0xbd8 [0x18]: PERF_EVENT_FORK: (1590:1591):(1590:1590)
0xc80 [0x18]: PERF_EVENT_FORK: (1590:1592):(1590:1590)
0x3338 [0x18]: PERF_EVENT_FORK: (1590:1593):(1590:1590)
0x66b0 [0x18]: PERF_EVENT_FORK: (1590:1594):(1590:1590)

Which both makes more sense and doesn't confuse perf report
anymore.

Reported-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: paulus@samba.org
Cc: Anton Blanchard <anton@samba.org>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <1250172882.5241.62.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/perf_counter.c')

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3f841beefc7b..534e20d14d63 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3028,10 +3028,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
 		return;
 
 	task_event->event.pid = perf_counter_pid(counter, task);
-	task_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+	task_event->event.ppid = perf_counter_pid(counter, current);
 
 	task_event->event.tid = perf_counter_tid(counter, task);
-	task_event->event.ptid = perf_counter_tid(counter, task->real_parent);
+	task_event->event.ptid = perf_counter_tid(counter, current);
 
 	perf_output_put(&handle, task_event->event);
 	perf_output_end(&handle);
-- 
cgit v1.2.3