From 4bdde044dc36ac7b01f7502394d52619af9d1927 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:58:05 +0800
Subject: tracing: Convert some signal events to DEFINE_TRACE

Use DECLARE_EVENT_CLASS to remove duplicate code:

text    data     bss     dec     hex filename
  23639    6084       8   29731    7423 kernel/signal.o.orig
  22727    6084       8   28819    7093 kernel/signal.o

2 events are converted:

  signal_queue_overflow: signal_overflow_fail, signal_lose_info

No functional change.

Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FBD.8070703@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/signal.h | 52 ++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h
index a510b75ac304..814566c99d29 100644
--- a/include/trace/events/signal.h
+++ b/include/trace/events/signal.h
@@ -100,18 +100,7 @@ TRACE_EVENT(signal_deliver,
 		  __entry->sa_handler, __entry->sa_flags)
 );
 
-/**
- * signal_overflow_fail - called when signal queue is overflow
- * @sig: signal number
- * @group: signal to process group or not (bool)
- * @info: pointer to struct siginfo
- *
- * Kernel fails to generate 'sig' signal with 'info' siginfo, because
- * siginfo queue is overflow, and the signal is dropped.
- * 'group' is not 0 if the signal will be sent to a process group.
- * 'sig' is always one of RT signals.
- */
-TRACE_EVENT(signal_overflow_fail,
+DECLARE_EVENT_CLASS(signal_queue_overflow,
 
 	TP_PROTO(int sig, int group, struct siginfo *info),
 
@@ -134,6 +123,24 @@ TRACE_EVENT(signal_overflow_fail,
 		  __entry->sig, __entry->group, __entry->errno, __entry->code)
 );
 
+/**
+ * signal_overflow_fail - called when signal queue is overflow
+ * @sig: signal number
+ * @group: signal to process group or not (bool)
+ * @info: pointer to struct siginfo
+ *
+ * Kernel fails to generate 'sig' signal with 'info' siginfo, because
+ * siginfo queue is overflow, and the signal is dropped.
+ * 'group' is not 0 if the signal will be sent to a process group.
+ * 'sig' is always one of RT signals.
+ */
+DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail,
+
+	TP_PROTO(int sig, int group, struct siginfo *info),
+
+	TP_ARGS(sig, group, info)
+);
+
 /**
  * signal_lose_info - called when siginfo is lost
  * @sig: signal number
@@ -145,28 +152,13 @@ TRACE_EVENT(signal_overflow_fail,
  * 'group' is not 0 if the signal will be sent to a process group.
  * 'sig' is always one of non-RT signals.
  */
-TRACE_EVENT(signal_lose_info,
+DEFINE_EVENT(signal_queue_overflow, signal_lose_info,
 
 	TP_PROTO(int sig, int group, struct siginfo *info),
 
-	TP_ARGS(sig, group, info),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig	)
-		__field(	int,	group	)
-		__field(	int,	errno	)
-		__field(	int,	code	)
-	),
-
-	TP_fast_assign(
-		__entry->sig	= sig;
-		__entry->group	= group;
-		TP_STORE_SIGINFO(__entry, info);
-	),
-
-	TP_printk("sig=%d group=%d errno=%d code=%d",
-		  __entry->sig, __entry->group, __entry->errno, __entry->code)
+	TP_ARGS(sig, group, info)
 );
+
 #endif /* _TRACE_SIGNAL_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 50354a8a28d0c91695a2d6d25b5a821bfe557a07 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:58:24 +0800
Subject: tracing: Update comments

Make some comments consistent with the code.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FD0.7090202@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index ea6f9d4a20e9..75dd7787fb37 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -154,9 +154,11 @@
  *
  *	field = (typeof(field))entry;
  *
- *	p = get_cpu_var(ftrace_event_seq);
+ *	p = &get_cpu_var(ftrace_event_seq);
  *	trace_seq_init(p);
- *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	ret = trace_seq_printf(s, "%s: ", <call>);
+ *	if (ret)
+ *		ret = trace_seq_printf(s, <TP_printk> "\n");
  *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
@@ -450,38 +452,38 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
+ *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	struct ring_buffer_event *event;
  *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
  *	struct ring_buffer *buffer;
  *	unsigned long irq_flags;
+ *	int __data_size;
  *	int pc;
  *
  *	local_save_flags(irq_flags);
  *	pc = preempt_count();
  *
+ *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
+ *
  *	event = trace_current_buffer_lock_reserve(&buffer,
  *				  event_<call>.id,
- *				  sizeof(struct ftrace_raw_<call>),
+ *				  sizeof(*entry) + __data_size,
  *				  irq_flags, pc);
  *	if (!event)
  *		return;
  *	entry	= ring_buffer_event_data(event);
  *
- *	<assign>;  <-- Here we assign the entries by the __field and
- *			__array macros.
+ *	{ <assign>; }  <-- Here we assign the entries by the __field and
+ *			   __array macros.
  *
- *	trace_current_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ *	if (!filter_current_check_discard(buffer, event_call, entry, event))
+ *		trace_current_buffer_unlock_commit(buffer,
+ *						   event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(struct ftrace_event_call *unused)
  * {
- *	int ret;
- *
- *	ret = register_trace_<call>(ftrace_raw_event_<call>);
- *	if (!ret)
- *		pr_info("event trace: Could not activate trace point "
- *			"probe to <call>");
- *	return ret;
+ *	return register_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static void ftrace_unreg_event_<call>(struct ftrace_event_call *unused)
@@ -493,6 +495,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
+ * static const char print_fmt_<call>[] = <TP_printk>;
+ *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
@@ -501,6 +505,8 @@ perf_trace_disable_##name(struct ftrace_event_call *unused)		\
  *	.raw_init		= trace_event_raw_init,
  *	.regfunc		= ftrace_reg_event_<call>,
  *	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.print_fmt		= print_fmt_<call>,
+ *	.define_fields		= ftrace_define_fields_<call>,
  * }
  *
  */
@@ -569,7 +575,6 @@ ftrace_raw_event_id_##call(struct ftrace_event_call *event_call,	\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-									\
 	tstruct								\
 									\
 	{ assign; }							\
-- 
cgit v1.2.3


From ae832d1e03ac9bf09fb8a07fb37908ab40c7cd0e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 24 Mar 2010 10:57:43 +0800
Subject: tracing: Remove side effect from module tracepoints that caused a GPF

Remove the @refcnt argument, because it has side-effects, and arguments with
side-effects are not skipped by the jump over disabled instrumentation and are
executed even when the tracepoint is disabled.

This was also causing a GPF as found by Randy Dunlap:

Subject: 2.6.33 GP fault only when built with tracing
LKML-Reference: <4BA2B69D.3000309@oracle.com>

Note, the current 2.6.34-rc has a fix for the actual cause of the GPF,
but this fixes one of its triggers.

Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4BA97FA7.6040406@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/module.h        |  6 ++----
 include/trace/events/module.h | 14 +++++++-------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 5e869ffd34aa..393ec39b580a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -460,8 +460,7 @@ static inline void __module_get(struct module *module)
 	if (module) {
 		preempt_disable();
 		__this_cpu_inc(module->refptr->count);
-		trace_module_get(module, _THIS_IP_,
-				 __this_cpu_read(module->refptr->count));
+		trace_module_get(module, _THIS_IP_);
 		preempt_enable();
 	}
 }
@@ -475,8 +474,7 @@ static inline int try_module_get(struct module *module)
 
 		if (likely(module_is_live(module))) {
 			__this_cpu_inc(module->refptr->count);
-			trace_module_get(module, _THIS_IP_,
-				__this_cpu_read(module->refptr->count));
+			trace_module_get(module, _THIS_IP_);
 		}
 		else
 			ret = 0;
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 4b0f48ba16a6..a585f8135bd9 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -53,9 +53,9 @@ TRACE_EVENT(module_free,
 
 DECLARE_EVENT_CLASS(module_refcnt,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt),
+	TP_ARGS(mod, ip),
 
 	TP_STRUCT__entry(
 		__field(	unsigned long,	ip		)
@@ -65,7 +65,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_fast_assign(
 		__entry->ip	= ip;
-		__entry->refcnt	= refcnt;
+		__entry->refcnt	= __this_cpu_read(mod->refptr->count);
 		__assign_str(name, mod->name);
 	),
 
@@ -75,16 +75,16 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
 DEFINE_EVENT(module_refcnt, module_get,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 DEFINE_EVENT(module_refcnt, module_put,
 
-	TP_PROTO(struct module *mod, unsigned long ip, int refcnt),
+	TP_PROTO(struct module *mod, unsigned long ip),
 
-	TP_ARGS(mod, ip, refcnt)
+	TP_ARGS(mod, ip)
 );
 
 TRACE_EVENT(module_request,
-- 
cgit v1.2.3


From eb0c53771fb2f5f66b0edb3ebce33be4bbf1c285 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 29 Mar 2010 14:25:18 -0400
Subject: tracing: Fix compile error in module tracepoints when MODULE_UNLOAD
 not set

If modules are configured in the build but unloading of modules is not,
then the refcnt is not defined. Place the get/put module tracepoints
under CONFIG_MODULE_UNLOAD since it references this field in the module
structure.

As a side-effect, this patch also reduces the code when MODULE_UNLOAD
is not set, because these unused tracepoints are not created.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/module.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index a585f8135bd9..f07b44a2b240 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -51,6 +51,9 @@ TRACE_EVENT(module_free,
 	TP_printk("%s", __get_str(name))
 );
 
+#ifdef CONFIG_MODULE_UNLOAD
+/* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */
+
 DECLARE_EVENT_CLASS(module_refcnt,
 
 	TP_PROTO(struct module *mod, unsigned long ip),
@@ -86,6 +89,7 @@ DEFINE_EVENT(module_refcnt, module_put,
 
 	TP_ARGS(mod, ip)
 );
+#endif /* CONFIG_MODULE_UNLOAD */
 
 TRACE_EVENT(module_request,
 
-- 
cgit v1.2.3


From 66a8cb95ed04025664d1db4e952155ee1dccd048 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 13:21:56 -0400
Subject: ring-buffer: Add place holder recording of dropped events

Currently, when the ring buffer drops events, it does not record
the fact that it did so. It does inform the writer that the event
was dropped by returning a NULL event, but it does not put in any
place holder where the event was dropped.

This is not a trivial thing to add because the ring buffer mostly
runs in overwrite (flight recorder) mode. That is, when the ring
buffer is full, new data will overwrite old data.

In a produce/consumer mode, where new data is simply dropped when
the ring buffer is full, it is trivial to add the placeholder
for dropped events. When there's more room to write new data, then
a special event can be added to notify the reader about the dropped
events.

But in overwrite mode, any new write can overwrite events. A place
holder can not be inserted into the ring buffer since there never
may be room. A reader could also come in at anytime and miss the
placeholder.

Luckily, the way the ring buffer works, the read side can find out
if events were lost or not, and how many events. Everytime a write
takes place, if it overwrites the header page (the next read) it
updates a "overrun" variable that keeps track of the number of
lost events. When a reader swaps out a page from the ring buffer,
it can record this number, perfom the swap, and then check to
see if the number changed, and take the diff if it has, which would be
the number of events dropped. This can be stored by the reader
and returned to callers of the reader.

Since the reader page swap will fail if the writer moved the head
page since the time the reader page set up the swap, this gives room
to record the overruns without worrying about races. If the reader
sets up the pages, records the overrun, than performs the swap,
if the swap succeeds, then the overrun variable has not been
updated since the setup before the swap.

For binary readers of the ring buffer, a flag is set in the header
of each sub page (sub buffer) of the ring buffer. This flag is embedded
in the size field of the data on the sub buffer, in the 31st bit (the size
can be 32 or 64 bits depending on the architecture), but only 27
bits needs to be used for the actual size (less actually).

We could add a new field in the sub buffer header to also record the
number of events dropped since the last read, but this will change the
format of the binary ring buffer a bit too much. Perhaps this change can
be made if the information on the number of events dropped is considered
important enough.

Note, the notification of dropped events is only used by consuming reads
or peeking at the ring buffer. Iterating over the ring buffer does not
keep this information because the necessary data is only available when
a page swap is made, and the iterator does not swap out pages.

Cc: Robert Richter <robert.richter@amd.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Luis Claudio R. Goncalves" <lclaudio@uudg.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 5fcc31ed5771..c8297761e414 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,9 +120,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
 struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
+		 unsigned long *lost_events);
 struct ring_buffer_event *
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
+		    unsigned long *lost_events);
 
 struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
-- 
cgit v1.2.3


From bc21b478425ac73f66a5ec0b375a5e0d12d609ce Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 31 Mar 2010 19:49:26 -0400
Subject: tracing: Show the lost events in the trace_pipe output

Now that the ring buffer can keep track of where events are lost.
Use this information to the output of trace_pipe:

       hackbench-3588  [001]  1326.701660: lock_acquire: ffffffff816591e0 read rcu_read_lock
       hackbench-3588  [001]  1326.701661: lock_acquire: ffff88003f4091f0 &(&dentry->d_lock)->rlock
       hackbench-3588  [001]  1326.701664: lock_release: ffff88003f4091f0 &(&dentry->d_lock)->rlock
CPU:1 [LOST 673 EVENTS]
       hackbench-3588  [001]  1326.702711: kmem_cache_free: call_site=ffffffff81102b85 ptr=ffff880026d96738
       hackbench-3588  [001]  1326.702712: lock_release: ffff88003e1480a8 &mm->mmap_sem
       hackbench-3588  [001]  1326.702713: lock_acquire: ffff88003e1480a8 &mm->mmap_sem

Even works with the function graph tracer:

 2) ! 170.098 us  |                                            }
 2)   4.036 us    |                                            rcu_irq_exit();
 2)   3.657 us    |                                            idle_cpu();
 2) ! 190.301 us  |                                          }
CPU:2 [LOST 2196 EVENTS]
 2)   0.853 us    |                            } /* cancel_dirty_page */
 2)               |                            remove_from_page_cache() {
 2)   1.578 us    |                              _raw_spin_lock_irq();
 2)               |                              __remove_from_page_cache() {

Note, it does not work with the iterator "trace" file, since it requires
the use of consuming the page from the ring buffer to determine how many
events were lost, which the iterator does not do.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c0f4b364c711..39e71b0a3bfd 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -58,6 +58,7 @@ struct trace_iterator {
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
 	struct trace_entry	*ent;
+	unsigned long		lost_events;
 	int			leftover;
 	int			cpu;
 	u64			ts;
-- 
cgit v1.2.3