From 2bf2160d8805de64308e2e7c3cd97813cb58ed2f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 23 Aug 2010 18:42:48 +0900 Subject: irq: Add tracepoint to softirq_raise Add a tracepoint for tracing when softirq action is raised. This and the existing tracepoints complete softirq's tracepoints: softirq_raise, softirq_entry and softirq_exit. And when this tracepoint is used in combination with the softirq_entry tracepoint we can determine the softirq raise latency. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Acked-by: Neil Horman Cc: David Miller Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C724298.4050509@jp.fujitsu.com> [ factorize softirq events with DECLARE_EVENT_CLASS ] Signed-off-by: Koki Sanagi Signed-off-by: Frederic Weisbecker --- include/trace/events/irq.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 0e4cfb694fe7..6fa7cbab7d93 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -5,7 +5,9 @@ #define _TRACE_IRQ_H #include -#include + +struct irqaction; +struct softirq_action; #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ @@ -93,7 +95,10 @@ DECLARE_EVENT_CLASS(softirq, ), TP_fast_assign( - __entry->vec = (int)(h - vec); + if (vec) + __entry->vec = (int)(h - vec); + else + __entry->vec = (int)(long)h; ), TP_printk("vec=%d [action=%s]", __entry->vec, @@ -136,6 +141,23 @@ DEFINE_EVENT(softirq, softirq_exit, TP_ARGS(h, vec) ); +/** + * softirq_raise - called immediately when a softirq is raised + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter contains a pointer to the softirq vector number which is + * raised. @vec is NULL and it means @h includes vector number not + * softirq_action. When used in combination with the softirq_entry tracepoint + * we can determine the softirq raise latency. + */ +DEFINE_EVENT(softirq, softirq_raise, + + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + + TP_ARGS(h, vec) +); + #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 3e4b10d7a4d2a78af64f8096dc7cdb3bebd65adb Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 23 Aug 2010 18:43:51 +0900 Subject: napi: Convert trace_napi_poll to TRACE_EVENT This patch converts trace_napi_poll from DECLARE_EVENT to TRACE_EVENT to improve the usability of napi_poll tracepoint. -0 [001] 241302.750777: napi_poll: napi poll on napi struct f6acc480 for device eth3 -0 [000] 241302.852389: napi_poll: napi poll on napi struct f5d0d70c for device eth1 The original patch is below: http://marc.info/?l=linux-kernel&m=126021713809450&w=2 [ sanagi.koki@jp.fujitsu.com: And add a fix by Steven Rostedt: http://marc.info/?l=linux-kernel&m=126150506519173&w=2 ] Signed-off-by: Neil Horman Acked-by: David S. Miller Acked-by: Neil Horman Cc: Mathieu Desnoyers Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C7242D7.4050009@jp.fujitsu.com> Signed-off-by: Koki Sanagi Signed-off-by: Frederic Weisbecker --- include/trace/events/napi.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h index 188deca2f3c7..8fe1e93f531d 100644 --- a/include/trace/events/napi.h +++ b/include/trace/events/napi.h @@ -6,10 +6,31 @@ #include #include +#include + +#define NO_DEV "(no_device)" + +TRACE_EVENT(napi_poll, -DECLARE_TRACE(napi_poll, TP_PROTO(struct napi_struct *napi), - TP_ARGS(napi)); + + TP_ARGS(napi), + + TP_STRUCT__entry( + __field( struct napi_struct *, napi) + __string( dev_name, napi->dev ? napi->dev->name : NO_DEV) + ), + + TP_fast_assign( + __entry->napi = napi; + __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV); + ), + + TP_printk("napi poll on napi struct %p for device %s", + __entry->napi, __get_str(dev_name)) +); + +#undef NO_DEV #endif /* _TRACE_NAPI_H_ */ -- cgit v1.2.3 From cf66ba58b5cb8b1526e9dd2fb96ff8db048d4d44 Mon Sep 17 00:00:00 2001 From: Koki Sanagi Date: Mon, 23 Aug 2010 18:45:02 +0900 Subject: netdev: Add tracepoints to netdev layer This patch adds tracepoint to dev_queue_xmit, dev_hard_start_xmit, netif_rx and netif_receive_skb. These tracepoints help you to monitor network driver's input/output. -0 [001] 112447.902030: netif_rx: dev=eth1 skbaddr=f3ef0900 len=84 -0 [001] 112447.902039: netif_receive_skb: dev=eth1 skbaddr=f3ef0900 len=84 sshd-6828 [000] 112447.903257: net_dev_queue: dev=eth4 skbaddr=f3fca538 len=226 sshd-6828 [000] 112447.903260: net_dev_xmit: dev=eth4 skbaddr=f3fca538 len=226 rc=0 Signed-off-by: Koki Sanagi Acked-by: David S. Miller Acked-by: Neil Horman Cc: Mathieu Desnoyers Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C72431E.3000901@jp.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/trace/events/net.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 include/trace/events/net.h (limited to 'include/trace') diff --git a/include/trace/events/net.h b/include/trace/events/net.h new file mode 100644 index 000000000000..5f247f5ffc56 --- /dev/null +++ b/include/trace/events/net.h @@ -0,0 +1,82 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM net + +#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NET_H + +#include +#include +#include +#include + +TRACE_EVENT(net_dev_xmit, + + TP_PROTO(struct sk_buff *skb, + int rc), + + TP_ARGS(skb, rc), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( int, rc ) + __string( name, skb->dev->name ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->rc = rc; + __assign_str(name, skb->dev->name); + ), + + TP_printk("dev=%s skbaddr=%p len=%u rc=%d", + __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) +); + +DECLARE_EVENT_CLASS(net_dev_template, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned int, len ) + __string( name, skb->dev->name ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + __entry->len = skb->len; + __assign_str(name, skb->dev->name); + ), + + TP_printk("dev=%s skbaddr=%p len=%u", + __get_str(name), __entry->skbaddr, __entry->len) +) + +DEFINE_EVENT(net_dev_template, net_dev_queue, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); + +DEFINE_EVENT(net_dev_template, netif_receive_skb, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); + +DEFINE_EVENT(net_dev_template, netif_rx, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); +#endif /* _TRACE_NET_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 07dc22e7295f25526f110d704655ff0ea7687420 Mon Sep 17 00:00:00 2001 From: Koki Sanagi Date: Mon, 23 Aug 2010 18:46:12 +0900 Subject: skb: Add tracepoints to freeing skb This patch adds tracepoint to consume_skb and add trace_kfree_skb before __kfree_skb in skb_free_datagram_locked and net_tx_action. Combinating with tracepoint on dev_hard_start_xmit, we can check how long it takes to free transmitted packets. And using it, we can calculate how many packets driver had at that time. It is useful when a drop of transmitted packet is a problem. sshd-6828 [000] 112689.258154: consume_skb: skbaddr=f2d99bb8 Signed-off-by: Koki Sanagi Acked-by: David S. Miller Acked-by: Neil Horman Cc: Mathieu Desnoyers Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C724364.50903@jp.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/trace/events/skb.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 4b2be6dc76f0..75ce9d500d8e 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -35,6 +35,23 @@ TRACE_EVENT(kfree_skb, __entry->skbaddr, __entry->protocol, __entry->location) ); +TRACE_EVENT(consume_skb, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + ), + + TP_printk("skbaddr=%p", __entry->skbaddr) +); + TRACE_EVENT(skb_copy_datagram_iovec, TP_PROTO(const struct sk_buff *skb, int len), -- cgit v1.2.3 From 74704ac6ea402d50c9543cb28247e6d9f521f7ae Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Tue, 7 Sep 2010 09:21:32 +0200 Subject: tracing, perf: Add more power related events This patch adds new generic events for dynamic power management tracing: - clock events class: used for clock enable/disable and for clock rate change, - power_domain events class: used for power domains transitions. The OMAP architecture will be using the new events for PM debugging, however the new events are made generic enough to be used by all platforms. Signed-off-by: Jean Pihet Acked-by: Thomas Renninger Cc: discuss@lesswatts.org Cc: linux-pm@lists.linux-foundation.org Cc: Thomas Renninger Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: Len Brown Cc: Kevin Hilman LKML-Reference: Signed-off-by: Ingo Molnar --- include/trace/events/power.h | 90 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 35a2a6e7bf1e..286784d69b8f 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -10,12 +10,17 @@ #ifndef _TRACE_POWER_ENUM_ #define _TRACE_POWER_ENUM_ enum { - POWER_NONE = 0, - POWER_CSTATE = 1, - POWER_PSTATE = 2, + POWER_NONE = 0, + POWER_CSTATE = 1, /* C-State */ + POWER_PSTATE = 2, /* Fequency change or DVFS */ + POWER_SSTATE = 3, /* Suspend */ }; #endif +/* + * The power events are used for cpuidle & suspend (power_start, power_end) + * and for cpufreq (power_frequency) + */ DECLARE_EVENT_CLASS(power, TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id), @@ -70,6 +75,85 @@ TRACE_EVENT(power_end, ); +/* + * The clock events are used for clock enable/disable and for + * clock rate change + */ +DECLARE_EVENT_CLASS(clock, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id), + + TP_STRUCT__entry( + __string( name, name ) + __field( u64, state ) + __field( u64, cpu_id ) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->state = state; + __entry->cpu_id = cpu_id; + ), + + TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), + (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) +); + +DEFINE_EVENT(clock, clock_enable, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +DEFINE_EVENT(clock, clock_disable, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +DEFINE_EVENT(clock, clock_set_rate, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +/* + * The power domain events are used for power domains transitions + */ +DECLARE_EVENT_CLASS(power_domain, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id), + + TP_STRUCT__entry( + __string( name, name ) + __field( u64, state ) + __field( u64, cpu_id ) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->state = state; + __entry->cpu_id = cpu_id; +), + + TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), + (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) +); + +DEFINE_EVENT(power_domain, power_domain_target, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + #endif /* _TRACE_POWER_H */ /* This part must be outside protection */ -- cgit v1.2.3 From a8027073eb127cd207070891374b5c54c2ce3d23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Sep 2010 15:13:34 -0400 Subject: tracing/sched: Add sched_pi_setprio tracepoint Add a tracepoint that shows the priority of a task being boosted via priority inheritance. Cc: Gregory Haskins Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/events/sched.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9208c92aeab5..f6334782a593 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); +/* + * Tracepoint for showing priority inheritance modifying a tasks + * priority. + */ +TRACE_EVENT(sched_pi_setprio, + + TP_PROTO(struct task_struct *tsk, int newprio), + + TP_ARGS(tsk, newprio), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, oldprio ) + __field( int, newprio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->oldprio = tsk->prio; + __entry->newprio = newprio; + ), + + TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", + __entry->comm, __entry->pid, + __entry->oldprio, __entry->newprio) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 97bd234701b2b39a0e749c1fe0e44f1d14c94292 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:41:14 +0200 Subject: workqueue: prepare for more tracepoints Define workqueue_work event class and use it for workqueue_execute_end trace point. Also, move trace/events/workqueue.h include downwards such that all struct definitions are visible to it. This is to prepare for more tracepoints and doesn't cause any functional change. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- include/trace/events/workqueue.h | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 49682d7e9d60..ec9d7244eb9f 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -7,13 +7,7 @@ #include #include -/** - * workqueue_execute_start - called immediately before the workqueue callback - * @work: pointer to struct work_struct - * - * Allows to track workqueue execution. - */ -TRACE_EVENT(workqueue_execute_start, +DECLARE_EVENT_CLASS(workqueue_work, TP_PROTO(struct work_struct *work), @@ -21,24 +15,22 @@ TRACE_EVENT(workqueue_execute_start, TP_STRUCT__entry( __field( void *, work ) - __field( void *, function) ), TP_fast_assign( __entry->work = work; - __entry->function = work->func; ), - TP_printk("work struct %p: function %pf", __entry->work, __entry->function) + TP_printk("work struct %p", __entry->work) ); /** - * workqueue_execute_end - called immediately before the workqueue callback + * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct * * Allows to track workqueue execution. */ -TRACE_EVENT(workqueue_execute_end, +TRACE_EVENT(workqueue_execute_start, TP_PROTO(struct work_struct *work), @@ -46,15 +38,29 @@ TRACE_EVENT(workqueue_execute_end, TP_STRUCT__entry( __field( void *, work ) + __field( void *, function) ), TP_fast_assign( __entry->work = work; + __entry->function = work->func; ), - TP_printk("work struct %p", __entry->work) + TP_printk("work struct %p: function %pf", __entry->work, __entry->function) ); +/** + * workqueue_execute_end - called immediately before the workqueue callback + * @work: pointer to struct work_struct + * + * Allows to track workqueue execution. + */ +DEFINE_EVENT(workqueue_work, workqueue_execute_end, + + TP_PROTO(struct work_struct *work), + + TP_ARGS(work) +); #endif /* _TRACE_WORKQUEUE_H */ -- cgit v1.2.3 From cdadf0097cdca06c497ffaeb5982e028c6e4ed38 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:49:55 +0200 Subject: workqueue: add queue_work and activate_work trace points These two tracepoints allow tracking when and how a work is queued and activated. This patch is based on Frederic's patch to add queue_work trace point. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- include/trace/events/workqueue.h | 53 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index ec9d7244eb9f..7d497291c85d 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -24,6 +24,59 @@ DECLARE_EVENT_CLASS(workqueue_work, TP_printk("work struct %p", __entry->work) ); +/** + * workqueue_queue_work - called when a work gets queued + * @req_cpu: the requested cpu + * @cwq: pointer to struct cpu_workqueue_struct + * @work: pointer to struct work_struct + * + * This event occurs when a work is queued immediately or once a + * delayed work is actually queued on a workqueue (ie: once the delay + * has been reached). + */ +TRACE_EVENT(workqueue_queue_work, + + TP_PROTO(unsigned int req_cpu, struct cpu_workqueue_struct *cwq, + struct work_struct *work), + + TP_ARGS(req_cpu, cwq, work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + __field( void *, workqueue) + __field( unsigned int, req_cpu ) + __field( unsigned int, cpu ) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = work->func; + __entry->workqueue = cwq->wq; + __entry->req_cpu = req_cpu; + __entry->cpu = cwq->gcwq->cpu; + ), + + TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u", + __entry->work, __entry->function, __entry->workqueue, + __entry->req_cpu, __entry->cpu) +); + +/** + * workqueue_activate_work - called when a work gets activated + * @work: pointer to struct work_struct + * + * This event occurs when a queued work is put on the active queue, + * which happens immediately after queueing unless @max_active limit + * is reached. + */ +DEFINE_EVENT(workqueue_work, workqueue_activate_work, + + TP_PROTO(struct work_struct *work), + + TP_ARGS(work) +); + /** * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct -- cgit v1.2.3 From f4bc6bb2d562703eafc895c37e7be20906de139d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Oct 2010 15:00:13 +0200 Subject: tracing: Cleanup the convoluted softirq tracepoints With the addition of trace_softirq_raise() the softirq tracepoint got even more convoluted. Why the tracepoints take two pointers to assign an integer is beyond my comprehension. But adding an extra case which treats the first pointer as an unsigned long when the second pointer is NULL including the back and forth type casting is just horrible. Convert the softirq tracepoints to take a single unsigned int argument for the softirq vector number and fix the call sites. Signed-off-by: Thomas Gleixner LKML-Reference: Acked-by: Peter Zijlstra Acked-by: mathieu.desnoyers@efficios.com Cc: Frederic Weisbecker Cc: Steven Rostedt --- include/trace/events/irq.h | 54 +++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 6fa7cbab7d93..1c09820df585 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -86,76 +86,62 @@ TRACE_EVENT(irq_handler_exit, DECLARE_EVENT_CLASS(softirq, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec), + TP_ARGS(vec_nr), TP_STRUCT__entry( - __field( int, vec ) + __field( unsigned int, vec ) ), TP_fast_assign( - if (vec) - __entry->vec = (int)(h - vec); - else - __entry->vec = (int)(long)h; + __entry->vec = vec_nr; ), - TP_printk("vec=%d [action=%s]", __entry->vec, + TP_printk("vec=%u [action=%s]", __entry->vec, show_softirq_name(__entry->vec)) ); /** * softirq_entry - called immediately before the softirq handler - * @h: pointer to struct softirq_action - * @vec: pointer to first struct softirq_action in softirq_vec array + * @vec_nr: softirq vector number * - * The @h parameter, contains a pointer to the struct softirq_action - * which has a pointer to the action handler that is called. By subtracting - * the @vec pointer from the @h pointer, we can determine the softirq - * number. Also, when used in combination with the softirq_exit tracepoint - * we can determine the softirq latency. + * When used in combination with the softirq_exit tracepoint + * we can determine the softirq handler runtine. */ DEFINE_EVENT(softirq, softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec) + TP_ARGS(vec_nr) ); /** * softirq_exit - called immediately after the softirq handler returns - * @h: pointer to struct softirq_action - * @vec: pointer to first struct softirq_action in softirq_vec array + * @vec_nr: softirq vector number * - * The @h parameter contains a pointer to the struct softirq_action - * that has handled the softirq. By subtracting the @vec pointer from - * the @h pointer, we can determine the softirq number. Also, when used in - * combination with the softirq_entry tracepoint we can determine the softirq - * latency. + * When used in combination with the softirq_entry tracepoint + * we can determine the softirq handler runtine. */ DEFINE_EVENT(softirq, softirq_exit, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec) + TP_ARGS(vec_nr) ); /** * softirq_raise - called immediately when a softirq is raised - * @h: pointer to struct softirq_action - * @vec: pointer to first struct softirq_action in softirq_vec array + * @vec_nr: softirq vector number * - * The @h parameter contains a pointer to the softirq vector number which is - * raised. @vec is NULL and it means @h includes vector number not - * softirq_action. When used in combination with the softirq_entry tracepoint - * we can determine the softirq raise latency. + * When used in combination with the softirq_entry tracepoint + * we can determine the softirq raise to run latency. */ DEFINE_EVENT(softirq, softirq_raise, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec) + TP_ARGS(vec_nr) ); #endif /* _TRACE_IRQ_H */ -- cgit v1.2.3 From 1b430beee5e388605dfb092b214ef0320f752cf6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 26 Oct 2010 14:21:26 -0700 Subject: writeback: remove nonblocking/encountered_congestion references This removes more dead code that was somehow missed by commit 0d99519efef (writeback: remove unused nonblocking and congestion checks). There are no behavior change except for the removal of two entries from one of the ext4 tracing interface. The nonblocking checks in ->writepages are no longer used because the flusher now prefer to block on get_request_wait() than to skip inodes on IO congestion. The latter will lead to more seeky IO. The nonblocking checks in ->writepage are no longer used because it's redundant with the WB_SYNC_NONE check. We no long set ->nonblocking in VM page out and page migration, because a) it's effectively redundant with WB_SYNC_NONE in current code b) it's old semantic of "Don't get stuck on request queues" is mis-behavior: that would skip some dirty inodes on congestion and page out others, which is unfair in terms of LRU age. Inspired by Christoph Hellwig. Thanks! Signed-off-by: Wu Fengguang Cc: Theodore Ts'o Cc: David Howells Cc: Sage Weil Cc: Steve French Cc: Chris Mason Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/ext4.h | 8 +++++--- include/trace/events/writeback.h | 2 -- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 01e9e0076a92..6bcb00645de4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -242,18 +242,20 @@ TRACE_EVENT(ext4_da_writepages, __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; - __entry->nonblocking = wbc->nonblocking; __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld " + "range_start %llu range_end %llu " + "for_kupdate %d for_reclaim %d " + "range_cyclic %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, - __entry->range_end, __entry->nonblocking, + __entry->range_end, __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic, (unsigned long) __entry->writeback_index) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index f345f66ae9d1..0bb01ab2e984 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -96,8 +96,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) - __field(int, nonblocking) - __field(int, encountered_congestion) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) -- cgit v1.2.3 From e11da5b4fdf01d71d73c21cb92b00595b917d7fd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:40 -0700 Subject: tracing, vmscan: add trace events for LRU list shrinking There have been numerous reports of stalls that pointed at the problem being somewhere in the VM. There are multiple roots to the problems which means dealing with any of the root problems in isolation is tricky to justify on their own and they would still need integration testing. This patch series puts together two different patch sets which in combination should tackle some of the root causes of latency problems being reported. Patch 1 adds a tracepoint for shrink_inactive_list. For this series, the most important results is being able to calculate the scanning/reclaim ratio as a measure of the amount of work being done by page reclaim. Patch 2 accounts for time spent in congestion_wait. Patches 3-6 were originally developed by Kosaki Motohiro but reworked for this series. It has been noted that lumpy reclaim is far too aggressive and trashes the system somewhat. As SLUB uses high-order allocations, a large cost incurred by lumpy reclaim will be noticeable. It was also reported during transparent hugepage support testing that lumpy reclaim was trashing the system and these patches should mitigate that problem without disabling lumpy reclaim. Patch 7 adds wait_iff_congested() and replaces some callers of congestion_wait(). wait_iff_congested() only sleeps if there is a BDI that is currently congested. Patch 8 notes that any BDI being congested is not necessarily a problem because there could be multiple BDIs of varying speeds and numberous zones. It attempts to track when a zone being reclaimed contains many pages backed by a congested BDI and if so, reclaimers wait on the congestion queue. I ran a number of tests with monitoring on X86, X86-64 and PPC64. Each machine had 3G of RAM and the CPUs were X86: Intel P4 2-core X86-64: AMD Phenom 4-core PPC64: PPC970MP Each used a single disk and the onboard IO controller. Dirty ratio was left at 20. I'm just going to report for X86-64 and PPC64 in a vague attempt to keep this report short. Four kernels were tested each based on v2.6.36-rc4 traceonly-v2r2: Patches 1 and 2 to instrument vmscan reclaims and congestion_wait lowlumpy-v2r3: Patches 1-6 to test if lumpy reclaim is better waitcongest-v2r3: Patches 1-7 to only wait on congestion waitwriteback-v2r4: Patches 1-8 to detect when a zone is congested nocongest-v1r5: Patches 1-3 for testing wait_iff_congestion nodirect-v1r5: Patches 1-10 to disable filesystem writeback for better IO The tests run were as follows kernbench compile-based benchmark. Smoke test performance sysbench OLTP read-only benchmark. Will be re-run in the future as read-write micro-mapped-file-stream This is a micro-benchmark from Johannes Weiner that accesses a large sparse-file through mmap(). It was configured to run in only single-CPU mode but can be indicative of how well page reclaim identifies suitable pages. stress-highalloc Tries to allocate huge pages under heavy load. kernbench, iozone and sysbench did not report any performance regression on any machine. sysbench did pressure the system lightly and there was reclaim activity but there were no difference of major interest between the kernels. X86-64 micro-mapped-file-stream traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3 waitwriteback-v2r4 pgalloc_dma 1639.00 ( 0.00%) 667.00 (-145.73%) 1167.00 ( -40.45%) 578.00 (-183.56%) pgalloc_dma32 2842410.00 ( 0.00%) 2842626.00 ( 0.01%) 2843043.00 ( 0.02%) 2843014.00 ( 0.02%) pgalloc_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgsteal_dma 729.00 ( 0.00%) 85.00 (-757.65%) 609.00 ( -19.70%) 125.00 (-483.20%) pgsteal_dma32 2338721.00 ( 0.00%) 2447354.00 ( 4.44%) 2429536.00 ( 3.74%) 2436772.00 ( 4.02%) pgsteal_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_kswapd_dma 1469.00 ( 0.00%) 532.00 (-176.13%) 1078.00 ( -36.27%) 220.00 (-567.73%) pgscan_kswapd_dma32 4597713.00 ( 0.00%) 4503597.00 ( -2.09%) 4295673.00 ( -7.03%) 3891686.00 ( -18.14%) pgscan_kswapd_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_direct_dma 71.00 ( 0.00%) 134.00 ( 47.01%) 243.00 ( 70.78%) 352.00 ( 79.83%) pgscan_direct_dma32 305820.00 ( 0.00%) 280204.00 ( -9.14%) 600518.00 ( 49.07%) 957485.00 ( 68.06%) pgscan_direct_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pageoutrun 16296.00 ( 0.00%) 21254.00 ( 23.33%) 18447.00 ( 11.66%) 20067.00 ( 18.79%) allocstall 443.00 ( 0.00%) 273.00 ( -62.27%) 513.00 ( 13.65%) 1568.00 ( 71.75%) These are based on the raw figures taken from /proc/vmstat. It's a rough measure of reclaim activity. Note that allocstall counts are higher because we are entering direct reclaim more often as a result of not sleeping in congestion. In itself, it's not necessarily a bad thing. It's easier to get a view of what happened from the vmscan tracepoint report. FTrace Reclaim Statistics: vmscan traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3 waitwriteback-v2r4 Direct reclaims 443 273 513 1568 Direct reclaim pages scanned 305968 280402 600825 957933 Direct reclaim pages reclaimed 43503 19005 30327 117191 Direct reclaim write file async I/O 0 0 0 0 Direct reclaim write anon async I/O 0 3 4 12 Direct reclaim write file sync I/O 0 0 0 0 Direct reclaim write anon sync I/O 0 0 0 0 Wake kswapd requests 187649 132338 191695 267701 Kswapd wakeups 3 1 4 1 Kswapd pages scanned 4599269 4454162 4296815 3891906 Kswapd pages reclaimed 2295947 2428434 2399818 2319706 Kswapd reclaim write file async I/O 1 0 1 1 Kswapd reclaim write anon async I/O 59 187 41 222 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 4.34 2.52 6.63 2.96 Time kswapd awake (seconds) 11.15 10.25 11.01 10.19 Total pages scanned 4905237 4734564 4897640 4849839 Total pages reclaimed 2339450 2447439 2430145 2436897 %age total pages scanned/reclaimed 47.69% 51.69% 49.62% 50.25% %age total pages scanned/written 0.00% 0.00% 0.00% 0.00% %age file pages scanned/written 0.00% 0.00% 0.00% 0.00% Percentage Time Spent Direct Reclaim 29.23% 19.02% 38.48% 20.25% Percentage Time kswapd Awake 78.58% 78.85% 76.83% 79.86% What is interesting here for nocongest in particular is that while direct reclaim scans more pages, the overall number of pages scanned remains the same and the ratio of pages scanned to pages reclaimed is more or less the same. In other words, while we are sleeping less, reclaim is not doing more work and as direct reclaim and kswapd is awake for less time, it would appear to be doing less work. FTrace Reclaim Statistics: congestion_wait Direct number congest waited 87 196 64 0 Direct time congest waited 4604ms 4732ms 5420ms 0ms Direct full congest waited 72 145 53 0 Direct number conditional waited 0 0 324 1315 Direct time conditional waited 0ms 0ms 0ms 0ms Direct full conditional waited 0 0 0 0 KSwapd number congest waited 20 10 15 7 KSwapd time congest waited 1264ms 536ms 884ms 284ms KSwapd full congest waited 10 4 6 2 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 The vanilla kernel spent 8 seconds asleep in direct reclaim and no time at all asleep with the patches. MMTests Statistics: duration User/Sys Time Running Test (seconds) 10.51 10.73 10.6 11.66 Total Elapsed Time (seconds) 14.19 13.00 14.33 12.76 Overall, the tests completed faster. It is interesting to note that backing off further when a zone is congested and not just a BDI was more efficient overall. PPC64 micro-mapped-file-stream pgalloc_dma 3024660.00 ( 0.00%) 3027185.00 ( 0.08%) 3025845.00 ( 0.04%) 3026281.00 ( 0.05%) pgalloc_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgsteal_dma 2508073.00 ( 0.00%) 2565351.00 ( 2.23%) 2463577.00 ( -1.81%) 2532263.00 ( 0.96%) pgsteal_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_kswapd_dma 4601307.00 ( 0.00%) 4128076.00 ( -11.46%) 3912317.00 ( -17.61%) 3377165.00 ( -36.25%) pgscan_kswapd_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_direct_dma 629825.00 ( 0.00%) 971622.00 ( 35.18%) 1063938.00 ( 40.80%) 1711935.00 ( 63.21%) pgscan_direct_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pageoutrun 27776.00 ( 0.00%) 20458.00 ( -35.77%) 18763.00 ( -48.04%) 18157.00 ( -52.98%) allocstall 977.00 ( 0.00%) 2751.00 ( 64.49%) 2098.00 ( 53.43%) 5136.00 ( 80.98%) Similar trends to x86-64. allocstalls are up but it's not necessarily bad. FTrace Reclaim Statistics: vmscan Direct reclaims 977 2709 2098 5136 Direct reclaim pages scanned 629825 963814 1063938 1711935 Direct reclaim pages reclaimed 75550 242538 150904 387647 Direct reclaim write file async I/O 0 0 0 2 Direct reclaim write anon async I/O 0 10 0 4 Direct reclaim write file sync I/O 0 0 0 0 Direct reclaim write anon sync I/O 0 0 0 0 Wake kswapd requests 392119 1201712 571935 571921 Kswapd wakeups 3 2 3 3 Kswapd pages scanned 4601307 4128076 3912317 3377165 Kswapd pages reclaimed 2432523 2318797 2312673 2144616 Kswapd reclaim write file async I/O 20 1 1 1 Kswapd reclaim write anon async I/O 57 132 11 121 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 6.19 7.30 13.04 10.88 Time kswapd awake (seconds) 21.73 26.51 25.55 23.90 Total pages scanned 5231132 5091890 4976255 5089100 Total pages reclaimed 2508073 2561335 2463577 2532263 %age total pages scanned/reclaimed 47.95% 50.30% 49.51% 49.76% %age total pages scanned/written 0.00% 0.00% 0.00% 0.00% %age file pages scanned/written 0.00% 0.00% 0.00% 0.00% Percentage Time Spent Direct Reclaim 18.89% 20.65% 32.65% 27.65% Percentage Time kswapd Awake 72.39% 80.68% 78.21% 77.40% Again, a similar trend that the congestion_wait changes mean that direct reclaim scans more pages but the overall number of pages scanned while slightly reduced, are very similar. The ratio of scanning/reclaimed remains roughly similar. The downside is that kswapd and direct reclaim was awake longer and for a larger percentage of the overall workload. It's possible there were big differences in the amount of time spent reclaiming slab pages between the different kernels which is plausible considering that the micro tests runs after fsmark and sysbench. Trace Reclaim Statistics: congestion_wait Direct number congest waited 845 1312 104 0 Direct time congest waited 19416ms 26560ms 7544ms 0ms Direct full congest waited 745 1105 72 0 Direct number conditional waited 0 0 1322 2935 Direct time conditional waited 0ms 0ms 12ms 312ms Direct full conditional waited 0 0 0 3 KSwapd number congest waited 39 102 75 63 KSwapd time congest waited 2484ms 6760ms 5756ms 3716ms KSwapd full congest waited 20 48 46 25 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 The vanilla kernel spent 20 seconds asleep in direct reclaim and only 312ms asleep with the patches. The time kswapd spent congest waited was also reduced by a large factor. MMTests Statistics: duration ser/Sys Time Running Test (seconds) 26.58 28.05 26.9 28.47 Total Elapsed Time (seconds) 30.02 32.86 32.67 30.88 With all patches applies, the completion times are very similar. X86-64 STRESS-HIGHALLOC traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Pass 1 82.00 ( 0.00%) 84.00 ( 2.00%) 85.00 ( 3.00%) 85.00 ( 3.00%) Pass 2 90.00 ( 0.00%) 87.00 (-3.00%) 88.00 (-2.00%) 89.00 (-1.00%) At Rest 92.00 ( 0.00%) 90.00 (-2.00%) 90.00 (-2.00%) 91.00 (-1.00%) Success figures across the board are broadly similar. traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Direct reclaims 1045 944 886 887 Direct reclaim pages scanned 135091 119604 109382 101019 Direct reclaim pages reclaimed 88599 47535 47863 46671 Direct reclaim write file async I/O 494 283 465 280 Direct reclaim write anon async I/O 29357 13710 16656 13462 Direct reclaim write file sync I/O 154 2 2 3 Direct reclaim write anon sync I/O 14594 571 509 561 Wake kswapd requests 7491 933 872 892 Kswapd wakeups 814 778 731 780 Kswapd pages scanned 7290822 15341158 11916436 13703442 Kswapd pages reclaimed 3587336 3142496 3094392 3187151 Kswapd reclaim write file async I/O 91975 32317 28022 29628 Kswapd reclaim write anon async I/O 1992022 789307 829745 849769 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 4588.93 2467.16 2495.41 2547.07 Time kswapd awake (seconds) 2497.66 1020.16 1098.06 1176.82 Total pages scanned 7425913 15460762 12025818 13804461 Total pages reclaimed 3675935 3190031 3142255 3233822 %age total pages scanned/reclaimed 49.50% 20.63% 26.13% 23.43% %age total pages scanned/written 28.66% 5.41% 7.28% 6.47% %age file pages scanned/written 1.25% 0.21% 0.24% 0.22% Percentage Time Spent Direct Reclaim 57.33% 42.15% 42.41% 42.99% Percentage Time kswapd Awake 43.56% 27.87% 29.76% 31.25% Scanned/reclaimed ratios again look good with big improvements in efficiency. The Scanned/written ratios also look much improved. With a better scanned/written ration, there is an expectation that IO would be more efficient and indeed, the time spent in direct reclaim is much reduced by the full series and kswapd spends a little less time awake. Overall, indications here are that allocations were happening much faster and this can be seen with a graph of the latency figures as the allocations were taking place http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-hydra-mean.ps FTrace Reclaim Statistics: congestion_wait Direct number congest waited 1333 204 169 4 Direct time congest waited 78896ms 8288ms 7260ms 200ms Direct full congest waited 756 92 69 2 Direct number conditional waited 0 0 26 186 Direct time conditional waited 0ms 0ms 0ms 2504ms Direct full conditional waited 0 0 0 25 KSwapd number congest waited 4 395 227 282 KSwapd time congest waited 384ms 25136ms 10508ms 18380ms KSwapd full congest waited 3 232 98 176 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 KSwapd full conditional waited 318 0 312 9 Overall, the time spent speeping is reduced. kswapd is still hitting congestion_wait() but that is because there are callers remaining where it wasn't clear in advance if they should be changed to wait_iff_congested() or not. Overall the sleep imes are reduced though - from 79ish seconds to about 19. MMTests Statistics: duration User/Sys Time Running Test (seconds) 3415.43 3386.65 3388.39 3377.5 Total Elapsed Time (seconds) 5733.48 3660.33 3689.41 3765.39 With the full series, the time to complete the tests are reduced by 30% PPC64 STRESS-HIGHALLOC traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Pass 1 17.00 ( 0.00%) 34.00 (17.00%) 38.00 (21.00%) 43.00 (26.00%) Pass 2 25.00 ( 0.00%) 37.00 (12.00%) 42.00 (17.00%) 46.00 (21.00%) At Rest 49.00 ( 0.00%) 43.00 (-6.00%) 45.00 (-4.00%) 51.00 ( 2.00%) Success rates there are *way* up particularly considering that the 16MB huge pages on PPC64 mean that it's always much harder to allocate them. FTrace Reclaim Statistics: vmscan stress-highalloc stress-highalloc stress-highalloc stress-highalloc traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Direct reclaims 499 505 564 509 Direct reclaim pages scanned 223478 41898 51818 45605 Direct reclaim pages reclaimed 137730 21148 27161 23455 Direct reclaim write file async I/O 399 136 162 136 Direct reclaim write anon async I/O 46977 2865 4686 3998 Direct reclaim write file sync I/O 29 0 1 3 Direct reclaim write anon sync I/O 31023 159 237 239 Wake kswapd requests 420 351 360 326 Kswapd wakeups 185 294 249 277 Kswapd pages scanned 15703488 16392500 17821724 17598737 Kswapd pages reclaimed 5808466 2908858 3139386 3145435 Kswapd reclaim write file async I/O 159938 18400 18717 13473 Kswapd reclaim write anon async I/O 3467554 228957 322799 234278 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 9665.35 1707.81 2374.32 1871.23 Time kswapd awake (seconds) 9401.21 1367.86 1951.75 1328.88 Total pages scanned 15926966 16434398 17873542 17644342 Total pages reclaimed 5946196 2930006 3166547 3168890 %age total pages scanned/reclaimed 37.33% 17.83% 17.72% 17.96% %age total pages scanned/written 23.27% 1.52% 1.94% 1.43% %age file pages scanned/written 1.01% 0.11% 0.11% 0.08% Percentage Time Spent Direct Reclaim 44.55% 35.10% 41.42% 36.91% Percentage Time kswapd Awake 86.71% 43.58% 52.67% 41.14% While the scanning rates are slightly up, the scanned/reclaimed and scanned/written figures are much improved. The time spent in direct reclaim and with kswapd are massively reduced, mostly by the lowlumpy patches. FTrace Reclaim Statistics: congestion_wait Direct number congest waited 725 303 126 3 Direct time congest waited 45524ms 9180ms 5936ms 300ms Direct full congest waited 487 190 52 3 Direct number conditional waited 0 0 200 301 Direct time conditional waited 0ms 0ms 0ms 1904ms Direct full conditional waited 0 0 0 19 KSwapd number congest waited 0 2 23 4 KSwapd time congest waited 0ms 200ms 420ms 404ms KSwapd full congest waited 0 2 2 4 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 Not as dramatic a story here but the time spent asleep is reduced and we can still see what wait_iff_congested is going to sleep when necessary. MMTests Statistics: duration User/Sys Time Running Test (seconds) 12028.09 3157.17 3357.79 3199.16 Total Elapsed Time (seconds) 10842.07 3138.72 3705.54 3229.85 The time to complete this test goes way down. With the full series, we are allocating over twice the number of huge pages in 30% of the time and there is a corresponding impact on the allocation latency graph available at. http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-powyah-mean.ps This patch: Add a trace event for shrink_inactive_list() and updates the sample postprocessing script appropriately. It can be used to determine how many pages were reclaimed and for non-lumpy reclaim where exactly the pages were reclaimed from. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 370aa5a87322..ecf952192a93 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -10,6 +10,7 @@ #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u +#define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u #define RECLAIM_WB_ASYNC 0x0008u @@ -17,6 +18,7 @@ (flags) ? __print_flags(flags, "|", \ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \ + {RECLAIM_WB_MIXED, "RECLAIM_WB_MIXED"}, \ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" @@ -26,6 +28,12 @@ (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) +#define trace_shrink_flags(file, sync) ( \ + (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \ + (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ + (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + ) + TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), @@ -269,6 +277,40 @@ TRACE_EVENT(mm_vmscan_writepage, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_lru_shrink_inactive, + + TP_PROTO(int nid, int zid, + unsigned long nr_scanned, unsigned long nr_reclaimed, + int priority, int reclaim_flags), + + TP_ARGS(nid, zid, nr_scanned, nr_reclaimed, priority, reclaim_flags), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, zid) + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_reclaimed) + __field(int, priority) + __field(int, reclaim_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->zid = zid; + __entry->nr_scanned = nr_scanned; + __entry->nr_reclaimed = nr_reclaimed; + __entry->priority = priority; + __entry->reclaim_flags = reclaim_flags; + ), + + TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", + __entry->nid, __entry->zid, + __entry->nr_scanned, __entry->nr_reclaimed, + __entry->priority, + show_reclaim_flags(__entry->reclaim_flags)) +); + + #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 52bb9198668968506f9d12bf35d7f5d3f094921e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:41 -0700 Subject: writeback: account for time spent congestion_waited There is strong evidence to indicate a lot of time is being spent in congestion_wait(), some of it unnecessarily. This patch adds a tracepoint for congestion_wait to record when congestion_wait() was called, how long the timeout was for and how long it actually slept. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 0bb01ab2e984..d2b2654606ec 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -151,6 +151,34 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written); DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); +DECLARE_EVENT_CLASS(writeback_congest_waited_template, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed), + + TP_STRUCT__entry( + __field( unsigned int, usec_timeout ) + __field( unsigned int, usec_delayed ) + ), + + TP_fast_assign( + __entry->usec_timeout = usec_timeout; + __entry->usec_delayed = usec_delayed; + ), + + TP_printk("usec_timeout=%u usec_delayed=%u", + __entry->usec_timeout, + __entry->usec_delayed) +); + +DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 7d3579e8e61937cbba268ea9b218d006b6d64221 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 26 Oct 2010 14:21:42 -0700 Subject: vmscan: narrow the scenarios in whcih lumpy reclaim uses synchrounous reclaim shrink_page_list() can decide to give up reclaiming a page under a number of conditions such as 1. trylock_page() failure 2. page is unevictable 3. zone reclaim and page is mapped 4. PageWriteback() is true 5. page is swapbacked and swap is full 6. add_to_swap() failure 7. page is dirty and gfpmask don't have GFP_IO, GFP_FS 8. page is pinned 9. IO queue is congested 10. pageout() start IO, but not finished With lumpy reclaim, failures result in entering synchronous lumpy reclaim but this can be unnecessary. In cases (2), (3), (5), (6), (7) and (8), there is no point retrying. This patch causes lumpy reclaim to abort when it is known it will fail. Case (9) is more interesting. current behavior is, 1. start shrink_page_list(async) 2. found queue_congested() 3. skip pageout write 4. still start shrink_page_list(sync) 5. wait on a lot of pages 6. again, found queue_congested() 7. give up pageout write again So, it's useless time wasting. However, just skipping page reclaim is also notgood as x86 allocating a huge page needs 512 pages for example. It can have more dirty pages than queue congestion threshold (~=128). After this patch, pageout() behaves as follows; - If order > PAGE_ALLOC_COSTLY_ORDER Ignore queue congestion always. - If order <= PAGE_ALLOC_COSTLY_ORDER skip write page and disable lumpy reclaim. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index ecf952192a93..c255fcc587bf 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, -- cgit v1.2.3 From 0e093d99763eb4cea09f8ca4f1d01f34e121d10b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:45 -0700 Subject: writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone If congestion_wait() is called with no BDI congested, the caller will sleep for the full timeout and this may be an unnecessary sleep. This patch adds a wait_iff_congested() that checks congestion and only sleeps if a BDI is congested else, it calls cond_resched() to ensure the caller is not hogging the CPU longer than its quota but otherwise will not sleep. This is aimed at reducing some of the major desktop stalls reported during IO. For example, while kswapd is operating, it calls congestion_wait() but it could just have been reclaiming clean page cache pages with no congestion. Without this patch, it would sleep for a full timeout but after this patch, it'll just call schedule() if it has been on the CPU too long. Similar logic applies to direct reclaimers that are not making enough progress. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index d2b2654606ec..89a2b2db4375 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_ARGS(usec_timeout, usec_delayed) ); +DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ -- cgit v1.2.3 From b853fd364810a241050778124842a8c415c72a69 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Wed, 27 Oct 2010 21:27:12 -0400 Subject: ext4: avoid null dereference in trace_ext4_mballoc_discard ac->inode is set to null in function ext4_mb_release_group_pa(), and then trace_ext4_mballoc_discard(ac) is called, the kernel will panic. BUG: unable to handle kernel NULL pointer dereference at 000000a4 IP: [] ftrace_raw_event_ext4__mballoc+0x54/0xc0 [ext4] *pdpt = 0000000000abd001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Pid: 550, comm: flush-8:16 Not tainted 2.6.36-rc1 #1 SE7320EP2/Altos G530 EIP: 0060:[] EFLAGS: 00010206 CPU: 1 EIP is at ftrace_raw_event_ext4__mballoc+0x54/0xc0 [ext4] EAX: f32ac840 EBX: f3f1cf88 ECX: f32ac840 EDX: 00000000 ESI: f32ac83c EDI: f880b9d8 EBP: 00000000 ESP: f4b77ae4 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process flush-8:16 (pid: 550, ti=f4b76000 task=f613e540 task.ti=f4b76000) Call Trace: [] ? ext4_mb_release_group_pa+0x121/0x150 [ext4] [] ? ext4_mb_discard_group_preallocations+0x336/0x400 [ext4] [] ? ext4_mb_new_blocks+0x3d1/0x4f0 [ext4] [] ? __make_request+0x10b/0x440 [] ? ext4_ext_map_blocks+0x1334/0x1980 [ext4] [] ? rb_reserve_next_event+0xaa/0x3b0 [] ? ext4_map_blocks+0xd6/0x1d0 [ext4] [] ? mpage_da_map_blocks+0xc7/0x8a0 [ext4] [] ? find_get_pages_tag+0x38/0x110 [] ? __pagevec_release+0x15/0x20 [] ? ext4_da_writepages+0x2b5/0x5d0 [ext4] [] ? __writepage+0x0/0x30 [] ? do_writepages+0x14/0x30 [] ? writeback_single_inode+0xa0/0x240 [] ? writeback_sb_inodes+0xc1/0x180 [] ? writeback_inodes_wb+0x88/0x140 [] ? wb_writeback+0x20b/0x320 [] ? lock_timer_base+0x27/0x50 [] ? wb_do_writeback+0x150/0x190 [] ? bdi_writeback_thread+0x88/0x1f0 [] ? complete+0x40/0x60 [] ? bdi_writeback_thread+0x0/0x1f0 [] ? kthread+0x74/0x80 [] ? kthread+0x0/0x80 [] ? kernel_thread_helper+0x6/0x10 Signed-off-by: Wen Congyang Acked-by: Steven Rostedt Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 01e9e0076a92..6a1fcff95f7c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -796,8 +796,9 @@ DECLARE_EVENT_CLASS(ext4__mballoc, ), TP_fast_assign( - __entry->dev = ac->ac_inode->i_sb->s_dev; - __entry->ino = ac->ac_inode->i_ino; + __entry->dev = ac->ac_sb->s_dev; + __entry->ino = ac->ac_inode ? + ac->ac_inode->i_ino : 0; __entry->result_logical = ac->ac_b_ex.fe_logical; __entry->result_start = ac->ac_b_ex.fe_start; __entry->result_group = ac->ac_b_ex.fe_group; -- cgit v1.2.3 From 4d5476164a052e80d4ef430e368e76dbde96801f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:07 -0400 Subject: ext4: fix oops in trace_ext4_mb_release_group_pa Our QA reported an oops in the ext4_mb_release_group_pa tracing, and Josef Bacik pointed out that it was because we may have a non-null but uninitialized ac_inode in the allocation context. I can reproduce it when running xfstests with ext4 tracepoints on, on a CONFIG_SLAB_DEBUG kernel. We call trace_ext4_mb_release_group_pa from 2 places, ext4_mb_discard_group_preallocations and ext4_mb_discard_lg_preallocations In both cases we allocate an ac as a container just for tracing (!) and never fill in the ac_inode. There's no reason to be assigning, testing, or printing it as far as I can see, so just remove it from the tracepoint. Signed-off-by: Eric Sandeen Reviewed-by: Josef Bacik Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6a1fcff95f7c..fbdfa3a6bbd8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -432,7 +432,6 @@ TRACE_EVENT(ext4_mb_release_group_pa, TP_STRUCT__entry( __field( dev_t, dev ) - __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) @@ -440,8 +439,6 @@ TRACE_EVENT(ext4_mb_release_group_pa, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->ino = (ac && ac->ac_inode) ? - ac->ac_inode->i_ino : 0; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), -- cgit v1.2.3 From 3e1e5f501632460184a98237d5460c521510535e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:07 -0400 Subject: ext4: don't use ext4_allocation_contexts for tracing Many tracepoints were populating an ext4_allocation_context to pass in, but this requires a slab allocation even when tracepoints are off. In fact, 4 of 5 of these allocations were only for tracing. In addition, we were only using a small fraction of the 144 bytes of this structure for this purpose. We can do away with all these alloc/frees of the ac and simply pass in the bits we care about, instead. I tested this by turning on tracing and running through xfstests on x86_64. I did not actually do anything with the trace output, however. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 51 ++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index fbdfa3a6bbd8..b5f4938d612d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -396,11 +396,11 @@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, TRACE_EVENT(ext4_mb_release_inode_pa, TP_PROTO(struct super_block *sb, - struct ext4_allocation_context *ac, + struct inode *inode, struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), - TP_ARGS(sb, ac, pa, block, count), + TP_ARGS(sb, inode, pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) @@ -412,8 +412,7 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->ino = (ac && ac->ac_inode) ? - ac->ac_inode->i_ino : 0; + __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; ), @@ -425,10 +424,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TRACE_EVENT(ext4_mb_release_group_pa, TP_PROTO(struct super_block *sb, - struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), - TP_ARGS(sb, ac, pa), + TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) @@ -779,47 +777,56 @@ TRACE_EVENT(ext4_mballoc_prealloc, ); DECLARE_EVENT_CLASS(ext4__mballoc, - TP_PROTO(struct ext4_allocation_context *ac), + TP_PROTO(struct super_block *sb, + struct inode *inode, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), - TP_ARGS(ac), + TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( - __entry->dev = ac->ac_sb->s_dev; - __entry->ino = ac->ac_inode ? - ac->ac_inode->i_ino : 0; - __entry->result_logical = ac->ac_b_ex.fe_logical; - __entry->result_start = ac->ac_b_ex.fe_start; - __entry->result_group = ac->ac_b_ex.fe_group; - __entry->result_len = ac->ac_b_ex.fe_len; + __entry->dev = sb->s_dev; + __entry->ino = inode ? inode->i_ino : 0; + __entry->result_start = start; + __entry->result_group = group; + __entry->result_len = len; ), - TP_printk("dev %s inode %lu extent %u/%d/%u@%u ", + TP_printk("dev %s inode %lu extent %u/%d/%u ", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, - __entry->result_len, __entry->result_logical) + __entry->result_len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, - TP_PROTO(struct ext4_allocation_context *ac), + TP_PROTO(struct super_block *sb, + struct inode *inode, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), - TP_ARGS(ac) + TP_ARGS(sb, inode, group, start, len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, - TP_PROTO(struct ext4_allocation_context *ac), + TP_PROTO(struct super_block *sb, + struct inode *inode, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), - TP_ARGS(ac) + TP_ARGS(sb, inode, group, start, len) ); TRACE_EVENT(ext4_forget, -- cgit v1.2.3 From a269029d0e2192046be4c07ed78a45022469ee4c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 22:08:50 -0400 Subject: ext4,jbd2: convert tracepoints to use major/minor numbers Unfortunately perf can't deal with anything other than direct structure accesses in the TP_printk() section. It will drop dead when it sees jbd2_dev_to_name() in the "print fmt" section of the tracepoint. Addresses-Google-Bug: 3138508 Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 329 +++++++++++++++++++++++++++----------------- include/trace/events/jbd2.h | 78 ++++++----- 2 files changed, 251 insertions(+), 156 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b5f4938d612d..8f59db107bbb 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -21,7 +21,8 @@ TRACE_EVENT(ext4_free_inode, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( uid_t, uid ) @@ -30,7 +31,8 @@ TRACE_EVENT(ext4_free_inode, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->uid = inode->i_uid; @@ -38,9 +40,10 @@ TRACE_EVENT(ext4_free_inode, __entry->blocks = inode->i_blocks; ), - TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, __entry->uid, __entry->gid, + TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, + __entry->uid, __entry->gid, (unsigned long long) __entry->blocks) ); @@ -50,20 +53,22 @@ TRACE_EVENT(ext4_request_inode, TP_ARGS(dir, mode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, dir ) __field( umode_t, mode ) ), TP_fast_assign( - __entry->dev = dir->i_sb->s_dev; + __entry->dev_major = MAJOR(dir->i_sb->s_dev); + __entry->dev_minor = MINOR(dir->i_sb->s_dev); __entry->dir = dir->i_ino; __entry->mode = mode; ), - TP_printk("dev %s dir %lu mode 0%o", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, - __entry->mode) + TP_printk("dev %d,%d dir %lu mode 0%o", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, @@ -72,21 +77,24 @@ TRACE_EVENT(ext4_allocate_inode, TP_ARGS(inode, dir, mode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( ino_t, dir ) __field( umode_t, mode ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->dir = dir->i_ino; __entry->mode = mode; ), - TP_printk("dev %s ino %lu dir %lu mode 0%o", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); @@ -98,7 +106,8 @@ DECLARE_EVENT_CLASS(ext4__write_begin, TP_ARGS(inode, pos, len, flags), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) @@ -106,15 +115,17 @@ DECLARE_EVENT_CLASS(ext4__write_begin, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->flags = flags; ), - TP_printk("dev %s ino %lu pos %llu len %u flags %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->flags) ); @@ -141,7 +152,8 @@ DECLARE_EVENT_CLASS(ext4__write_end, TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) @@ -149,16 +161,18 @@ DECLARE_EVENT_CLASS(ext4__write_end, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), - TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->pos, __entry->len, __entry->copied) + TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->pos, + __entry->len, __entry->copied) ); DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, @@ -199,21 +213,23 @@ TRACE_EVENT(ext4_writepage, TP_ARGS(inode, page), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( pgoff_t, index ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->index = page->index; ), - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->index) + TP_printk("dev %d,%d ino %lu page_index %lu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->index) ); TRACE_EVENT(ext4_da_writepages, @@ -222,7 +238,8 @@ TRACE_EVENT(ext4_da_writepages, TP_ARGS(inode, wbc), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( long, nr_to_write ) __field( long, pages_skipped ) @@ -236,7 +253,8 @@ TRACE_EVENT(ext4_da_writepages, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; @@ -249,8 +267,8 @@ TRACE_EVENT(ext4_da_writepages, __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", - jbd2_dev_to_name(__entry->dev), + TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", + __entry->dev_major, __entry->dev_minor, (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, @@ -265,7 +283,8 @@ TRACE_EVENT(ext4_da_write_pages, TP_ARGS(inode, mpd), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, b_blocknr ) __field( __u32, b_size ) @@ -276,7 +295,8 @@ TRACE_EVENT(ext4_da_write_pages, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->b_blocknr = mpd->b_blocknr; __entry->b_size = mpd->b_size; @@ -286,8 +306,9 @@ TRACE_EVENT(ext4_da_write_pages, __entry->pages_written = mpd->pages_written; ), - TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->b_blocknr, __entry->b_size, __entry->b_state, __entry->first_page, __entry->io_done, __entry->pages_written) @@ -300,7 +321,8 @@ TRACE_EVENT(ext4_da_writepages_result, TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) @@ -310,7 +332,8 @@ TRACE_EVENT(ext4_da_writepages_result, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->ret = ret; __entry->pages_written = pages_written; @@ -319,8 +342,8 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu", - jbd2_dev_to_name(__entry->dev), + TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu", + __entry->dev_major, __entry->dev_minor, (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->more_io, @@ -334,20 +357,23 @@ TRACE_EVENT(ext4_discard_blocks, TP_ARGS(sb, blk, count), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( __u64, blk ) __field( __u64, count ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->blk = blk; __entry->count = count; ), - TP_printk("dev %s blk %llu count %llu", - jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count) + TP_printk("dev %d,%d blk %llu count %llu", + __entry->dev_major, __entry->dev_minor, + __entry->blk, __entry->count) ); DECLARE_EVENT_CLASS(ext4__mb_new_pa, @@ -357,7 +383,8 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_ARGS(ac, pa), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) @@ -366,16 +393,18 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, ), TP_fast_assign( - __entry->dev = ac->ac_sb->s_dev; + __entry->dev_major = MAJOR(ac->ac_sb->s_dev); + __entry->dev_minor = MINOR(ac->ac_sb->s_dev); __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; __entry->pa_lstart = pa->pa_lstart; ), - TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) + TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->pa_pstart, + __entry->pa_len, __entry->pa_lstart) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, @@ -403,7 +432,8 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TP_ARGS(sb, inode, pa, block, count), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, block ) __field( __u32, count ) @@ -411,15 +441,16 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; ), - TP_printk("dev %s ino %lu block %llu count %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->block, __entry->count) + TP_printk("dev %d,%d ino %lu block %llu count %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, @@ -429,20 +460,23 @@ TRACE_EVENT(ext4_mb_release_group_pa, TP_ARGS(sb, pa), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), - TP_printk("dev %s pstart %llu len %u", - jbd2_dev_to_name(__entry->dev), __entry->pa_pstart, __entry->pa_len) + TP_printk("dev %d,%d pstart %llu len %u", + __entry->dev_major, __entry->dev_minor, + __entry->pa_pstart, __entry->pa_len) ); TRACE_EVENT(ext4_discard_preallocations, @@ -451,18 +485,21 @@ TRACE_EVENT(ext4_discard_preallocations, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; ), - TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) + TP_printk("dev %d,%d ino %lu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mb_discard_preallocations, @@ -471,18 +508,20 @@ TRACE_EVENT(ext4_mb_discard_preallocations, TP_ARGS(sb, needed), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( int, needed ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->needed = needed; ), - TP_printk("dev %s needed %d", - jbd2_dev_to_name(__entry->dev), __entry->needed) + TP_printk("dev %d,%d needed %d", + __entry->dev_major, __entry->dev_minor, __entry->needed) ); TRACE_EVENT(ext4_request_blocks, @@ -491,7 +530,8 @@ TRACE_EVENT(ext4_request_blocks, TP_ARGS(ar), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( unsigned int, flags ) __field( unsigned int, len ) @@ -504,7 +544,8 @@ TRACE_EVENT(ext4_request_blocks, ), TP_fast_assign( - __entry->dev = ar->inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev); __entry->ino = ar->inode->i_ino; __entry->flags = ar->flags; __entry->len = ar->len; @@ -516,8 +557,9 @@ TRACE_EVENT(ext4_request_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->flags, __entry->len, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, @@ -533,7 +575,8 @@ TRACE_EVENT(ext4_allocate_blocks, TP_ARGS(ar, block), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned int, flags ) @@ -547,7 +590,8 @@ TRACE_EVENT(ext4_allocate_blocks, ), TP_fast_assign( - __entry->dev = ar->inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev); __entry->ino = ar->inode->i_ino; __entry->block = block; __entry->flags = ar->flags; @@ -560,9 +604,10 @@ TRACE_EVENT(ext4_allocate_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->flags, __entry->len, __entry->block, + TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->flags, + __entry->len, __entry->block, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, (unsigned long long) __entry->lleft, @@ -578,7 +623,8 @@ TRACE_EVENT(ext4_free_blocks, TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, block ) @@ -587,7 +633,8 @@ TRACE_EVENT(ext4_free_blocks, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->block = block; @@ -595,8 +642,9 @@ TRACE_EVENT(ext4_free_blocks, __entry->flags = flags; ), - TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, __entry->block, __entry->count, __entry->flags) ); @@ -607,7 +655,8 @@ TRACE_EVENT(ext4_sync_file, TP_ARGS(file, datasync), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) @@ -616,14 +665,16 @@ TRACE_EVENT(ext4_sync_file, TP_fast_assign( struct dentry *dentry = file->f_path.dentry; - __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->dev_major = MAJOR(dentry->d_inode->i_sb->s_dev); + __entry->dev_minor = MINOR(dentry->d_inode->i_sb->s_dev); __entry->ino = dentry->d_inode->i_ino; __entry->datasync = datasync; __entry->parent = dentry->d_parent->d_inode->i_ino; ), - TP_printk("dev %s ino %ld parent %ld datasync %d ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %ld parent %ld datasync %d ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); @@ -633,18 +684,20 @@ TRACE_EVENT(ext4_sync_fs, TP_ARGS(sb, wait), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( int, wait ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->wait = wait; ), - TP_printk("dev %s wait %d", jbd2_dev_to_name(__entry->dev), - __entry->wait) + TP_printk("dev %d,%d wait %d", __entry->dev_major, + __entry->dev_minor, __entry->wait) ); TRACE_EVENT(ext4_alloc_da_blocks, @@ -653,21 +706,24 @@ TRACE_EVENT(ext4_alloc_da_blocks, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( unsigned int, data_blocks ) __field( unsigned int, meta_blocks ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; ), - TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->data_blocks, __entry->meta_blocks) ); @@ -677,7 +733,8 @@ TRACE_EVENT(ext4_mballoc_alloc, TP_ARGS(ac), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u16, found ) __field( __u16, groups ) @@ -700,7 +757,8 @@ TRACE_EVENT(ext4_mballoc_alloc, ), TP_fast_assign( - __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ac->ac_inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ac->ac_inode->i_sb->s_dev); __entry->ino = ac->ac_inode->i_ino; __entry->found = ac->ac_found; __entry->flags = ac->ac_flags; @@ -722,10 +780,11 @@ TRACE_EVENT(ext4_mballoc_alloc, __entry->result_len = ac->ac_f_ex.fe_len; ), - TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " + TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " "tail %u broken %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->goal_group, __entry->goal_start, @@ -743,7 +802,8 @@ TRACE_EVENT(ext4_mballoc_prealloc, TP_ARGS(ac), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) @@ -756,7 +816,8 @@ TRACE_EVENT(ext4_mballoc_prealloc, ), TP_fast_assign( - __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ac->ac_inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ac->ac_inode->i_sb->s_dev); __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; @@ -768,8 +829,9 @@ TRACE_EVENT(ext4_mballoc_prealloc, __entry->result_len = ac->ac_b_ex.fe_len; ), - TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->result_group, __entry->result_start, @@ -786,7 +848,8 @@ DECLARE_EVENT_CLASS(ext4__mballoc, TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) @@ -794,15 +857,17 @@ DECLARE_EVENT_CLASS(ext4__mballoc, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->ino = inode ? inode->i_ino : 0; __entry->result_start = start; __entry->result_group = group; __entry->result_len = len; ), - TP_printk("dev %s inode %lu extent %u/%d/%u ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d inode %lu extent %u/%d/%u ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) ); @@ -835,7 +900,8 @@ TRACE_EVENT(ext4_forget, TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( int, is_metadata ) @@ -843,16 +909,18 @@ TRACE_EVENT(ext4_forget, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->is_metadata = is_metadata; __entry->block = block; ), - TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, __entry->is_metadata, __entry->block) + TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, + __entry->is_metadata, __entry->block) ); TRACE_EVENT(ext4_da_update_reserve_space, @@ -861,7 +929,8 @@ TRACE_EVENT(ext4_da_update_reserve_space, TP_ARGS(inode, used_blocks), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -872,7 +941,8 @@ TRACE_EVENT(ext4_da_update_reserve_space, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -882,9 +952,10 @@ TRACE_EVENT(ext4_da_update_reserve_space, __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; ), - TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, (unsigned long long) __entry->i_blocks, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, + (unsigned long long) __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) ); @@ -895,7 +966,8 @@ TRACE_EVENT(ext4_da_reserve_space, TP_ARGS(inode, md_needed), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -905,7 +977,8 @@ TRACE_EVENT(ext4_da_reserve_space, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -914,8 +987,9 @@ TRACE_EVENT(ext4_da_reserve_space, __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; ), - TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, (unsigned long long) __entry->i_blocks, __entry->md_needed, __entry->reserved_data_blocks, __entry->reserved_meta_blocks) @@ -927,7 +1001,8 @@ TRACE_EVENT(ext4_da_release_space, TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -938,7 +1013,8 @@ TRACE_EVENT(ext4_da_release_space, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -948,8 +1024,9 @@ TRACE_EVENT(ext4_da_release_space, __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; ), - TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, (unsigned long long) __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks, __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) @@ -961,18 +1038,20 @@ DECLARE_EVENT_CLASS(ext4__bitmap_load, TP_ARGS(sb, group), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( __u32, group ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->group = group; ), - TP_printk("dev %s group %u", - jbd2_dev_to_name(__entry->dev), __entry->group) + TP_printk("dev %d,%d group %u", + __entry->dev_major, __entry->dev_minor, __entry->group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index bf16545cc977..7447ea9305b5 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -17,17 +17,19 @@ TRACE_EVENT(jbd2_checkpoint, TP_ARGS(journal, result), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( int, result ) ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->result = result; ), - TP_printk("dev %s result %d", - jbd2_dev_to_name(__entry->dev), __entry->result) + TP_printk("dev %d,%d result %d", + __entry->dev_major, __entry->dev_minor, __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, @@ -37,20 +39,22 @@ DECLARE_EVENT_CLASS(jbd2_commit, TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( char, sync_commit ) __field( int, transaction ) ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %s transaction %d sync %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit) + TP_printk("dev %d,%d transaction %d sync %d", + __entry->dev_major, __entry->dev_minor, + __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, @@ -87,22 +91,24 @@ TRACE_EVENT(jbd2_end_commit, TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( char, sync_commit ) __field( int, transaction ) __field( int, head ) ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; __entry->head = journal->j_tail_sequence; ), - TP_printk("dev %s transaction %d sync %d head %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit, __entry->head) + TP_printk("dev %d,%d transaction %d sync %d head %d", + __entry->dev_major, __entry->dev_minor, + __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, @@ -111,17 +117,20 @@ TRACE_EVENT(jbd2_submit_inode_data, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; ), - TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) + TP_printk("dev %d,%d ino %lu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino) ); TRACE_EVENT(jbd2_run_stats, @@ -131,7 +140,8 @@ TRACE_EVENT(jbd2_run_stats, TP_ARGS(dev, tid, stats), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( unsigned long, tid ) __field( unsigned long, wait ) __field( unsigned long, running ) @@ -144,7 +154,8 @@ TRACE_EVENT(jbd2_run_stats, ), TP_fast_assign( - __entry->dev = dev; + __entry->dev_major = MAJOR(dev); + __entry->dev_minor = MINOR(dev); __entry->tid = tid; __entry->wait = stats->rs_wait; __entry->running = stats->rs_running; @@ -156,9 +167,9 @@ TRACE_EVENT(jbd2_run_stats, __entry->blocks_logged = stats->rs_blocks_logged; ), - TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " + TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u " "logging %u handle_count %u blocks %u blocks_logged %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + __entry->dev_major, __entry->dev_minor, __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), @@ -175,7 +186,8 @@ TRACE_EVENT(jbd2_checkpoint_stats, TP_ARGS(dev, tid, stats), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( unsigned long, tid ) __field( unsigned long, chp_time ) __field( __u32, forced_to_close ) @@ -184,7 +196,8 @@ TRACE_EVENT(jbd2_checkpoint_stats, ), TP_fast_assign( - __entry->dev = dev; + __entry->dev_major = MAJOR(dev); + __entry->dev_minor = MINOR(dev); __entry->tid = tid; __entry->chp_time = stats->cs_chp_time; __entry->forced_to_close= stats->cs_forced_to_close; @@ -192,9 +205,9 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->dropped = stats->cs_dropped; ), - TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " + TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u " "written %u dropped %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + __entry->dev_major, __entry->dev_minor, __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); @@ -207,7 +220,8 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, TP_ARGS(journal, first_tid, block_nr, freed), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( tid_t, tail_sequence ) __field( tid_t, first_tid ) __field(unsigned long, block_nr ) @@ -215,16 +229,18 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->tail_sequence = journal->j_tail_sequence; __entry->first_tid = first_tid; __entry->block_nr = block_nr; __entry->freed = freed; ), - TP_printk("dev %s from %u to %u offset %lu freed %lu", - jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, - __entry->first_tid, __entry->block_nr, __entry->freed) + TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", + __entry->dev_major, __entry->dev_minor, + __entry->tail_sequence, __entry->first_tid, + __entry->block_nr, __entry->freed) ); #endif /* _TRACE_JBD2_H */ -- cgit v1.2.3 From 7ff9c073dd4d7200399076554f7ab9b876f196f6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Nov 2010 13:51:33 -0500 Subject: ext4: Add new ext4 inode tracepoints Add ext4_evict_inode, ext4_drop_inode, ext4_mark_inode_dirty, and ext4_begin_ordered_truncate() Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 97 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 289010d3270b..e5e345fb2a5c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -98,6 +98,103 @@ TRACE_EVENT(ext4_allocate_inode, (unsigned long) __entry->dir, __entry->mode) ); +TRACE_EVENT(ext4_evict_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( int, nlink ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->nlink = inode->i_nlink; + ), + + TP_printk("dev %d,%d ino %lu nlink %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->nlink) +); + +TRACE_EVENT(ext4_drop_inode, + TP_PROTO(struct inode *inode, int drop), + + TP_ARGS(inode, drop), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( int, drop ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->drop = drop; + ), + + TP_printk("dev %d,%d ino %lu drop %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->drop) +); + +TRACE_EVENT(ext4_mark_inode_dirty, + TP_PROTO(struct inode *inode, unsigned long IP), + + TP_ARGS(inode, IP), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d ino %lu caller %pF", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, (void *)__entry->ip) +); + +TRACE_EVENT(ext4_begin_ordered_truncate, + TP_PROTO(struct inode *inode, loff_t new_size), + + TP_ARGS(inode, new_size), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( loff_t, new_size ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->new_size = new_size; + ), + + TP_printk("dev %d,%d ino %lu new_size %lld", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, + (long long) __entry->new_size) +); + DECLARE_EVENT_CLASS(ext4__write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, -- cgit v1.2.3