From b3aa1584e9f3449b0669ab2beb9b9bf99874e1d6 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 9 Apr 2012 10:44:57 -0700
Subject: workqueue: Fix workqueue_execute_end() comment

workqueue_execute_end() is called after the callback function,
not before.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/trace/events/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/trace')

diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
index 7d497291c85d..4018f5058f27 100644
--- a/include/trace/events/workqueue.h
+++ b/include/trace/events/workqueue.h
@@ -103,7 +103,7 @@ TRACE_EVENT(workqueue_execute_start,
 );
 
 /**
- * workqueue_execute_end - called immediately before the workqueue callback
+ * workqueue_execute_end - called immediately after the workqueue callback
  * @work:	pointer to struct work_struct
  *
  * Allows to track workqueue execution.
-- 
cgit v1.2.3


From 2db938bee32e7469ca8ed9bfb3a05535f28c680d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 21 Feb 2011 17:25:37 +0100
Subject: jbd: Refine commit writeout logic

Currently we write out all journal buffers in WRITE_SYNC mode. This improves
performance for fsync heavy workloads but hinders performance when writes
are mostly asynchronous, most noticably it slows down readers and users
complain about slow desktop response etc.

So submit writes as asynchronous in the normal case and only submit writes as
WRITE_SYNC if we detect someone is waiting for current transaction commit.

I've gathered some numbers to back this change. The first is the read latency
test. It measures time to read 1 MB after several seconds of sleeping in
presence of streaming writes.

Top 10 times (out of 90) in us:
Before		After
2131586		697473
1709932		557487
1564598		535642
1480462		347573
1478579		323153
1408496		222181
1388960		181273
1329565		181070
1252486		172832
1223265		172278

Average:
619377		82180

So the improvement in both maximum and average latency is massive.

I've measured fsync throughput by:
fs_mark -n 100 -t 1 -s 16384 -d /mnt/fsync/ -S 1 -L 4

in presence of streaming reader. The numbers (fsyncs/s) are:
Before		After
9.9		6.3
6.8		6.0
6.3		6.2
5.8		6.1

So fsync performance seems unharmed by this change.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/trace/events/jbd.h | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h
index aff64d82d713..9305e1b5edc3 100644
--- a/include/trace/events/jbd.h
+++ b/include/trace/events/jbd.h
@@ -36,19 +36,17 @@ DECLARE_EVENT_CLASS(jbd_commit,
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
-		__field(	char,	sync_commit		)
 		__field(	int,	transaction		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->sync_commit = commit_transaction->t_synchronous_commit;
 		__entry->transaction	= commit_transaction->t_tid;
 	),
 
-	TP_printk("dev %d,%d transaction %d sync %d",
+	TP_printk("dev %d,%d transaction %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->transaction, __entry->sync_commit)
+		  __entry->transaction)
 );
 
 DEFINE_EVENT(jbd_commit, jbd_start_commit,
@@ -87,19 +85,17 @@ TRACE_EVENT(jbd_drop_transaction,
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
-		__field(	char,	sync_commit		)
 		__field(	int,	transaction		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->sync_commit = commit_transaction->t_synchronous_commit;
 		__entry->transaction	= commit_transaction->t_tid;
 	),
 
-	TP_printk("dev %d,%d transaction %d sync %d",
+	TP_printk("dev %d,%d transaction %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->transaction, __entry->sync_commit)
+		  __entry->transaction)
 );
 
 TRACE_EVENT(jbd_end_commit,
@@ -109,21 +105,19 @@ TRACE_EVENT(jbd_end_commit,
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
-		__field(	char,	sync_commit		)
 		__field(	int,	transaction		)
 		__field(	int,	head			)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->sync_commit = commit_transaction->t_synchronous_commit;
 		__entry->transaction	= commit_transaction->t_tid;
 		__entry->head		= journal->j_tail_sequence;
 	),
 
-	TP_printk("dev %d,%d transaction %d sync %d head %d",
+	TP_printk("dev %d,%d transaction %d head %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->transaction, __entry->sync_commit, __entry->head)
+		  __entry->transaction, __entry->head)
 );
 
 TRACE_EVENT(jbd_do_submit_data,
@@ -133,19 +127,17 @@ TRACE_EVENT(jbd_do_submit_data,
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
-		__field(	char,	sync_commit		)
 		__field(	int,	transaction		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->sync_commit = commit_transaction->t_synchronous_commit;
 		__entry->transaction	= commit_transaction->t_tid;
 	),
 
-	TP_printk("dev %d,%d transaction %d sync %d",
+	TP_printk("dev %d,%d transaction %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		   __entry->transaction, __entry->sync_commit)
+		   __entry->transaction)
 );
 
 TRACE_EVENT(jbd_cleanup_journal_tail,
-- 
cgit v1.2.3


From ec2e3031b65f23f66840b5c89c4b83076831a435 Mon Sep 17 00:00:00 2001
From: Liam Girdwood <lrg@ti.com>
Date: Wed, 18 Apr 2012 11:41:11 +0100
Subject: ASoC: dapm: Add API call to query valid DAPM paths

In preparation for ASoC DSP support.

Add a DAPM API call to determine whether a DAPM audio path is valid between
source and sink widgets. This also takes into account all kcontrol mux and mixer
settings in between the source and sink widgets to validate the audio path.

This will be used by the DSP core to determine the runtime DAI mappings
between FE and BE DAIs in order to run PCM operations.

Signed-off-by: Liam Girdwood <lrg@ti.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/trace/events/asoc.h | 80 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

(limited to 'include/trace')

diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h
index ab26f8aa3c78..6d8efb1cc8ce 100644
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -7,6 +7,8 @@
 #include <linux/ktime.h>
 #include <linux/tracepoint.h>
 
+#define DAPM_DIRECT "(direct)"
+
 struct snd_soc_jack;
 struct snd_soc_codec;
 struct snd_soc_platform;
@@ -241,6 +243,84 @@ TRACE_EVENT(snd_soc_dapm_walk_done,
 		  (int)__entry->path_checks, (int)__entry->neighbour_checks)
 );
 
+TRACE_EVENT(snd_soc_dapm_output_path,
+
+	TP_PROTO(struct snd_soc_dapm_widget *widget,
+		struct snd_soc_dapm_path *path),
+
+	TP_ARGS(widget, path),
+
+	TP_STRUCT__entry(
+		__string(	wname,	widget->name		)
+		__string(	pname,	path->name ? path->name : DAPM_DIRECT)
+		__string(	psname,	path->sink->name	)
+		__field(	int,	path_sink		)
+		__field(	int,	path_connect		)
+	),
+
+	TP_fast_assign(
+		__assign_str(wname, widget->name);
+		__assign_str(pname, path->name ? path->name : DAPM_DIRECT);
+		__assign_str(psname, path->sink->name);
+		__entry->path_connect = path->connect;
+		__entry->path_sink = (int)path->sink;
+	),
+
+	TP_printk("%c%s -> %s -> %s\n",
+		(int) __entry->path_sink &&
+		(int) __entry->path_connect ? '*' : ' ',
+		__get_str(wname), __get_str(pname), __get_str(psname))
+);
+
+TRACE_EVENT(snd_soc_dapm_input_path,
+
+	TP_PROTO(struct snd_soc_dapm_widget *widget,
+		struct snd_soc_dapm_path *path),
+
+	TP_ARGS(widget, path),
+
+	TP_STRUCT__entry(
+		__string(	wname,	widget->name		)
+		__string(	pname,	path->name ? path->name : DAPM_DIRECT)
+		__string(	psname,	path->source->name	)
+		__field(	int,	path_source		)
+		__field(	int,	path_connect		)
+	),
+
+	TP_fast_assign(
+		__assign_str(wname, widget->name);
+		__assign_str(pname, path->name ? path->name : DAPM_DIRECT);
+		__assign_str(psname, path->source->name);
+		__entry->path_connect = path->connect;
+		__entry->path_source = (int)path->source;
+	),
+
+	TP_printk("%c%s <- %s <- %s\n",
+		(int) __entry->path_source &&
+		(int) __entry->path_connect ? '*' : ' ',
+		__get_str(wname), __get_str(pname), __get_str(psname))
+);
+
+TRACE_EVENT(snd_soc_dapm_connected,
+
+	TP_PROTO(int paths, int stream),
+
+	TP_ARGS(paths, stream),
+
+	TP_STRUCT__entry(
+		__field(	int,	paths		)
+		__field(	int,	stream		)
+	),
+
+	TP_fast_assign(
+		__entry->paths = paths;
+		__entry->stream = stream;
+	),
+
+	TP_printk("%s: found %d paths\n",
+		__entry->stream ? "capture" : "playback", __entry->paths)
+);
+
 TRACE_EVENT(snd_soc_jack_irq,
 
 	TP_PROTO(const char *name),
-- 
cgit v1.2.3


From c97f3bdd26080c2cb2a648c37b6dcb8eac2f91e7 Mon Sep 17 00:00:00 2001
From: Liam Girdwood <lrg@ti.com>
Date: Mon, 23 Apr 2012 10:31:15 +0100
Subject: ASoC: dapm: Fix x86_64 build warning.

Fixes the following build warning on x86_64.

In file included from include/trace/ftrace.h:567:0,
                 from include/trace/define_trace.h:86,
                 from include/trace/events/asoc.h:410,
                 from sound/soc/soc-core.c:45:
include/trace/events/asoc.h: In function 'ftrace_raw_event_snd_soc_dapm_output_path':
include/trace/events/asoc.h:246:1: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
include/trace/events/asoc.h: In function 'ftrace_raw_event_snd_soc_dapm_input_path':
include/trace/events/asoc.h:275:1: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]

Signed-off-by: Liam Girdwood <lrg@ti.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/trace/events/asoc.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h
index 6d8efb1cc8ce..5fc2dcdd21cd 100644
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -263,7 +263,7 @@ TRACE_EVENT(snd_soc_dapm_output_path,
 		__assign_str(pname, path->name ? path->name : DAPM_DIRECT);
 		__assign_str(psname, path->sink->name);
 		__entry->path_connect = path->connect;
-		__entry->path_sink = (int)path->sink;
+		__entry->path_sink = (long)path->sink;
 	),
 
 	TP_printk("%c%s -> %s -> %s\n",
@@ -292,7 +292,7 @@ TRACE_EVENT(snd_soc_dapm_input_path,
 		__assign_str(pname, path->name ? path->name : DAPM_DIRECT);
 		__assign_str(psname, path->source->name);
 		__entry->path_connect = path->connect;
-		__entry->path_source = (int)path->source;
+		__entry->path_source = (long)path->source;
 	),
 
 	TP_printk("%c%s <- %s <- %s\n",
-- 
cgit v1.2.3


From 2fdbb31b662787f78bb78b3e4e18f1a072058ffc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Thu, 23 Feb 2012 15:58:29 -0800
Subject: rcu: Add RCU_FAST_NO_HZ tracing for idle exit

Traces of rcu_prep_idle events can be confusing because
rcu_cleanup_after_idle() does no tracing.  This commit therefore adds
this tracing.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/trace')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 337099783f37..aaa55e1b8c48 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -292,6 +292,7 @@ TRACE_EVENT(rcu_dyntick,
  *	"More callbacks": Still more callbacks, try again to clear them out.
  *	"Callbacks drained": All callbacks processed, off to dyntick idle!
  *	"Timer": Timer fired to cause CPU to continue processing callbacks.
+ *	"Cleanup after idle": Idle exited, timer canceled.
  */
 TRACE_EVENT(rcu_prep_idle,
 
-- 
cgit v1.2.3


From 6791e36c4a40e8930e08669e60077eea6770c429 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arve=20Hj=C3=B8nnev=C3=A5g?= <arve@android.com>
Date: Sun, 29 Apr 2012 22:53:02 +0200
Subject: PM / Sleep: Add wakeup_source_activate and wakeup_source_deactivate
 tracepoints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add tracepoints to wakeup_source_activate and wakeup_source_deactivate.
Useful for checking that specific wakeup sources overlap as expected.

Signed-off-by: Arve Hjønnevåg <arve@android.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 include/trace/events/power.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/trace')

diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index cae9a94f025d..0c9783841a30 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -65,6 +65,40 @@ TRACE_EVENT(machine_suspend,
 	TP_printk("state=%lu", (unsigned long)__entry->state)
 );
 
+DECLARE_EVENT_CLASS(wakeup_source,
+
+	TP_PROTO(const char *name, unsigned int state),
+
+	TP_ARGS(name, state),
+
+	TP_STRUCT__entry(
+		__string(       name,           name            )
+		__field(        u64,            state           )
+	),
+
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->state = state;
+	),
+
+	TP_printk("%s state=0x%lx", __get_str(name),
+		(unsigned long)__entry->state)
+);
+
+DEFINE_EVENT(wakeup_source, wakeup_source_activate,
+
+	TP_PROTO(const char *name, unsigned int state),
+
+	TP_ARGS(name, state)
+);
+
+DEFINE_EVENT(wakeup_source, wakeup_source_deactivate,
+
+	TP_PROTO(const char *name, unsigned int state),
+
+	TP_ARGS(name, state)
+);
+
 #ifdef CONFIG_EVENT_POWER_TRACING_DEPRECATED
 
 /*
-- 
cgit v1.2.3


From cc1676d917f32504dbadc858fa790bc524c9f0da Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 3 May 2012 14:47:56 +0200
Subject: writeback: Move requeueing when I_SYNC set to writeback_sb_inodes()

When writeback_single_inode() is called on inode which has I_SYNC already
set while doing WB_SYNC_NONE, inode is moved to b_more_io list. However
this makes sense only if the caller is flusher thread. For other callers of
writeback_single_inode() it doesn't really make sense and may be even wrong
- flusher thread may be doing WB_SYNC_ALL writeback in parallel.

So we move requeueing from writeback_single_inode() to writeback_sb_inodes().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
---
 include/trace/events/writeback.h | 36 +++++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 7b81887b023f..b453d92c2253 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -372,6 +372,35 @@ TRACE_EVENT(balance_dirty_pages,
 	  )
 );
 
+TRACE_EVENT(writeback_sb_inodes_requeue,
+
+	TP_PROTO(struct inode *inode),
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long, ino)
+		__field(unsigned long, state)
+		__field(unsigned long, dirtied_when)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,
+		        dev_name(inode_to_bdi(inode)->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
+		__entry->dirtied_when	= inode->dirtied_when;
+	),
+
+	TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
+		  __entry->name,
+		  __entry->ino,
+		  show_inode_state(__entry->state),
+		  __entry->dirtied_when,
+		  (jiffies - __entry->dirtied_when) / HZ
+	)
+);
+
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
 
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
@@ -450,13 +479,6 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 	)
 );
 
-DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
-	TP_PROTO(struct inode *inode,
-		 struct writeback_control *wbc,
-		 unsigned long nr_to_write),
-	TP_ARGS(inode, wbc, nr_to_write)
-);
-
 DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
 	TP_PROTO(struct inode *inode,
 		 struct writeback_control *wbc,
-- 
cgit v1.2.3


From 21e52e15666323078b8517a4312712579176b56f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Mon, 30 Apr 2012 14:16:19 -0700
Subject: rcu: Make RCU_FAST_NO_HZ handle timer migration

The current RCU_FAST_NO_HZ assumes that timers do not migrate unless a
CPU goes offline, in which case it assumes that the CPU will have to come
out of dyntick-idle mode (cancelling the timer) in order to go offline.
This is important because when RCU_FAST_NO_HZ permits a CPU to enter
dyntick-idle mode despite having RCU callbacks pending, it posts a timer
on that CPU to force a wakeup on that CPU.  This wakeup ensures that the
CPU will eventually handle the end of the grace period, including invoking
its RCU callbacks.

However, Pascal Chapperon's test setup shows that the timer handler
rcu_idle_gp_timer_func() really does get invoked in some cases.  This is
problematic because this can cause the CPU that entered dyntick-idle
mode despite still having RCU callbacks pending to remain in
dyntick-idle mode indefinitely, which means that its RCU callbacks might
never be invoked.  This situation can result in grace-period delays or
even system hangs, which matches Pascal's observations of slow boot-up
and shutdown (https://lkml.org/lkml/2012/4/5/142).  See also the bugzilla:

	https://bugzilla.redhat.com/show_bug.cgi?id=806548

This commit therefore causes the "should never be invoked" timer handler
rcu_idle_gp_timer_func() to use smp_call_function_single() to wake up
the CPU for which the timer was intended, allowing that CPU to invoke
its RCU callbacks in a timely manner.

Reported-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/trace')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index aaa55e1b8c48..1480900c511c 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -292,6 +292,7 @@ TRACE_EVENT(rcu_dyntick,
  *	"More callbacks": Still more callbacks, try again to clear them out.
  *	"Callbacks drained": All callbacks processed, off to dyntick idle!
  *	"Timer": Timer fired to cause CPU to continue processing callbacks.
+ *	"Demigrate": Timer fired on wrong CPU, woke up correct CPU.
  *	"Cleanup after idle": Idle exited, timer canceled.
  */
 TRACE_EVENT(rcu_prep_idle,
-- 
cgit v1.2.3


From 9754e39c7bc51328f145e933bfb0df47cd67b6e9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 7 Apr 2012 12:33:03 +0200
Subject: jbd: Split updating of journal superblock and marking journal empty

There are three case of updating journal superblock. In the first case, we want
to mark journal as empty (setting s_sequence to 0), in the second case we want
to update log tail, in the third case we want to update s_errno. Split these
cases into separate functions. It makes the code slightly more straightforward
and later patches will make the distinction even more important.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/trace/events/jbd.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h
index 9305e1b5edc3..d9658a940a39 100644
--- a/include/trace/events/jbd.h
+++ b/include/trace/events/jbd.h
@@ -169,24 +169,20 @@ TRACE_EVENT(jbd_cleanup_journal_tail,
 		  __entry->block_nr, __entry->freed)
 );
 
-TRACE_EVENT(jbd_update_superblock_end,
-	TP_PROTO(journal_t *journal, int wait),
+TRACE_EVENT(journal_write_superblock,
+	TP_PROTO(journal_t *journal),
 
-	TP_ARGS(journal, wait),
+	TP_ARGS(journal),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
-		__field(	int,	wait			)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= journal->j_fs_dev->bd_dev;
-		__entry->wait		= wait;
 	),
 
-	TP_printk("dev %d,%d wait %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		   __entry->wait)
+	TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
 );
 
 #endif /* _TRACE_JBD_H */
-- 
cgit v1.2.3


From fd2cbd4dfa3db477dd6226d387d3f1911d36a6a9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 7 Apr 2012 11:05:19 +0200
Subject: jbd: Write journal superblock with WRITE_FUA after checkpointing

If journal superblock is written only in disk's caches and other transaction
starts reusing space of the transaction cleaned from the log, it can happen
blocks of a new transaction reach the disk before journal superblock. When
power failure happens in such case, subsequent journal replay would still try
to replay the old transaction but some of it's blocks may be already
overwritten by the new transaction. For this reason we must use WRITE_FUA when
updating log tail and we must first write new log tail to disk and update
in-memory information only after that.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/trace/events/jbd.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/jbd.h b/include/trace/events/jbd.h
index d9658a940a39..da6f2591c25e 100644
--- a/include/trace/events/jbd.h
+++ b/include/trace/events/jbd.h
@@ -170,19 +170,22 @@ TRACE_EVENT(jbd_cleanup_journal_tail,
 );
 
 TRACE_EVENT(journal_write_superblock,
-	TP_PROTO(journal_t *journal),
+	TP_PROTO(journal_t *journal, int write_op),
 
-	TP_ARGS(journal),
+	TP_ARGS(journal, write_op),
 
 	TP_STRUCT__entry(
 		__field(	dev_t,	dev			)
+		__field(	int,	write_op		)
 	),
 
 	TP_fast_assign(
 		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->write_op	= write_op;
 	),
 
-	TP_printk("dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
+	TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->write_op)
 );
 
 #endif /* _TRACE_JBD_H */
-- 
cgit v1.2.3


From 1523299d5817773e344d135d4b1c485f269400bc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 7 Feb 2012 15:41:24 -0800
Subject: userns: Convert ext3 to use kuid/kgid where appropriate

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/trace/events/ext3.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h
index 7b53c0573dc9..15d11a39be47 100644
--- a/include/trace/events/ext3.h
+++ b/include/trace/events/ext3.h
@@ -24,8 +24,8 @@ TRACE_EVENT(ext3_free_inode,
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->mode	= inode->i_mode;
-		__entry->uid	= inode->i_uid;
-		__entry->gid	= inode->i_gid;
+		__entry->uid	= i_uid_read(inode);
+		__entry->gid	= i_gid_read(inode);
 		__entry->blocks	= inode->i_blocks;
 	),
 
-- 
cgit v1.2.3


From 08cefc7ab839cf3ece44b8033968a4732eac06d8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 7 Feb 2012 15:41:49 -0800
Subject: userns: Convert ext4 to user kuid/kgid where appropriate

Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/trace/events/ext4.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 319538bf17d2..69d8a69ea831 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -36,8 +36,8 @@ TRACE_EVENT(ext4_free_inode,
 		__entry->dev	= inode->i_sb->s_dev;
 		__entry->ino	= inode->i_ino;
 		__entry->mode	= inode->i_mode;
-		__entry->uid	= inode->i_uid;
-		__entry->gid	= inode->i_gid;
+		__entry->uid	= i_uid_read(inode);
+		__entry->gid	= i_gid_read(inode);
 		__entry->blocks	= inode->i_blocks;
 	),
 
-- 
cgit v1.2.3


From e709ffd6169ccd259eb5874e853303e91e94e829 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 29 May 2012 15:06:18 -0700
Subject: mm: remove swap token code

The swap token code no longer fits in with the current VM model.  It
does not play well with cgroups or the better NUMA placement code in
development, since we have only one swap token globally.

It also has the potential to mess with scalability of the system, by
increasing the number of non-reclaimable pages on the active and
inactive anon LRU lists.

Last but not least, the swap token code has been broken for a year
without complaints, as reported by Konstantin Khlebnikov.  This suggests
we no longer have much use for it.

The days of sub-1G memory systems with heavy use of swap are over.  If
we ever need thrashing reducing code in the future, we will have to
implement something that does scale.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hughd@google.com>
Acked-by: Bob Picco <bpicco@meloft.net>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/vmscan.h | 82 -------------------------------------------
 1 file changed, 82 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index f64560e204bc..572195459d58 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -395,88 +395,6 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
 		show_reclaim_flags(__entry->reclaim_flags))
 );
 
-TRACE_EVENT(replace_swap_token,
-	TP_PROTO(struct mm_struct *old_mm,
-		 struct mm_struct *new_mm),
-
-	TP_ARGS(old_mm, new_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*,	old_mm)
-		__field(unsigned int,		old_prio)
-		__field(struct mm_struct*,	new_mm)
-		__field(unsigned int,		new_prio)
-	),
-
-	TP_fast_assign(
-		__entry->old_mm   = old_mm;
-		__entry->old_prio = old_mm ? old_mm->token_priority : 0;
-		__entry->new_mm   = new_mm;
-		__entry->new_prio = new_mm->token_priority;
-	),
-
-	TP_printk("old_token_mm=%p old_prio=%u new_token_mm=%p new_prio=%u",
-		  __entry->old_mm, __entry->old_prio,
-		  __entry->new_mm, __entry->new_prio)
-);
-
-DECLARE_EVENT_CLASS(put_swap_token_template,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-
-	TP_ARGS(swap_token_mm),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, swap_token_mm)
-	),
-
-	TP_fast_assign(
-		__entry->swap_token_mm = swap_token_mm;
-	),
-
-	TP_printk("token_mm=%p", __entry->swap_token_mm)
-);
-
-DEFINE_EVENT(put_swap_token_template, put_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm)
-);
-
-DEFINE_EVENT_CONDITION(put_swap_token_template, disable_swap_token,
-	TP_PROTO(struct mm_struct *swap_token_mm),
-	TP_ARGS(swap_token_mm),
-	TP_CONDITION(swap_token_mm != NULL)
-);
-
-TRACE_EVENT_CONDITION(update_swap_token_priority,
-	TP_PROTO(struct mm_struct *mm,
-		 unsigned int old_prio,
-		 struct mm_struct *swap_token_mm),
-
-	TP_ARGS(mm, old_prio, swap_token_mm),
-
-	TP_CONDITION(mm->token_priority != old_prio),
-
-	TP_STRUCT__entry(
-		__field(struct mm_struct*, mm)
-		__field(unsigned int, old_prio)
-		__field(unsigned int, new_prio)
-		__field(struct mm_struct*, swap_token_mm)
-		__field(unsigned int, swap_token_prio)
-	),
-
-	TP_fast_assign(
-		__entry->mm		= mm;
-		__entry->old_prio	= old_prio;
-		__entry->new_prio	= mm->token_priority;
-		__entry->swap_token_mm	= swap_token_mm;
-		__entry->swap_token_prio = swap_token_mm ? swap_token_mm->token_priority : 0;
-	),
-
-	TP_printk("mm=%p old_prio=%u new_prio=%u swap_token_mm=%p token_prio=%u",
-		  __entry->mm, __entry->old_prio, __entry->new_prio,
-		  __entry->swap_token_mm, __entry->swap_token_prio)
-);
-
 #endif /* _TRACE_VMSCAN_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From c53919adc045bf803252e912f23028a68525753d Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 29 May 2012 15:06:19 -0700
Subject: mm: vmscan: remove lumpy reclaim

This series removes lumpy reclaim and some stalling logic that was
unintentionally being used by memory compaction.  The end result is that
stalling on dirty pages during page reclaim now depends on
wait_iff_congested().

Four kernels were compared

  3.3.0     vanilla
  3.4.0-rc2 vanilla
  3.4.0-rc2 lumpyremove-v2 is patch one from this series
  3.4.0-rc2 nosync-v2r3 is the full series

Removing lumpy reclaim saves almost 900 bytes of text whereas the full
series removes 1200 bytes.

     text     data      bss       dec     hex  filename
  6740375  1927944  2260992  10929311  a6c49f  vmlinux-3.4.0-rc2-vanilla
  6739479  1927944  2260992  10928415  a6c11f  vmlinux-3.4.0-rc2-lumpyremove-v2
  6739159  1927944  2260992  10928095  a6bfdf  vmlinux-3.4.0-rc2-nosync-v2

There are behaviour changes in the series and so tests were run with
monitoring of ftrace events.  This disrupts results so the performance
results are distorted but the new behaviour should be clearer.

fs-mark running in a threaded configuration showed little of interest as
it did not push reclaim aggressively

  FS-Mark Multi Threaded
                          3.3.0-vanilla       rc2-vanilla       lumpyremove-v2r3       nosync-v2r3
  Files/s  min           3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)
  Files/s  mean          3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)
  Files/s  stddev        0.00 ( 0.00%)        0.00 ( 0.00%)        0.00 ( 0.00%)        0.00 ( 0.00%)
  Files/s  max           3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)        3.20 ( 0.00%)
  Overhead min      508667.00 ( 0.00%)   521350.00 (-2.49%)   544292.00 (-7.00%)   547168.00 (-7.57%)
  Overhead mean     551185.00 ( 0.00%)   652690.73 (-18.42%)   991208.40 (-79.83%)   570130.53 (-3.44%)
  Overhead stddev    18200.69 ( 0.00%)   331958.29 (-1723.88%)  1579579.43 (-8578.68%)     9576.81 (47.38%)
  Overhead max      576775.00 ( 0.00%)  1846634.00 (-220.17%)  6901055.00 (-1096.49%)   585675.00 (-1.54%)
  MMTests Statistics: duration
  Sys Time Running Test (seconds)             309.90    300.95    307.33    298.95
  User+Sys Time Running Test (seconds)        319.32    309.67    315.69    307.51
  Total Elapsed Time (seconds)               1187.85   1193.09   1191.98   1193.73

  MMTests Statistics: vmstat
  Page Ins                                       80532       82212       81420       79480
  Page Outs                                  111434984   111456240   111437376   111582628
  Swap Ins                                           0           0           0           0
  Swap Outs                                          0           0           0           0
  Direct pages scanned                           44881       27889       27453       34843
  Kswapd pages scanned                        25841428    25860774    25861233    25843212
  Kswapd pages reclaimed                      25841393    25860741    25861199    25843179
  Direct pages reclaimed                         44881       27889       27453       34843
  Kswapd efficiency                                99%         99%         99%         99%
  Kswapd velocity                            21754.791   21675.460   21696.029   21649.127
  Direct efficiency                               100%        100%        100%        100%
  Direct velocity                               37.783      23.375      23.031      29.188
  Percentage direct scans                           0%          0%          0%          0%

ftrace showed that there was no stalling on writeback or pages submitted
for IO from reclaim context.

postmark was similar and while it was more interesting, it also did not
push reclaim heavily.

  POSTMARK
                                       3.3.0-vanilla       rc2-vanilla  lumpyremove-v2r3       nosync-v2r3
  Transactions per second:               16.00 ( 0.00%)    20.00 (25.00%)    18.00 (12.50%)    17.00 ( 6.25%)
  Data megabytes read per second:        18.80 ( 0.00%)    24.27 (29.10%)    22.26 (18.40%)    20.54 ( 9.26%)
  Data megabytes written per second:     35.83 ( 0.00%)    46.25 (29.08%)    42.42 (18.39%)    39.14 ( 9.24%)
  Files created alone per second:        28.00 ( 0.00%)    38.00 (35.71%)    34.00 (21.43%)    30.00 ( 7.14%)
  Files create/transact per second:       8.00 ( 0.00%)    10.00 (25.00%)     9.00 (12.50%)     8.00 ( 0.00%)
  Files deleted alone per second:       556.00 ( 0.00%)  1224.00 (120.14%)  3062.00 (450.72%)  6124.00 (1001.44%)
  Files delete/transact per second:       8.00 ( 0.00%)    10.00 (25.00%)     9.00 (12.50%)     8.00 ( 0.00%)

  MMTests Statistics: duration
  Sys Time Running Test (seconds)             113.34    107.99    109.73    108.72
  User+Sys Time Running Test (seconds)        145.51    139.81    143.32    143.55
  Total Elapsed Time (seconds)               1159.16    899.23    980.17   1062.27

  MMTests Statistics: vmstat
  Page Ins                                    13710192    13729032    13727944    13760136
  Page Outs                                   43071140    42987228    42733684    42931624
  Swap Ins                                           0           0           0           0
  Swap Outs                                          0           0           0           0
  Direct pages scanned                               0           0           0           0
  Kswapd pages scanned                         9941613     9937443     9939085     9929154
  Kswapd pages reclaimed                       9940926     9936751     9938397     9928465
  Direct pages reclaimed                             0           0           0           0
  Kswapd efficiency                                99%         99%         99%         99%
  Kswapd velocity                             8576.567   11051.058   10140.164    9347.109
  Direct efficiency                               100%        100%        100%        100%
  Direct velocity                                0.000       0.000       0.000       0.000

It looks like here that the full series regresses performance but as
ftrace showed no usage of wait_iff_congested() or sync reclaim I am
assuming it's a disruption due to monitoring.  Other data such as memory
usage, page IO, swap IO all looked similar.

Running a benchmark with a plain DD showed nothing very interesting.
The full series stalled in wait_iff_congested() slightly less but stall
times on vanilla kernels were marginal.

Running a benchmark that hammered on file-backed mappings showed stalls
due to congestion but not in sync writebacks

  MICRO
                                       3.3.0-vanilla       rc2-vanilla  lumpyremove-v2r3       nosync-v2r3
  MMTests Statistics: duration
  Sys Time Running Test (seconds)             308.13    294.50    298.75    299.53
  User+Sys Time Running Test (seconds)        330.45    316.28    318.93    320.79
  Total Elapsed Time (seconds)               1814.90   1833.88   1821.14   1832.91

  MMTests Statistics: vmstat
  Page Ins                                      108712      120708       97224      110344
  Page Outs                                  155514576   156017404   155813676   156193256
  Swap Ins                                           0           0           0           0
  Swap Outs                                          0           0           0           0
  Direct pages scanned                         2599253     1550480     2512822     2414760
  Kswapd pages scanned                        69742364    71150694    68839041    69692533
  Kswapd pages reclaimed                      34824488    34773341    34796602    34799396
  Direct pages reclaimed                         53693       94750       61792       75205
  Kswapd efficiency                                49%         48%         50%         49%
  Kswapd velocity                            38427.662   38797.901   37799.972   38022.889
  Direct efficiency                                 2%          6%          2%          3%
  Direct velocity                             1432.174     845.464    1379.807    1317.446
  Percentage direct scans                           3%          2%          3%          3%
  Page writes by reclaim                             0           0           0           0
  Page writes file                                   0           0           0           0
  Page writes anon                                   0           0           0           0
  Page reclaim immediate                             0           0           0        1218
  Page rescued immediate                             0           0           0           0
  Slabs scanned                                  15360       16384       13312       16384
  Direct inode steals                                0           0           0           0
  Kswapd inode steals                             4340        4327        1630        4323

  FTrace Reclaim Statistics: congestion_wait
  Direct number congest     waited                 0          0          0          0
  Direct time   congest     waited               0ms        0ms        0ms        0ms
  Direct full   congest     waited                 0          0          0          0
  Direct number conditional waited               900        870        754        789
  Direct time   conditional waited               0ms        0ms        0ms       20ms
  Direct full   conditional waited                 0          0          0          0
  KSwapd number congest     waited              2106       2308       2116       1915
  KSwapd time   congest     waited          139924ms   157832ms   125652ms   132516ms
  KSwapd full   congest     waited              1346       1530       1202       1278
  KSwapd number conditional waited             12922      16320      10943      14670
  KSwapd time   conditional waited               0ms        0ms        0ms        0ms
  KSwapd full   conditional waited                 0          0          0          0

Reclaim statistics are not radically changed.  The stall times in kswapd
are massive but it is clear that it is due to calls to congestion_wait()
and that is almost certainly the call in balance_pgdat().  Otherwise
stalls due to dirty pages are non-existant.

I ran a benchmark that stressed high-order allocation.  This is very
artifical load but was used in the past to evaluate lumpy reclaim and
compaction.  Generally I look at allocation success rates and latency
figures.

  STRESS-HIGHALLOC
                   3.3.0-vanilla       rc2-vanilla  lumpyremove-v2r3       nosync-v2r3
  Pass 1          81.00 ( 0.00%)    28.00 (-53.00%)    24.00 (-57.00%)    28.00 (-53.00%)
  Pass 2          82.00 ( 0.00%)    39.00 (-43.00%)    38.00 (-44.00%)    43.00 (-39.00%)
  while Rested    88.00 ( 0.00%)    87.00 (-1.00%)    88.00 ( 0.00%)    88.00 ( 0.00%)

  MMTests Statistics: duration
  Sys Time Running Test (seconds)             740.93    681.42    685.14    684.87
  User+Sys Time Running Test (seconds)       2922.65   3269.52   3281.35   3279.44
  Total Elapsed Time (seconds)               1161.73   1152.49   1159.55   1161.44

  MMTests Statistics: vmstat
  Page Ins                                     4486020     2807256     2855944     2876244
  Page Outs                                    7261600     7973688     7975320     7986120
  Swap Ins                                       31694           0           0           0
  Swap Outs                                      98179           0           0           0
  Direct pages scanned                           53494       57731       34406      113015
  Kswapd pages scanned                         6271173     1287481     1278174     1219095
  Kswapd pages reclaimed                       2029240     1281025     1260708     1201583
  Direct pages reclaimed                          1468       14564       16649       92456
  Kswapd efficiency                                32%         99%         98%         98%
  Kswapd velocity                             5398.133    1117.130    1102.302    1049.641
  Direct efficiency                                 2%         25%         48%         81%
  Direct velocity                               46.047      50.092      29.672      97.306
  Percentage direct scans                           0%          4%          2%          8%
  Page writes by reclaim                       1616049           0           0           0
  Page writes file                             1517870           0           0           0
  Page writes anon                               98179           0           0           0
  Page reclaim immediate                        103778       27339        9796       17831
  Page rescued immediate                             0           0           0           0
  Slabs scanned                                1096704      986112      980992      998400
  Direct inode steals                              223      215040      216736      247881
  Kswapd inode steals                           175331       61548       68444       63066
  Kswapd skipped wait                            21991           0           1           0
  THP fault alloc                                    1         135         125         134
  THP collapse alloc                               393         311         228         236
  THP splits                                        25          13           7           8
  THP fault fallback                                 0           0           0           0
  THP collapse fail                                  3           5           7           7
  Compaction stalls                                865        1270        1422        1518
  Compaction success                               370         401         353         383
  Compaction failures                              495         869        1069        1135
  Compaction pages moved                        870155     3828868     4036106     4423626
  Compaction move failure                        26429       23865       29742       27514

Success rates are completely hosed for 3.4-rc2 which is almost certainly
due to commit fe2c2a106663 ("vmscan: reclaim at order 0 when compaction
is enabled").  I expected this would happen for kswapd and impair
allocation success rates (https://lkml.org/lkml/2012/1/25/166) but I did
not anticipate this much a difference: 80% less scanning, 37% less
reclaim by kswapd

In comparison, reclaim/compaction is not aggressive and gives up easily
which is the intended behaviour.  hugetlbfs uses __GFP_REPEAT and would
be much more aggressive about reclaim/compaction than THP allocations
are.  The stress test above is allocating like neither THP or hugetlbfs
but is much closer to THP.

Mainline is now impaired in terms of high order allocation under heavy
load although I do not know to what degree as I did not test with
__GFP_REPEAT.  Keep this in mind for bugs related to hugepage pool
resizing, THP allocation and high order atomic allocation failures from
network devices.

In terms of congestion throttling, I see the following for this test

  FTrace Reclaim Statistics: congestion_wait
  Direct number congest     waited                 3          0          0          0
  Direct time   congest     waited               0ms        0ms        0ms        0ms
  Direct full   congest     waited                 0          0          0          0
  Direct number conditional waited               957        512       1081       1075
  Direct time   conditional waited               0ms        0ms        0ms        0ms
  Direct full   conditional waited                 0          0          0          0
  KSwapd number congest     waited                36          4          3          5
  KSwapd time   congest     waited            3148ms      400ms      300ms      500ms
  KSwapd full   congest     waited                30          4          3          5
  KSwapd number conditional waited             88514        197        332        542
  KSwapd time   conditional waited            4980ms        0ms        0ms        0ms
  KSwapd full   conditional waited                49          0          0          0

The "conditional waited" times are the most interesting as this is
directly impacted by the number of dirty pages encountered during scan.
As lumpy reclaim is no longer scanning contiguous ranges, it is finding
fewer dirty pages.  This brings wait times from about 5 seconds to 0.
kswapd itself is still calling congestion_wait() so it'll still stall but
it's a lot less.

In terms of the type of IO we were doing, I see this

  FTrace Reclaim Statistics: mm_vmscan_writepage
  Direct writes anon  sync                         0          0          0          0
  Direct writes anon  async                        0          0          0          0
  Direct writes file  sync                         0          0          0          0
  Direct writes file  async                        0          0          0          0
  Direct writes mixed sync                         0          0          0          0
  Direct writes mixed async                        0          0          0          0
  KSwapd writes anon  sync                         0          0          0          0
  KSwapd writes anon  async                    91682          0          0          0
  KSwapd writes file  sync                         0          0          0          0
  KSwapd writes file  async                   822629          0          0          0
  KSwapd writes mixed sync                         0          0          0          0
  KSwapd writes mixed async                        0          0          0          0

In 3.2, kswapd was doing a bunch of async writes of pages but
reclaim/compaction was never reaching a point where it was doing sync
IO.  This does not guarantee that reclaim/compaction was not calling
wait_on_page_writeback() but I would consider it unlikely.  It indicates
that merging patches 2 and 3 to stop reclaim/compaction calling
wait_on_page_writeback() should be safe.

This patch:

Lumpy reclaim had a purpose but in the mind of some, it was to kick the
system so hard it trashed.  For others the purpose was to complicate
vmscan.c.  Over time it was giving softer shoes and a nicer attitude but
memory compaction needs to step up and replace it so this patch sends
lumpy reclaim to the farm.

The tracepoint format changes for isolating LRU pages with this patch
applied.  Furthermore reclaim/compaction can no longer queue dirty pages
in pageout() if the underlying BDI is congested.  Lumpy reclaim used
this logic and reclaim/compaction was using it in error.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/vmscan.h | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 572195459d58..bdaf32f8a874 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -263,22 +263,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file),
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
 
 	TP_STRUCT__entry(
 		__field(int, order)
 		__field(unsigned long, nr_requested)
 		__field(unsigned long, nr_scanned)
 		__field(unsigned long, nr_taken)
-		__field(unsigned long, nr_lumpy_taken)
-		__field(unsigned long, nr_lumpy_dirty)
-		__field(unsigned long, nr_lumpy_failed)
 		__field(isolate_mode_t, isolate_mode)
 		__field(int, file)
 	),
@@ -288,22 +282,16 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
 		__entry->nr_requested = nr_requested;
 		__entry->nr_scanned = nr_scanned;
 		__entry->nr_taken = nr_taken;
-		__entry->nr_lumpy_taken = nr_lumpy_taken;
-		__entry->nr_lumpy_dirty = nr_lumpy_dirty;
-		__entry->nr_lumpy_failed = nr_lumpy_failed;
 		__entry->isolate_mode = isolate_mode;
 		__entry->file = file;
 	),
 
-	TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu contig_taken=%lu contig_dirty=%lu contig_failed=%lu file=%d",
+	TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
 		__entry->isolate_mode,
 		__entry->order,
 		__entry->nr_requested,
 		__entry->nr_scanned,
 		__entry->nr_taken,
-		__entry->nr_lumpy_taken,
-		__entry->nr_lumpy_dirty,
-		__entry->nr_lumpy_failed,
 		__entry->file)
 );
 
@@ -313,13 +301,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
@@ -329,13 +314,10 @@ DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
 		unsigned long nr_requested,
 		unsigned long nr_scanned,
 		unsigned long nr_taken,
-		unsigned long nr_lumpy_taken,
-		unsigned long nr_lumpy_dirty,
-		unsigned long nr_lumpy_failed,
 		isolate_mode_t isolate_mode,
 		int file),
 
-	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, isolate_mode, file)
+	TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
 
 );
 
-- 
cgit v1.2.3


From 41ac1999c3e3563e1810b14878a869c79c9368bb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 29 May 2012 15:06:19 -0700
Subject: mm: vmscan: do not stall on writeback during memory compaction

This patch stops reclaim/compaction entering sync reclaim as this was
only intended for lumpy reclaim and an oversight.  Page migration has
its own logic for stalling on writeback pages if necessary and memory
compaction is already using it.

Waiting on page writeback is bad for a number of reasons but the primary
one is that waiting on writeback to a slow device like USB can take a
considerable length of time.  Page reclaim instead uses
wait_iff_congested() to throttle if too many dirty pages are being
scanned.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/vmscan.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index bdaf32f8a874..82f693395ac5 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -13,7 +13,7 @@
 #define RECLAIM_WB_ANON		0x0001u
 #define RECLAIM_WB_FILE		0x0002u
 #define RECLAIM_WB_MIXED	0x0010u
-#define RECLAIM_WB_SYNC		0x0004u
+#define RECLAIM_WB_SYNC		0x0004u /* Unused, all reclaim async */
 #define RECLAIM_WB_ASYNC	0x0008u
 
 #define show_reclaim_flags(flags)				\
@@ -27,13 +27,13 @@
 
 #define trace_reclaim_flags(page, sync) ( \
 	(page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC)   \
+	(RECLAIM_WB_ASYNC) \
 	)
 
-#define trace_shrink_flags(file, sync) ( \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \
-			(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) |  \
-	(sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \
+#define trace_shrink_flags(file, sync) \
+	( \
+		(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
+		(RECLAIM_WB_ASYNC) \
 	)
 
 TRACE_EVENT(mm_vmscan_kswapd_sleep,
-- 
cgit v1.2.3


From 23b9da55c5b0feb484bd5e8615f4eb1ce4169453 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 29 May 2012 15:06:20 -0700
Subject: mm: vmscan: remove reclaim_mode_t

There is little motiviation for reclaim_mode_t once RECLAIM_MODE_[A]SYNC
and lumpy reclaim have been removed.  This patch gets rid of
reclaim_mode_t as well and improves the documentation about what
reclaim/compaction is and when it is triggered.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ying Han <yinghan@google.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/vmscan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/trace')

diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 82f693395ac5..bab3b87e4064 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -25,12 +25,12 @@
 		{RECLAIM_WB_ASYNC,	"RECLAIM_WB_ASYNC"}	\
 		) : "RECLAIM_WB_NONE"
 
-#define trace_reclaim_flags(page, sync) ( \
+#define trace_reclaim_flags(page) ( \
 	(page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
 	(RECLAIM_WB_ASYNC) \
 	)
 
-#define trace_shrink_flags(file, sync) \
+#define trace_shrink_flags(file) \
 	( \
 		(file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \
 		(RECLAIM_WB_ASYNC) \
-- 
cgit v1.2.3