352 files changed, 13209 insertions, 4409 deletions
diff --git a/.mailmap b/.mailmap
index dfab12f809ed..eba9bf953ef5 100644
--- a/.mailmap
+++ b/.mailmap
@@ -66,6 +66,7 @@ Kenneth W Chen <kenneth.w.chen@intel.com>
 Koushik <raghavendra.koushik@neterion.com>
 Leonid I Ananiev <leonid.i.ananiev@intel.com>
 Linas Vepstas <linas@austin.ibm.com>
+Mark Brown <broonie@sirena.org.uk>
 Matthieu CASTET <castet.matthieu@free.fr>
 Michael Buesch <mb@bu3sch.de>
 Michael Buesch <mbuesch@freenet.de>
diff --git a/Documentation/cgroups.txt b/Documentation/cgroups/cgroups.txt
index d9014aa0eb68..d9014aa0eb68 100644
--- a/Documentation/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt
new file mode 100644
index 000000000000..c50ab58b72eb
--- /dev/null
+++ b/Documentation/cgroups/freezer-subsystem.txt
@@ -0,0 +1,99 @@
+	The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+	The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+	Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells:
+
+	$ echo $$
+	16644
+	$ bash
+	$ echo $$
+	16690
+
+	From a second, unrelated bash shell:
+	$ kill -SIGSTOP 16690
+	$ kill -SIGCONT 16990
+
+	<at this point 16990 exits and causes 16644 to exit too>
+
+	This happens because bash can observe both signals and choose how it
+responds to them.
+
+	Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+	 In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+	The freezer subsystem in the container filesystem defines a file named
+freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the
+cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup.
+Reading will return the current state.
+
+* Examples of usage :
+
+   # mkdir /containers/freezer
+   # mount -t cgroup -ofreezer freezer  /containers
+   # mkdir /containers/0
+   # echo $some_pid > /containers/0/tasks
+
+to get status of the freezer subsystem :
+
+   # cat /containers/0/freezer.state
+   THAWED
+
+to freeze all tasks in the container :
+
+   # echo FROZEN > /containers/0/freezer.state
+   # cat /containers/0/freezer.state
+   FREEZING
+   # cat /containers/0/freezer.state
+   FROZEN
+
+to unfreeze all tasks in the container :
+
+   # echo THAWED > /containers/0/freezer.state
+   # cat /containers/0/freezer.state
+   THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
+
+It's important to note that freezing can be incomplete. In that case we return
+EBUSY. This means that some tasks in the cgroup are busy doing something that
+prevents us from completely freezing the cgroup at this time. After EBUSY,
+the cgroup will remain partially frozen -- reflected by freezer.state reporting
+"FREEZING" when read. The state will remain "FREEZING" until one of these
+things happens:
+
+	1) Userspace cancels the freezing operation by writing "THAWED" to
+		the freezer.state file
+	2) Userspace retries the freezing operation by writing "FROZEN" to
+		the freezer.state file (writing "FREEZING" is not legal
+		and returns EIO)
+	3) The tasks that blocked the cgroup from entering the "FROZEN"
+		state disappear from the cgroup's set of tasks.
diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index 9b53d5827361..1c07547d3f81 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -112,14 +112,22 @@ the per cgroup LRU.
 
 2.2.1 Accounting details
 
-All mapped pages (RSS) and unmapped user pages (Page Cache) are accounted.
-RSS pages are accounted at the time of page_add_*_rmap() unless they've already
-been accounted for earlier. A file page will be accounted for as Page Cache;
-it's mapped into the page tables of a process, duplicate accounting is carefully
-avoided. Page Cache pages are accounted at the time of add_to_page_cache().
-The corresponding routines that remove a page from the page tables or removes
-a page from Page Cache is used to decrement the accounting counters of the
-cgroup.
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+(some pages which never be reclaimable and will not be on global LRU
+ are not accounted. we just accounts pages under usual vm management.)
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+A RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-lru because our purpose is to control amount
+of used pages. not-on-lru pages are tend to be out-of-control from vm view.
 
 2.3 Shared Page Accounting
 
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 47e568a9370a..5c86c258c791 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -48,7 +48,7 @@ hooks, beyond what is already present, required to manage dynamic
 job placement on large systems.
 
 Cpusets use the generic cgroup subsystem described in
-Documentation/cgroup.txt.
+Documentation/cgroups/cgroups.txt.
 
 Requests by a task, using the sched_setaffinity(2) system call to
 include CPUs in its CPU affinity mask, and using the mbind(2) and
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 295f26cd895a..9dd2a3bb2acc 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -96,6 +96,11 @@ errors=remount-ro(*)	Remount the filesystem read-only on an error.
 errors=continue		Keep going on a filesystem error.
 errors=panic		Panic and halt the machine if an error occurs.
 
+data_err=ignore(*)	Just print an error message if an error occurs
+			in a file data buffer in ordered mode.
+data_err=abort		Abort the journal if an error occurs in a file
+			data buffer in ordered mode.
+
 grpid			Give objects the same group ID as their creator.
 bsdgroups
 
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index c032bf39e8b9..bcceb99b81dd 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1384,15 +1384,18 @@ causes the kernel to prefer to reclaim dentries and inodes.
 dirty_background_ratio
 ----------------------
 
-Contains, as a percentage of total system memory, the number of pages at which
-the pdflush background writeback daemon will start writing out dirty data.
+Contains, as a percentage of the dirtyable system memory (free pages + mapped
+pages + file cache, not including locked pages and HugePages), the number of
+pages at which the pdflush background writeback daemon will start writing out
+dirty data.
 
 dirty_ratio
 -----------------
 
-Contains, as a percentage of total system memory, the number of pages at which
-a process which is generating disk writes will itself start writing out dirty
-data.
+Contains, as a percentage of the dirtyable system memory (free pages + mapped
+pages + file cache, not including locked pages and HugePages), the number of
+pages at which a process which is generating disk writes will itself start
+writing out dirty data.
 
 dirty_writeback_centisecs
 -------------------------
@@ -2412,24 +2415,29 @@ will be dumped when the <pid> process is dumped. coredump_filter is a bitmask
 of memory types. If a bit of the bitmask is set, memory segments of the
 corresponding memory type are dumped, otherwise they are not dumped.
 
-The following 4 memory types are supported:
+The following 7 memory types are supported:
   - (bit 0) anonymous private memory
   - (bit 1) anonymous shared memory
   - (bit 2) file-backed private memory
   - (bit 3) file-backed shared memory
   - (bit 4) ELF header pages in file-backed private memory areas (it is
             effective only if the bit 2 is cleared)
+  - (bit 5) hugetlb private memory
+  - (bit 6) hugetlb shared memory
 
   Note that MMIO pages such as frame buffer are never dumped and vDSO pages
   are always dumped regardless of the bitmask status.
 
-Default value of coredump_filter is 0x3; this means all anonymous memory
-segments are dumped.
+  Note bit 0-4 doesn't effect any hugetlb memory. hugetlb memory are only
+  effected by bit 5-6.
+
+Default value of coredump_filter is 0x23; this means all anonymous memory
+segments and hugetlb private memory are dumped.
 
 If you don't want to dump all shared memory segments attached to pid 1234,
-write 1 to the process's proc file.
+write 0x21 to the process's proc file.
 
-  $ echo 0x1 > /proc/1234/coredump_filter
+  $ echo 0x21 > /proc/1234/coredump_filter
 
 When a new process is created, the process inherits the bitmask status from its
 parent. It is useful to set up coredump_filter before the program runs.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bcecfaa1e770..0f1544f67400 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -690,7 +690,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			See Documentation/block/as-iosched.txt and
 			Documentation/block/deadline-iosched.txt for details.
 
-	elfcorehdr=	[X86-32, X86_64]
+	elfcorehdr=	[IA64,PPC,SH,X86-32,X86_64]
 			Specifies physical address of start of kernel core
 			image elf header. Generally kexec loader will
 			pass this option to capture kernel.
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
new file mode 100644
index 000000000000..bdf93b7f0f24
--- /dev/null
+++ b/Documentation/mtd/nand_ecc.txt
@@ -0,0 +1,714 @@
+Introduction
+============
+
+Having looked at the linux mtd/nand driver and more specific at nand_ecc.c
+I felt there was room for optimisation. I bashed the code for a few hours
+performing tricks like table lookup removing superfluous code etc.
+After that the speed was increased by 35-40%.
+Still I was not too happy as I felt there was additional room for improvement.
+
+Bad! I was hooked.
+I decided to annotate my steps in this file. Perhaps it is useful to someone
+or someone learns something from it.
+
+
+The problem
+===========
+
+NAND flash (at least SLC one) typically has sectors of 256 bytes.
+However NAND flash is not extremely reliable so some error detection
+(and sometimes correction) is needed.
+
+This is done by means of a Hamming code. I'll try to explain it in
+laymans terms (and apologies to all the pro's in the field in case I do
+not use the right terminology, my coding theory class was almost 30
+years ago, and I must admit it was not one of my favourites).
+
+As I said before the ecc calculation is performed on sectors of 256
+bytes. This is done by calculating several parity bits over the rows and
+columns. The parity used is even parity which means that the parity bit = 1
+if the data over which the parity is calculated is 1 and the parity bit = 0
+if the data over which the parity is calculated is 0. So the total
+number of bits over the data over which the parity is calculated + the
+parity bit is even. (see wikipedia if you can't follow this).
+Parity is often calculated by means of an exclusive or operation,
+sometimes also referred to as xor. In C the operator for xor is ^
+
+Back to ecc.
+Let's give a small figure:
+
+byte   0:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp2 rp4 ... rp14
+byte   1:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp2 rp4 ... rp14
+byte   2:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp3 rp4 ... rp14
+byte   3:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp3 rp4 ... rp14
+byte   4:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp2 rp5 ... rp14
+....
+byte 254:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp0 rp3 rp5 ... rp15
+byte 255:  bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0   rp1 rp3 rp5 ... rp15
+           cp1  cp0  cp1  cp0  cp1  cp0  cp1  cp0
+           cp3  cp3  cp2  cp2  cp3  cp3  cp2  cp2
+           cp5  cp5  cp5  cp5  cp4  cp4  cp4  cp4
+
+This figure represents a sector of 256 bytes.
+cp is my abbreviaton for column parity, rp for row parity.
+
+Let's start to explain column parity.
+cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
+so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even.
+Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7.
+cp2 is the parity over bit0, bit1, bit4 and bit5
+cp3 is the parity over bit2, bit3, bit6 and bit7.
+cp4 is the parity over bit0, bit1, bit2 and bit3.
+cp5 is the parity over bit4, bit5, bit6 and bit7.
+Note that each of cp0 .. cp5 is exactly one bit.
+
+Row parity actually works almost the same.
+rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254)
+rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255)
+rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ...
+(so handle two bytes, then skip 2 bytes).
+rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...)
+for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc.
+so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...)
+and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, ..
+The story now becomes quite boring. I guess you get the idea.
+rp6 covers 8 bytes then skips 8 etc
+rp7 skips 8 bytes then covers 8 etc
+rp8 covers 16 bytes then skips 16 etc
+rp9 skips 16 bytes then covers 16 etc
+rp10 covers 32 bytes then skips 32 etc
+rp11 skips 32 bytes then covers 32 etc
+rp12 covers 64 bytes then skips 64 etc
+rp13 skips 64 bytes then covers 64 etc
+rp14 covers 128 bytes then skips 128
+rp15 skips 128 bytes then covers 128
+
+In the end the parity bits are grouped together in three bytes as
+follows:
+ECC    Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0
+ECC 0   rp07  rp06  rp05  rp04  rp03  rp02  rp01  rp00
+ECC 1   rp15  rp14  rp13  rp12  rp11  rp10  rp09  rp08
+ECC 2   cp5   cp4   cp3   cp2   cp1   cp0      1     1
+
+I detected after writing this that ST application note AN1823
+(http://www.st.com/stonline/books/pdf/docs/10123.pdf) gives a much
+nicer picture.(but they use line parity as term where I use row parity)
+Oh well, I'm graphically challenged, so suffer with me for a moment :-)
+And I could not reuse the ST picture anyway for copyright reasons.
+
+
+Attempt 0
+=========
+
+Implementing the parity calculation is pretty simple.
+In C pseudocode:
+for (i = 0; i < 256; i++)
+{
+    if (i & 0x01)
+       rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
+    else
+       rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
+    if (i & 0x02)
+       rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
+    else
+       rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2;
+    if (i & 0x04)
+      rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5;
+    else
+      rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4;
+    if (i & 0x08)
+      rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7;
+    else
+      rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6;
+    if (i & 0x10)
+      rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9;
+    else
+      rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8;
+    if (i & 0x20)
+      rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
+    else
+    rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
+    if (i & 0x40)
+      rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
+    else
+      rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12;
+    if (i & 0x80)
+      rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15;
+    else
+      rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14;
+    cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0;
+    cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1;
+    cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2;
+    cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3
+    cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4
+    cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5
+}
+
+
+Analysis 0
+==========
+
+C does have bitwise operators but not really operators to do the above
+efficiently (and most hardware has no such instructions either).
+Therefore without implementing this it was clear that the code above was
+not going to bring me a Nobel prize :-)
+
+Fortunately the exclusive or operation is commutative, so we can combine
+the values in any order. So instead of calculating all the bits
+individually, let us try to rearrange things.
+For the column parity this is easy. We can just xor the bytes and in the
+end filter out the relevant bits. This is pretty nice as it will bring
+all cp calculation out of the if loop.
+
+Similarly we can first xor the bytes for the various rows.
+This leads to:
+
+
+Attempt 1
+=========
+
+const char parity[256] = {
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+    0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
+};
+
+void ecc1(const unsigned char *buf, unsigned char *code)
+{
+    int i;
+    const unsigned char *bp = buf;
+    unsigned char cur;
+    unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+    unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+    unsigned char par;
+
+    par = 0;
+    rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+    rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+    rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+    rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+    for (i = 0; i < 256; i++)
+    {
+        cur = *bp++;
+        par ^= cur;
+        if (i & 0x01) rp1 ^= cur; else rp0 ^= cur;
+        if (i & 0x02) rp3 ^= cur; else rp2 ^= cur;
+        if (i & 0x04) rp5 ^= cur; else rp4 ^= cur;
+        if (i & 0x08) rp7 ^= cur; else rp6 ^= cur;
+        if (i & 0x10) rp9 ^= cur; else rp8 ^= cur;
+        if (i & 0x20) rp11 ^= cur; else rp10 ^= cur;
+        if (i & 0x40) rp13 ^= cur; else rp12 ^= cur;
+        if (i & 0x80) rp15 ^= cur; else rp14 ^= cur;
+    }
+    code[0] =
+        (parity[rp7] << 7) |
+        (parity[rp6] << 6) |
+        (parity[rp5] << 5) |
+        (parity[rp4] << 4) |
+        (parity[rp3] << 3) |
+        (parity[rp2] << 2) |
+        (parity[rp1] << 1) |
+        (parity[rp0]);
+    code[1] =
+        (parity[rp15] << 7) |
+        (parity[rp14] << 6) |
+        (parity[rp13] << 5) |
+        (parity[rp12] << 4) |
+        (parity[rp11] << 3) |
+        (parity[rp10] << 2) |
+        (parity[rp9]  << 1) |
+        (parity[rp8]);
+    code[2] =
+        (parity[par & 0xf0] << 7) |
+        (parity[par & 0x0f] << 6) |
+        (parity[par & 0xcc] << 5) |
+        (parity[par & 0x33] << 4) |
+        (parity[par & 0xaa] << 3) |
+        (parity[par & 0x55] << 2);
+    code[0] = ~code[0];
+    code[1] = ~code[1];
+    code[2] = ~code[2];
+}
+
+Still pretty straightforward. The last three invert statements are there to
+give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash
+all data is 0xff, so the checksum then matches.
+
+I also introduced the parity lookup. I expected this to be the fastest
+way to calculate the parity, but I will investigate alternatives later
+on.
+
+
+Analysis 1
+==========
+
+The code works, but is not terribly efficient. On my system it took
+almost 4 times as much time as the linux driver code. But hey, if it was
+*that* easy this would have been done long before.
+No pain. no gain.
+
+Fortunately there is plenty of room for improvement.
+
+In step 1 we moved from bit-wise calculation to byte-wise calculation.
+However in C we can also use the unsigned long data type and virtually
+every modern microprocessor supports 32 bit operations, so why not try
+to write our code in such a way that we process data in 32 bit chunks.
+
+Of course this means some modification as the row parity is byte by
+byte. A quick analysis:
+for the column parity we use the par variable. When extending to 32 bits
+we can in the end easily calculate p0 and p1 from it.
+(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
+respectively)
+also rp2 and rp3 can be easily retrieved from par as rp3 covers the
+first two bytes and rp2 the last two bytes.
+
+Note that of course now the loop is executed only 64 times (256/4).
+And note that care must taken wrt byte ordering. The way bytes are
+ordered in a long is machine dependent, and might affect us.
+Anyway, if there is an issue: this code is developed on x86 (to be
+precise: a DELL PC with a D920 Intel CPU)
+
+And of course the performance might depend on alignment, but I expect
+that the I/O buffers in the nand driver are aligned properly (and
+otherwise that should be fixed to get maximum performance).
+
+Let's give it a try...
+
+
+Attempt 2
+=========
+
+extern const char parity[256];
+
+void ecc2(const unsigned char *buf, unsigned char *code)
+{
+    int i;
+    const unsigned long *bp = (unsigned long *)buf;
+    unsigned long cur;
+    unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+    unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+    unsigned long par;
+
+    par = 0;
+    rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+    rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+    rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+    rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+    for (i = 0; i < 64; i++)
+    {
+        cur = *bp++;
+        par ^= cur;
+        if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+        if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+        if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+        if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+        if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+        if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+    }
+    /*
+       we need to adapt the code generation for the fact that rp vars are now
+       long; also the column parity calculation needs to be changed.
+       we'll bring rp4 to 15 back to single byte entities by shifting and
+       xoring
+    */
+    rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff;
+    rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff;
+    rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff;
+    rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff;
+    rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff;
+    rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff;
+    rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff;
+    rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff;
+    rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff;
+    rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff;
+    rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff;
+    rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff;
+    rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff;
+    rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff;
+    par ^= (par >> 16);
+    rp1 = (par >> 8); rp1 &= 0xff;
+    rp0 = (par & 0xff);
+    par ^= (par >> 8); par &= 0xff;
+
+    code[0] =
+        (parity[rp7] << 7) |
+        (parity[rp6] << 6) |
+        (parity[rp5] << 5) |
+        (parity[rp4] << 4) |
+        (parity[rp3] << 3) |
+        (parity[rp2] << 2) |
+        (parity[rp1] << 1) |
+        (parity[rp0]);
+    code[1] =
+        (parity[rp15] << 7) |
+        (parity[rp14] << 6) |
+        (parity[rp13] << 5) |
+        (parity[rp12] << 4) |
+        (parity[rp11] << 3) |
+        (parity[rp10] << 2) |
+        (parity[rp9]  << 1) |
+        (parity[rp8]);
+    code[2] =
+        (parity[par & 0xf0] << 7) |
+        (parity[par & 0x0f] << 6) |
+        (parity[par & 0xcc] << 5) |
+        (parity[par & 0x33] << 4) |
+        (parity[par & 0xaa] << 3) |
+        (parity[par & 0x55] << 2);
+    code[0] = ~code[0];
+    code[1] = ~code[1];
+    code[2] = ~code[2];
+}
+
+The parity array is not shown any more. Note also that for these
+examples I kinda deviated from my regular programming style by allowing
+multiple statements on a line, not using { } in then and else blocks
+with only a single statement and by using operators like ^=
+
+
+Analysis 2
+==========
+
+The code (of course) works, and hurray: we are a little bit faster than
+the linux driver code (about 15%). But wait, don't cheer too quickly.
+THere is more to be gained.
+If we look at e.g. rp14 and rp15 we see that we either xor our data with
+rp14 or with rp15. However we also have par which goes over all data.
+This means there is no need to calculate rp14 as it can be calculated from
+rp15 through rp14 = par ^ rp15;
+(or if desired we can avoid calculating rp15 and calculate it from
+rp14).  That is why some places refer to inverse parity.
+Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
+Effectively this means we can eliminate the else clause from the if
+statements. Also we can optimise the calculation in the end a little bit
+by going from long to byte first. Actually we can even avoid the table
+lookups
+
+Attempt 3
+=========
+
+Odd replaced:
+        if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+        if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+        if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+        if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+        if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+        if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+with
+        if (i & 0x01) rp5 ^= cur;
+        if (i & 0x02) rp7 ^= cur;
+        if (i & 0x04) rp9 ^= cur;
+        if (i & 0x08) rp11 ^= cur;
+        if (i & 0x10) rp13 ^= cur;
+        if (i & 0x20) rp15 ^= cur;
+
+        and outside the loop added:
+    rp4  = par ^ rp5;
+    rp6  = par ^ rp7;
+    rp8  = par ^ rp9;
+    rp10  = par ^ rp11;
+    rp12  = par ^ rp13;
+    rp14  = par ^ rp15;
+
+And after that the code takes about 30% more time, although the number of
+statements is reduced. This is also reflected in the assembly code.
+
+
+Analysis 3
+==========
+
+Very weird. Guess it has to do with caching or instruction parallellism
+or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting
+observation was that this one is only 30% slower (according to time)
+executing the code as my 3Ghz D920 processor.
+
+Well, it was expected not to be easy so maybe instead move to a
+different track: let's move back to the code from attempt2 and do some
+loop unrolling. This will eliminate a few if statements. I'll try
+different amounts of unrolling to see what works best.
+
+
+Attempt 4
+=========
+
+Unrolled the loop 1, 2, 3 and 4 times.
+For 4 the code starts with:
+
+    for (i = 0; i < 4; i++)
+    {
+        cur = *bp++;
+        par ^= cur;
+        rp4 ^= cur;
+        rp6 ^= cur;
+        rp8 ^= cur;
+        rp10 ^= cur;
+        if (i & 0x1) rp13 ^= cur; else rp12 ^= cur;
+        if (i & 0x2) rp15 ^= cur; else rp14 ^= cur;
+        cur = *bp++;
+        par ^= cur;
+        rp5 ^= cur;
+        rp6 ^= cur;
+        ...
+
+
+Analysis 4
+==========
+
+Unrolling once gains about 15%
+Unrolling twice keeps the gain at about 15%
+Unrolling three times gives a gain of 30% compared to attempt 2.
+Unrolling four times gives a marginal improvement compared to unrolling
+three times.
+
+I decided to proceed with a four time unrolled loop anyway. It was my gut
+feeling that in the next steps I would obtain additional gain from it.
+
+The next step was triggered by the fact that par contains the xor of all
+bytes and rp4 and rp5 each contain the xor of half of the bytes.
+So in effect par = rp4 ^ rp5. But as xor is commutative we can also say
+that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can
+eliminate rp5 (or rp4, but I already foresaw another optimisation).
+The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15.
+
+
+Attempt 5
+=========
+
+Effectively so all odd digit rp assignments in the loop were removed.
+This included the else clause of the if statements.
+Of course after the loop we need to correct things by adding code like:
+    rp5 = par ^ rp4;
+Also the initial assignments (rp5 = 0; etc) could be removed.
+Along the line I also removed the initialisation of rp0/1/2/3.
+
+
+Analysis 5
+==========
+
+Measurements showed this was a good move. The run-time roughly halved
+compared with attempt 4 with 4 times unrolled, and we only require 1/3rd
+of the processor time compared to the current code in the linux kernel.
+
+However, still I thought there was more. I didn't like all the if
+statements. Why not keep a running parity and only keep the last if
+statement. Time for yet another version!
+
+
+Attempt 6
+=========
+
+THe code within the for loop was changed to:
+
+    for (i = 0; i < 4; i++)
+    {
+        cur = *bp++; tmppar  = cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+	    cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+	    cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+	    cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
+	    cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp8 ^= cur;
+
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur;
+
+	    par ^= tmppar;
+        if ((i & 0x1) == 0) rp12 ^= tmppar;
+        if ((i & 0x2) == 0) rp14 ^= tmppar;
+    }
+
+As you can see tmppar is used to accumulate the parity within a for
+iteration. In the last 3 statements is is added to par and, if needed,
+to rp12 and rp14.
+
+While making the changes I also found that I could exploit that tmppar
+contains the running parity for this iteration. So instead of having:
+rp4 ^= cur; rp6 = cur;
+I removed the rp6 = cur; statement and did rp6 ^= tmppar; on next
+statement. A similar change was done for rp8 and rp10
+
+
+Analysis 6
+==========
+
+Measuring this code again showed big gain. When executing the original
+linux code 1 million times, this took about 1 second on my system.
+(using time to measure the performance). After this iteration I was back
+to 0.075 sec. Actually I had to decide to start measuring over 10
+million interations in order not to loose too much accuracy. This one
+definitely seemed to be the jackpot!
+
+There is a little bit more room for improvement though. There are three
+places with statements:
+rp4 ^= cur; rp6 ^= cur;
+It seems more efficient to also maintain a variable rp4_6 in the while
+loop; This eliminates 3 statements per loop. Of course after the loop we
+need to correct by adding:
+    rp4 ^= rp4_6;
+    rp6 ^= rp4_6
+Furthermore there are 4 sequential assingments to rp8. This can be
+encoded slightly more efficient by saving tmppar before those 4 lines
+and later do rp8 = rp8 ^ tmppar ^ notrp8;
+(where notrp8 is the value of rp8 before those 4 lines).
+Again a use of the commutative property of xor.
+Time for a new test!
+
+
+Attempt 7
+=========
+
+The new code now looks like:
+
+    for (i = 0; i < 4; i++)
+    {
+        cur = *bp++; tmppar  = cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+	    cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+	    cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+	    notrp8 = tmppar;
+	    cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+	    cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur;
+	    rp8 = rp8 ^ tmppar ^ notrp8;
+
+        cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+        cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+        cur = *bp++; tmppar ^= cur;
+
+	    par ^= tmppar;
+        if ((i & 0x1) == 0) rp12 ^= tmppar;
+        if ((i & 0x2) == 0) rp14 ^= tmppar;
+    }
+    rp4 ^= rp4_6;
+    rp6 ^= rp4_6;
+
+
+Not a big change, but every penny counts :-)
+
+
+Analysis 7
+==========
+
+Acutally this made things worse. Not very much, but I don't want to move
+into the wrong direction. Maybe something to investigate later. Could
+have to do with caching again.
+
+Guess that is what there is to win within the loop. Maybe unrolling one
+more time will help. I'll keep the optimisations from 7 for now.
+
+
+Attempt 8
+=========
+
+Unrolled the loop one more time.
+
+
+Analysis 8
+==========
+
+This makes things worse. Let's stick with attempt 6 and continue from there.
+Although it seems that the code within the loop cannot be optimised
+further there is still room to optimize the generation of the ecc codes.
+We can simply calcualate the total parity. If this is 0 then rp4 = rp5
+etc. If the parity is 1, then rp4 = !rp5;
+But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
+in the result byte and then do something like
+    code[0] |= (code[0] << 1);
+Lets test this.
+
+
+Attempt 9
+=========
+
+Changed the code but again this slightly degrades performance. Tried all
+kind of other things, like having dedicated parity arrays to avoid the
+shift after parity[rp7] << 7; No gain.
+Change the lookup using the parity array by using shift operators (e.g.
+replace parity[rp7] << 7 with:
+rp7 ^= (rp7 << 4);
+rp7 ^= (rp7 << 2);
+rp7 ^= (rp7 << 1);
+rp7 &= 0x80;
+No gain.
+
+The only marginal change was inverting the parity bits, so we can remove
+the last three invert statements.
+
+Ah well, pity this does not deliver more. Then again 10 million
+iterations using the linux driver code takes between 13 and 13.5
+seconds, whereas my code now takes about 0.73 seconds for those 10
+million iterations. So basically I've improved the performance by a
+factor 18 on my system. Not that bad. Of course on different hardware
+you will get different results. No warranties!
+
+But of course there is no such thing as a free lunch. The codesize almost
+tripled (from 562 bytes to 1434 bytes). Then again, it is not that much.
+
+
+Correcting errors
+=================
+
+For correcting errors I again used the ST application note as a starter,
+but I also peeked at the existing code.
+The algorithm itself is pretty straightforward. Just xor the given and
+the calculated ecc. If all bytes are 0 there is no problem. If 11 bits
+are 1 we have one correctable bit error. If there is 1 bit 1, we have an
+error in the given ecc code.
+It proved to be fastest to do some table lookups. Performance gain
+introduced by this is about a factor 2 on my system when a repair had to
+be done, and 1% or so if no repair had to be done.
+Code size increased from 330 bytes to 686 bytes for this function.
+(gcc 4.2, -O3)
+
+
+Conclusion
+==========
+
+The gain when calculating the ecc is tremendous. Om my development hardware
+a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
+embedded system with a MIPS core a factor 7 was obtained.
+On  a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
+5 (big endian mode, gcc 4.1.2, -O3)
+For correction not much gain could be obtained (as bitflips are rare). Then
+again there are also much less cycles spent there.
+
+It seems there is not much more gain possible in this, at least when
+programmed in C. Of course it might be possible to squeeze something more
+out of it with an assembler program, but due to pipeline behaviour etc
+this is very tricky (at least for intel hw).
+
+Author: Frans Meulenbroeks
+Copyright (C) 2008 Koninklijke Philips Electronics NV.
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index 5ce0952aa065..49378a9f2b5f 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -95,7 +95,8 @@ On all -  write a character to /proc/sysrq-trigger.  e.g.:
 
 'p'     - Will dump the current registers and flags to your console.
 
-'q'     - Will dump a list of all running timers.
+'q'     - Will dump a list of all running hrtimers.
+	  WARNING: Does not cover any other timers
 
 'r'     - Turns off keyboard raw mode and sets it to XLATE.
 
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
new file mode 100644
index 000000000000..125eed560e5a
--- /dev/null
+++ b/Documentation/vm/unevictable-lru.txt
@@ -0,0 +1,615 @@
+
+This document describes the Linux memory management "Unevictable LRU"
+infrastructure and the use of this infrastructure to manage several types
+of "unevictable" pages.  The document attempts to provide the overall
+rationale behind this mechanism and the rationale for some of the design
+decisions that drove the implementation.  The latter design rationale is
+discussed in the context of an implementation description.  Admittedly, one
+can obtain the implementation details--the "what does it do?"--by reading the
+code.  One hopes that the descriptions below add value by provide the answer
+to "why does it do that?".
+
+Unevictable LRU Infrastructure:
+
+The Unevictable LRU adds an additional LRU list to track unevictable pages
+and to hide these pages from vmscan.  This mechanism is based on a patch by
+Larry Woodman of Red Hat to address several scalability problems with page
+reclaim in Linux.  The problems have been observed at customer sites on large
+memory x86_64 systems.  For example, a non-numal x86_64 platform with 128GB
+of main memory will have over 32 million 4k pages in a single zone.  When a
+large fraction of these pages are not evictable for any reason [see below],
+vmscan will spend a lot of time scanning the LRU lists looking for the small
+fraction of pages that are evictable.  This can result in a situation where
+all cpus are spending 100% of their time in vmscan for hours or days on end,
+with the system completely unresponsive.
+
+The Unevictable LRU infrastructure addresses the following classes of
+unevictable pages:
+
++ page owned by ramfs
++ page mapped into SHM_LOCKed shared memory regions
++ page mapped into VM_LOCKED [mlock()ed] vmas
+
+The infrastructure might be able to handle other conditions that make pages
+unevictable, either by definition or by circumstance, in the future.
+
+
+The Unevictable LRU List
+
+The Unevictable LRU infrastructure consists of an additional, per-zone, LRU list
+called the "unevictable" list and an associated page flag, PG_unevictable, to
+indicate that the page is being managed on the unevictable list.  The
+PG_unevictable flag is analogous to, and mutually exclusive with, the PG_active
+flag in that it indicates on which LRU list a page resides when PG_lru is set.
+The unevictable LRU list is source configurable based on the UNEVICTABLE_LRU
+Kconfig option.
+
+The Unevictable LRU infrastructure maintains unevictable pages on an additional
+LRU list for a few reasons:
+
+1) We get to "treat unevictable pages just like we treat other pages in the
+   system, which means we get to use the same code to manipulate them, the
+   same code to isolate them (for migrate, etc.), the same code to keep track
+   of the statistics, etc..." [Rik van Riel]
+
+2) We want to be able to migrate unevictable pages between nodes--for memory
+   defragmentation, workload management and memory hotplug.  The linux kernel
+   can only migrate pages that it can successfully isolate from the lru lists.
+   If we were to maintain pages elsewise than on an lru-like list, where they
+   can be found by isolate_lru_page(), we would prevent their migration, unless
+   we reworked migration code to find the unevictable pages.
+
+
+The unevictable LRU list does not differentiate between file backed and swap
+backed [anon] pages.  This differentiation is only important while the pages
+are, in fact, evictable.
+
+The unevictable LRU list benefits from the "arrayification" of the per-zone
+LRU lists and statistics originally proposed and posted by Christoph Lameter.
+
+The unevictable list does not use the lru pagevec mechanism. Rather,
+unevictable pages are placed directly on the page's zone's unevictable
+list under the zone lru_lock.  The reason for this is to prevent stranding
+of pages on the unevictable list when one task has the page isolated from the
+lru and other tasks are changing the "evictability" state of the page.
+
+
+Unevictable LRU and Memory Controller Interaction
+
+The memory controller data structure automatically gets a per zone unevictable
+lru list as a result of the "arrayification" of the per-zone LRU lists.  The
+memory controller tracks the movement of pages to and from the unevictable list.
+When a memory control group comes under memory pressure, the controller will
+not attempt to reclaim pages on the unevictable list.  This has a couple of
+effects.  Because the pages are "hidden" from reclaim on the unevictable list,
+the reclaim process can be more efficient, dealing only with pages that have
+a chance of being reclaimed.  On the other hand, if too many of the pages
+charged to the control group are unevictable, the evictable portion of the
+working set of the tasks in the control group may not fit into the available
+memory.  This can cause the control group to thrash or to oom-kill tasks.
+
+
+Unevictable LRU:  Detecting Unevictable Pages
+
+The function page_evictable(page, vma) in vmscan.c determines whether a
+page is evictable or not.  For ramfs pages and pages in SHM_LOCKed regions,
+page_evictable() tests a new address space flag, AS_UNEVICTABLE, in the page's
+address space using a wrapper function.  Wrapper functions are used to set,
+clear and test the flag to reduce the requirement for #ifdef's throughout the
+source code.  AS_UNEVICTABLE is set on ramfs inode/mapping when it is created.
+This flag remains for the life of the inode.
+
+For shared memory regions, AS_UNEVICTABLE is set when an application
+successfully SHM_LOCKs the region and is removed when the region is
+SHM_UNLOCKed.  Note that shmctl(SHM_LOCK, ...) does not populate the page
+tables for the region as does, for example, mlock().   So, we make no special
+effort to push any pages in the SHM_LOCKed region to the unevictable list.
+Vmscan will do this when/if it encounters the pages during reclaim.  On
+SHM_UNLOCK, shmctl() scans the pages in the region and "rescues" them from the
+unevictable list if no other condition keeps them unevictable.  If a SHM_LOCKed
+region is destroyed, the pages are also "rescued" from the unevictable list in
+the process of freeing them.
+
+page_evictable() detects mlock()ed pages by testing an additional page flag,
+PG_mlocked via the PageMlocked() wrapper.  If the page is NOT mlocked, and a
+non-NULL vma is supplied, page_evictable() will check whether the vma is
+VM_LOCKED via is_mlocked_vma().  is_mlocked_vma() will SetPageMlocked() and
+update the appropriate statistics if the vma is VM_LOCKED.  This method allows
+efficient "culling" of pages in the fault path that are being faulted in to
+VM_LOCKED vmas.
+
+
+Unevictable Pages and Vmscan [shrink_*_list()]
+
+If unevictable pages are culled in the fault path, or moved to the unevictable
+list at mlock() or mmap() time, vmscan will never encounter the pages until
+they have become evictable again, for example, via munlock() and have been
+"rescued" from the unevictable list.  However, there may be situations where we
+decide, for the sake of expediency, to leave a unevictable page on one of the
+regular active/inactive LRU lists for vmscan to deal with.  Vmscan checks for
+such pages in all of the shrink_{active|inactive|page}_list() functions and
+will "cull" such pages that it encounters--that is, it diverts those pages to
+the unevictable list for the zone being scanned.
+
+There may be situations where a page is mapped into a VM_LOCKED vma, but the
+page is not marked as PageMlocked.  Such pages will make it all the way to
+shrink_page_list() where they will be detected when vmscan walks the reverse
+map in try_to_unmap().  If try_to_unmap() returns SWAP_MLOCK, shrink_page_list()
+will cull the page at that point.
+
+Note that for anonymous pages, shrink_page_list() attempts to add the page to
+the swap cache before it tries to unmap the page.  To avoid this unnecessary
+consumption of swap space, shrink_page_list() calls try_to_munlock() to check
+whether any VM_LOCKED vmas map the page without attempting to unmap the page.
+If try_to_munlock() returns SWAP_MLOCK, shrink_page_list() will cull the page
+without consuming swap space.  try_to_munlock() will be described below.
+
+To "cull" an unevictable page, vmscan simply puts the page back on the lru
+list using putback_lru_page()--the inverse operation to isolate_lru_page()--
+after dropping the page lock.  Because the condition which makes the page
+unevictable may change once the page is unlocked, putback_lru_page() will
+recheck the unevictable state of a page that it places on the unevictable lru
+list.  If the page has become unevictable, putback_lru_page() removes it from
+the list and retries, including the page_unevictable() test.  Because such a
+race is a rare event and movement of pages onto the unevictable list should be
+rare, these extra evictabilty checks should not occur in the majority of calls
+to putback_lru_page().
+
+
+Mlocked Page:  Prior Work
+
+The "Unevictable Mlocked Pages" infrastructure is based on work originally
+posted by Nick Piggin in an RFC patch entitled "mm: mlocked pages off LRU".
+Nick posted his patch as an alternative to a patch posted by Christoph
+Lameter to achieve the same objective--hiding mlocked pages from vmscan.
+In Nick's patch, he used one of the struct page lru list link fields as a count
+of VM_LOCKED vmas that map the page.  This use of the link field for a count
+prevented the management of the pages on an LRU list.  Thus, mlocked pages were
+not migratable as isolate_lru_page() could not find them and the lru list link
+field was not available to the migration subsystem.  Nick resolved this by
+putting mlocked pages back on the lru list before attempting to isolate them,
+thus abandoning the count of VM_LOCKED vmas.  When Nick's patch was integrated
+with the Unevictable LRU work, the count was replaced by walking the reverse
+map to determine whether any VM_LOCKED vmas mapped the page.  More on this
+below.
+
+
+Mlocked Pages:  Basic Management
+
+Mlocked pages--pages mapped into a VM_LOCKED vma--represent one class of
+unevictable pages.  When such a page has been "noticed" by the memory
+management subsystem, the page is marked with the PG_mlocked [PageMlocked()]
+flag.  A PageMlocked() page will be placed on the unevictable LRU list when
+it is added to the LRU.   Pages can be "noticed" by memory management in
+several places:
+
+1) in the mlock()/mlockall() system call handlers.
+2) in the mmap() system call handler when mmap()ing a region with the
+   MAP_LOCKED flag, or mmap()ing a region in a task that has called
+   mlockall() with the MCL_FUTURE flag.  Both of these conditions result
+   in the VM_LOCKED flag being set for the vma.
+3) in the fault path, if mlocked pages are "culled" in the fault path,
+   and when a VM_LOCKED stack segment is expanded.
+4) as mentioned above, in vmscan:shrink_page_list() with attempting to
+   reclaim a page in a VM_LOCKED vma--via try_to_unmap() or try_to_munlock().
+
+Mlocked pages become unlocked and rescued from the unevictable list when:
+
+1) mapped in a range unlocked via the munlock()/munlockall() system calls.
+2) munmapped() out of the last VM_LOCKED vma that maps the page, including
+   unmapping at task exit.
+3) when the page is truncated from the last VM_LOCKED vma of an mmap()ed file.
+4) before a page is COWed in a VM_LOCKED vma.
+
+
+Mlocked Pages:  mlock()/mlockall() System Call Handling
+
+Both [do_]mlock() and [do_]mlockall() system call handlers call mlock_fixup()
+for each vma in the range specified by the call.  In the case of mlockall(),
+this is the entire active address space of the task.  Note that mlock_fixup()
+is used for both mlock()ing and munlock()ing a range of memory.  A call to
+mlock() an already VM_LOCKED vma, or to munlock() a vma that is not VM_LOCKED
+is treated as a no-op--mlock_fixup() simply returns.
+
+If the vma passes some filtering described in "Mlocked Pages:  Filtering Vmas"
+below, mlock_fixup() will attempt to merge the vma with its neighbors or split
+off a subset of the vma if the range does not cover the entire vma.  Once the
+vma has been merged or split or neither, mlock_fixup() will call
+__mlock_vma_pages_range() to fault in the pages via get_user_pages() and
+to mark the pages as mlocked via mlock_vma_page().
+
+Note that the vma being mlocked might be mapped with PROT_NONE.  In this case,
+get_user_pages() will be unable to fault in the pages.  That's OK.  If pages
+do end up getting faulted into this VM_LOCKED vma, we'll handle them in the
+fault path or in vmscan.
+
+Also note that a page returned by get_user_pages() could be truncated or
+migrated out from under us, while we're trying to mlock it.  To detect
+this, __mlock_vma_pages_range() tests the page_mapping after acquiring
+the page lock.  If the page is still associated with its mapping, we'll
+go ahead and call mlock_vma_page().  If the mapping is gone, we just
+unlock the page and move on.  Worse case, this results in page mapped
+in a VM_LOCKED vma remaining on a normal LRU list without being
+PageMlocked().  Again, vmscan will detect and cull such pages.
+
+mlock_vma_page(), called with the page locked [N.B., not "mlocked"], will
+TestSetPageMlocked() for each page returned by get_user_pages().  We use
+TestSetPageMlocked() because the page might already be mlocked by another
+task/vma and we don't want to do extra work.  We especially do not want to
+count an mlocked page more than once in the statistics.  If the page was
+already mlocked, mlock_vma_page() is done.
+
+If the page was NOT already mlocked, mlock_vma_page() attempts to isolate the
+page from the LRU, as it is likely on the appropriate active or inactive list
+at that time.  If the isolate_lru_page() succeeds, mlock_vma_page() will
+putback the page--putback_lru_page()--which will notice that the page is now
+mlocked and divert the page to the zone's unevictable LRU list.  If
+mlock_vma_page() is unable to isolate the page from the LRU, vmscan will handle
+it later if/when it attempts to reclaim the page.
+
+
+Mlocked Pages:  Filtering Special Vmas
+
+mlock_fixup() filters several classes of "special" vmas:
+
+1) vmas with VM_IO|VM_PFNMAP set are skipped entirely.  The pages behind
+   these mappings are inherently pinned, so we don't need to mark them as
+   mlocked.  In any case, most of the pages have no struct page in which to
+   so mark the page.  Because of this, get_user_pages() will fail for these
+   vmas, so there is no sense in attempting to visit them.
+
+2) vmas mapping hugetlbfs page are already effectively pinned into memory.
+   We don't need nor want to mlock() these pages.  However, to preserve the
+   prior behavior of mlock()--before the unevictable/mlock changes--mlock_fixup()
+   will call make_pages_present() in the hugetlbfs vma range to allocate the
+   huge pages and populate the ptes.
+
+3) vmas with VM_DONTEXPAND|VM_RESERVED are generally user space mappings of
+   kernel pages, such as the vdso page, relay channel pages, etc.  These pages
+   are inherently unevictable and are not managed on the LRU lists.
+   mlock_fixup() treats these vmas the same as hugetlbfs vmas.  It calls
+   make_pages_present() to populate the ptes.
+
+Note that for all of these special vmas, mlock_fixup() does not set the
+VM_LOCKED flag.  Therefore, we won't have to deal with them later during
+munlock() or munmap()--for example, at task exit.  Neither does mlock_fixup()
+account these vmas against the task's "locked_vm".
+
+Mlocked Pages:  Downgrading the Mmap Semaphore.
+
+mlock_fixup() must be called with the mmap semaphore held for write, because
+it may have to merge or split vmas.  However, mlocking a large region of
+memory can take a long time--especially if vmscan must reclaim pages to
+satisfy the regions requirements.  Faulting in a large region with the mmap
+semaphore held for write can hold off other faults on the address space, in
+the case of a multi-threaded task.  It can also hold off scans of the task's
+address space via /proc.  While testing under heavy load, it was observed that
+the ps(1) command could be held off for many minutes while a large segment was
+mlock()ed down.
+
+To address this issue, and to make the system more responsive during mlock()ing
+of large segments, mlock_fixup() downgrades the mmap semaphore to read mode
+during the call to __mlock_vma_pages_range().  This works fine.  However, the
+callers of mlock_fixup() expect the semaphore to be returned in write mode.
+So, mlock_fixup() "upgrades" the semphore to write mode.  Linux does not
+support an atomic upgrade_sem() call, so mlock_fixup() must drop the semaphore
+and reacquire it in write mode.  In a multi-threaded task, it is possible for
+the task memory map to change while the semaphore is dropped.  Therefore,
+mlock_fixup() looks up the vma at the range start address after reacquiring
+the semaphore in write mode and verifies that it still covers the original
+range.  If not, mlock_fixup() returns an error [-EAGAIN].  All callers of
+mlock_fixup() have been changed to deal with this new error condition.
+
+Note:  when munlocking a region, all of the pages should already be resident--
+unless we have racing threads mlocking() and munlocking() regions.  So,
+unlocking should not have to wait for page allocations nor faults  of any kind.
+Therefore mlock_fixup() does not downgrade the semaphore for munlock().
+
+
+Mlocked Pages:  munlock()/munlockall() System Call Handling
+
+The munlock() and munlockall() system calls are handled by the same functions--
+do_mlock[all]()--as the mlock() and mlockall() system calls with the unlock
+vs lock operation indicated by an argument.  So, these system calls are also
+handled by mlock_fixup().  Again, if called for an already munlock()ed vma,
+mlock_fixup() simply returns.  Because of the vma filtering discussed above,
+VM_LOCKED will not be set in any "special" vmas.  So, these vmas will be
+ignored for munlock.
+
+If the vma is VM_LOCKED, mlock_fixup() again attempts to merge or split off
+the specified range.  The range is then munlocked via the function
+__mlock_vma_pages_range()--the same function used to mlock a vma range--
+passing a flag to indicate that munlock() is being performed.
+
+Because the vma access protections could have been changed to PROT_NONE after
+faulting in and mlocking some pages, get_user_pages() was unreliable for visiting
+these pages for munlocking.  Because we don't want to leave pages mlocked(),
+get_user_pages() was enhanced to accept a flag to ignore the permissions when
+fetching the pages--all of which should be resident as a result of previous
+mlock()ing.
+
+For munlock(), __mlock_vma_pages_range() unlocks individual pages by calling
+munlock_vma_page().  munlock_vma_page() unconditionally clears the PG_mlocked
+flag using TestClearPageMlocked().  As with mlock_vma_page(), munlock_vma_page()
+use the Test*PageMlocked() function to handle the case where the page might
+have already been unlocked by another task.  If the page was mlocked,
+munlock_vma_page() updates that zone statistics for the number of mlocked
+pages.  Note, however, that at this point we haven't checked whether the page
+is mapped by other VM_LOCKED vmas.
+
+We can't call try_to_munlock(), the function that walks the reverse map to check
+for other VM_LOCKED vmas, without first isolating the page from the LRU.
+try_to_munlock() is a variant of try_to_unmap() and thus requires that the page
+not be on an lru list.  [More on these below.]  However, the call to
+isolate_lru_page() could fail, in which case we couldn't try_to_munlock().
+So, we go ahead and clear PG_mlocked up front, as this might be the only chance
+we have.  If we can successfully isolate the page, we go ahead and
+try_to_munlock(), which will restore the PG_mlocked flag and update the zone
+page statistics if it finds another vma holding the page mlocked.  If we fail
+to isolate the page, we'll have left a potentially mlocked page on the LRU.
+This is fine, because we'll catch it later when/if vmscan tries to reclaim the
+page.  This should be relatively rare.
+
+Mlocked Pages:  Migrating Them...
+
+A page that is being migrated has been isolated from the lru lists and is
+held locked across unmapping of the page, updating the page's mapping
+[address_space] entry and copying the contents and state, until the
+page table entry has been replaced with an entry that refers to the new
+page.  Linux supports migration of mlocked pages and other unevictable
+pages.  This involves simply moving the PageMlocked and PageUnevictable states
+from the old page to the new page.
+
+Note that page migration can race with mlocking or munlocking of the same
+page.  This has been discussed from the mlock/munlock perspective in the
+respective sections above.  Both processes [migration, m[un]locking], hold
+the page locked.  This provides the first level of synchronization.  Page
+migration zeros out the page_mapping of the old page before unlocking it,
+so m[un]lock can skip these pages by testing the page mapping under page
+lock.
+
+When completing page migration, we place the new and old pages back onto the
+lru after dropping the page lock.  The "unneeded" page--old page on success,
+new page on failure--will be freed when the reference count held by the
+migration process is released.  To ensure that we don't strand pages on the
+unevictable list because of a race between munlock and migration, page
+migration uses the putback_lru_page() function to add migrated pages back to
+the lru.
+
+
+Mlocked Pages:  mmap(MAP_LOCKED) System Call Handling
+
+In addition the the mlock()/mlockall() system calls, an application can request
+that a region of memory be mlocked using the MAP_LOCKED flag with the mmap()
+call.  Furthermore, any mmap() call or brk() call that expands the heap by a
+task that has previously called mlockall() with the MCL_FUTURE flag will result
+in the newly mapped memory being mlocked.  Before the unevictable/mlock changes,
+the kernel simply called make_pages_present() to allocate pages and populate
+the page table.
+
+To mlock a range of memory under the unevictable/mlock infrastructure, the
+mmap() handler and task address space expansion functions call
+mlock_vma_pages_range() specifying the vma and the address range to mlock.
+mlock_vma_pages_range() filters vmas like mlock_fixup(), as described above in
+"Mlocked Pages:  Filtering Vmas".  It will clear the VM_LOCKED flag, which will
+have already been set by the caller, in filtered vmas.  Thus these vma's need
+not be visited for munlock when the region is unmapped.
+
+For "normal" vmas, mlock_vma_pages_range() calls __mlock_vma_pages_range() to
+fault/allocate the pages and mlock them.  Again, like mlock_fixup(),
+mlock_vma_pages_range() downgrades the mmap semaphore to read mode before
+attempting to fault/allocate and mlock the pages; and "upgrades" the semaphore
+back to write mode before returning.
+
+The callers of mlock_vma_pages_range() will have already added the memory
+range to be mlocked to the task's "locked_vm".  To account for filtered vmas,
+mlock_vma_pages_range() returns the number of pages NOT mlocked.  All of the
+callers then subtract a non-negative return value from the task's locked_vm.
+A negative return value represent an error--for example, from get_user_pages()
+attempting to fault in a vma with PROT_NONE access.  In this case, we leave
+the memory range accounted as locked_vm, as the protections could be changed
+later and pages allocated into that region.
+
+
+Mlocked Pages:  munmap()/exit()/exec() System Call Handling
+
+When unmapping an mlocked region of memory, whether by an explicit call to
+munmap() or via an internal unmap from exit() or exec() processing, we must
+munlock the pages if we're removing the last VM_LOCKED vma that maps the pages.
+Before the unevictable/mlock changes, mlocking did not mark the pages in any way,
+so unmapping them required no processing.
+
+To munlock a range of memory under the unevictable/mlock infrastructure, the
+munmap() hander and task address space tear down function call
+munlock_vma_pages_all().  The name reflects the observation that one always
+specifies the entire vma range when munlock()ing during unmap of a region.
+Because of the vma filtering when mlocking() regions, only "normal" vmas that
+actually contain mlocked pages will be passed to munlock_vma_pages_all().
+
+munlock_vma_pages_all() clears the VM_LOCKED vma flag and, like mlock_fixup()
+for the munlock case, calls __munlock_vma_pages_range() to walk the page table
+for the vma's memory range and munlock_vma_page() each resident page mapped by
+the vma.  This effectively munlocks the page, only if this is the last
+VM_LOCKED vma that maps the page.
+
+
+Mlocked Page:  try_to_unmap()
+
+[Note:  the code changes represented by this section are really quite small
+compared to the text to describe what happening and why, and to discuss the
+implications.]
+
+Pages can, of course, be mapped into multiple vmas.  Some of these vmas may
+have VM_LOCKED flag set.  It is possible for a page mapped into one or more
+VM_LOCKED vmas not to have the PG_mlocked flag set and therefore reside on one
+of the active or inactive LRU lists.  This could happen if, for example, a
+task in the process of munlock()ing the page could not isolate the page from
+the LRU.  As a result, vmscan/shrink_page_list() might encounter such a page
+as described in "Unevictable Pages and Vmscan [shrink_*_list()]".  To
+handle this situation, try_to_unmap() has been enhanced to check for VM_LOCKED
+vmas while it is walking a page's reverse map.
+
+try_to_unmap() is always called, by either vmscan for reclaim or for page
+migration, with the argument page locked and isolated from the LRU.  BUG_ON()
+assertions enforce this requirement.  Separate functions handle anonymous and
+mapped file pages, as these types of pages have different reverse map
+mechanisms.
+
+	try_to_unmap_anon()
+
+To unmap anonymous pages, each vma in the list anchored in the anon_vma must be
+visited--at least until a VM_LOCKED vma is encountered.  If the page is being
+unmapped for migration, VM_LOCKED vmas do not stop the process because mlocked
+pages are migratable.  However, for reclaim, if the page is mapped into a
+VM_LOCKED vma, the scan stops.  try_to_unmap() attempts to acquire the mmap
+semphore of the mm_struct to which the vma belongs in read mode.  If this is
+successful, try_to_unmap() will mlock the page via mlock_vma_page()--we
+wouldn't have gotten to try_to_unmap() if the page were already mlocked--and
+will return SWAP_MLOCK, indicating that the page is unevictable.  If the
+mmap semaphore cannot be acquired, we are not sure whether the page is really
+unevictable or not.  In this case, try_to_unmap() will return SWAP_AGAIN.
+
+	try_to_unmap_file() -- linear mappings
+
+Unmapping of a mapped file page works the same, except that the scan visits
+all vmas that maps the page's index/page offset in the page's mapping's
+reverse map priority search tree.  It must also visit each vma in the page's
+mapping's non-linear list, if the list is non-empty.  As for anonymous pages,
+on encountering a VM_LOCKED vma for a mapped file page, try_to_unmap() will
+attempt to acquire the associated mm_struct's mmap semaphore to mlock the page,
+returning SWAP_MLOCK if this is successful, and SWAP_AGAIN, if not.
+
+	try_to_unmap_file() -- non-linear mappings
+
+If a page's mapping contains a non-empty non-linear mapping vma list, then
+try_to_un{map|lock}() must also visit each vma in that list to determine
+whether the page is mapped in a VM_LOCKED vma.  Again, the scan must visit
+all vmas in the non-linear list to ensure that the pages is not/should not be
+mlocked.  If a VM_LOCKED vma is found in the list, the scan could terminate.
+However, there is no easy way to determine whether the page is actually mapped
+in a given vma--either for unmapping or testing whether the VM_LOCKED vma
+actually pins the page.
+
+So, try_to_unmap_file() handles non-linear mappings by scanning a certain
+number of pages--a "cluster"--in each non-linear vma associated with the page's
+mapping, for each file mapped page that vmscan tries to unmap.  If this happens
+to unmap the page we're trying to unmap, try_to_unmap() will notice this on
+return--(page_mapcount(page) == 0)--and return SWAP_SUCCESS.  Otherwise, it
+will return SWAP_AGAIN, causing vmscan to recirculate this page.  We take
+advantage of the cluster scan in try_to_unmap_cluster() as follows:
+
+For each non-linear vma, try_to_unmap_cluster() attempts to acquire the mmap
+semaphore of the associated mm_struct for read without blocking.  If this
+attempt is successful and the vma is VM_LOCKED, try_to_unmap_cluster() will
+retain the mmap semaphore for the scan; otherwise it drops it here.  Then,
+for each page in the cluster, if we're holding the mmap semaphore for a locked
+vma, try_to_unmap_cluster() calls mlock_vma_page() to mlock the page.  This
+call is a no-op if the page is already locked, but will mlock any pages in
+the non-linear mapping that happen to be unlocked.  If one of the pages so
+mlocked is the page passed in to try_to_unmap(), try_to_unmap_cluster() will
+return SWAP_MLOCK, rather than the default SWAP_AGAIN.  This will allow vmscan
+to cull the page, rather than recirculating it on the inactive list.  Again,
+if try_to_unmap_cluster() cannot acquire the vma's mmap sem, it returns
+SWAP_AGAIN, indicating that the page is mapped by a VM_LOCKED vma, but
+couldn't be mlocked.
+
+
+Mlocked pages:  try_to_munlock() Reverse Map Scan
+
+TODO/FIXME:  a better name might be page_mlocked()--analogous to the
+page_referenced() reverse map walker--especially if we continue to call this
+from shrink_page_list().  See related TODO/FIXME below.
+
+When munlock_vma_page()--see "Mlocked Pages:  munlock()/munlockall() System
+Call Handling" above--tries to munlock a page, or when shrink_page_list()
+encounters an anonymous page that is not yet in the swap cache, they need to
+determine whether or not the page is mapped by any VM_LOCKED vma, without
+actually attempting to unmap all ptes from the page.  For this purpose, the
+unevictable/mlock infrastructure introduced a variant of try_to_unmap() called
+try_to_munlock().
+
+try_to_munlock() calls the same functions as try_to_unmap() for anonymous and
+mapped file pages with an additional argument specifing unlock versus unmap
+processing.  Again, these functions walk the respective reverse maps looking
+for VM_LOCKED vmas.  When such a vma is found for anonymous pages and file
+pages mapped in linear VMAs, as in the try_to_unmap() case, the functions
+attempt to acquire the associated mmap semphore, mlock the page via
+mlock_vma_page() and return SWAP_MLOCK.  This effectively undoes the
+pre-clearing of the page's PG_mlocked done by munlock_vma_page() and informs
+shrink_page_list() that the anonymous page should be culled rather than added
+to the swap cache in preparation for a try_to_unmap() that will almost
+certainly fail.
+
+If try_to_unmap() is unable to acquire a VM_LOCKED vma's associated mmap
+semaphore, it will return SWAP_AGAIN.  This will allow shrink_page_list()
+to recycle the page on the inactive list and hope that it has better luck
+with the page next time.
+
+For file pages mapped into non-linear vmas, the try_to_munlock() logic works
+slightly differently.  On encountering a VM_LOCKED non-linear vma that might
+map the page, try_to_munlock() returns SWAP_AGAIN without actually mlocking
+the page.  munlock_vma_page() will just leave the page unlocked and let
+vmscan deal with it--the usual fallback position.
+
+Note that try_to_munlock()'s reverse map walk must visit every vma in a pages'
+reverse map to determine that a page is NOT mapped into any VM_LOCKED vma.
+However, the scan can terminate when it encounters a VM_LOCKED vma and can
+successfully acquire the vma's mmap semphore for read and mlock the page.
+Although try_to_munlock() can be called many [very many!] times when
+munlock()ing a large region or tearing down a large address space that has been
+mlocked via mlockall(), overall this is a fairly rare event.  In addition,
+although shrink_page_list() calls try_to_munlock() for every anonymous page that
+it handles that is not yet in the swap cache, on average anonymous pages will
+have very short reverse map lists.
+
+Mlocked Page:  Page Reclaim in shrink_*_list()
+
+shrink_active_list() culls any obviously unevictable pages--i.e.,
+!page_evictable(page, NULL)--diverting these to the unevictable lru
+list.  However, shrink_active_list() only sees unevictable pages that
+made it onto the active/inactive lru lists.  Note that these pages do not
+have PageUnevictable set--otherwise, they would be on the unevictable list and
+shrink_active_list would never see them.
+
+Some examples of these unevictable pages on the LRU lists are:
+
+1) ramfs pages that have been placed on the lru lists when first allocated.
+
+2) SHM_LOCKed shared memory pages.  shmctl(SHM_LOCK) does not attempt to
+   allocate or fault in the pages in the shared memory region.  This happens
+   when an application accesses the page the first time after SHM_LOCKing
+   the segment.
+
+3) Mlocked pages that could not be isolated from the lru and moved to the
+   unevictable list in mlock_vma_page().
+
+3) Pages mapped into multiple VM_LOCKED vmas, but try_to_munlock() couldn't
+   acquire the vma's mmap semaphore to test the flags and set PageMlocked.
+   munlock_vma_page() was forced to let the page back on to the normal
+   LRU list for vmscan to handle.
+
+shrink_inactive_list() also culls any unevictable pages that it finds
+on the inactive lists, again diverting them to the appropriate zone's unevictable
+lru list.  shrink_inactive_list() should only see SHM_LOCKed pages that became
+SHM_LOCKed after shrink_active_list() had moved them to the inactive list, or
+pages mapped into VM_LOCKED vmas that munlock_vma_page() couldn't isolate from
+the lru to recheck via try_to_munlock().  shrink_inactive_list() won't notice
+the latter, but will pass on to shrink_page_list().
+
+shrink_page_list() again culls obviously unevictable pages that it could
+encounter for similar reason to shrink_inactive_list().  As already discussed,
+shrink_page_list() proactively looks for anonymous pages that should have
+PG_mlocked set but don't--these would not be detected by page_evictable()--to
+avoid adding them to the swap cache unnecessarily.  File pages mapped into
+VM_LOCKED vmas but without PG_mlocked set will make it all the way to
+try_to_unmap().  shrink_page_list() will divert them to the unevictable list when
+try_to_unmap() returns SWAP_MLOCK, as discussed above.
+
+TODO/FIXME:  If we can enhance the swap cache to reliably remove entries
+with page_count(page) > 2, as long as all ptes are mapped to the page and
+not the swap entry, we can probably remove the call to try_to_munlock() in
+shrink_page_list() and just remove the page from the swap cache when
+try_to_unmap() returns SWAP_MLOCK.   Currently, remove_exclusive_swap_page()
+doesn't seem to allow that.
+
+
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index a0f642b6a4b9..6110197757a3 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -70,6 +70,7 @@ config AUTO_IRQ_AFFINITY
 	default y
 
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 
 menu "System setup"
diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 15fda4344424..d069526bd767 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -74,12 +74,14 @@ register struct thread_info *__current_thread_info __asm__("$8");
 #define TIF_UAC_SIGBUS		7
 #define TIF_MEMDIE		8
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
+#define TIF_FREEZE		16	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 /* Work to do on interrupt/exception return.  */
 #define _TIF_WORK_MASK		(_TIF_SIGPENDING | _TIF_NEED_RESCHED)
diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index 04dcc5e5d4c1..9cd8dca742a7 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -655,7 +655,7 @@ __marvel_rtc_io(u8 b, unsigned long addr, int write)
 
 	case 0x71:					/* RTC_PORT(1) */
 		rtc_access.index = index;
-		rtc_access.data = BCD_TO_BIN(b);
+		rtc_access.data = bcd2bin(b);
 		rtc_access.function = 0x48 + !write;	/* GET/PUT_TOY */
 
 #ifdef CONFIG_SMP
@@ -668,7 +668,7 @@ __marvel_rtc_io(u8 b, unsigned long addr, int write)
 #else
 		__marvel_access_rtc(&rtc_access);
 #endif
-		ret = BIN_TO_BCD(rtc_access.data);
+		ret = bin2bcd(rtc_access.data);
 		break;
 
 	default:
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 75480cab0893..e6a231435cba 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -346,12 +346,12 @@ time_init(void)
 	year = CMOS_READ(RTC_YEAR);
 
 	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		BCD_TO_BIN(sec);
-		BCD_TO_BIN(min);
-		BCD_TO_BIN(hour);
-		BCD_TO_BIN(day);
-		BCD_TO_BIN(mon);
-		BCD_TO_BIN(year);
+		sec = bcd2bin(sec);
+		min = bcd2bin(min);
+		hour = bcd2bin(hour);
+		day = bcd2bin(day);
+		mon = bcd2bin(mon);
+		year = bcd2bin(year);
 	}
 
 	/* PC-like is standard; used for year >= 70 */
@@ -525,7 +525,7 @@ set_rtc_mmss(unsigned long nowtime)
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
 	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(cmos_minutes);
+		cmos_minutes = bcd2bin(cmos_minutes);
 
 	/*
 	 * since we're only adjusting minutes and seconds,
@@ -543,8 +543,8 @@ set_rtc_mmss(unsigned long nowtime)
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
 		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			BIN_TO_BCD(real_seconds);
-			BIN_TO_BCD(real_minutes);
+			real_seconds = bin2bcd(real_seconds);
+			real_minutes = bin2bcd(real_minutes);
 		}
 		CMOS_WRITE(real_seconds,RTC_SECONDS);
 		CMOS_WRITE(real_minutes,RTC_MINUTES);
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4853f9df37bd..df39d20f7425 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -192,6 +192,8 @@ config VECTORS_BASE
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "System Type"
 
 choice
diff --git a/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h b/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h
index eb4b190b6657..eb35fca9aea5 100644
--- a/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h
+++ b/arch/arm/mach-pxa/include/mach/pxa3xx_nand.h
@@ -4,6 +4,43 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 
+struct pxa3xx_nand_timing {
+	unsigned int	tCH;  /* Enable signal hold time */
+	unsigned int	tCS;  /* Enable signal setup time */
+	unsigned int	tWH;  /* ND_nWE high duration */
+	unsigned int	tWP;  /* ND_nWE pulse time */
+	unsigned int	tRH;  /* ND_nRE high duration */
+	unsigned int	tRP;  /* ND_nRE pulse width */
+	unsigned int	tR;   /* ND_nWE high to ND_nRE low for read */
+	unsigned int	tWHR; /* ND_nWE high to ND_nRE low for status read */
+	unsigned int	tAR;  /* ND_ALE low to ND_nRE low delay */
+};
+
+struct pxa3xx_nand_cmdset {
+	uint16_t	read1;
+	uint16_t	read2;
+	uint16_t	program;
+	uint16_t	read_status;
+	uint16_t	read_id;
+	uint16_t	erase;
+	uint16_t	reset;
+	uint16_t	lock;
+	uint16_t	unlock;
+	uint16_t	lock_status;
+};
+
+struct pxa3xx_nand_flash {
+	const struct pxa3xx_nand_timing *timing; /* NAND Flash timing */
+	const struct pxa3xx_nand_cmdset *cmdset;
+
+	uint32_t page_per_block;/* Pages per block (PG_PER_BLK) */
+	uint32_t page_size;	/* Page size in bytes (PAGE_SZ) */
+	uint32_t flash_width;	/* Width of Flash memory (DWIDTH_M) */
+	uint32_t dfc_width;	/* Width of flash controller(DWIDTH_C) */
+	uint32_t num_blocks;	/* Number of physical blocks in Flash */
+	uint32_t chip_id;
+};
+
 struct pxa3xx_nand_platform_data {
 
 	/* the data flash bus is shared between the Static Memory
@@ -12,8 +49,11 @@ struct pxa3xx_nand_platform_data {
 	 */
 	int	enable_arbiter;
 
-	struct mtd_partition *parts;
-	unsigned int	nr_parts;
+	const struct mtd_partition		*parts;
+	unsigned int				nr_parts;
+
+	const struct pxa3xx_nand_flash * 	flash;
+	size_t					num_flash;
 };
 
 extern void pxa3xx_set_nand_info(struct pxa3xx_nand_platform_data *info);
diff --git a/arch/arm/plat-mxc/include/mach/mxc_nand.h b/arch/arm/plat-mxc/include/mach/mxc_nand.h
new file mode 100644
index 000000000000..2b972df22d12
--- /dev/null
+++ b/arch/arm/plat-mxc/include/mach/mxc_nand.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
+ * Copyright 2008 Sascha Hauer, kernel@pengutronix.de
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef __ASM_ARCH_NAND_H
+#define __ASM_ARCH_NAND_H
+
+struct mxc_nand_platform_data {
+	int width;	/* data bus width in bytes */
+	int hw_ecc;	/* 0 if supress hardware ECC */
+};
+#endif /* __ASM_ARCH_NAND_H */
diff --git a/arch/arm/plat-omap/include/mach/onenand.h b/arch/arm/plat-omap/include/mach/onenand.h
index d57f20226b28..4649d302c263 100644
--- a/arch/arm/plat-omap/include/mach/onenand.h
+++ b/arch/arm/plat-omap/include/mach/onenand.h
@@ -16,6 +16,10 @@ struct omap_onenand_platform_data {
 	int			gpio_irq;
 	struct mtd_partition	*parts;
 	int			nr_parts;
-	int                     (*onenand_setup)(void __iomem *);
+	int                     (*onenand_setup)(void __iomem *, int freq);
 	int			dma_channel;
 };
+
+int omap2_onenand_rephase(void);
+
+#define ONENAND_MAX_PARTITIONS 8
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 7c239a916275..33a5b2969eb4 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -72,6 +72,8 @@ config GENERIC_BUG
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "System Type and features"
 
 source "kernel/time/Kconfig"
diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h
index 294b25f9323d..4442f8d2d423 100644
--- a/arch/avr32/include/asm/thread_info.h
+++ b/arch/avr32/include/asm/thread_info.h
@@ -96,6 +96,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_MEMDIE		(1 << TIF_MEMDIE)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP)
+#define _TIF_FREEZE		(1 << TIF_FREEZE)
 
 /* Note: The masks below must never span more than 16 bits! */
 
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 8102c79aaa94..29e71ed6b8a7 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -64,8 +64,11 @@ config HARDWARE_PM
 	depends on OPROFILE
 
 source "init/Kconfig"
+
 source "kernel/Kconfig.preempt"
 
+source "kernel/Kconfig.freezer"
+
 menu "Blackfin Processor Options"
 
 comment "Processor and Board Settings"
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 9389d38f222f..07335e719bf8 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -62,6 +62,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "General setup"
 
 source "fs/Kconfig.binfmt"
diff --git a/arch/cris/arch-v10/drivers/ds1302.c b/arch/cris/arch-v10/drivers/ds1302.c
index c9aa3904be05..3bdfaf43390c 100644
--- a/arch/cris/arch-v10/drivers/ds1302.c
+++ b/arch/cris/arch-v10/drivers/ds1302.c
@@ -215,12 +215,12 @@ get_rtc_time(struct rtc_time *rtc_tm)
 
 	local_irq_restore(flags);
 	
-	BCD_TO_BIN(rtc_tm->tm_sec);
-	BCD_TO_BIN(rtc_tm->tm_min);
-	BCD_TO_BIN(rtc_tm->tm_hour);
-	BCD_TO_BIN(rtc_tm->tm_mday);
-	BCD_TO_BIN(rtc_tm->tm_mon);
-	BCD_TO_BIN(rtc_tm->tm_year);
+	rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec);
+	rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min);
+	rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour);
+	rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday);
+	rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
+	rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
 
 	/*
 	 * Account for differences between how the RTC uses the values
@@ -295,12 +295,12 @@ rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
 			else
 				yrs -= 1900;	/* RTC (70, 71, ... 99) */
 
-			BIN_TO_BCD(sec);
-			BIN_TO_BCD(min);
-			BIN_TO_BCD(hrs);
-			BIN_TO_BCD(day);
-			BIN_TO_BCD(mon);
-			BIN_TO_BCD(yrs);
+			sec = bin2bcd(sec);
+			min = bin2bcd(min);
+			hrs = bin2bcd(hrs);
+			day = bin2bcd(day);
+			mon = bin2bcd(mon);
+			yrs = bin2bcd(yrs);
 
 			local_irq_save(flags);
 			CMOS_WRITE(yrs, RTC_YEAR);
diff --git a/arch/cris/arch-v10/drivers/pcf8563.c b/arch/cris/arch-v10/drivers/pcf8563.c
index 8769dc914073..1e90c1a9c849 100644
--- a/arch/cris/arch-v10/drivers/pcf8563.c
+++ b/arch/cris/arch-v10/drivers/pcf8563.c
@@ -122,7 +122,7 @@ get_rtc_time(struct rtc_time *tm)
 		       "information is no longer guaranteed!\n", PCF8563_NAME);
 	}
 
-	tm->tm_year  = BCD_TO_BIN(tm->tm_year) +
+	tm->tm_year  = bcd2bin(tm->tm_year) +
 		       ((tm->tm_mon & 0x80) ? 100 : 0);
 	tm->tm_sec  &= 0x7F;
 	tm->tm_min  &= 0x7F;
@@ -131,11 +131,11 @@ get_rtc_time(struct rtc_time *tm)
 	tm->tm_wday &= 0x07; /* Not coded in BCD. */
 	tm->tm_mon  &= 0x1F;
 
-	BCD_TO_BIN(tm->tm_sec);
-	BCD_TO_BIN(tm->tm_min);
-	BCD_TO_BIN(tm->tm_hour);
-	BCD_TO_BIN(tm->tm_mday);
-	BCD_TO_BIN(tm->tm_mon);
+	tm->tm_sec = bcd2bin(tm->tm_sec);
+	tm->tm_min = bcd2bin(tm->tm_min);
+	tm->tm_hour = bcd2bin(tm->tm_hour);
+	tm->tm_mday = bcd2bin(tm->tm_mday);
+	tm->tm_mon = bcd2bin(tm->tm_mon);
 	tm->tm_mon--; /* Month is 1..12 in RTC but 0..11 in linux */
 }
 
@@ -282,12 +282,12 @@ int pcf8563_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		century = (tm.tm_year >= 2000) ? 0x80 : 0;
 		tm.tm_year = tm.tm_year % 100;
 
-		BIN_TO_BCD(tm.tm_year);
-		BIN_TO_BCD(tm.tm_mon);
-		BIN_TO_BCD(tm.tm_mday);
-		BIN_TO_BCD(tm.tm_hour);
-		BIN_TO_BCD(tm.tm_min);
-		BIN_TO_BCD(tm.tm_sec);
+		tm.tm_year = bin2bcd(tm.tm_year);
+		tm.tm_mon = bin2bcd(tm.tm_mon);
+		tm.tm_mday = bin2bcd(tm.tm_mday);
+		tm.tm_hour = bin2bcd(tm.tm_hour);
+		tm.tm_min = bin2bcd(tm.tm_min);
+		tm.tm_sec = bin2bcd(tm.tm_sec);
 		tm.tm_mon |= century;
 
 		mutex_lock(&rtc_lock);
diff --git a/arch/cris/arch-v32/drivers/pcf8563.c b/arch/cris/arch-v32/drivers/pcf8563.c
index f263ab571221..f4478506e52c 100644
--- a/arch/cris/arch-v32/drivers/pcf8563.c
+++ b/arch/cris/arch-v32/drivers/pcf8563.c
@@ -118,7 +118,7 @@ get_rtc_time(struct rtc_time *tm)
 		       "information is no longer guaranteed!\n", PCF8563_NAME);
 	}
 
-	tm->tm_year  = BCD_TO_BIN(tm->tm_year) +
+	tm->tm_year  = bcd2bin(tm->tm_year) +
 		       ((tm->tm_mon & 0x80) ? 100 : 0);
 	tm->tm_sec  &= 0x7F;
 	tm->tm_min  &= 0x7F;
@@ -127,11 +127,11 @@ get_rtc_time(struct rtc_time *tm)
 	tm->tm_wday &= 0x07; /* Not coded in BCD. */
 	tm->tm_mon  &= 0x1F;
 
-	BCD_TO_BIN(tm->tm_sec);
-	BCD_TO_BIN(tm->tm_min);
-	BCD_TO_BIN(tm->tm_hour);
-	BCD_TO_BIN(tm->tm_mday);
-	BCD_TO_BIN(tm->tm_mon);
+	tm->tm_sec = bcd2bin(tm->tm_sec);
+	tm->tm_min = bcd2bin(tm->tm_min);
+	tm->tm_hour = bcd2bin(tm->tm_hour);
+	tm->tm_mday = bcd2bin(tm->tm_mday);
+	tm->tm_mon = bcd2bin(tm->tm_mon);
 	tm->tm_mon--; /* Month is 1..12 in RTC but 0..11 in linux */
 }
 
@@ -279,12 +279,12 @@ int pcf8563_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 		century = (tm.tm_year >= 2000) ? 0x80 : 0;
 		tm.tm_year = tm.tm_year % 100;
 
-		BIN_TO_BCD(tm.tm_year);
-		BIN_TO_BCD(tm.tm_mon);
-		BIN_TO_BCD(tm.tm_mday);
-		BIN_TO_BCD(tm.tm_hour);
-		BIN_TO_BCD(tm.tm_min);
-		BIN_TO_BCD(tm.tm_sec);
+		tm.tm_year = bin2bcd(tm.tm_year);
+		tm.tm_mon = bin2bcd(tm.tm_mon);
+		tm.tm_mday = bin2bcd(tm.tm_mday);
+		tm.tm_hour = bin2bcd(tm.tm_hour);
+		tm.tm_min = bin2bcd(tm.tm_min);
+		tm.tm_sec = bin2bcd(tm.tm_sec);
 		tm.tm_mon |= century;
 
 		mutex_lock(&rtc_lock);
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index ff4c6aa75def..074fe7dea96b 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -127,7 +127,7 @@ int set_rtc_mmss(unsigned long nowtime)
 		return 0;
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
-	BCD_TO_BIN(cmos_minutes);
+	cmos_minutes = bcd2bin(cmos_minutes);
 
 	/*
 	 * since we're only adjusting minutes and seconds,
@@ -142,8 +142,8 @@ int set_rtc_mmss(unsigned long nowtime)
 	real_minutes %= 60;
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
-		BIN_TO_BCD(real_seconds);
-		BIN_TO_BCD(real_minutes);
+		real_seconds = bin2bcd(real_seconds);
+		real_minutes = bin2bcd(real_minutes);
 		CMOS_WRITE(real_seconds,RTC_SECONDS);
 		CMOS_WRITE(real_minutes,RTC_MINUTES);
 	} else {
@@ -170,12 +170,12 @@ get_cmos_time(void)
 	mon = CMOS_READ(RTC_MONTH);
 	year = CMOS_READ(RTC_YEAR);
 
-	BCD_TO_BIN(sec);
-	BCD_TO_BIN(min);
-	BCD_TO_BIN(hour);
-	BCD_TO_BIN(day);
-	BCD_TO_BIN(mon);
-	BCD_TO_BIN(year);
+	sec = bcd2bin(sec);
+	min = bcd2bin(min);
+	hour = bcd2bin(hour);
+	day = bcd2bin(day);
+	mon = bcd2bin(mon);
+	year = bcd2bin(year);
 
 	if ((year += 1900) < 1970)
 		year += 100;
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index a5aac1b07562..9d1552a9ee2c 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Fujitsu FR-V system setup"
 
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index c7966746fbfe..bd1995403c67 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -90,6 +90,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 source "arch/h8300/Kconfig.cpu"
 
 menu "Executable file formats"
diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h
index aafd4d322ec3..700014d2155f 100644
--- a/arch/h8300/include/asm/thread_info.h
+++ b/arch/h8300/include/asm/thread_info.h
@@ -89,6 +89,7 @@ static inline struct thread_info *current_thread_info(void)
 					   TIF_NEED_RESCHED */
 #define TIF_MEMDIE		4
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
+#define TIF_FREEZE		16	/* is freezing for suspend */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -96,6 +97,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3b7aa38254a8..912c57db2d21 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Processor type and features"
 
 config IA64
diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 4956be40d7b5..d98f0f4ff83f 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -2070,14 +2070,13 @@ sba_init(void)
 	if (!ia64_platform_is("hpzx1") && !ia64_platform_is("hpzx1_swiotlb"))
 		return 0;
 
-#if defined(CONFIG_IA64_GENERIC) && defined(CONFIG_CRASH_DUMP) && \
-        defined(CONFIG_PROC_FS)
+#if defined(CONFIG_IA64_GENERIC)
 	/* If we are booting a kdump kernel, the sba_iommu will
 	 * cause devices that were not shutdown properly to MCA
 	 * as soon as they are turned back on.  Our only option for
 	 * a successful kdump kernel boot is to use the swiotlb.
 	 */
-	if (elfcorehdr_addr < ELFCORE_ADDR_MAX) {
+	if (is_kdump_kernel()) {
 		if (swiotlb_late_init_with_default_size(64 * (1<<20)) != 0)
 			panic("Unable to initialize software I/O TLB:"
 				  " Try machvec=dig boot option");
diff --git a/arch/ia64/kernel/crash_dump.c b/arch/ia64/kernel/crash_dump.c
index da60e90eeeb1..23e91290e41f 100644
--- a/arch/ia64/kernel/crash_dump.c
+++ b/arch/ia64/kernel/crash_dump.c
@@ -8,10 +8,14 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/crash_dump.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index 51b75cea7018..efaff15d8cf1 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -1335,7 +1335,7 @@ kdump_find_rsvd_region (unsigned long size, struct rsvd_region *r, int n)
 }
 #endif
 
-#ifdef CONFIG_PROC_VMCORE
+#ifdef CONFIG_CRASH_DUMP
 /* locate the size find a the descriptor at a certain address */
 unsigned long __init
 vmcore_find_descriptor_size (unsigned long address)
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index de636b215677..916ba898237f 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -352,7 +352,7 @@ reserve_memory (void)
 	}
 #endif
 
-#ifdef CONFIG_PROC_VMCORE
+#ifdef CONFIG_CRASH_KERNEL
 	if (reserve_elfcorehdr(&rsvd_region[n].start,
 			       &rsvd_region[n].end) == 0)
 		n++;
@@ -478,7 +478,12 @@ static __init int setup_nomca(char *s)
 }
 early_param("nomca", setup_nomca);
 
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+#ifdef CONFIG_CRASH_DUMP
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel.
  */
@@ -502,11 +507,11 @@ int __init reserve_elfcorehdr(unsigned long *start, unsigned long *end)
 	 * to work properly.
 	 */
 
-	if (elfcorehdr_addr >= ELFCORE_ADDR_MAX)
+	if (!is_vmcore_usable())
 		return -EINVAL;
 
 	if ((length = vmcore_find_descriptor_size(elfcorehdr_addr)) == 0) {
-		elfcorehdr_addr = ELFCORE_ADDR_MAX;
+		vmcore_unusable();
 		return -EINVAL;
 	}
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index f482a9098e32..054bcd9439aa 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -700,23 +700,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	return ret;
 }
-#ifdef CONFIG_MEMORY_HOTREMOVE
-int remove_memory(u64 start, u64 size)
-{
-	unsigned long start_pfn, end_pfn;
-	unsigned long timeout = 120 * HZ;
-	int ret;
-	start_pfn = start >> PAGE_SHIFT;
-	end_pfn = start_pfn + (size >> PAGE_SHIFT);
-	ret = offline_pages(start_pfn, end_pfn, timeout);
-	if (ret)
-		goto out;
-	/* we can free mem_map at this point */
-out:
-	return ret;
-}
-EXPORT_SYMBOL_GPL(remove_memory);
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif
 
 /*
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 00289c178f89..dbaed4a63815 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -42,6 +42,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Processor type and features"
 
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 677c93a490f6..836fb66f080d 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -62,6 +62,8 @@ mainmenu "Linux/68k Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Platform dependent setup"
 
 config EISA
diff --git a/arch/m68k/bvme6000/rtc.c b/arch/m68k/bvme6000/rtc.c
index 808c9018b115..c50bec8aabb1 100644
--- a/arch/m68k/bvme6000/rtc.c
+++ b/arch/m68k/bvme6000/rtc.c
@@ -18,7 +18,6 @@
 #include <linux/poll.h>
 #include <linux/module.h>
 #include <linux/mc146818rtc.h>	/* For struct rtc_time and ioctls, etc */
-#include <linux/smp_lock.h>
 #include <linux/bcd.h>
 #include <asm/bvme6000hw.h>
 
diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig
index 0a8998315e5e..76b66feb74df 100644
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -75,6 +75,8 @@ config NO_IOPORT
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Processor type and features"
 
 choice
diff --git a/arch/m68knommu/include/asm/thread_info.h b/arch/m68knommu/include/asm/thread_info.h
index 0c9bc095f3f0..82529f424ea3 100644
--- a/arch/m68knommu/include/asm/thread_info.h
+++ b/arch/m68knommu/include/asm/thread_info.h
@@ -84,12 +84,14 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
 #define TIF_MEMDIE		4
+#define TIF_FREEZE		16	/* is freezing for suspend */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index b905744d7915..5f149b030c0f 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER
 	  add initrd or initramfs image to the kernel image.
 	  Otherwise, say N.
 
+source "kernel/Kconfig.freezer"
+
 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
 
 config HW_HAS_EISA
diff --git a/arch/mips/dec/time.c b/arch/mips/dec/time.c
index 3965fda94a89..1359c03ded51 100644
--- a/arch/mips/dec/time.c
+++ b/arch/mips/dec/time.c
@@ -45,12 +45,12 @@ unsigned long read_persistent_clock(void)
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		sec = BCD2BIN(sec);
-		min = BCD2BIN(min);
-		hour = BCD2BIN(hour);
-		day = BCD2BIN(day);
-		mon = BCD2BIN(mon);
-		year = BCD2BIN(year);
+		sec = bcd2bin(sec);
+		min = bcd2bin(min);
+		hour = bcd2bin(hour);
+		day = bcd2bin(day);
+		mon = bcd2bin(mon);
+		year = bcd2bin(year);
 	}
 
 	year += real_year - 72 + 2000;
@@ -83,7 +83,7 @@ int rtc_mips_set_mmss(unsigned long nowtime)
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
 	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		cmos_minutes = BCD2BIN(cmos_minutes);
+		cmos_minutes = bcd2bin(cmos_minutes);
 
 	/*
 	 * since we're only adjusting minutes and seconds,
@@ -99,8 +99,8 @@ int rtc_mips_set_mmss(unsigned long nowtime)
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
 		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			real_seconds = BIN2BCD(real_seconds);
-			real_minutes = BIN2BCD(real_minutes);
+			real_seconds = bin2bcd(real_seconds);
+			real_minutes = bin2bcd(real_minutes);
 		}
 		CMOS_WRITE(real_seconds, RTC_SECONDS);
 		CMOS_WRITE(real_minutes, RTC_MINUTES);
diff --git a/arch/mips/include/asm/mc146818-time.h b/arch/mips/include/asm/mc146818-time.h
index cdc379a0a94e..199b45733a95 100644
--- a/arch/mips/include/asm/mc146818-time.h
+++ b/arch/mips/include/asm/mc146818-time.h
@@ -44,7 +44,7 @@ static inline int mc146818_set_rtc_mmss(unsigned long nowtime)
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
 	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(cmos_minutes);
+		cmos_minutes = bcd2bin(cmos_minutes);
 
 	/*
 	 * since we're only adjusting minutes and seconds,
@@ -60,8 +60,8 @@ static inline int mc146818_set_rtc_mmss(unsigned long nowtime)
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
 		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			BIN_TO_BCD(real_seconds);
-			BIN_TO_BCD(real_minutes);
+			real_seconds = bin2bcd(real_seconds);
+			real_minutes = bin2bcd(real_minutes);
 		}
 		CMOS_WRITE(real_seconds, RTC_SECONDS);
 		CMOS_WRITE(real_minutes, RTC_MINUTES);
@@ -103,12 +103,12 @@ static inline unsigned long mc146818_get_cmos_time(void)
 	} while (sec != CMOS_READ(RTC_SECONDS));
 
 	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		BCD_TO_BIN(sec);
-		BCD_TO_BIN(min);
-		BCD_TO_BIN(hour);
-		BCD_TO_BIN(day);
-		BCD_TO_BIN(mon);
-		BCD_TO_BIN(year);
+		sec = bcd2bin(sec);
+		min = bcd2bin(min);
+		hour = bcd2bin(hour);
+		day = bcd2bin(day);
+		mon = bcd2bin(mon);
+		year = bcd2bin(year);
 	}
 	spin_unlock_irqrestore(&rtc_lock, flags);
 	year = mc146818_decode_year(year);
diff --git a/arch/mips/pmc-sierra/yosemite/setup.c b/arch/mips/pmc-sierra/yosemite/setup.c
index 6537d90a25bb..2d3c0dca275d 100644
--- a/arch/mips/pmc-sierra/yosemite/setup.c
+++ b/arch/mips/pmc-sierra/yosemite/setup.c
@@ -79,14 +79,14 @@ unsigned long read_persistent_clock(void)
 	/* Stop the update to the time */
 	m48t37_base->control = 0x40;
 
-	year = BCD2BIN(m48t37_base->year);
-	year += BCD2BIN(m48t37_base->century) * 100;
+	year = bcd2bin(m48t37_base->year);
+	year += bcd2bin(m48t37_base->century) * 100;
 
-	month = BCD2BIN(m48t37_base->month);
-	day = BCD2BIN(m48t37_base->date);
-	hour = BCD2BIN(m48t37_base->hour);
-	min = BCD2BIN(m48t37_base->min);
-	sec = BCD2BIN(m48t37_base->sec);
+	month = bcd2bin(m48t37_base->month);
+	day = bcd2bin(m48t37_base->date);
+	hour = bcd2bin(m48t37_base->hour);
+	min = bcd2bin(m48t37_base->min);
+	sec = bcd2bin(m48t37_base->sec);
 
 	/* Start the update to the time again */
 	m48t37_base->control = 0x00;
@@ -113,22 +113,22 @@ int rtc_mips_set_time(unsigned long tim)
 	m48t37_base->control = 0x80;
 
 	/* year */
-	m48t37_base->year = BIN2BCD(tm.tm_year % 100);
-	m48t37_base->century = BIN2BCD(tm.tm_year / 100);
+	m48t37_base->year = bin2bcd(tm.tm_year % 100);
+	m48t37_base->century = bin2bcd(tm.tm_year / 100);
 
 	/* month */
-	m48t37_base->month = BIN2BCD(tm.tm_mon);
+	m48t37_base->month = bin2bcd(tm.tm_mon);
 
 	/* day */
-	m48t37_base->date = BIN2BCD(tm.tm_mday);
+	m48t37_base->date = bin2bcd(tm.tm_mday);
 
 	/* hour/min/sec */
-	m48t37_base->hour = BIN2BCD(tm.tm_hour);
-	m48t37_base->min = BIN2BCD(tm.tm_min);
-	m48t37_base->sec = BIN2BCD(tm.tm_sec);
+	m48t37_base->hour = bin2bcd(tm.tm_hour);
+	m48t37_base->min = bin2bcd(tm.tm_min);
+	m48t37_base->sec = bin2bcd(tm.tm_sec);
 
 	/* day of week -- not really used, but let's keep it up-to-date */
-	m48t37_base->day = BIN2BCD(tm.tm_wday + 1);
+	m48t37_base->day = bin2bcd(tm.tm_wday + 1);
 
 	/* disable writing */
 	m48t37_base->control = 0x00;
diff --git a/arch/mips/sibyte/swarm/rtc_m41t81.c b/arch/mips/sibyte/swarm/rtc_m41t81.c
index 26fbff4c15b1..b732600b47f5 100644
--- a/arch/mips/sibyte/swarm/rtc_m41t81.c
+++ b/arch/mips/sibyte/swarm/rtc_m41t81.c
@@ -156,32 +156,32 @@ int m41t81_set_time(unsigned long t)
 	 */
 
 	spin_lock_irqsave(&rtc_lock, flags);
-	tm.tm_sec = BIN2BCD(tm.tm_sec);
+	tm.tm_sec = bin2bcd(tm.tm_sec);
 	m41t81_write(M41T81REG_SC, tm.tm_sec);
 
-	tm.tm_min = BIN2BCD(tm.tm_min);
+	tm.tm_min = bin2bcd(tm.tm_min);
 	m41t81_write(M41T81REG_MN, tm.tm_min);
 
-	tm.tm_hour = BIN2BCD(tm.tm_hour);
+	tm.tm_hour = bin2bcd(tm.tm_hour);
 	tm.tm_hour = (tm.tm_hour & 0x3f) | (m41t81_read(M41T81REG_HR) & 0xc0);
 	m41t81_write(M41T81REG_HR, tm.tm_hour);
 
 	/* tm_wday starts from 0 to 6 */
 	if (tm.tm_wday == 0) tm.tm_wday = 7;
-	tm.tm_wday = BIN2BCD(tm.tm_wday);
+	tm.tm_wday = bin2bcd(tm.tm_wday);
 	m41t81_write(M41T81REG_DY, tm.tm_wday);
 
-	tm.tm_mday = BIN2BCD(tm.tm_mday);
+	tm.tm_mday = bin2bcd(tm.tm_mday);
 	m41t81_write(M41T81REG_DT, tm.tm_mday);
 
 	/* tm_mon starts from 0, *ick* */
 	tm.tm_mon ++;
-	tm.tm_mon = BIN2BCD(tm.tm_mon);
+	tm.tm_mon = bin2bcd(tm.tm_mon);
 	m41t81_write(M41T81REG_MO, tm.tm_mon);
 
 	/* we don't do century, everything is beyond 2000 */
 	tm.tm_year %= 100;
-	tm.tm_year = BIN2BCD(tm.tm_year);
+	tm.tm_year = bin2bcd(tm.tm_year);
 	m41t81_write(M41T81REG_YR, tm.tm_year);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
@@ -209,12 +209,12 @@ unsigned long m41t81_get_time(void)
 	year = m41t81_read(M41T81REG_YR);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	sec = BCD2BIN(sec);
-	min = BCD2BIN(min);
-	hour = BCD2BIN(hour);
-	day = BCD2BIN(day);
-	mon = BCD2BIN(mon);
-	year = BCD2BIN(year);
+	sec = bcd2bin(sec);
+	min = bcd2bin(min);
+	hour = bcd2bin(hour);
+	day = bcd2bin(day);
+	mon = bcd2bin(mon);
+	year = bcd2bin(year);
 
 	year += 2000;
 
diff --git a/arch/mips/sibyte/swarm/rtc_xicor1241.c b/arch/mips/sibyte/swarm/rtc_xicor1241.c
index ff3e5dabb348..4438b2195c44 100644
--- a/arch/mips/sibyte/swarm/rtc_xicor1241.c
+++ b/arch/mips/sibyte/swarm/rtc_xicor1241.c
@@ -124,18 +124,18 @@ int xicor_set_time(unsigned long t)
 	xicor_write(X1241REG_SR, X1241REG_SR_WEL | X1241REG_SR_RWEL);
 
 	/* trivial ones */
-	tm.tm_sec = BIN2BCD(tm.tm_sec);
+	tm.tm_sec = bin2bcd(tm.tm_sec);
 	xicor_write(X1241REG_SC, tm.tm_sec);
 
-	tm.tm_min = BIN2BCD(tm.tm_min);
+	tm.tm_min = bin2bcd(tm.tm_min);
 	xicor_write(X1241REG_MN, tm.tm_min);
 
-	tm.tm_mday = BIN2BCD(tm.tm_mday);
+	tm.tm_mday = bin2bcd(tm.tm_mday);
 	xicor_write(X1241REG_DT, tm.tm_mday);
 
 	/* tm_mon starts from 0, *ick* */
 	tm.tm_mon ++;
-	tm.tm_mon = BIN2BCD(tm.tm_mon);
+	tm.tm_mon = bin2bcd(tm.tm_mon);
 	xicor_write(X1241REG_MO, tm.tm_mon);
 
 	/* year is split */
@@ -148,7 +148,7 @@ int xicor_set_time(unsigned long t)
 	tmp = xicor_read(X1241REG_HR);
 	if (tmp & X1241REG_HR_MIL) {
 		/* 24 hour format */
-		tm.tm_hour = BIN2BCD(tm.tm_hour);
+		tm.tm_hour = bin2bcd(tm.tm_hour);
 		tmp = (tmp & ~0x3f) | (tm.tm_hour & 0x3f);
 	} else {
 		/* 12 hour format, with 0x2 for pm */
@@ -157,7 +157,7 @@ int xicor_set_time(unsigned long t)
 			tmp |= 0x20;
 			tm.tm_hour -= 12;
 		}
-		tm.tm_hour = BIN2BCD(tm.tm_hour);
+		tm.tm_hour = bin2bcd(tm.tm_hour);
 		tmp |= tm.tm_hour;
 	}
 	xicor_write(X1241REG_HR, tmp);
@@ -191,13 +191,13 @@ unsigned long xicor_get_time(void)
 	y2k = xicor_read(X1241REG_Y2K);
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
-	sec = BCD2BIN(sec);
-	min = BCD2BIN(min);
-	hour = BCD2BIN(hour);
-	day = BCD2BIN(day);
-	mon = BCD2BIN(mon);
-	year = BCD2BIN(year);
-	y2k = BCD2BIN(y2k);
+	sec = bcd2bin(sec);
+	min = bcd2bin(min);
+	hour = bcd2bin(hour);
+	day = bcd2bin(day);
+	mon = bcd2bin(mon);
+	year = bcd2bin(year);
+	y2k = bcd2bin(y2k);
 
 	year += (y2k * 100);
 
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index dd557c9cf001..9a9f43358879 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -68,6 +68,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel Configuration"
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Matsushita MN10300 system setup"
 
diff --git a/arch/mn10300/kernel/rtc.c b/arch/mn10300/kernel/rtc.c
index 042f792d8430..7978470b5749 100644
--- a/arch/mn10300/kernel/rtc.c
+++ b/arch/mn10300/kernel/rtc.c
@@ -67,7 +67,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
 	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(cmos_minutes);
+		cmos_minutes = bcd2bin(cmos_minutes);
 
 	/*
 	 * since we're only adjusting minutes and seconds,
@@ -84,8 +84,8 @@ static int set_rtc_mmss(unsigned long nowtime)
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
 		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			BIN_TO_BCD(real_seconds);
-			BIN_TO_BCD(real_minutes);
+			real_seconds = bin2bcd(real_seconds);
+			real_minutes = bin2bcd(real_minutes);
 		}
 		CMOS_WRITE(real_seconds, RTC_SECONDS);
 		CMOS_WRITE(real_minutes, RTC_MINUTES);
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 8313fccced5e..2bd1f6ef5db0 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -90,6 +90,8 @@ config ARCH_MAY_HAVE_PC_FDC
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 
 menu "Processor type and features"
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 380baa1780e9..9391199d9e77 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -230,6 +230,8 @@ config PPC_OF_PLATFORM_PCI
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 source "arch/powerpc/sysdev/Kconfig"
 source "arch/powerpc/platforms/Kconfig"
 
diff --git a/arch/powerpc/include/asm/ps3av.h b/arch/powerpc/include/asm/ps3av.h
index fda98715cd35..5aa22cffdbd6 100644
--- a/arch/powerpc/include/asm/ps3av.h
+++ b/arch/powerpc/include/asm/ps3av.h
@@ -678,6 +678,8 @@ struct ps3av_pkt_avb_param {
 	u8 buf[PS3AV_PKT_AVB_PARAM_MAX_BUF_SIZE];
 };
 
+/* channel status */
+extern u8 ps3av_mode_cs_info[];
 
 /** command status **/
 #define PS3AV_STATUS_SUCCESS			0x0000	/* success */
@@ -735,6 +737,7 @@ extern int ps3av_get_mode(void);
 extern int ps3av_video_mode2res(u32, u32 *, u32 *);
 extern int ps3av_video_mute(int);
 extern int ps3av_audio_mute(int);
+extern int ps3av_audio_mute_analog(int);
 extern int ps3av_dev_open(void);
 extern int ps3av_dev_close(void);
 extern void ps3av_register_flip_ctl(void (*flip_ctl)(int on, void *data),
diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c
index a323c9b32ee1..97e056379728 100644
--- a/arch/powerpc/kernel/crash_dump.c
+++ b/arch/powerpc/kernel/crash_dump.c
@@ -27,6 +27,9 @@
 #define DBG(fmt...)
 #endif
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 void __init reserve_kdump_trampoline(void)
 {
 	lmb_reserve(0, KDUMP_RESERVE_LIMIT);
@@ -66,7 +69,11 @@ void __init setup_kdump_trampoline(void)
 	DBG(" <- setup_kdump_trampoline()\n");
 }
 
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
 static int __init parse_elfcorehdr(char *p)
 {
 	if (p)
@@ -75,7 +82,6 @@ static int __init parse_elfcorehdr(char *p)
 	return 1;
 }
 __setup("elfcorehdr=", parse_elfcorehdr);
-#endif
 
 static int __init parse_savemaxmem(char *p)
 {
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 98d7bf99533a..b9e1a1da6e52 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -134,23 +134,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
 
 	return __add_pages(zone, start_pfn, nr_pages);
 }
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
-int remove_memory(u64 start, u64 size)
-{
-	unsigned long start_pfn, end_pfn;
-	int ret;
-
-	start_pfn = start >> PAGE_SHIFT;
-	end_pfn = start_pfn + (size >> PAGE_SHIFT);
-	ret = offline_pages(start_pfn, end_pfn, 120 * HZ);
-	if (ret)
-		goto out;
-	/* Arch-specific calls go here - next patch */
-out:
-	return ret;
-}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 /*
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bc581d8a7cd9..70b7645ce745 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -78,6 +78,8 @@ config S390
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "Base setup"
 
 comment "Processor type and features"
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index ea40a9d690fc..de3fad60c682 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -99,6 +99,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_31BIT		18	/* 32bit process */ 
 #define TIF_MEMDIE		19
 #define TIF_RESTORE_SIGMASK	20	/* restore signal mask in do_signal() */
+#define TIF_FREEZE		21	/* thread is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -112,6 +113,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_31BIT		(1<<TIF_31BIT)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 1169130a97ef..158b0d6d7046 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -189,14 +189,3 @@ int arch_add_memory(int nid, u64 start, u64 size)
 	return rc;
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
-int remove_memory(u64 start, u64 size)
-{
-	unsigned long start_pfn, end_pfn;
-
-	start_pfn = PFN_DOWN(start);
-	end_pfn = start_pfn + PFN_DOWN(size);
-	return offline_pages(start_pfn, end_pfn, 120 * HZ);
-}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index b4aa2a03e19b..cb2c87df70ce 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -121,6 +121,8 @@ config IO_TRAPPED
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "System type"
 
 #
diff --git a/arch/sh/kernel/crash_dump.c b/arch/sh/kernel/crash_dump.c
index 4a2ecbe27d8e..95d216255565 100644
--- a/arch/sh/kernel/crash_dump.c
+++ b/arch/sh/kernel/crash_dump.c
@@ -10,6 +10,9 @@
 #include <linux/io.h>
 #include <asm/uaccess.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 97671dac12a6..e594559c8dba 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -37,6 +37,8 @@ config HZ
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 menu "General machine setup"
 
 config SMP
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 29899fd5b1b2..80fe547c3f45 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -135,6 +135,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *)
 #define TIF_POLLING_NRFLAG	9	/* true if poll_idle() is polling
 					 * TIF_NEED_RESCHED */
 #define TIF_MEMDIE		10
+#define TIF_FREEZE		11	/* is freezing for suspend */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -148,6 +149,7 @@ BTFIXUPDEF_CALL(void, free_thread_info, struct thread_info *)
 #define _TIF_DO_NOTIFY_RESUME_MASK	(_TIF_NOTIFY_RESUME | \
 					 _TIF_SIGPENDING | \
 					 _TIF_RESTORE_SIGMASK)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index c0a737d7292c..639ac805448a 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -237,6 +237,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define TIF_ABI_PENDING		12
 #define TIF_MEMDIE		13
 #define TIF_POLLING_NRFLAG	14
+#define TIF_FREEZE		15	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -249,6 +250,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_USER_WORK_MASK	((0xff << TI_FLAG_WSAVED_SHIFT) | \
 				 _TIF_DO_NOTIFY_RESUME_MASK | \
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 5446e2a499b1..035b15af90d8 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -96,6 +96,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ
 	def_bool y
 
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 menu "Processor type and features"
 
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 6976812cfb18..393bccfe1785 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -229,6 +229,8 @@ endmenu
 
 source "init/Kconfig"
 
+source "kernel/Kconfig.freezer"
+
 source "drivers/block/Kconfig"
 
 source "arch/um/Kconfig.char"
diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c
index fd0c25ad6af3..129647375a6c 100644
--- a/arch/um/sys-i386/signal.c
+++ b/arch/um/sys-i386/signal.c
@@ -179,7 +179,8 @@ static int copy_sc_from_user(struct pt_regs *regs,
 	if (have_fpx_regs) {
 		struct user_fxsr_struct fpx;
 
-		err = copy_from_user(&fpx, &sc.fpstate->_fxsr_env[0],
+		err = copy_from_user(&fpx,
+			&((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0],
 				     sizeof(struct user_fxsr_struct));
 		if (err)
 			return 1;
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bd3c2c53873e..49349ba77d80 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -193,6 +193,7 @@ config X86_TRAMPOLINE
 config KTIME_SCALAR
 	def_bool X86_32
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 menu "Processor type and features"
 
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b48..f7cdb3b457aa 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
 
 static void *kdump_buf_page;
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index e90a60ef10c2..045b36cada65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,6 +10,9 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 0a23b5795b25..dd6f2b71561b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
 	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(cmos_minutes);
+		cmos_minutes = bcd2bin(cmos_minutes);
 
 	/*
 	 * since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
 		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			BIN_TO_BCD(real_seconds);
-			BIN_TO_BCD(real_minutes);
+			real_seconds = bin2bcd(real_seconds);
+			real_minutes = bin2bcd(real_minutes);
 		}
 		CMOS_WRITE(real_seconds,RTC_SECONDS);
 		CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
 	WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
 
 	if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
-		BCD_TO_BIN(sec);
-		BCD_TO_BIN(min);
-		BCD_TO_BIN(hour);
-		BCD_TO_BIN(day);
-		BCD_TO_BIN(mon);
-		BCD_TO_BIN(year);
+		sec = bcd2bin(sec);
+		min = bcd2bin(min);
+		hour = bcd2bin(hour);
+		day = bcd2bin(day);
+		mon = bcd2bin(mon);
+		year = bcd2bin(year);
 	}
 
 	if (century) {
-		BCD_TO_BIN(century);
+		century = bcd2bin(century);
 		year += century * 100;
 		printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
 	} else
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2255782e8d4b..b2c97874ec0f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -561,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
 
 }
 
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+
+#ifdef CONFIG_CRASH_DUMP
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel. This option will be passed
  * by kexec loader to the capture kernel.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a9ec89c3fbca..407d8784f669 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	/* Must avoid aliasing mappings in the highmem code */
 	kmap_flush_unused();
 
+	vm_unmap_aliases();
+
 	cpa.vaddr = addr;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0013a729b41d..b61534c7a4c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
 			/* make sure there are no stray mappings of
 			   this page */
 			kmap_flush_unused();
+			vm_unmap_aliases();
 	}
 }
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ae173f6edd8b..d4d52f5a1cf7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 		/* re-enable interrupts for kmap_flush_unused */
 		xen_mc_issue(0);
 		kmap_flush_unused();
+		vm_unmap_aliases();
 		xen_mc_batch();
 	}
 
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 02e417d3d8e9..a213260b51e5 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -55,6 +55,7 @@ config HZ
 	default 100
 
 source "init/Kconfig"
+source "kernel/Kconfig.freezer"
 
 menu "Processor type and features"
 
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index b1c723f9f58d..70f7f60929ca 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -431,7 +431,7 @@ static ssize_t acpi_battery_alarm_store(struct device *dev,
 }
 
 static struct device_attribute alarm_attr = {
-	.attr = {.name = "alarm", .mode = 0644, .owner = THIS_MODULE},
+	.attr = {.name = "alarm", .mode = 0644},
 	.show = acpi_battery_alarm_show,
 	.store = acpi_battery_alarm_store,
 };
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 10a36512647c..7b011e7e29fe 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -463,7 +463,7 @@ static ssize_t acpi_battery_alarm_store(struct device *dev,
 }
 
 static struct device_attribute alarm_attr = {
-	.attr = {.name = "alarm", .mode = 0644, .owner = THIS_MODULE},
+	.attr = {.name = "alarm", .mode = 0644},
 	.show = acpi_battery_alarm_show,
 	.store = acpi_battery_alarm_store,
 };
diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/sleep/proc.c
index bf5b04de02d1..631ee2ee2ca0 100644
--- a/drivers/acpi/sleep/proc.c
+++ b/drivers/acpi/sleep/proc.c
@@ -120,13 +120,13 @@ static int acpi_system_alarm_seq_show(struct seq_file *seq, void *offset)
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		BCD_TO_BIN(sec);
-		BCD_TO_BIN(min);
-		BCD_TO_BIN(hr);
-		BCD_TO_BIN(day);
-		BCD_TO_BIN(mo);
-		BCD_TO_BIN(yr);
-		BCD_TO_BIN(cent);
+		sec = bcd2bin(sec);
+		min = bcd2bin(min);
+		hr = bcd2bin(hr);
+		day = bcd2bin(day);
+		mo = bcd2bin(mo);
+		yr = bcd2bin(yr);
+		cent = bcd2bin(cent);
 	}
 
 	/* we're trusting the FADT (see above) */
@@ -204,7 +204,7 @@ static u32 cmos_bcd_read(int offset, int rtc_control)
 {
 	u32 val = CMOS_READ(offset);
 	if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(val);
+		val = bcd2bin(val);
 	return val;
 }
 
@@ -212,7 +212,7 @@ static u32 cmos_bcd_read(int offset, int rtc_control)
 static void cmos_bcd_write(u32 val, int offset, int rtc_control)
 {
 	if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BIN_TO_BCD(val);
+		val = bin2bcd(val);
 	CMOS_WRITE(val, offset);
 }
 
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index 91dec448b3ed..24e80fd927e2 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -115,7 +115,6 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr,
 	table_attr->attr.read = acpi_table_show;
 	table_attr->attr.attr.name = table_attr->name;
 	table_attr->attr.attr.mode = 0444;
-	table_attr->attr.attr.owner = THIS_MODULE;
 
 	return;
 }
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index af0d175c025d..5260e9e0df48 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -21,6 +21,8 @@
 #include <linux/memory_hotplug.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/stat.h>
+
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
 
@@ -325,7 +327,7 @@ memory_probe_store(struct class *class, const char *buf, size_t count)
 
 	return count;
 }
-static CLASS_ATTR(probe, 0700, NULL, memory_probe_store);
+static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
 
 static int memory_probe_init(void)
 {
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5116b78c6325..f5207090885a 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -13,6 +13,7 @@
 #include <linux/nodemask.h>
 #include <linux/cpu.h>
 #include <linux/device.h>
+#include <linux/swap.h>
 
 static struct sysdev_class node_class = {
 	.name = "node",
@@ -61,34 +62,52 @@ static ssize_t node_read_meminfo(struct sys_device * dev,
 	si_meminfo_node(&i, nid);
 
 	n = sprintf(buf, "\n"
-		       "Node %d MemTotal:     %8lu kB\n"
-		       "Node %d MemFree:      %8lu kB\n"
-		       "Node %d MemUsed:      %8lu kB\n"
-		       "Node %d Active:       %8lu kB\n"
-		       "Node %d Inactive:     %8lu kB\n"
+		       "Node %d MemTotal:       %8lu kB\n"
+		       "Node %d MemFree:        %8lu kB\n"
+		       "Node %d MemUsed:        %8lu kB\n"
+		       "Node %d Active:         %8lu kB\n"
+		       "Node %d Inactive:       %8lu kB\n"
+		       "Node %d Active(anon):   %8lu kB\n"
+		       "Node %d Inactive(anon): %8lu kB\n"
+		       "Node %d Active(file):   %8lu kB\n"
+		       "Node %d Inactive(file): %8lu kB\n"
+#ifdef CONFIG_UNEVICTABLE_LRU
+		       "Node %d Unevictable:    %8lu kB\n"
+		       "Node %d Mlocked:        %8lu kB\n"
+#endif
 #ifdef CONFIG_HIGHMEM
-		       "Node %d HighTotal:    %8lu kB\n"
-		       "Node %d HighFree:     %8lu kB\n"
-		       "Node %d LowTotal:     %8lu kB\n"
-		       "Node %d LowFree:      %8lu kB\n"
+		       "Node %d HighTotal:      %8lu kB\n"
+		       "Node %d HighFree:       %8lu kB\n"
+		       "Node %d LowTotal:       %8lu kB\n"
+		       "Node %d LowFree:        %8lu kB\n"
 #endif
-		       "Node %d Dirty:        %8lu kB\n"
-		       "Node %d Writeback:    %8lu kB\n"
-		       "Node %d FilePages:    %8lu kB\n"
-		       "Node %d Mapped:       %8lu kB\n"
-		       "Node %d AnonPages:    %8lu kB\n"
-		       "Node %d PageTables:   %8lu kB\n"
-		       "Node %d NFS_Unstable: %8lu kB\n"
-		       "Node %d Bounce:       %8lu kB\n"
-		       "Node %d WritebackTmp: %8lu kB\n"
-		       "Node %d Slab:         %8lu kB\n"
-		       "Node %d SReclaimable: %8lu kB\n"
-		       "Node %d SUnreclaim:   %8lu kB\n",
+		       "Node %d Dirty:          %8lu kB\n"
+		       "Node %d Writeback:      %8lu kB\n"
+		       "Node %d FilePages:      %8lu kB\n"
+		       "Node %d Mapped:         %8lu kB\n"
+		       "Node %d AnonPages:      %8lu kB\n"
+		       "Node %d PageTables:     %8lu kB\n"
+		       "Node %d NFS_Unstable:   %8lu kB\n"
+		       "Node %d Bounce:         %8lu kB\n"
+		       "Node %d WritebackTmp:   %8lu kB\n"
+		       "Node %d Slab:           %8lu kB\n"
+		       "Node %d SReclaimable:   %8lu kB\n"
+		       "Node %d SUnreclaim:     %8lu kB\n",
 		       nid, K(i.totalram),
 		       nid, K(i.freeram),
 		       nid, K(i.totalram - i.freeram),
-		       nid, K(node_page_state(nid, NR_ACTIVE)),
-		       nid, K(node_page_state(nid, NR_INACTIVE)),
+		       nid, K(node_page_state(nid, NR_ACTIVE_ANON) +
+				node_page_state(nid, NR_ACTIVE_FILE)),
+		       nid, K(node_page_state(nid, NR_INACTIVE_ANON) +
+				node_page_state(nid, NR_INACTIVE_FILE)),
+		       nid, K(node_page_state(nid, NR_ACTIVE_ANON)),
+		       nid, K(node_page_state(nid, NR_INACTIVE_ANON)),
+		       nid, K(node_page_state(nid, NR_ACTIVE_FILE)),
+		       nid, K(node_page_state(nid, NR_INACTIVE_FILE)),
+#ifdef CONFIG_UNEVICTABLE_LRU
+		       nid, K(node_page_state(nid, NR_UNEVICTABLE)),
+		       nid, K(node_page_state(nid, NR_MLOCK)),
+#endif
 #ifdef CONFIG_HIGHMEM
 		       nid, K(i.totalhigh),
 		       nid, K(i.freehigh),
@@ -173,6 +192,8 @@ int register_node(struct node *node, int num, struct node *parent)
 		sysdev_create_file(&node->sysdev, &attr_meminfo);
 		sysdev_create_file(&node->sysdev, &attr_numastat);
 		sysdev_create_file(&node->sysdev, &attr_distance);
+
+		scan_unevictable_register_node(node);
 	}
 	return error;
 }
@@ -192,6 +213,8 @@ void unregister_node(struct node *node)
 	sysdev_remove_file(&node->sysdev, &attr_numastat);
 	sysdev_remove_file(&node->sysdev, &attr_distance);
 
+	scan_unevictable_unregister_node(node);
+
 	sysdev_unregister(&node->sysdev);
 }
 
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index b82654e883a7..d876ad861237 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -90,7 +90,7 @@ static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
 static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
 static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
 static struct device_attribute dev_attr_firmware_version = {
-	.attr = { .name = "firmware-version", .mode = S_IRUGO, .owner = THIS_MODULE },
+	.attr = { .name = "firmware-version", .mode = S_IRUGO },
 	.show = aoedisk_show_fwver,
 };
 
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7b3351260d56..9034ca585afd 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -391,7 +391,7 @@ static ssize_t pid_show(struct device *dev,
 }
 
 static struct device_attribute pid_attr = {
-	.attr = { .name = "pid", .mode = S_IRUGO, .owner = THIS_MODULE },
+	.attr = { .name = "pid", .mode = S_IRUGO},
 	.show = pid_show,
 };
 
diff --git a/drivers/char/ds1286.c b/drivers/char/ds1286.c
index 5329d482b582..0a826d7be10e 100644
--- a/drivers/char/ds1286.c
+++ b/drivers/char/ds1286.c
@@ -210,8 +210,8 @@ static int ds1286_ioctl(struct inode *inode, struct file *file,
 		if (sec != 0)
 			return -EINVAL;
 
-		min = BIN2BCD(min);
-		min = BIN2BCD(hrs);
+		min = bin2bcd(min);
+		min = bin2bcd(hrs);
 
 		spin_lock(&ds1286_lock);
 		rtc_write(hrs, RTC_HOURS_ALARM);
@@ -353,7 +353,7 @@ static int ds1286_proc_output(char *buf)
 
 	ds1286_get_time(&tm);
 	hundredth = rtc_read(RTC_HUNDREDTH_SECOND);
-	BCD_TO_BIN(hundredth);
+	hundredth = bcd2bin(hundredth);
 
 	p += sprintf(p,
 	             "rtc_time\t: %02d:%02d:%02d.%02d\n"
@@ -477,12 +477,12 @@ static void ds1286_get_time(struct rtc_time *rtc_tm)
 	rtc_write(save_control, RTC_CMD);
 	spin_unlock_irqrestore(&ds1286_lock, flags);
 
-	BCD_TO_BIN(rtc_tm->tm_sec);
-	BCD_TO_BIN(rtc_tm->tm_min);
-	BCD_TO_BIN(rtc_tm->tm_hour);
-	BCD_TO_BIN(rtc_tm->tm_mday);
-	BCD_TO_BIN(rtc_tm->tm_mon);
-	BCD_TO_BIN(rtc_tm->tm_year);
+	rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec);
+	rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min);
+	rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour);
+	rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday);
+	rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
+	rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
 
 	/*
 	 * Account for differences between how the RTC uses the values
@@ -531,12 +531,12 @@ static int ds1286_set_time(struct rtc_time *rtc_tm)
 	if (yrs >= 100)
 		yrs -= 100;
 
-	BIN_TO_BCD(sec);
-	BIN_TO_BCD(min);
-	BIN_TO_BCD(hrs);
-	BIN_TO_BCD(day);
-	BIN_TO_BCD(mon);
-	BIN_TO_BCD(yrs);
+	sec = bin2bcd(sec);
+	min = bin2bcd(min);
+	hrs = bin2bcd(hrs);
+	day = bin2bcd(day);
+	mon = bin2bcd(mon);
+	yrs = bin2bcd(yrs);
 
 	spin_lock_irqsave(&ds1286_lock, flags);
 	save_control = rtc_read(RTC_CMD);
@@ -572,8 +572,8 @@ static void ds1286_get_alm_time(struct rtc_time *alm_tm)
 	cmd = rtc_read(RTC_CMD);
 	spin_unlock_irqrestore(&ds1286_lock, flags);
 
-	BCD_TO_BIN(alm_tm->tm_min);
-	BCD_TO_BIN(alm_tm->tm_hour);
+	alm_tm->tm_min = bcd2bin(alm_tm->tm_min);
+	alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour);
 	alm_tm->tm_sec = 0;
 }
 
diff --git a/drivers/char/ds1302.c b/drivers/char/ds1302.c
index c5e67a623951..170693c93c73 100644
--- a/drivers/char/ds1302.c
+++ b/drivers/char/ds1302.c
@@ -131,12 +131,12 @@ get_rtc_time(struct rtc_time *rtc_tm)
 
 	local_irq_restore(flags);
 
-	BCD_TO_BIN(rtc_tm->tm_sec);
-	BCD_TO_BIN(rtc_tm->tm_min);
-	BCD_TO_BIN(rtc_tm->tm_hour);
-	BCD_TO_BIN(rtc_tm->tm_mday);
-	BCD_TO_BIN(rtc_tm->tm_mon);
-	BCD_TO_BIN(rtc_tm->tm_year);
+	rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec);
+	rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min);
+	rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour);
+	rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday);
+	rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
+	rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
 
 	/*
 	 * Account for differences between how the RTC uses the values
@@ -211,12 +211,12 @@ static long rtc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			else
 				yrs -= 1900;	/* RTC (70, 71, ... 99) */
 
-			BIN_TO_BCD(sec);
-			BIN_TO_BCD(min);
-			BIN_TO_BCD(hrs);
-			BIN_TO_BCD(day);
-			BIN_TO_BCD(mon);
-			BIN_TO_BCD(yrs);
+			sec = bin2bcd(sec);
+			min = bin2bcd(min);
+			hrs = bin2bcd(hrs);
+			day = bin2bcd(day);
+			mon = bin2bcd(mon);
+			yrs = bin2bcd(yrs);
 
 			lock_kernel();
 			local_irq_save(flags);
diff --git a/drivers/char/ip27-rtc.c b/drivers/char/ip27-rtc.c
index ec9d0443d92c..2abd881b4cbc 100644
--- a/drivers/char/ip27-rtc.c
+++ b/drivers/char/ip27-rtc.c
@@ -130,12 +130,12 @@ static long rtc_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (yrs >= 100)
 			yrs -= 100;
 
-		sec = BIN2BCD(sec);
-		min = BIN2BCD(min);
-		hrs = BIN2BCD(hrs);
-		day = BIN2BCD(day);
-		mon = BIN2BCD(mon);
-		yrs = BIN2BCD(yrs);
+		sec = bin2bcd(sec);
+		min = bin2bcd(min);
+		hrs = bin2bcd(hrs);
+		day = bin2bcd(day);
+		mon = bin2bcd(mon);
+		yrs = bin2bcd(yrs);
 
 		spin_lock_irq(&rtc_lock);
 		rtc->control |= M48T35_RTC_SET;
@@ -311,12 +311,12 @@ static void get_rtc_time(struct rtc_time *rtc_tm)
 	rtc->control &= ~M48T35_RTC_READ;
 	spin_unlock_irq(&rtc_lock);
 
-	rtc_tm->tm_sec = BCD2BIN(rtc_tm->tm_sec);
-	rtc_tm->tm_min = BCD2BIN(rtc_tm->tm_min);
-	rtc_tm->tm_hour = BCD2BIN(rtc_tm->tm_hour);
-	rtc_tm->tm_mday = BCD2BIN(rtc_tm->tm_mday);
-	rtc_tm->tm_mon = BCD2BIN(rtc_tm->tm_mon);
-	rtc_tm->tm_year = BCD2BIN(rtc_tm->tm_year);
+	rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec);
+	rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min);
+	rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour);
+	rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday);
+	rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
+	rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
 
 	/*
 	 * Account for differences between how the RTC uses the values
diff --git a/drivers/char/pc8736x_gpio.c b/drivers/char/pc8736x_gpio.c
index b930de50407a..3f7da8cf3a80 100644
--- a/drivers/char/pc8736x_gpio.c
+++ b/drivers/char/pc8736x_gpio.c
@@ -41,7 +41,8 @@ static u8 pc8736x_gpio_shadow[4];
 #define SIO_BASE2       0x4E	/* alt command-reg to check */
 
 #define SIO_SID		0x20	/* SuperI/O ID Register */
-#define SIO_SID_VALUE	0xe9	/* Expected value in SuperI/O ID Register */
+#define SIO_SID_PC87365	0xe5	/* Expected value in ID Register for PC87365 */
+#define SIO_SID_PC87366	0xe9	/* Expected value in ID Register for PC87366 */
 
 #define SIO_CF1		0x21	/* chip config, bit0 is chip enable */
 
@@ -91,13 +92,17 @@ static inline int superio_inb(int addr)
 
 static int pc8736x_superio_present(void)
 {
+	int id;
+
 	/* try the 2 possible values, read a hardware reg to verify */
 	superio_cmd = SIO_BASE1;
-	if (superio_inb(SIO_SID) == SIO_SID_VALUE)
+	id = superio_inb(SIO_SID);
+	if (id == SIO_SID_PC87365 || id == SIO_SID_PC87366)
 		return superio_cmd;
 
 	superio_cmd = SIO_BASE2;
-	if (superio_inb(SIO_SID) == SIO_SID_VALUE)
+	id = superio_inb(SIO_SID);
+	if (id == SIO_SID_PC87365 || id == SIO_SID_PC87366)
 		return superio_cmd;
 
 	return 0;
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 17683de95717..32dc89720d58 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -518,17 +518,17 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
 		if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) ||
 							RTC_ALWAYS_BCD) {
 			if (sec < 60)
-				BIN_TO_BCD(sec);
+				sec = bin2bcd(sec);
 			else
 				sec = 0xff;
 
 			if (min < 60)
-				BIN_TO_BCD(min);
+				min = bin2bcd(min);
 			else
 				min = 0xff;
 
 			if (hrs < 24)
-				BIN_TO_BCD(hrs);
+				hrs = bin2bcd(hrs);
 			else
 				hrs = 0xff;
 		}
@@ -614,12 +614,12 @@ static int rtc_do_ioctl(unsigned int cmd, unsigned long arg, int kernel)
 
 		if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)
 		    || RTC_ALWAYS_BCD) {
-			BIN_TO_BCD(sec);
-			BIN_TO_BCD(min);
-			BIN_TO_BCD(hrs);
-			BIN_TO_BCD(day);
-			BIN_TO_BCD(mon);
-			BIN_TO_BCD(yrs);
+			sec = bin2bcd(sec);
+			min = bin2bcd(min);
+			hrs = bin2bcd(hrs);
+			day = bin2bcd(day);
+			mon = bin2bcd(mon);
+			yrs = bin2bcd(yrs);
 		}
 
 		save_control = CMOS_READ(RTC_CONTROL);
@@ -1099,7 +1099,7 @@ no_irq:
 	spin_unlock_irq(&rtc_lock);
 
 	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(year);       /* This should never happen... */
+		year = bcd2bin(year);       /* This should never happen... */
 
 	if (year < 20) {
 		epoch = 2000;
@@ -1352,13 +1352,13 @@ static void rtc_get_rtc_time(struct rtc_time *rtc_tm)
 	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		BCD_TO_BIN(rtc_tm->tm_sec);
-		BCD_TO_BIN(rtc_tm->tm_min);
-		BCD_TO_BIN(rtc_tm->tm_hour);
-		BCD_TO_BIN(rtc_tm->tm_mday);
-		BCD_TO_BIN(rtc_tm->tm_mon);
-		BCD_TO_BIN(rtc_tm->tm_year);
-		BCD_TO_BIN(rtc_tm->tm_wday);
+		rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec);
+		rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min);
+		rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour);
+		rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday);
+		rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
+		rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
+		rtc_tm->tm_wday = bcd2bin(rtc_tm->tm_wday);
 	}
 
 #ifdef CONFIG_MACH_DECSTATION
@@ -1392,9 +1392,9 @@ static void get_rtc_alm_time(struct rtc_time *alm_tm)
 	spin_unlock_irq(&rtc_lock);
 
 	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-		BCD_TO_BIN(alm_tm->tm_sec);
-		BCD_TO_BIN(alm_tm->tm_min);
-		BCD_TO_BIN(alm_tm->tm_hour);
+		alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec);
+		alm_tm->tm_min = bcd2bin(alm_tm->tm_min);
+		alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour);
 	}
 }
 
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 5b8d7a1aa3e6..ba4e86281fbf 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -2504,7 +2504,7 @@ static void __devexit sx_remove_card(struct sx_board *board,
 		del_timer(&board->timer);
 		if (pdev) {
 #ifdef CONFIG_PCI
-			pci_iounmap(pdev, board->base2);
+			iounmap(board->base2);
 			pci_release_region(pdev, IS_CF_BOARD(board) ? 3 : 2);
 #endif
 		} else {
@@ -2677,7 +2677,7 @@ static int __devinit sx_pci_probe(struct pci_dev *pdev,
 	}
 	board->hw_base = pci_resource_start(pdev, reg);
 	board->base2 =
-	board->base = pci_iomap(pdev, reg, WINDOW_LEN(board));
+	board->base = ioremap_nocache(board->hw_base, WINDOW_LEN(board));
 	if (!board->base) {
 		dev_err(&pdev->dev, "ioremap failed\n");
 		goto err_reg;
@@ -2703,7 +2703,7 @@ static int __devinit sx_pci_probe(struct pci_dev *pdev,
 
 	return 0;
 err_unmap:
-	pci_iounmap(pdev, board->base2);
+	iounmap(board->base2);
 err_reg:
 	pci_release_region(pdev, reg);
 err_flag:
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index dce4cc0e6953..d0c0d64ed366 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -168,7 +168,7 @@ static void sysrq_handle_show_timers(int key, struct tty_struct *tty)
 static struct sysrq_key_op sysrq_show_timers_op = {
 	.handler	= sysrq_handle_show_timers,
 	.help_msg	= "show-all-timers(Q)",
-	.action_msg	= "Show Pending Timers",
+	.action_msg	= "Show pending hrtimers (no others)",
 };
 
 static void sysrq_handle_mountro(int key, struct tty_struct *tty)
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index e70d13defde4..9c47dc48c9fd 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -1157,7 +1157,7 @@ EXPORT_SYMBOL_GPL(tpm_dev_vendor_release);
  * Once all references to platform device are down to 0,
  * release all allocated structures.
  */
-static void tpm_dev_release(struct device *dev)
+void tpm_dev_release(struct device *dev)
 {
 	struct tpm_chip *chip = dev_get_drvdata(dev);
 
diff --git a/drivers/edac/cell_edac.c b/drivers/edac/cell_edac.c
index 0e024fe2d8c4..887072f5dc8b 100644
--- a/drivers/edac/cell_edac.c
+++ b/drivers/edac/cell_edac.c
@@ -142,7 +142,7 @@ static void __devinit cell_edac_init_csrows(struct mem_ctl_info *mci)
 		csrow->nr_pages = (r.end - r.start + 1) >> PAGE_SHIFT;
 		csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
 		csrow->mtype = MEM_XDR;
-		csrow->edac_mode = EDAC_FLAG_EC | EDAC_FLAG_SECDED;
+		csrow->edac_mode = EDAC_SECDED;
 		dev_dbg(mci->dev,
 			"Initialized on node %d, chanmask=0x%x,"
 			" first_page=0x%lx, nr_pages=0x%x\n",
diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c
index deb154aa47c4..4353414a0b77 100644
--- a/drivers/firmware/iscsi_ibft.c
+++ b/drivers/firmware/iscsi_ibft.c
@@ -732,7 +732,6 @@ static int __init ibft_create_attribute(struct ibft_kobject *kobj_data,
 
 	attr->attr.name = name;
 	attr->attr.mode = S_IRUSR;
-	attr->attr.owner = THIS_MODULE;
 
 	attr->hdr = hdr;
 	attr->show = show;
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 9112830107a5..22edc4273ef6 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -248,7 +248,7 @@ static ssize_t gpio_value_show(struct device *dev,
 	if (!test_bit(FLAG_EXPORT, &desc->flags))
 		status = -EIO;
 	else
-		status = sprintf(buf, "%d\n", gpio_get_value_cansleep(gpio));
+		status = sprintf(buf, "%d\n", !!gpio_get_value_cansleep(gpio));
 
 	mutex_unlock(&sysfs_lock);
 	return status;
@@ -1105,7 +1105,7 @@ int gpio_get_value_cansleep(unsigned gpio)
 
 	might_sleep_if(extra_checks);
 	chip = gpio_to_chip(gpio);
-	return chip->get(chip, gpio - chip->base);
+	return chip->get ? chip->get(chip, gpio - chip->base) : 0;
 }
 EXPORT_SYMBOL_GPL(gpio_get_value_cansleep);
 
diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c
index b06b8e090a27..bc011da79e14 100644
--- a/drivers/hwmon/applesmc.c
+++ b/drivers/hwmon/applesmc.c
@@ -49,6 +49,9 @@
 
 #define APPLESMC_MAX_DATA_LENGTH 32
 
+#define APPLESMC_MIN_WAIT	0x0040
+#define APPLESMC_MAX_WAIT	0x8000
+
 #define APPLESMC_STATUS_MASK	0x0f
 #define APPLESMC_READ_CMD	0x10
 #define APPLESMC_WRITE_CMD	0x11
@@ -57,8 +60,8 @@
 
 #define KEY_COUNT_KEY		"#KEY" /* r-o ui32 */
 
-#define LIGHT_SENSOR_LEFT_KEY	"ALV0" /* r-o {alv (6 bytes) */
-#define LIGHT_SENSOR_RIGHT_KEY	"ALV1" /* r-o {alv (6 bytes) */
+#define LIGHT_SENSOR_LEFT_KEY	"ALV0" /* r-o {alv (6-10 bytes) */
+#define LIGHT_SENSOR_RIGHT_KEY	"ALV1" /* r-o {alv (6-10 bytes) */
 #define BACKLIGHT_KEY		"LKSB" /* w-o {lkb (2 bytes) */
 
 #define CLAMSHELL_KEY		"MSLD" /* r-o ui8 (unused) */
@@ -104,6 +107,15 @@ static const char* temperature_sensors_sets[][36] = {
 /* Set 6: Macbook3 set */
 	{ "TB0T", "TC0D", "TC0P", "TM0P", "TN0P", "TTF0", "TW0P", "Th0H",
 	  "Th0S", "Th1H", NULL },
+/* Set 7: Macbook Air */
+	{ "TB0T", "TB1S", "TB1T", "TB2S", "TB2T", "TC0D", "TC0P", "TCFP",
+	  "TTF0", "TW0P", "Th0H", "Tp0P", "TpFP", "Ts0P", "Ts0S", NULL },
+/* Set 8: Macbook Pro 4,1 (Penryn) */
+	{ "TB0T", "TC0D", "TC0P", "TG0D", "TG0H", "TTF0", "TW0P", "Th0H",
+	  "Th1H", "Th2H", "Tm0P", "Ts0P", NULL },
+/* Set 9: Macbook Pro 3,1 (Santa Rosa) */
+	{ "TALP", "TB0T", "TC0D", "TC0P", "TG0D", "TG0H", "TTF0", "TW0P",
+	  "Th0H", "Th1H", "Th2H", "Tm0P", "Ts0P", NULL },
 };
 
 /* List of keys used to read/write fan speeds */
@@ -163,25 +175,25 @@ static unsigned int key_at_index;
 static struct workqueue_struct *applesmc_led_wq;
 
 /*
- * __wait_status - Wait up to 2ms for the status port to get a certain value
+ * __wait_status - Wait up to 32ms for the status port to get a certain value
  * (masked with 0x0f), returning zero if the value is obtained.  Callers must
  * hold applesmc_lock.
  */
 static int __wait_status(u8 val)
 {
-	unsigned int i;
+	int us;
 
 	val = val & APPLESMC_STATUS_MASK;
 
-	for (i = 0; i < 200; i++) {
+	for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) {
+		udelay(us);
 		if ((inb(APPLESMC_CMD_PORT) & APPLESMC_STATUS_MASK) == val) {
 			if (debug)
 				printk(KERN_DEBUG
-						"Waited %d us for status %x\n",
-						i*10, val);
+					"Waited %d us for status %x\n",
+					2 * us - APPLESMC_MIN_WAIT, val);
 			return 0;
 		}
-		udelay(10);
 	}
 
 	printk(KERN_WARNING "applesmc: wait status failed: %x != %x\n",
@@ -191,6 +203,25 @@ static int __wait_status(u8 val)
 }
 
 /*
+ * special treatment of command port - on newer macbooks, it seems necessary
+ * to resend the command byte before polling the status again. Callers must
+ * hold applesmc_lock.
+ */
+static int send_command(u8 cmd)
+{
+	int us;
+	for (us = APPLESMC_MIN_WAIT; us < APPLESMC_MAX_WAIT; us <<= 1) {
+		outb(cmd, APPLESMC_CMD_PORT);
+		udelay(us);
+		if ((inb(APPLESMC_CMD_PORT) & APPLESMC_STATUS_MASK) == 0x0c)
+			return 0;
+	}
+	printk(KERN_WARNING "applesmc: command failed: %x -> %x\n",
+		cmd, inb(APPLESMC_CMD_PORT));
+	return -EIO;
+}
+
+/*
  * applesmc_read_key - reads len bytes from a given key, and put them in buffer.
  * Returns zero on success or a negative error on failure. Callers must
  * hold applesmc_lock.
@@ -205,8 +236,7 @@ static int applesmc_read_key(const char* key, u8* buffer, u8 len)
 		return -EINVAL;
 	}
 
-	outb(APPLESMC_READ_CMD, APPLESMC_CMD_PORT);
-	if (__wait_status(0x0c))
+	if (send_command(APPLESMC_READ_CMD))
 		return -EIO;
 
 	for (i = 0; i < 4; i++) {
@@ -249,8 +279,7 @@ static int applesmc_write_key(const char* key, u8* buffer, u8 len)
 		return -EINVAL;
 	}
 
-	outb(APPLESMC_WRITE_CMD, APPLESMC_CMD_PORT);
-	if (__wait_status(0x0c))
+	if (send_command(APPLESMC_WRITE_CMD))
 		return -EIO;
 
 	for (i = 0; i < 4; i++) {
@@ -284,8 +313,7 @@ static int applesmc_get_key_at_index(int index, char* key)
 	readkey[2] = index >> 8;
 	readkey[3] = index;
 
-	outb(APPLESMC_GET_KEY_BY_INDEX_CMD, APPLESMC_CMD_PORT);
-	if (__wait_status(0x0c))
+	if (send_command(APPLESMC_GET_KEY_BY_INDEX_CMD))
 		return -EIO;
 
 	for (i = 0; i < 4; i++) {
@@ -315,8 +343,7 @@ static int applesmc_get_key_type(char* key, char* type)
 {
 	int i;
 
-	outb(APPLESMC_GET_KEY_TYPE_CMD, APPLESMC_CMD_PORT);
-	if (__wait_status(0x0c))
+	if (send_command(APPLESMC_GET_KEY_TYPE_CMD))
 		return -EIO;
 
 	for (i = 0; i < 4; i++) {
@@ -325,7 +352,7 @@ static int applesmc_get_key_type(char* key, char* type)
 			return -EIO;
 	}
 
-	outb(5, APPLESMC_DATA_PORT);
+	outb(6, APPLESMC_DATA_PORT);
 
 	for (i = 0; i < 6; i++) {
 		if (__wait_status(0x05))
@@ -527,17 +554,27 @@ out:
 static ssize_t applesmc_light_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	static int data_length;
 	int ret;
 	u8 left = 0, right = 0;
-	u8 buffer[6];
+	u8 buffer[10], query[6];
 
 	mutex_lock(&applesmc_lock);
 
-	ret = applesmc_read_key(LIGHT_SENSOR_LEFT_KEY, buffer, 6);
+	if (!data_length) {
+		ret = applesmc_get_key_type(LIGHT_SENSOR_LEFT_KEY, query);
+		if (ret)
+			goto out;
+		data_length = clamp_val(query[0], 0, 10);
+		printk(KERN_INFO "applesmc: light sensor data length set to "
+			"%d\n", data_length);
+	}
+
+	ret = applesmc_read_key(LIGHT_SENSOR_LEFT_KEY, buffer, data_length);
 	left = buffer[2];
 	if (ret)
 		goto out;
-	ret = applesmc_read_key(LIGHT_SENSOR_RIGHT_KEY, buffer, 6);
+	ret = applesmc_read_key(LIGHT_SENSOR_RIGHT_KEY, buffer, data_length);
 	right = buffer[2];
 
 out:
@@ -1233,39 +1270,57 @@ static __initdata struct dmi_match_data applesmc_dmi_data[] = {
 	{ .accelerometer = 0, .light = 0, .temperature_set = 5 },
 /* MacBook3: accelerometer and temperature set 6 */
 	{ .accelerometer = 1, .light = 0, .temperature_set = 6 },
+/* MacBook Air: accelerometer, backlight and temperature set 7 */
+	{ .accelerometer = 1, .light = 1, .temperature_set = 7 },
+/* MacBook Pro 4: accelerometer, backlight and temperature set 8 */
+	{ .accelerometer = 1, .light = 1, .temperature_set = 8 },
+/* MacBook Pro 3: accelerometer, backlight and temperature set 9 */
+	{ .accelerometer = 1, .light = 1, .temperature_set = 9 },
 };
 
 /* Note that DMI_MATCH(...,"MacBook") will match "MacBookPro1,1".
  * So we need to put "Apple MacBook Pro" before "Apple MacBook". */
 static __initdata struct dmi_system_id applesmc_whitelist[] = {
+	{ applesmc_dmi_match, "Apple MacBook Air", {
+	  DMI_MATCH(DMI_BOARD_VENDOR, "Apple"),
+	  DMI_MATCH(DMI_PRODUCT_NAME, "MacBookAir") },
+		&applesmc_dmi_data[7]},
+	{ applesmc_dmi_match, "Apple MacBook Pro 4", {
+	  DMI_MATCH(DMI_BOARD_VENDOR, "Apple"),
+	  DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro4") },
+		&applesmc_dmi_data[8]},
+	{ applesmc_dmi_match, "Apple MacBook Pro 3", {
+	  DMI_MATCH(DMI_BOARD_VENDOR, "Apple"),
+	  DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro3") },
+		&applesmc_dmi_data[9]},
 	{ applesmc_dmi_match, "Apple MacBook Pro", {
 	  DMI_MATCH(DMI_BOARD_VENDOR,"Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"MacBookPro") },
-		(void*)&applesmc_dmi_data[0]},
+		&applesmc_dmi_data[0]},
 	{ applesmc_dmi_match, "Apple MacBook (v2)", {
 	  DMI_MATCH(DMI_BOARD_VENDOR,"Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"MacBook2") },
-		(void*)&applesmc_dmi_data[1]},
+		&applesmc_dmi_data[1]},
 	{ applesmc_dmi_match, "Apple MacBook (v3)", {
 	  DMI_MATCH(DMI_BOARD_VENDOR,"Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"MacBook3") },
-		(void*)&applesmc_dmi_data[6]},
+		&applesmc_dmi_data[6]},
 	{ applesmc_dmi_match, "Apple MacBook", {
 	  DMI_MATCH(DMI_BOARD_VENDOR,"Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"MacBook") },
-		(void*)&applesmc_dmi_data[2]},
+		&applesmc_dmi_data[2]},
 	{ applesmc_dmi_match, "Apple Macmini", {
 	  DMI_MATCH(DMI_BOARD_VENDOR,"Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"Macmini") },
-		(void*)&applesmc_dmi_data[3]},
+		&applesmc_dmi_data[3]},
 	{ applesmc_dmi_match, "Apple MacPro2", {
 	  DMI_MATCH(DMI_BOARD_VENDOR,"Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"MacPro2") },
-		(void*)&applesmc_dmi_data[4]},
+		&applesmc_dmi_data[4]},
 	{ applesmc_dmi_match, "Apple iMac", {
 	  DMI_MATCH(DMI_BOARD_VENDOR,"Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME,"iMac") },
-		(void*)&applesmc_dmi_data[5]},
+		&applesmc_dmi_data[5]},
 	{ .ident = NULL }
 };
 
diff --git a/drivers/hwmon/pc87360.c b/drivers/hwmon/pc87360.c
index 9b462bb13fa3..5fbfa34c110e 100644
--- a/drivers/hwmon/pc87360.c
+++ b/drivers/hwmon/pc87360.c
@@ -75,7 +75,8 @@ MODULE_PARM_DESC(force_id, "Override the detected device ID");
 #define FSCM	0x09	/* Logical device: fans */
 #define VLM	0x0d	/* Logical device: voltages */
 #define TMS	0x0e	/* Logical device: temperatures */
-static const u8 logdev[3] = { FSCM, VLM, TMS };
+#define LDNI_MAX 3
+static const u8 logdev[LDNI_MAX] = { FSCM, VLM, TMS };
 
 #define LD_FAN		0
 #define LD_IN		1
@@ -489,11 +490,66 @@ static struct sensor_device_attribute in_max[] = {
 	SENSOR_ATTR(in10_max, S_IWUSR | S_IRUGO, show_in_max, set_in_max, 10),
 };
 
+/* (temp & vin) channel status register alarm bits (pdf sec.11.5.12) */
+#define CHAN_ALM_MIN	0x02	/* min limit crossed */
+#define CHAN_ALM_MAX	0x04	/* max limit exceeded */
+#define TEMP_ALM_CRIT	0x08	/* temp crit exceeded (temp only) */
+
+/* show_in_min/max_alarm() reads data from the per-channel status
+   register (sec 11.5.12), not the vin event status registers (sec
+   11.5.2) that (legacy) show_in_alarm() resds (via data->in_alarms) */
+
+static ssize_t show_in_min_alarm(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MIN));
+}
+static ssize_t show_in_max_alarm(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MAX));
+}
+
+static struct sensor_device_attribute in_min_alarm[] = {
+	SENSOR_ATTR(in0_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 0),
+	SENSOR_ATTR(in1_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 1),
+	SENSOR_ATTR(in2_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 2),
+	SENSOR_ATTR(in3_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 3),
+	SENSOR_ATTR(in4_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 4),
+	SENSOR_ATTR(in5_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 5),
+	SENSOR_ATTR(in6_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 6),
+	SENSOR_ATTR(in7_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 7),
+	SENSOR_ATTR(in8_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 8),
+	SENSOR_ATTR(in9_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 9),
+	SENSOR_ATTR(in10_min_alarm, S_IRUGO, show_in_min_alarm, NULL, 10),
+};
+static struct sensor_device_attribute in_max_alarm[] = {
+	SENSOR_ATTR(in0_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 0),
+	SENSOR_ATTR(in1_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 1),
+	SENSOR_ATTR(in2_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 2),
+	SENSOR_ATTR(in3_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 3),
+	SENSOR_ATTR(in4_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 4),
+	SENSOR_ATTR(in5_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 5),
+	SENSOR_ATTR(in6_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 6),
+	SENSOR_ATTR(in7_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 7),
+	SENSOR_ATTR(in8_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 8),
+	SENSOR_ATTR(in9_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 9),
+	SENSOR_ATTR(in10_max_alarm, S_IRUGO, show_in_max_alarm, NULL, 10),
+};
+
 #define VIN_UNIT_ATTRS(X) \
 	&in_input[X].dev_attr.attr,	\
 	&in_status[X].dev_attr.attr,	\
 	&in_min[X].dev_attr.attr,	\
-	&in_max[X].dev_attr.attr
+	&in_max[X].dev_attr.attr,	\
+	&in_min_alarm[X].dev_attr.attr,	\
+	&in_max_alarm[X].dev_attr.attr
 
 static ssize_t show_vid(struct device *dev, struct device_attribute *attr, char *buf)
 {
@@ -658,12 +714,68 @@ static struct sensor_device_attribute therm_crit[] = {
 		    show_therm_crit, set_therm_crit, 2+11),
 };
 
+/* show_therm_min/max_alarm() reads data from the per-channel voltage
+   status register (sec 11.5.12) */
+
+static ssize_t show_therm_min_alarm(struct device *dev,
+				struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MIN));
+}
+static ssize_t show_therm_max_alarm(struct device *dev,
+				struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->in_status[nr] & CHAN_ALM_MAX));
+}
+static ssize_t show_therm_crit_alarm(struct device *dev,
+				struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->in_status[nr] & TEMP_ALM_CRIT));
+}
+
+static struct sensor_device_attribute therm_min_alarm[] = {
+	SENSOR_ATTR(temp4_min_alarm, S_IRUGO,
+		    show_therm_min_alarm, NULL, 0+11),
+	SENSOR_ATTR(temp5_min_alarm, S_IRUGO,
+		    show_therm_min_alarm, NULL, 1+11),
+	SENSOR_ATTR(temp6_min_alarm, S_IRUGO,
+		    show_therm_min_alarm, NULL, 2+11),
+};
+static struct sensor_device_attribute therm_max_alarm[] = {
+	SENSOR_ATTR(temp4_max_alarm, S_IRUGO,
+		    show_therm_max_alarm, NULL, 0+11),
+	SENSOR_ATTR(temp5_max_alarm, S_IRUGO,
+		    show_therm_max_alarm, NULL, 1+11),
+	SENSOR_ATTR(temp6_max_alarm, S_IRUGO,
+		    show_therm_max_alarm, NULL, 2+11),
+};
+static struct sensor_device_attribute therm_crit_alarm[] = {
+	SENSOR_ATTR(temp4_crit_alarm, S_IRUGO,
+		    show_therm_crit_alarm, NULL, 0+11),
+	SENSOR_ATTR(temp5_crit_alarm, S_IRUGO,
+		    show_therm_crit_alarm, NULL, 1+11),
+	SENSOR_ATTR(temp6_crit_alarm, S_IRUGO,
+		    show_therm_crit_alarm, NULL, 2+11),
+};
+
 #define THERM_UNIT_ATTRS(X) \
 	&therm_input[X].dev_attr.attr,	\
 	&therm_status[X].dev_attr.attr,	\
 	&therm_min[X].dev_attr.attr,	\
 	&therm_max[X].dev_attr.attr,	\
-	&therm_crit[X].dev_attr.attr
+	&therm_crit[X].dev_attr.attr,	\
+	&therm_min_alarm[X].dev_attr.attr, \
+	&therm_max_alarm[X].dev_attr.attr, \
+	&therm_crit_alarm[X].dev_attr.attr
 
 static struct attribute * pc8736x_therm_attr_array[] = {
 	THERM_UNIT_ATTRS(0),
@@ -790,12 +902,76 @@ static ssize_t show_temp_alarms(struct device *dev, struct device_attribute *att
 }
 static DEVICE_ATTR(alarms_temp, S_IRUGO, show_temp_alarms, NULL);
 
+/* show_temp_min/max_alarm() reads data from the per-channel status
+   register (sec 12.3.7), not the temp event status registers (sec
+   12.3.2) that show_temp_alarm() reads (via data->temp_alarms) */
+
+static ssize_t show_temp_min_alarm(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->temp_status[nr] & CHAN_ALM_MIN));
+}
+static ssize_t show_temp_max_alarm(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->temp_status[nr] & CHAN_ALM_MAX));
+}
+static ssize_t show_temp_crit_alarm(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->temp_status[nr] & TEMP_ALM_CRIT));
+}
+
+static struct sensor_device_attribute temp_min_alarm[] = {
+	SENSOR_ATTR(temp1_min_alarm, S_IRUGO, show_temp_min_alarm, NULL, 0),
+	SENSOR_ATTR(temp2_min_alarm, S_IRUGO, show_temp_min_alarm, NULL, 1),
+	SENSOR_ATTR(temp3_min_alarm, S_IRUGO, show_temp_min_alarm, NULL, 2),
+};
+static struct sensor_device_attribute temp_max_alarm[] = {
+	SENSOR_ATTR(temp1_max_alarm, S_IRUGO, show_temp_max_alarm, NULL, 0),
+	SENSOR_ATTR(temp2_max_alarm, S_IRUGO, show_temp_max_alarm, NULL, 1),
+	SENSOR_ATTR(temp3_max_alarm, S_IRUGO, show_temp_max_alarm, NULL, 2),
+};
+static struct sensor_device_attribute temp_crit_alarm[] = {
+	SENSOR_ATTR(temp1_crit_alarm, S_IRUGO, show_temp_crit_alarm, NULL, 0),
+	SENSOR_ATTR(temp2_crit_alarm, S_IRUGO, show_temp_crit_alarm, NULL, 1),
+	SENSOR_ATTR(temp3_crit_alarm, S_IRUGO, show_temp_crit_alarm, NULL, 2),
+};
+
+#define TEMP_FAULT	0x40	/* open diode */
+static ssize_t show_temp_fault(struct device *dev,
+			struct device_attribute *devattr, char *buf)
+{
+	struct pc87360_data *data = pc87360_update_device(dev);
+	unsigned nr = to_sensor_dev_attr(devattr)->index;
+
+	return sprintf(buf, "%u\n", !!(data->temp_status[nr] & TEMP_FAULT));
+}
+static struct sensor_device_attribute temp_fault[] = {
+	SENSOR_ATTR(temp1_fault, S_IRUGO, show_temp_fault, NULL, 0),
+	SENSOR_ATTR(temp2_fault, S_IRUGO, show_temp_fault, NULL, 1),
+	SENSOR_ATTR(temp3_fault, S_IRUGO, show_temp_fault, NULL, 2),
+};
+
 #define TEMP_UNIT_ATTRS(X) \
 	&temp_input[X].dev_attr.attr,	\
 	&temp_status[X].dev_attr.attr,	\
 	&temp_min[X].dev_attr.attr,	\
 	&temp_max[X].dev_attr.attr,	\
-	&temp_crit[X].dev_attr.attr
+	&temp_crit[X].dev_attr.attr,	\
+	&temp_min_alarm[X].dev_attr.attr, \
+	&temp_max_alarm[X].dev_attr.attr, \
+	&temp_crit_alarm[X].dev_attr.attr, \
+	&temp_fault[X].dev_attr.attr
 
 static struct attribute * pc8736x_temp_attr_array[] = {
 	TEMP_UNIT_ATTRS(0),
@@ -809,8 +985,8 @@ static const struct attribute_group pc8736x_temp_group = {
 	.attrs = pc8736x_temp_attr_array,
 };
 
-static ssize_t show_name(struct device *dev, struct device_attribute
-			 *devattr, char *buf)
+static ssize_t show_name(struct device *dev,
+			struct device_attribute *devattr, char *buf)
 {
 	struct pc87360_data *data = dev_get_drvdata(dev);
 	return sprintf(buf, "%s\n", data->name);
@@ -955,7 +1131,7 @@ static int __devinit pc87360_probe(struct platform_device *pdev)
 	mutex_init(&data->update_lock);
 	platform_set_drvdata(pdev, data);
 
-	for (i = 0; i < 3; i++) {
+	for (i = 0; i < LDNI_MAX; i++) {
 		if (((data->address[i] = extra_isa[i]))
 		 && !request_region(extra_isa[i], PC87360_EXTENT,
 		 		    pc87360_driver.driver.name)) {
@@ -1031,7 +1207,15 @@ static int __devinit pc87360_probe(struct platform_device *pdev)
 			    || (err = device_create_file(dev,
 					&temp_crit[i].dev_attr))
 			    || (err = device_create_file(dev,
-					&temp_status[i].dev_attr)))
+					&temp_status[i].dev_attr))
+			    || (err = device_create_file(dev,
+					&temp_min_alarm[i].dev_attr))
+			    || (err = device_create_file(dev,
+					&temp_max_alarm[i].dev_attr))
+			    || (err = device_create_file(dev,
+					&temp_crit_alarm[i].dev_attr))
+			    || (err = device_create_file(dev,
+					&temp_fault[i].dev_attr)))
 				goto ERROR3;
 		}
 		if ((err = device_create_file(dev, &dev_attr_alarms_temp)))
@@ -1131,6 +1315,16 @@ static void pc87360_write_value(struct pc87360_data *data, u8 ldi, u8 bank,
 	mutex_unlock(&(data->lock));
 }
 
+/* (temp & vin) channel conversion status register flags (pdf sec.11.5.12) */
+#define CHAN_CNVRTD	0x80	/* new data ready */
+#define CHAN_ENA	0x01	/* enabled channel (temp or vin) */
+#define CHAN_ALM_ENA	0x10	/* propagate to alarms-reg ?? (chk val!) */
+#define CHAN_READY	(CHAN_ENA|CHAN_CNVRTD) /* sample ready mask */
+
+#define TEMP_OTS_OE	0x20	/* OTS Output Enable */
+#define VIN_RW1C_MASK	(CHAN_READY|CHAN_ALM_MAX|CHAN_ALM_MIN)   /* 0x87 */
+#define TEMP_RW1C_MASK	(VIN_RW1C_MASK|TEMP_ALM_CRIT|TEMP_FAULT) /* 0xCF */
+
 static void pc87360_init_device(struct platform_device *pdev,
 				int use_thermistors)
 {
@@ -1152,11 +1346,12 @@ static void pc87360_init_device(struct platform_device *pdev,
 
 	nr = data->innr < 11 ? data->innr : 11;
 	for (i = 0; i < nr; i++) {
+		reg = pc87360_read_value(data, LD_IN, i,
+					 PC87365_REG_IN_STATUS);
+		dev_dbg(&pdev->dev, "bios in%d status:0x%02x\n", i, reg);
 		if (init >= init_in[i]) {
 			/* Forcibly enable voltage channel */
-			reg = pc87360_read_value(data, LD_IN, i,
-						 PC87365_REG_IN_STATUS);
-			if (!(reg & 0x01)) {
+			if (!(reg & CHAN_ENA)) {
 				dev_dbg(&pdev->dev, "Forcibly "
 					"enabling in%d\n", i);
 				pc87360_write_value(data, LD_IN, i,
@@ -1168,19 +1363,24 @@ static void pc87360_init_device(struct platform_device *pdev,
 
 	/* We can't blindly trust the Super-I/O space configuration bit,
 	   most BIOS won't set it properly */
+	dev_dbg(&pdev->dev, "bios thermistors:%d\n", use_thermistors);
 	for (i = 11; i < data->innr; i++) {
 		reg = pc87360_read_value(data, LD_IN, i,
 					 PC87365_REG_TEMP_STATUS);
-		use_thermistors = use_thermistors || (reg & 0x01);
+		use_thermistors = use_thermistors || (reg & CHAN_ENA);
+		/* thermistors are temp[4-6], measured on vin[11-14] */
+		dev_dbg(&pdev->dev, "bios temp%d_status:0x%02x\n", i-7, reg);
 	}
+	dev_dbg(&pdev->dev, "using thermistors:%d\n", use_thermistors);
 
 	i = use_thermistors ? 2 : 0;
 	for (; i < data->tempnr; i++) {
+		reg = pc87360_read_value(data, LD_TEMP, i,
+					 PC87365_REG_TEMP_STATUS);
+		dev_dbg(&pdev->dev, "bios temp%d_status:0x%02x\n", i+1, reg);
 		if (init >= init_temp[i]) {
 			/* Forcibly enable temperature channel */
-			reg = pc87360_read_value(data, LD_TEMP, i,
-						 PC87365_REG_TEMP_STATUS);
-			if (!(reg & 0x01)) {
+			if (!(reg & CHAN_ENA)) {
 				dev_dbg(&pdev->dev, "Forcibly "
 					"enabling temp%d\n", i+1);
 				pc87360_write_value(data, LD_TEMP, i,
@@ -1197,7 +1397,7 @@ static void pc87360_init_device(struct platform_device *pdev,
 				   diodes */
 				reg = pc87360_read_value(data, LD_TEMP,
 				      (i-11)/2, PC87365_REG_TEMP_STATUS);
-				if (reg & 0x01) {
+				if (reg & CHAN_ENA) {
 					dev_dbg(&pdev->dev, "Skipping "
 						"temp%d, pin already in use "
 						"by temp%d\n", i-7, (i-11)/2);
@@ -1207,7 +1407,7 @@ static void pc87360_init_device(struct platform_device *pdev,
 				/* Forcibly enable thermistor channel */
 				reg = pc87360_read_value(data, LD_IN, i,
 							 PC87365_REG_IN_STATUS);
-				if (!(reg & 0x01)) {
+				if (!(reg & CHAN_ENA)) {
 					dev_dbg(&pdev->dev, "Forcibly "
 						"enabling temp%d\n", i-7);
 					pc87360_write_value(data, LD_IN, i,
@@ -1221,7 +1421,8 @@ static void pc87360_init_device(struct platform_device *pdev,
 	if (data->innr) {
 		reg = pc87360_read_value(data, LD_IN, NO_BANK,
 					 PC87365_REG_IN_CONFIG);
-		if (reg & 0x01) {
+		dev_dbg(&pdev->dev, "bios vin-cfg:0x%02x\n", reg);
+		if (reg & CHAN_ENA) {
 			dev_dbg(&pdev->dev, "Forcibly "
 				"enabling monitoring (VLM)\n");
 			pc87360_write_value(data, LD_IN, NO_BANK,
@@ -1233,7 +1434,8 @@ static void pc87360_init_device(struct platform_device *pdev,
 	if (data->tempnr) {
 		reg = pc87360_read_value(data, LD_TEMP, NO_BANK,
 					 PC87365_REG_TEMP_CONFIG);
-		if (reg & 0x01) {
+		dev_dbg(&pdev->dev, "bios temp-cfg:0x%02x\n", reg);
+		if (reg & CHAN_ENA) {
 			dev_dbg(&pdev->dev, "Forcibly enabling "
 				"monitoring (TMS)\n");
 			pc87360_write_value(data, LD_TEMP, NO_BANK,
@@ -1336,11 +1538,11 @@ static struct pc87360_data *pc87360_update_device(struct device *dev)
 			pc87360_write_value(data, LD_IN, i,
 					    PC87365_REG_IN_STATUS,
 					    data->in_status[i]);
-			if ((data->in_status[i] & 0x81) == 0x81) {
+			if ((data->in_status[i] & CHAN_READY) == CHAN_READY) {
 				data->in[i] = pc87360_read_value(data, LD_IN,
 					      i, PC87365_REG_IN);
 			}
-			if (data->in_status[i] & 0x01) {
+			if (data->in_status[i] & CHAN_ENA) {
 				data->in_min[i] = pc87360_read_value(data,
 						  LD_IN, i,
 						  PC87365_REG_IN_MIN);
@@ -1373,12 +1575,12 @@ static struct pc87360_data *pc87360_update_device(struct device *dev)
 			pc87360_write_value(data, LD_TEMP, i,
 					    PC87365_REG_TEMP_STATUS,
 					    data->temp_status[i]);
-			if ((data->temp_status[i] & 0x81) == 0x81) {
+			if ((data->temp_status[i] & CHAN_READY) == CHAN_READY) {
 				data->temp[i] = pc87360_read_value(data,
 						LD_TEMP, i,
 						PC87365_REG_TEMP);
 			}
-			if (data->temp_status[i] & 0x01) {
+			if (data->temp_status[i] & CHAN_ENA) {
 				data->temp_min[i] = pc87360_read_value(data,
 						    LD_TEMP, i,
 						    PC87365_REG_TEMP_MIN);
diff --git a/drivers/i2c/chips/at24.c b/drivers/i2c/chips/at24.c
index 2a4acb269569..d4775528abc6 100644
--- a/drivers/i2c/chips/at24.c
+++ b/drivers/i2c/chips/at24.c
@@ -460,7 +460,6 @@ static int at24_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	 */
 	at24->bin.attr.name = "eeprom";
 	at24->bin.attr.mode = chip.flags & AT24_FLAG_IRUGO ? S_IRUGO : S_IRUSR;
-	at24->bin.attr.owner = THIS_MODULE;
 	at24->bin.read = at24_bin_read;
 	at24->bin.size = chip.byte_len;
 
diff --git a/drivers/i2c/chips/ds1682.c b/drivers/i2c/chips/ds1682.c
index 23be4d42cb02..f3ee4a1abb77 100644
--- a/drivers/i2c/chips/ds1682.c
+++ b/drivers/i2c/chips/ds1682.c
@@ -190,7 +190,6 @@ static struct bin_attribute ds1682_eeprom_attr = {
 	.attr = {
 		.name = "eeprom",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = DS1682_EEPROM_SIZE,
 	.read = ds1682_eeprom_read,
diff --git a/drivers/i2c/chips/menelaus.c b/drivers/i2c/chips/menelaus.c
index 176126d3a01d..4b364bae6b3e 100644
--- a/drivers/i2c/chips/menelaus.c
+++ b/drivers/i2c/chips/menelaus.c
@@ -832,52 +832,52 @@ static irqreturn_t menelaus_irq(int irq, void *_menelaus)
 
 static void menelaus_to_time(char *regs, struct rtc_time *t)
 {
-	t->tm_sec = BCD2BIN(regs[0]);
-	t->tm_min = BCD2BIN(regs[1]);
+	t->tm_sec = bcd2bin(regs[0]);
+	t->tm_min = bcd2bin(regs[1]);
 	if (the_menelaus->rtc_control & RTC_CTRL_MODE12) {
-		t->tm_hour = BCD2BIN(regs[2] & 0x1f) - 1;
+		t->tm_hour = bcd2bin(regs[2] & 0x1f) - 1;
 		if (regs[2] & RTC_HR_PM)
 			t->tm_hour += 12;
 	} else
-		t->tm_hour = BCD2BIN(regs[2] & 0x3f);
-	t->tm_mday = BCD2BIN(regs[3]);
-	t->tm_mon = BCD2BIN(regs[4]) - 1;
-	t->tm_year = BCD2BIN(regs[5]) + 100;
+		t->tm_hour = bcd2bin(regs[2] & 0x3f);
+	t->tm_mday = bcd2bin(regs[3]);
+	t->tm_mon = bcd2bin(regs[4]) - 1;
+	t->tm_year = bcd2bin(regs[5]) + 100;
 }
 
 static int time_to_menelaus(struct rtc_time *t, int regnum)
 {
 	int	hour, status;
 
-	status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_sec));
+	status = menelaus_write_reg(regnum++, bin2bcd(t->tm_sec));
 	if (status < 0)
 		goto fail;
 
-	status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_min));
+	status = menelaus_write_reg(regnum++, bin2bcd(t->tm_min));
 	if (status < 0)
 		goto fail;
 
 	if (the_menelaus->rtc_control & RTC_CTRL_MODE12) {
 		hour = t->tm_hour + 1;
 		if (hour > 12)
-			hour = RTC_HR_PM | BIN2BCD(hour - 12);
+			hour = RTC_HR_PM | bin2bcd(hour - 12);
 		else
-			hour = BIN2BCD(hour);
+			hour = bin2bcd(hour);
 	} else
-		hour = BIN2BCD(t->tm_hour);
+		hour = bin2bcd(t->tm_hour);
 	status = menelaus_write_reg(regnum++, hour);
 	if (status < 0)
 		goto fail;
 
-	status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_mday));
+	status = menelaus_write_reg(regnum++, bin2bcd(t->tm_mday));
 	if (status < 0)
 		goto fail;
 
-	status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_mon + 1));
+	status = menelaus_write_reg(regnum++, bin2bcd(t->tm_mon + 1));
 	if (status < 0)
 		goto fail;
 
-	status = menelaus_write_reg(regnum++, BIN2BCD(t->tm_year - 100));
+	status = menelaus_write_reg(regnum++, bin2bcd(t->tm_year - 100));
 	if (status < 0)
 		goto fail;
 
@@ -914,7 +914,7 @@ static int menelaus_read_time(struct device *dev, struct rtc_time *t)
 	}
 
 	menelaus_to_time(regs, t);
-	t->tm_wday = BCD2BIN(regs[6]);
+	t->tm_wday = bcd2bin(regs[6]);
 
 	return 0;
 }
@@ -927,7 +927,7 @@ static int menelaus_set_time(struct device *dev, struct rtc_time *t)
 	status = time_to_menelaus(t, MENELAUS_RTC_SEC);
 	if (status < 0)
 		return status;
-	status = menelaus_write_reg(MENELAUS_RTC_WKDAY, BIN2BCD(t->tm_wday));
+	status = menelaus_write_reg(MENELAUS_RTC_WKDAY, bin2bcd(t->tm_wday));
 	if (status < 0) {
 		dev_err(&the_menelaus->client->dev, "rtc write reg %02x "
 				"err %d\n", MENELAUS_RTC_WKDAY, status);
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index a78d35aecee3..f1e82a92e61e 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -122,7 +122,7 @@ struct cm_counter_attribute {
 
 #define CM_COUNTER_ATTR(_name, _index) \
 struct cm_counter_attribute cm_##_name##_counter_attr = { \
-	.attr = { .name = __stringify(_name), .mode = 0444, .owner = THIS_MODULE }, \
+	.attr = { .name = __stringify(_name), .mode = 0444 }, \
 	.index = _index \
 }
 
diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c
index c7c770c28988..aa1ff524256e 100644
--- a/drivers/media/dvb/ttpci/av7110.c
+++ b/drivers/media/dvb/ttpci/av7110.c
@@ -36,7 +36,6 @@
 #include <linux/fs.h>
 #include <linux/timer.h>
 #include <linux/poll.h>
-#include <linux/byteorder/swabb.h>
 #include <linux/smp_lock.h>
 
 #include <linux/kernel.h>
@@ -52,6 +51,7 @@
 #include <linux/i2c.h>
 #include <linux/kthread.h>
 #include <asm/unaligned.h>
+#include <asm/byteorder.h>
 
 #include <asm/system.h>
 
diff --git a/drivers/media/video/cx18/cx18-driver.h b/drivers/media/video/cx18/cx18-driver.h
index fa8be0731a3f..a4b1708fafe7 100644
--- a/drivers/media/video/cx18/cx18-driver.h
+++ b/drivers/media/video/cx18/cx18-driver.h
@@ -41,6 +41,7 @@
 #include <linux/pagemap.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
+#include <asm/byteorder.h>
 
 #include <linux/dvb/video.h>
 #include <linux/dvb/audio.h>
diff --git a/drivers/media/video/ivtv/ivtv-driver.h b/drivers/media/video/ivtv/ivtv-driver.h
index bc29436e8a3c..3733b2afec5f 100644
--- a/drivers/media/video/ivtv/ivtv-driver.h
+++ b/drivers/media/video/ivtv/ivtv-driver.h
@@ -55,6 +55,7 @@
 #include <linux/mutex.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
+#include <asm/byteorder.h>
 
 #include <linux/dvb/video.h>
 #include <linux/dvb/audio.h>
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 6e291bf8237a..5263913e0c69 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1044,7 +1044,6 @@ static int mspro_block_read_attributes(struct memstick_dev *card)
 
 		s_attr->dev_attr.attr.name = s_attr->name;
 		s_attr->dev_attr.attr.mode = S_IRUGO;
-		s_attr->dev_attr.attr.owner = THIS_MODULE;
 		s_attr->dev_attr.show = mspro_block_attr_show(s_attr->id);
 
 		if (!rc)
diff --git a/drivers/misc/hp-wmi.c b/drivers/misc/hp-wmi.c
index 5dabfb69ee53..4b7c24c519c3 100644
--- a/drivers/misc/hp-wmi.c
+++ b/drivers/misc/hp-wmi.c
@@ -82,6 +82,7 @@ static struct key_entry hp_wmi_keymap[] = {
 	{KE_KEY, 0x03, KEY_BRIGHTNESSDOWN},
 	{KE_KEY, 0x20e6, KEY_PROG1},
 	{KE_KEY, 0x2142, KEY_MEDIA},
+	{KE_KEY, 0x213b, KEY_INFO},
 	{KE_KEY, 0x231b, KEY_HELP},
 	{KE_END, 0}
 };
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index 14f11f8b9e5f..a90d50c2c3e5 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -172,6 +172,11 @@ config MTD_CHAR
 	  memory chips, and also use ioctl() to obtain information about
 	  the device, or to erase parts of it.
 
+config HAVE_MTD_OTP
+	bool
+	help
+	  Enable access to OTP regions using MTD_CHAR.
+
 config MTD_BLKDEVS
 	tristate "Common interface to block layer for MTD 'translation layers'"
 	depends on BLOCK
diff --git a/drivers/mtd/chips/Kconfig b/drivers/mtd/chips/Kconfig
index 479d32b57a1e..9408099eec48 100644
--- a/drivers/mtd/chips/Kconfig
+++ b/drivers/mtd/chips/Kconfig
@@ -6,6 +6,7 @@ menu "RAM/ROM/Flash chip drivers"
 config MTD_CFI
 	tristate "Detect flash chips by Common Flash Interface (CFI) probe"
 	select MTD_GEN_PROBE
+	select MTD_CFI_UTIL
 	help
 	  The Common Flash Interface specification was developed by Intel,
 	  AMD and other flash manufactures that provides a universal method
@@ -154,6 +155,7 @@ config MTD_CFI_I8
 config MTD_OTP
 	bool "Protection Registers aka one-time programmable (OTP) bits"
 	depends on MTD_CFI_ADV_OPTIONS
+	select HAVE_MTD_OTP
 	default n
 	help
 	  This enables support for reading, writing and locking so called
@@ -187,7 +189,7 @@ config MTD_CFI_INTELEXT
 	  StrataFlash and other parts.
 
 config MTD_CFI_AMDSTD
-	tristate "Support for AMD/Fujitsu flash chips"
+	tristate "Support for AMD/Fujitsu/Spansion flash chips"
 	depends on MTD_GEN_PROBE
 	select MTD_CFI_UTIL
 	help
diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 5f1b472137a0..c93a8be5d5f1 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -478,6 +478,28 @@ struct mtd_info *cfi_cmdset_0001(struct map_info *map, int primary)
 		else
 			cfi->chips[i].erase_time = 2000000;
 
+		if (cfi->cfiq->WordWriteTimeoutTyp &&
+		    cfi->cfiq->WordWriteTimeoutMax)
+			cfi->chips[i].word_write_time_max =
+				1<<(cfi->cfiq->WordWriteTimeoutTyp +
+				    cfi->cfiq->WordWriteTimeoutMax);
+		else
+			cfi->chips[i].word_write_time_max = 50000 * 8;
+
+		if (cfi->cfiq->BufWriteTimeoutTyp &&
+		    cfi->cfiq->BufWriteTimeoutMax)
+			cfi->chips[i].buffer_write_time_max =
+				1<<(cfi->cfiq->BufWriteTimeoutTyp +
+				    cfi->cfiq->BufWriteTimeoutMax);
+
+		if (cfi->cfiq->BlockEraseTimeoutTyp &&
+		    cfi->cfiq->BlockEraseTimeoutMax)
+			cfi->chips[i].erase_time_max =
+				1000<<(cfi->cfiq->BlockEraseTimeoutTyp +
+				       cfi->cfiq->BlockEraseTimeoutMax);
+		else
+			cfi->chips[i].erase_time_max = 2000000 * 8;
+
 		cfi->chips[i].ref_point_counter = 0;
 		init_waitqueue_head(&(cfi->chips[i].wq));
 	}
@@ -703,6 +725,10 @@ static int chip_ready (struct map_info *map, struct flchip *chip, unsigned long
 	struct cfi_pri_intelext *cfip = cfi->cmdset_priv;
 	unsigned long timeo = jiffies + HZ;
 
+	/* Prevent setting state FL_SYNCING for chip in suspended state. */
+	if (mode == FL_SYNCING && chip->oldstate != FL_READY)
+		goto sleep;
+
 	switch (chip->state) {
 
 	case FL_STATUS:
@@ -808,8 +834,9 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 	DECLARE_WAITQUEUE(wait, current);
 
  retry:
-	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING
-			   || mode == FL_OTP_WRITE || mode == FL_SHUTDOWN)) {
+	if (chip->priv &&
+	    (mode == FL_WRITING || mode == FL_ERASING || mode == FL_OTP_WRITE
+	    || mode == FL_SHUTDOWN) && chip->state != FL_SYNCING) {
 		/*
 		 * OK. We have possibility for contention on the write/erase
 		 * operations which are global to the real chip and not per
@@ -859,6 +886,14 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 				return ret;
 			}
 			spin_lock(&shared->lock);
+
+			/* We should not own chip if it is already
+			 * in FL_SYNCING state. Put contender and retry. */
+			if (chip->state == FL_SYNCING) {
+				put_chip(map, contender, contender->start);
+				spin_unlock(contender->mutex);
+				goto retry;
+			}
 			spin_unlock(contender->mutex);
 		}
 
@@ -1012,7 +1047,7 @@ static void __xipram xip_enable(struct map_info *map, struct flchip *chip,
 
 static int __xipram xip_wait_for_operation(
 		struct map_info *map, struct flchip *chip,
-		unsigned long adr, unsigned int chip_op_time )
+		unsigned long adr, unsigned int chip_op_time_max)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	struct cfi_pri_intelext *cfip = cfi->cmdset_priv;
@@ -1021,7 +1056,7 @@ static int __xipram xip_wait_for_operation(
 	flstate_t oldstate, newstate;
 
        	start = xip_currtime();
-	usec = chip_op_time * 8;
+	usec = chip_op_time_max;
 	if (usec == 0)
 		usec = 500000;
 	done = 0;
@@ -1131,8 +1166,8 @@ static int __xipram xip_wait_for_operation(
 #define XIP_INVAL_CACHED_RANGE(map, from, size)  \
 	INVALIDATE_CACHED_RANGE(map, from, size)
 
-#define INVAL_CACHE_AND_WAIT(map, chip, cmd_adr, inval_adr, inval_len, usec) \
-	xip_wait_for_operation(map, chip, cmd_adr, usec)
+#define INVAL_CACHE_AND_WAIT(map, chip, cmd_adr, inval_adr, inval_len, usec, usec_max) \
+	xip_wait_for_operation(map, chip, cmd_adr, usec_max)
 
 #else
 
@@ -1144,7 +1179,7 @@ static int __xipram xip_wait_for_operation(
 static int inval_cache_and_wait_for_operation(
 		struct map_info *map, struct flchip *chip,
 		unsigned long cmd_adr, unsigned long inval_adr, int inval_len,
-		unsigned int chip_op_time)
+		unsigned int chip_op_time, unsigned int chip_op_time_max)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	map_word status, status_OK = CMD(0x80);
@@ -1156,8 +1191,7 @@ static int inval_cache_and_wait_for_operation(
 		INVALIDATE_CACHED_RANGE(map, inval_adr, inval_len);
 	spin_lock(chip->mutex);
 
-	/* set our timeout to 8 times the expected delay */
-	timeo = chip_op_time * 8;
+	timeo = chip_op_time_max;
 	if (!timeo)
 		timeo = 500000;
 	reset_timeo = timeo;
@@ -1217,8 +1251,8 @@ static int inval_cache_and_wait_for_operation(
 
 #endif
 
-#define WAIT_TIMEOUT(map, chip, adr, udelay) \
-	INVAL_CACHE_AND_WAIT(map, chip, adr, 0, 0, udelay);
+#define WAIT_TIMEOUT(map, chip, adr, udelay, udelay_max) \
+	INVAL_CACHE_AND_WAIT(map, chip, adr, 0, 0, udelay, udelay_max);
 
 
 static int do_point_onechip (struct map_info *map, struct flchip *chip, loff_t adr, size_t len)
@@ -1452,7 +1486,8 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 
 	ret = INVAL_CACHE_AND_WAIT(map, chip, adr,
 				   adr, map_bankwidth(map),
-				   chip->word_write_time);
+				   chip->word_write_time,
+				   chip->word_write_time_max);
 	if (ret) {
 		xip_enable(map, chip, adr);
 		printk(KERN_ERR "%s: word write error (status timeout)\n", map->name);
@@ -1623,7 +1658,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 
 	chip->state = FL_WRITING_TO_BUFFER;
 	map_write(map, write_cmd, cmd_adr);
-	ret = WAIT_TIMEOUT(map, chip, cmd_adr, 0);
+	ret = WAIT_TIMEOUT(map, chip, cmd_adr, 0, 0);
 	if (ret) {
 		/* Argh. Not ready for write to buffer */
 		map_word Xstatus = map_read(map, cmd_adr);
@@ -1640,7 +1675,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 
 	/* Figure out the number of words to write */
 	word_gap = (-adr & (map_bankwidth(map)-1));
-	words = (len - word_gap + map_bankwidth(map) - 1) / map_bankwidth(map);
+	words = DIV_ROUND_UP(len - word_gap, map_bankwidth(map));
 	if (!word_gap) {
 		words--;
 	} else {
@@ -1692,7 +1727,8 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 
 	ret = INVAL_CACHE_AND_WAIT(map, chip, cmd_adr,
 				   initial_adr, initial_len,
-				   chip->buffer_write_time);
+				   chip->buffer_write_time,
+				   chip->buffer_write_time_max);
 	if (ret) {
 		map_write(map, CMD(0x70), cmd_adr);
 		chip->state = FL_STATUS;
@@ -1827,7 +1863,8 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 
 	ret = INVAL_CACHE_AND_WAIT(map, chip, adr,
 				   adr, len,
-				   chip->erase_time);
+				   chip->erase_time,
+				   chip->erase_time_max);
 	if (ret) {
 		map_write(map, CMD(0x70), adr);
 		chip->state = FL_STATUS;
@@ -2006,7 +2043,7 @@ static int __xipram do_xxlock_oneblock(struct map_info *map, struct flchip *chip
 	 */
 	udelay = (!extp || !(extp->FeatureSupport & (1 << 5))) ? 1000000/HZ : 0;
 
-	ret = WAIT_TIMEOUT(map, chip, adr, udelay);
+	ret = WAIT_TIMEOUT(map, chip, adr, udelay, udelay * 100);
 	if (ret) {
 		map_write(map, CMD(0x70), adr);
 		chip->state = FL_STATUS;
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index a972cc6be436..3e6f5d8609e8 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -13,6 +13,8 @@
  * XIP support hooks by Vitaly Wool (based on code for Intel flash
  * by Nicolas Pitre)
  *
+ * 25/09/2008 Christopher Moore: TopBottom fixup for many Macronix with CFI V1.0
+ *
  * Occasionally maintained by Thayne Harbaugh tharbaugh at lnxi dot com
  *
  * This code is GPL
@@ -43,6 +45,7 @@
 
 #define MANUFACTURER_AMD	0x0001
 #define MANUFACTURER_ATMEL	0x001F
+#define MANUFACTURER_MACRONIX	0x00C2
 #define MANUFACTURER_SST	0x00BF
 #define SST49LF004B	        0x0060
 #define SST49LF040B	        0x0050
@@ -144,12 +147,44 @@ static void fixup_amd_bootblock(struct mtd_info *mtd, void* param)
 
 	if (((major << 8) | minor) < 0x3131) {
 		/* CFI version 1.0 => don't trust bootloc */
+
+		DEBUG(MTD_DEBUG_LEVEL1,
+			"%s: JEDEC Vendor ID is 0x%02X Device ID is 0x%02X\n",
+			map->name, cfi->mfr, cfi->id);
+
+		/* AFAICS all 29LV400 with a bottom boot block have a device ID
+		 * of 0x22BA in 16-bit mode and 0xBA in 8-bit mode.
+		 * These were badly detected as they have the 0x80 bit set
+		 * so treat them as a special case.
+		 */
+		if (((cfi->id == 0xBA) || (cfi->id == 0x22BA)) &&
+
+			/* Macronix added CFI to their 2nd generation
+			 * MX29LV400C B/T but AFAICS no other 29LV400 (AMD,
+			 * Fujitsu, Spansion, EON, ESI and older Macronix)
+			 * has CFI.
+			 *
+			 * Therefore also check the manufacturer.
+			 * This reduces the risk of false detection due to
+			 * the 8-bit device ID.
+			 */
+			(cfi->mfr == MANUFACTURER_MACRONIX)) {
+			DEBUG(MTD_DEBUG_LEVEL1,
+				"%s: Macronix MX29LV400C with bottom boot block"
+				" detected\n", map->name);
+			extp->TopBottom = 2;	/* bottom boot */
+		} else
 		if (cfi->id & 0x80) {
 			printk(KERN_WARNING "%s: JEDEC Device ID is 0x%02X. Assuming broken CFI table.\n", map->name, cfi->id);
 			extp->TopBottom = 3;	/* top boot */
 		} else {
 			extp->TopBottom = 2;	/* bottom boot */
 		}
+
+		DEBUG(MTD_DEBUG_LEVEL1,
+			"%s: AMD CFI PRI V%c.%c has no boot block field;"
+			" deduced %s from Device ID\n", map->name, major, minor,
+			extp->TopBottom == 2 ? "bottom" : "top");
 	}
 }
 #endif
@@ -178,10 +213,18 @@ static void fixup_convert_atmel_pri(struct mtd_info *mtd, void *param)
 	if (atmel_pri.Features & 0x02)
 		extp->EraseSuspend = 2;
 
-	if (atmel_pri.BottomBoot)
-		extp->TopBottom = 2;
-	else
-		extp->TopBottom = 3;
+	/* Some chips got it backwards... */
+	if (cfi->id == AT49BV6416) {
+		if (atmel_pri.BottomBoot)
+			extp->TopBottom = 3;
+		else
+			extp->TopBottom = 2;
+	} else {
+		if (atmel_pri.BottomBoot)
+			extp->TopBottom = 2;
+		else
+			extp->TopBottom = 3;
+	}
 
 	/* burst write mode not supported */
 	cfi->cfiq->BufWriteTimeoutTyp = 0;
@@ -243,6 +286,7 @@ static struct cfi_fixup cfi_fixup_table[] = {
 	{ CFI_MFR_ATMEL, CFI_ID_ANY, fixup_convert_atmel_pri, NULL },
 #ifdef AMD_BOOTLOC_BUG
 	{ CFI_MFR_AMD, CFI_ID_ANY, fixup_amd_bootblock, NULL },
+	{ MANUFACTURER_MACRONIX, CFI_ID_ANY, fixup_amd_bootblock, NULL },
 #endif
 	{ CFI_MFR_AMD, 0x0050, fixup_use_secsi, NULL, },
 	{ CFI_MFR_AMD, 0x0053, fixup_use_secsi, NULL, },
diff --git a/drivers/mtd/chips/cfi_probe.c b/drivers/mtd/chips/cfi_probe.c
index c418e92e1d92..e63e6749429a 100644
--- a/drivers/mtd/chips/cfi_probe.c
+++ b/drivers/mtd/chips/cfi_probe.c
@@ -44,17 +44,14 @@ do { \
 
 #define xip_enable(base, map, cfi) \
 do { \
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); \
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); \
+	cfi_qry_mode_off(base, map, cfi);		\
 	xip_allowed(base, map); \
 } while (0)
 
 #define xip_disable_qry(base, map, cfi) \
 do { \
 	xip_disable(); \
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL); \
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL); \
-	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL); \
+	cfi_qry_mode_on(base, map, cfi); \
 } while (0)
 
 #else
@@ -70,32 +67,6 @@ do { \
    in: interleave,type,mode
    ret: table index, <0 for error
  */
-static int __xipram qry_present(struct map_info *map, __u32 base,
-				struct cfi_private *cfi)
-{
-	int osf = cfi->interleave * cfi->device_type;	// scale factor
-	map_word val[3];
-	map_word qry[3];
-
-	qry[0] = cfi_build_cmd('Q', map, cfi);
-	qry[1] = cfi_build_cmd('R', map, cfi);
-	qry[2] = cfi_build_cmd('Y', map, cfi);
-
-	val[0] = map_read(map, base + osf*0x10);
-	val[1] = map_read(map, base + osf*0x11);
-	val[2] = map_read(map, base + osf*0x12);
-
-	if (!map_word_equal(map, qry[0], val[0]))
-		return 0;
-
-	if (!map_word_equal(map, qry[1], val[1]))
-		return 0;
-
-	if (!map_word_equal(map, qry[2], val[2]))
-		return 0;
-
-	return 1; 	// "QRY" found
-}
 
 static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 				   unsigned long *chip_map, struct cfi_private *cfi)
@@ -116,11 +87,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 	}
 
 	xip_disable();
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
-
-	if (!qry_present(map,base,cfi)) {
+	if (!cfi_qry_mode_on(base, map, cfi)) {
 		xip_enable(base, map, cfi);
 		return 0;
 	}
@@ -141,14 +108,13 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
  		start = i << cfi->chipshift;
 		/* This chip should be in read mode if it's one
 		   we've already touched. */
-		if (qry_present(map, start, cfi)) {
+		if (cfi_qry_present(map, start, cfi)) {
 			/* Eep. This chip also had the QRY marker.
 			 * Is it an alias for the new one? */
-			cfi_send_gen_cmd(0xF0, 0, start, map, cfi, cfi->device_type, NULL);
-			cfi_send_gen_cmd(0xFF, 0, start, map, cfi, cfi->device_type, NULL);
+			cfi_qry_mode_off(start, map, cfi);
 
 			/* If the QRY marker goes away, it's an alias */
-			if (!qry_present(map, start, cfi)) {
+			if (!cfi_qry_present(map, start, cfi)) {
 				xip_allowed(base, map);
 				printk(KERN_DEBUG "%s: Found an alias at 0x%x for the chip at 0x%lx\n",
 				       map->name, base, start);
@@ -158,10 +124,9 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 			 * unfortunate. Stick the new chip in read mode
 			 * too and if it's the same, assume it's an alias. */
 			/* FIXME: Use other modes to do a proper check */
-			cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-			cfi_send_gen_cmd(0xFF, 0, start, map, cfi, cfi->device_type, NULL);
+			cfi_qry_mode_off(base, map, cfi);
 
-			if (qry_present(map, base, cfi)) {
+			if (cfi_qry_present(map, base, cfi)) {
 				xip_allowed(base, map);
 				printk(KERN_DEBUG "%s: Found an alias at 0x%x for the chip at 0x%lx\n",
 				       map->name, base, start);
@@ -176,8 +141,7 @@ static int __xipram cfi_probe_chip(struct map_info *map, __u32 base,
 	cfi->numchips++;
 
 	/* Put it back into Read Mode */
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_qry_mode_off(base, map, cfi);
 	xip_allowed(base, map);
 
 	printk(KERN_INFO "%s: Found %d x%d devices at 0x%x in %d-bit bank\n",
@@ -237,9 +201,7 @@ static int __xipram cfi_chip_setup(struct map_info *map,
 			  cfi_read_query(map, base + 0xf * ofs_factor);
 
 	/* Put it back into Read Mode */
-	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
-	/* ... even if it's an Intel chip */
-	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_qry_mode_off(base, map, cfi);
 	xip_allowed(base, map);
 
 	/* Do any necessary byteswapping */
diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c
index 0ee457018016..34d40e25d312 100644
--- a/drivers/mtd/chips/cfi_util.c
+++ b/drivers/mtd/chips/cfi_util.c
@@ -24,6 +24,66 @@
 #include <linux/mtd/cfi.h>
 #include <linux/mtd/compatmac.h>
 
+int __xipram cfi_qry_present(struct map_info *map, __u32 base,
+			     struct cfi_private *cfi)
+{
+	int osf = cfi->interleave * cfi->device_type;	/* scale factor */
+	map_word val[3];
+	map_word qry[3];
+
+	qry[0] = cfi_build_cmd('Q', map, cfi);
+	qry[1] = cfi_build_cmd('R', map, cfi);
+	qry[2] = cfi_build_cmd('Y', map, cfi);
+
+	val[0] = map_read(map, base + osf*0x10);
+	val[1] = map_read(map, base + osf*0x11);
+	val[2] = map_read(map, base + osf*0x12);
+
+	if (!map_word_equal(map, qry[0], val[0]))
+		return 0;
+
+	if (!map_word_equal(map, qry[1], val[1]))
+		return 0;
+
+	if (!map_word_equal(map, qry[2], val[2]))
+		return 0;
+
+	return 1; 	/* "QRY" found */
+}
+EXPORT_SYMBOL_GPL(cfi_qry_present);
+
+int __xipram cfi_qry_mode_on(uint32_t base, struct map_info *map,
+			     struct cfi_private *cfi)
+{
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
+	if (cfi_qry_present(map, base, cfi))
+		return 1;
+	/* QRY not found probably we deal with some odd CFI chips */
+	/* Some revisions of some old Intel chips? */
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
+	if (cfi_qry_present(map, base, cfi))
+		return 1;
+	/* ST M29DW chips */
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0x98, 0x555, base, map, cfi, cfi->device_type, NULL);
+	if (cfi_qry_present(map, base, cfi))
+		return 1;
+	/* QRY not found */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cfi_qry_mode_on);
+
+void __xipram cfi_qry_mode_off(uint32_t base, struct map_info *map,
+			       struct cfi_private *cfi)
+{
+	cfi_send_gen_cmd(0xF0, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_send_gen_cmd(0xFF, 0, base, map, cfi, cfi->device_type, NULL);
+}
+EXPORT_SYMBOL_GPL(cfi_qry_mode_off);
+
 struct cfi_extquery *
 __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* name)
 {
@@ -48,8 +108,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n
 #endif
 
 	/* Switch it into Query Mode */
-	cfi_send_gen_cmd(0x98, 0x55, base, map, cfi, cfi->device_type, NULL);
-
+	cfi_qry_mode_on(base, map, cfi);
 	/* Read in the Extended Query Table */
 	for (i=0; i<size; i++) {
 		((unsigned char *)extp)[i] =
@@ -57,8 +116,7 @@ __xipram cfi_read_pri(struct map_info *map, __u16 adr, __u16 size, const char* n
 	}
 
 	/* Make sure it returns to read mode */
-	cfi_send_gen_cmd(0xf0, 0, base, map, cfi, cfi->device_type, NULL);
-	cfi_send_gen_cmd(0xff, 0, base, map, cfi, cfi->device_type, NULL);
+	cfi_qry_mode_off(base, map, cfi);
 
 #ifdef CONFIG_MTD_XIP
 	(void) map_read(map, base);
diff --git a/drivers/mtd/chips/gen_probe.c b/drivers/mtd/chips/gen_probe.c
index f061885b2812..e2dc96441e05 100644
--- a/drivers/mtd/chips/gen_probe.c
+++ b/drivers/mtd/chips/gen_probe.c
@@ -111,7 +111,7 @@ static struct cfi_private *genprobe_ident_chips(struct map_info *map, struct chi
 		max_chips = 1;
 	}
 
-	mapsize = sizeof(long) * ( (max_chips + BITS_PER_LONG-1) / BITS_PER_LONG );
+	mapsize = sizeof(long) * DIV_ROUND_UP(max_chips, BITS_PER_LONG);
 	chip_map = kzalloc(mapsize, GFP_KERNEL);
 	if (!chip_map) {
 		printk(KERN_WARNING "%s: kmalloc failed for CFI chip map\n", map->name);
diff --git a/drivers/mtd/cmdlinepart.c b/drivers/mtd/cmdlinepart.c
index 71bc07f149b7..50a340388e74 100644
--- a/drivers/mtd/cmdlinepart.c
+++ b/drivers/mtd/cmdlinepart.c
@@ -7,6 +7,7 @@
  *
  * mtdparts=<mtddef>[;<mtddef]
  * <mtddef>  := <mtd-id>:<partdef>[,<partdef>]
+ *              where <mtd-id> is the name from the "cat /proc/mtd" command
  * <partdef> := <size>[@offset][<name>][ro][lk]
  * <mtd-id>  := unique name used in mapping driver/device (mtd->name)
  * <size>    := standard linux memsize OR "-" to denote all remaining space
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 9c613f06623c..6fde0a2e3567 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -59,6 +59,27 @@ config MTD_DATAFLASH
 	  Sometimes DataFlash chips are packaged inside MMC-format
 	  cards; at this writing, the MMC stack won't handle those.
 
+config MTD_DATAFLASH_WRITE_VERIFY
+	bool "Verify DataFlash page writes"
+	depends on MTD_DATAFLASH
+	help
+	  This adds an extra check when data is written to the flash.
+	  It may help if you are verifying chip setup (timings etc) on
+	  your board.  There is a rare possibility that even though the
+	  device thinks the write was successful, a bit could have been
+	  flipped accidentally due to device wear or something else.
+
+config MTD_DATAFLASH_OTP
+	bool "DataFlash OTP support (Security Register)"
+	depends on MTD_DATAFLASH
+	select HAVE_MTD_OTP
+	help
+	  Newer DataFlash chips (revisions C and D) support 128 bytes of
+	  one-time-programmable (OTP) data.  The first half may be written
+	  (once) with up to 64 bytes of data, such as a serial number or
+	  other key product data.  The second half is programmed with a
+	  unique-to-each-chip bit pattern at the factory.
+
 config MTD_M25P80
 	tristate "Support most SPI Flash chips (AT26DF, M25P, W25X, ...)"
 	depends on SPI_MASTER && EXPERIMENTAL
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index b35c3333e210..76a76751da36 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -39,6 +39,7 @@
 #define	OPCODE_PP		0x02	/* Page program (up to 256 bytes) */
 #define	OPCODE_BE_4K 		0x20	/* Erase 4KiB block */
 #define	OPCODE_BE_32K		0x52	/* Erase 32KiB block */
+#define	OPCODE_BE		0xc7	/* Erase whole flash block */
 #define	OPCODE_SE		0xd8	/* Sector erase (usually 64KiB) */
 #define	OPCODE_RDID		0x9f	/* Read JEDEC ID */
 
@@ -161,6 +162,31 @@ static int wait_till_ready(struct m25p *flash)
 	return 1;
 }
 
+/*
+ * Erase the whole flash memory
+ *
+ * Returns 0 if successful, non-zero otherwise.
+ */
+static int erase_block(struct m25p *flash)
+{
+	DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %dKiB\n",
+			flash->spi->dev.bus_id, __func__,
+			flash->mtd.size / 1024);
+
+	/* Wait until finished previous write command. */
+	if (wait_till_ready(flash))
+		return 1;
+
+	/* Send write enable, then erase commands. */
+	write_enable(flash);
+
+	/* Set up command buffer. */
+	flash->command[0] = OPCODE_BE;
+
+	spi_write(flash->spi, flash->command, 1);
+
+	return 0;
+}
 
 /*
  * Erase one sector of flash memory at offset ``offset'' which is any
@@ -229,15 +255,21 @@ static int m25p80_erase(struct mtd_info *mtd, struct erase_info *instr)
 	 */
 
 	/* now erase those sectors */
-	while (len) {
-		if (erase_sector(flash, addr)) {
-			instr->state = MTD_ERASE_FAILED;
-			mutex_unlock(&flash->lock);
-			return -EIO;
-		}
+	if (len == flash->mtd.size && erase_block(flash)) {
+		instr->state = MTD_ERASE_FAILED;
+		mutex_unlock(&flash->lock);
+		return -EIO;
+	} else {
+		while (len) {
+			if (erase_sector(flash, addr)) {
+				instr->state = MTD_ERASE_FAILED;
+				mutex_unlock(&flash->lock);
+				return -EIO;
+			}
 
-		addr += mtd->erasesize;
-		len -= mtd->erasesize;
+			addr += mtd->erasesize;
+			len -= mtd->erasesize;
+		}
 	}
 
 	mutex_unlock(&flash->lock);
@@ -437,6 +469,7 @@ struct flash_info {
 	 * then a two byte device id.
 	 */
 	u32		jedec_id;
+	u16             ext_id;
 
 	/* The size listed here is what works with OPCODE_SE, which isn't
 	 * necessarily called a "sector" by the vendor.
@@ -456,72 +489,75 @@ struct flash_info {
 static struct flash_info __devinitdata m25p_data [] = {
 
 	/* Atmel -- some are (confusingly) marketed as "DataFlash" */
-	{ "at25fs010",  0x1f6601, 32 * 1024, 4, SECT_4K, },
-	{ "at25fs040",  0x1f6604, 64 * 1024, 8, SECT_4K, },
+	{ "at25fs010",  0x1f6601, 0, 32 * 1024, 4, SECT_4K, },
+	{ "at25fs040",  0x1f6604, 0, 64 * 1024, 8, SECT_4K, },
 
-	{ "at25df041a", 0x1f4401, 64 * 1024, 8, SECT_4K, },
-	{ "at25df641",  0x1f4800, 64 * 1024, 128, SECT_4K, },
+	{ "at25df041a", 0x1f4401, 0, 64 * 1024, 8, SECT_4K, },
+	{ "at25df641",  0x1f4800, 0, 64 * 1024, 128, SECT_4K, },
 
-	{ "at26f004",   0x1f0400, 64 * 1024, 8, SECT_4K, },
-	{ "at26df081a", 0x1f4501, 64 * 1024, 16, SECT_4K, },
-	{ "at26df161a", 0x1f4601, 64 * 1024, 32, SECT_4K, },
-	{ "at26df321",  0x1f4701, 64 * 1024, 64, SECT_4K, },
+	{ "at26f004",   0x1f0400, 0, 64 * 1024, 8, SECT_4K, },
+	{ "at26df081a", 0x1f4501, 0, 64 * 1024, 16, SECT_4K, },
+	{ "at26df161a", 0x1f4601, 0, 64 * 1024, 32, SECT_4K, },
+	{ "at26df321",  0x1f4701, 0, 64 * 1024, 64, SECT_4K, },
 
 	/* Spansion -- single (large) sector size only, at least
 	 * for the chips listed here (without boot sectors).
 	 */
-	{ "s25sl004a", 0x010212, 64 * 1024, 8, },
-	{ "s25sl008a", 0x010213, 64 * 1024, 16, },
-	{ "s25sl016a", 0x010214, 64 * 1024, 32, },
-	{ "s25sl032a", 0x010215, 64 * 1024, 64, },
-	{ "s25sl064a", 0x010216, 64 * 1024, 128, },
+	{ "s25sl004a", 0x010212, 0, 64 * 1024, 8, },
+	{ "s25sl008a", 0x010213, 0, 64 * 1024, 16, },
+	{ "s25sl016a", 0x010214, 0, 64 * 1024, 32, },
+	{ "s25sl032a", 0x010215, 0, 64 * 1024, 64, },
+	{ "s25sl064a", 0x010216, 0, 64 * 1024, 128, },
+        { "s25sl12800", 0x012018, 0x0300, 256 * 1024, 64, },
+	{ "s25sl12801", 0x012018, 0x0301, 64 * 1024, 256, },
 
 	/* SST -- large erase sizes are "overlays", "sectors" are 4K */
-	{ "sst25vf040b", 0xbf258d, 64 * 1024, 8, SECT_4K, },
-	{ "sst25vf080b", 0xbf258e, 64 * 1024, 16, SECT_4K, },
-	{ "sst25vf016b", 0xbf2541, 64 * 1024, 32, SECT_4K, },
-	{ "sst25vf032b", 0xbf254a, 64 * 1024, 64, SECT_4K, },
+	{ "sst25vf040b", 0xbf258d, 0, 64 * 1024, 8, SECT_4K, },
+	{ "sst25vf080b", 0xbf258e, 0, 64 * 1024, 16, SECT_4K, },
+	{ "sst25vf016b", 0xbf2541, 0, 64 * 1024, 32, SECT_4K, },
+	{ "sst25vf032b", 0xbf254a, 0, 64 * 1024, 64, SECT_4K, },
 
 	/* ST Microelectronics -- newer production may have feature updates */
-	{ "m25p05",  0x202010,  32 * 1024, 2, },
-	{ "m25p10",  0x202011,  32 * 1024, 4, },
-	{ "m25p20",  0x202012,  64 * 1024, 4, },
-	{ "m25p40",  0x202013,  64 * 1024, 8, },
-	{ "m25p80",         0,  64 * 1024, 16, },
-	{ "m25p16",  0x202015,  64 * 1024, 32, },
-	{ "m25p32",  0x202016,  64 * 1024, 64, },
-	{ "m25p64",  0x202017,  64 * 1024, 128, },
-	{ "m25p128", 0x202018, 256 * 1024, 64, },
-
-	{ "m45pe80", 0x204014,  64 * 1024, 16, },
-	{ "m45pe16", 0x204015,  64 * 1024, 32, },
-
-	{ "m25pe80", 0x208014,  64 * 1024, 16, },
-	{ "m25pe16", 0x208015,  64 * 1024, 32, SECT_4K, },
+	{ "m25p05",  0x202010,  0, 32 * 1024, 2, },
+	{ "m25p10",  0x202011,  0, 32 * 1024, 4, },
+	{ "m25p20",  0x202012,  0, 64 * 1024, 4, },
+	{ "m25p40",  0x202013,  0, 64 * 1024, 8, },
+	{ "m25p80",         0,  0, 64 * 1024, 16, },
+	{ "m25p16",  0x202015,  0, 64 * 1024, 32, },
+	{ "m25p32",  0x202016,  0, 64 * 1024, 64, },
+	{ "m25p64",  0x202017,  0, 64 * 1024, 128, },
+	{ "m25p128", 0x202018, 0, 256 * 1024, 64, },
+
+	{ "m45pe80", 0x204014,  0, 64 * 1024, 16, },
+	{ "m45pe16", 0x204015,  0, 64 * 1024, 32, },
+
+	{ "m25pe80", 0x208014,  0, 64 * 1024, 16, },
+	{ "m25pe16", 0x208015,  0, 64 * 1024, 32, SECT_4K, },
 
 	/* Winbond -- w25x "blocks" are 64K, "sectors" are 4KiB */
-	{ "w25x10", 0xef3011, 64 * 1024, 2, SECT_4K, },
-	{ "w25x20", 0xef3012, 64 * 1024, 4, SECT_4K, },
-	{ "w25x40", 0xef3013, 64 * 1024, 8, SECT_4K, },
-	{ "w25x80", 0xef3014, 64 * 1024, 16, SECT_4K, },
-	{ "w25x16", 0xef3015, 64 * 1024, 32, SECT_4K, },
-	{ "w25x32", 0xef3016, 64 * 1024, 64, SECT_4K, },
-	{ "w25x64", 0xef3017, 64 * 1024, 128, SECT_4K, },
+	{ "w25x10", 0xef3011, 0, 64 * 1024, 2, SECT_4K, },
+	{ "w25x20", 0xef3012, 0, 64 * 1024, 4, SECT_4K, },
+	{ "w25x40", 0xef3013, 0, 64 * 1024, 8, SECT_4K, },
+	{ "w25x80", 0xef3014, 0, 64 * 1024, 16, SECT_4K, },
+	{ "w25x16", 0xef3015, 0, 64 * 1024, 32, SECT_4K, },
+	{ "w25x32", 0xef3016, 0, 64 * 1024, 64, SECT_4K, },
+	{ "w25x64", 0xef3017, 0, 64 * 1024, 128, SECT_4K, },
 };
 
 static struct flash_info *__devinit jedec_probe(struct spi_device *spi)
 {
 	int			tmp;
 	u8			code = OPCODE_RDID;
-	u8			id[3];
+	u8			id[5];
 	u32			jedec;
+	u16                     ext_jedec;
 	struct flash_info	*info;
 
 	/* JEDEC also defines an optional "extended device information"
 	 * string for after vendor-specific data, after the three bytes
 	 * we use here.  Supporting some chips might require using it.
 	 */
-	tmp = spi_write_then_read(spi, &code, 1, id, 3);
+	tmp = spi_write_then_read(spi, &code, 1, id, 5);
 	if (tmp < 0) {
 		DEBUG(MTD_DEBUG_LEVEL0, "%s: error %d reading JEDEC ID\n",
 			spi->dev.bus_id, tmp);
@@ -533,10 +569,14 @@ static struct flash_info *__devinit jedec_probe(struct spi_device *spi)
 	jedec = jedec << 8;
 	jedec |= id[2];
 
+	ext_jedec = id[3] << 8 | id[4];
+
 	for (tmp = 0, info = m25p_data;
 			tmp < ARRAY_SIZE(m25p_data);
 			tmp++, info++) {
 		if (info->jedec_id == jedec)
+			if (ext_jedec != 0 && info->ext_id != ext_jedec)
+				continue;
 			return info;
 	}
 	dev_err(&spi->dev, "unrecognized JEDEC id %06x\n", jedec);
diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c
index 8bd0dea6885f..6dd9aff8bb2d 100644
--- a/drivers/mtd/devices/mtd_dataflash.c
+++ b/drivers/mtd/devices/mtd_dataflash.c
@@ -30,12 +30,10 @@
  * doesn't (yet) use these for any kind of i/o overlap or prefetching.
  *
  * Sometimes DataFlash is packaged in MMC-format cards, although the
- * MMC stack can't use SPI (yet), or distinguish between MMC and DataFlash
+ * MMC stack can't (yet?) distinguish between MMC and DataFlash
  * protocols during enumeration.
  */
 
-#define CONFIG_DATAFLASH_WRITE_VERIFY
-
 /* reads can bypass the buffers */
 #define OP_READ_CONTINUOUS	0xE8
 #define OP_READ_PAGE		0xD2
@@ -80,7 +78,8 @@
  */
 #define OP_READ_ID		0x9F
 #define OP_READ_SECURITY	0x77
-#define OP_WRITE_SECURITY	0x9A	/* OTP bits */
+#define OP_WRITE_SECURITY_REVC	0x9A
+#define OP_WRITE_SECURITY	0x9B	/* revision D */
 
 
 struct dataflash {
@@ -402,7 +401,7 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 		(void) dataflash_waitready(priv->spi);
 
 
-#ifdef	CONFIG_DATAFLASH_WRITE_VERIFY
+#ifdef CONFIG_MTD_DATAFLASH_VERIFY_WRITE
 
 		/* (3) Compare to Buffer1 */
 		addr = pageaddr << priv->page_offset;
@@ -431,7 +430,7 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 		} else
 			status = 0;
 
-#endif	/* CONFIG_DATAFLASH_WRITE_VERIFY */
+#endif	/* CONFIG_MTD_DATAFLASH_VERIFY_WRITE */
 
 		remaining = remaining - writelen;
 		pageaddr++;
@@ -451,16 +450,192 @@ static int dataflash_write(struct mtd_info *mtd, loff_t to, size_t len,
 
 /* ......................................................................... */
 
+#ifdef CONFIG_MTD_DATAFLASH_OTP
+
+static int dataflash_get_otp_info(struct mtd_info *mtd,
+		struct otp_info *info, size_t len)
+{
+	/* Report both blocks as identical:  bytes 0..64, locked.
+	 * Unless the user block changed from all-ones, we can't
+	 * tell whether it's still writable; so we assume it isn't.
+	 */
+	info->start = 0;
+	info->length = 64;
+	info->locked = 1;
+	return sizeof(*info);
+}
+
+static ssize_t otp_read(struct spi_device *spi, unsigned base,
+		uint8_t *buf, loff_t off, size_t len)
+{
+	struct spi_message	m;
+	size_t			l;
+	uint8_t			*scratch;
+	struct spi_transfer	t;
+	int			status;
+
+	if (off > 64)
+		return -EINVAL;
+
+	if ((off + len) > 64)
+		len = 64 - off;
+	if (len == 0)
+		return len;
+
+	spi_message_init(&m);
+
+	l = 4 + base + off + len;
+	scratch = kzalloc(l, GFP_KERNEL);
+	if (!scratch)
+		return -ENOMEM;
+
+	/* OUT: OP_READ_SECURITY, 3 don't-care bytes, zeroes
+	 * IN:  ignore 4 bytes, data bytes 0..N (max 127)
+	 */
+	scratch[0] = OP_READ_SECURITY;
+
+	memset(&t, 0, sizeof t);
+	t.tx_buf = scratch;
+	t.rx_buf = scratch;
+	t.len = l;
+	spi_message_add_tail(&t, &m);
+
+	dataflash_waitready(spi);
+
+	status = spi_sync(spi, &m);
+	if (status >= 0) {
+		memcpy(buf, scratch + 4 + base + off, len);
+		status = len;
+	}
+
+	kfree(scratch);
+	return status;
+}
+
+static int dataflash_read_fact_otp(struct mtd_info *mtd,
+		loff_t from, size_t len, size_t *retlen, u_char *buf)
+{
+	struct dataflash	*priv = (struct dataflash *)mtd->priv;
+	int			status;
+
+	/* 64 bytes, from 0..63 ... start at 64 on-chip */
+	mutex_lock(&priv->lock);
+	status = otp_read(priv->spi, 64, buf, from, len);
+	mutex_unlock(&priv->lock);
+
+	if (status < 0)
+		return status;
+	*retlen = status;
+	return 0;
+}
+
+static int dataflash_read_user_otp(struct mtd_info *mtd,
+		loff_t from, size_t len, size_t *retlen, u_char *buf)
+{
+	struct dataflash	*priv = (struct dataflash *)mtd->priv;
+	int			status;
+
+	/* 64 bytes, from 0..63 ... start at 0 on-chip */
+	mutex_lock(&priv->lock);
+	status = otp_read(priv->spi, 0, buf, from, len);
+	mutex_unlock(&priv->lock);
+
+	if (status < 0)
+		return status;
+	*retlen = status;
+	return 0;
+}
+
+static int dataflash_write_user_otp(struct mtd_info *mtd,
+		loff_t from, size_t len, size_t *retlen, u_char *buf)
+{
+	struct spi_message	m;
+	const size_t		l = 4 + 64;
+	uint8_t			*scratch;
+	struct spi_transfer	t;
+	struct dataflash	*priv = (struct dataflash *)mtd->priv;
+	int			status;
+
+	if (len > 64)
+		return -EINVAL;
+
+	/* Strictly speaking, we *could* truncate the write ... but
+	 * let's not do that for the only write that's ever possible.
+	 */
+	if ((from + len) > 64)
+		return -EINVAL;
+
+	/* OUT: OP_WRITE_SECURITY, 3 zeroes, 64 data-or-zero bytes
+	 * IN:  ignore all
+	 */
+	scratch = kzalloc(l, GFP_KERNEL);
+	if (!scratch)
+		return -ENOMEM;
+	scratch[0] = OP_WRITE_SECURITY;
+	memcpy(scratch + 4 + from, buf, len);
+
+	spi_message_init(&m);
+
+	memset(&t, 0, sizeof t);
+	t.tx_buf = scratch;
+	t.len = l;
+	spi_message_add_tail(&t, &m);
+
+	/* Write the OTP bits, if they've not yet been written.
+	 * This modifies SRAM buffer1.
+	 */
+	mutex_lock(&priv->lock);
+	dataflash_waitready(priv->spi);
+	status = spi_sync(priv->spi, &m);
+	mutex_unlock(&priv->lock);
+
+	kfree(scratch);
+
+	if (status >= 0) {
+		status = 0;
+		*retlen = len;
+	}
+	return status;
+}
+
+static char *otp_setup(struct mtd_info *device, char revision)
+{
+	device->get_fact_prot_info = dataflash_get_otp_info;
+	device->read_fact_prot_reg = dataflash_read_fact_otp;
+	device->get_user_prot_info = dataflash_get_otp_info;
+	device->read_user_prot_reg = dataflash_read_user_otp;
+
+	/* rev c parts (at45db321c and at45db1281 only!) use a
+	 * different write procedure; not (yet?) implemented.
+	 */
+	if (revision > 'c')
+		device->write_user_prot_reg = dataflash_write_user_otp;
+
+	return ", OTP";
+}
+
+#else
+
+static char *otp_setup(struct mtd_info *device, char revision)
+{
+	return " (OTP)";
+}
+
+#endif
+
+/* ......................................................................... */
+
 /*
  * Register DataFlash device with MTD subsystem.
  */
 static int __devinit
-add_dataflash(struct spi_device *spi, char *name,
-		int nr_pages, int pagesize, int pageoffset)
+add_dataflash_otp(struct spi_device *spi, char *name,
+		int nr_pages, int pagesize, int pageoffset, char revision)
 {
 	struct dataflash		*priv;
 	struct mtd_info			*device;
 	struct flash_platform_data	*pdata = spi->dev.platform_data;
+	char				*otp_tag = "";
 
 	priv = kzalloc(sizeof *priv, GFP_KERNEL);
 	if (!priv)
@@ -489,8 +664,12 @@ add_dataflash(struct spi_device *spi, char *name,
 	device->write = dataflash_write;
 	device->priv = priv;
 
-	dev_info(&spi->dev, "%s (%d KBytes) pagesize %d bytes\n",
-			name, DIV_ROUND_UP(device->size, 1024), pagesize);
+	if (revision >= 'c')
+		otp_tag = otp_setup(device, revision);
+
+	dev_info(&spi->dev, "%s (%d KBytes) pagesize %d bytes%s\n",
+			name, DIV_ROUND_UP(device->size, 1024),
+			pagesize, otp_tag);
 	dev_set_drvdata(&spi->dev, priv);
 
 	if (mtd_has_partitions()) {
@@ -519,6 +698,14 @@ add_dataflash(struct spi_device *spi, char *name,
 	return add_mtd_device(device) == 1 ? -ENODEV : 0;
 }
 
+static inline int __devinit
+add_dataflash(struct spi_device *spi, char *name,
+		int nr_pages, int pagesize, int pageoffset)
+{
+	return add_dataflash_otp(spi, name, nr_pages, pagesize,
+			pageoffset, 0);
+}
+
 struct flash_info {
 	char		*name;
 
@@ -664,13 +851,16 @@ static int __devinit dataflash_probe(struct spi_device *spi)
 	 * Try to detect dataflash by JEDEC ID.
 	 * If it succeeds we know we have either a C or D part.
 	 * D will support power of 2 pagesize option.
+	 * Both support the security register, though with different
+	 * write procedures.
 	 */
 	info = jedec_probe(spi);
 	if (IS_ERR(info))
 		return PTR_ERR(info);
 	if (info != NULL)
-		return add_dataflash(spi, info->name, info->nr_pages,
-				 info->pagesize, info->pageoffset);
+		return add_dataflash_otp(spi, info->name, info->nr_pages,
+				info->pagesize, info->pageoffset,
+				(info->flags & SUP_POW2PS) ? 'd' : 'c');
 
 	/*
 	 * Older chips support only legacy commands, identifing
diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c
index c4f9d3378b24..50ce13887f63 100644
--- a/drivers/mtd/inftlcore.c
+++ b/drivers/mtd/inftlcore.c
@@ -388,6 +388,10 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 		if (thisEUN == targetEUN)
 			break;
 
+		/* Unlink the last block from the chain. */
+		inftl->PUtable[prevEUN] = BLOCK_NIL;
+
+		/* Now try to erase it. */
 		if (INFTL_formatblock(inftl, thisEUN) < 0) {
 			/*
 			 * Could not erase : mark block as reserved.
@@ -396,7 +400,6 @@ static u16 INFTL_foldchain(struct INFTLrecord *inftl, unsigned thisVUC, unsigned
 		} else {
 			/* Correctly erased : mark it as free */
 			inftl->PUtable[thisEUN] = BLOCK_FREE;
-			inftl->PUtable[prevEUN] = BLOCK_NIL;
 			inftl->numfreeEUNs++;
 		}
 	}
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index df8e00bba07b..5ea169362164 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -332,30 +332,6 @@ config MTD_CFI_FLAGADM
 	  Mapping for the Flaga digital module. If you don't have one, ignore
 	  this setting.
 
-config MTD_WALNUT
-	tristate "Flash device mapped on IBM 405GP Walnut"
-	depends on MTD_JEDECPROBE && WALNUT && !PPC_MERGE
-	help
-	  This enables access routines for the flash chips on the IBM 405GP
-	  Walnut board. If you have one of these boards and would like to
-	  use the flash chips on it, say 'Y'.
-
-config MTD_EBONY
-	tristate "Flash devices mapped on IBM 440GP Ebony"
-	depends on MTD_JEDECPROBE && EBONY && !PPC_MERGE
-	help
-	  This enables access routines for the flash chips on the IBM 440GP
-	  Ebony board. If you have one of these boards and would like to
-	  use the flash chips on it, say 'Y'.
-
-config MTD_OCOTEA
-	tristate "Flash devices mapped on IBM 440GX Ocotea"
-	depends on MTD_CFI && OCOTEA && !PPC_MERGE
-	help
-	  This enables access routines for the flash chips on the IBM 440GX
-	  Ocotea board. If you have one of these boards and would like to
-	  use the flash chips on it, say 'Y'.
-
 config MTD_REDWOOD
 	tristate "CFI Flash devices mapped on IBM Redwood"
 	depends on MTD_CFI && ( REDWOOD_4 || REDWOOD_5 || REDWOOD_6 )
@@ -458,13 +434,6 @@ config MTD_CEIVA
 	  PhotoMax Digital Picture Frame.
 	  If you have such a device, say 'Y'.
 
-config MTD_NOR_TOTO
-	tristate "NOR Flash device on TOTO board"
-	depends on ARCH_OMAP && OMAP_TOTO
-	help
-	  This enables access to the NOR flash on the Texas Instruments
-	  TOTO board.
-
 config MTD_H720X
 	tristate "Hynix evaluation board mappings"
 	depends on MTD_CFI && ( ARCH_H7201 || ARCH_H7202 )
@@ -522,7 +491,7 @@ config MTD_BFIN_ASYNC
 
 config MTD_UCLINUX
 	tristate "Generic uClinux RAM/ROM filesystem support"
-	depends on MTD_PARTITIONS && !MMU
+	depends on MTD_PARTITIONS && MTD_RAM && !MMU
 	help
 	  Map driver to support image based filesystems for uClinux.
 
diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile
index 6cda6df973e5..6d9ba35caf11 100644
--- a/drivers/mtd/maps/Makefile
+++ b/drivers/mtd/maps/Makefile
@@ -50,12 +50,8 @@ obj-$(CONFIG_MTD_REDWOOD)	+= redwood.o
 obj-$(CONFIG_MTD_UCLINUX)	+= uclinux.o
 obj-$(CONFIG_MTD_NETtel)	+= nettel.o
 obj-$(CONFIG_MTD_SCB2_FLASH)	+= scb2_flash.o
-obj-$(CONFIG_MTD_EBONY)		+= ebony.o
-obj-$(CONFIG_MTD_OCOTEA)	+= ocotea.o
-obj-$(CONFIG_MTD_WALNUT)        += walnut.o
 obj-$(CONFIG_MTD_H720X)		+= h720x-flash.o
 obj-$(CONFIG_MTD_SBC8240)	+= sbc8240.o
-obj-$(CONFIG_MTD_NOR_TOTO)	+= omap-toto-flash.o
 obj-$(CONFIG_MTD_IXP4XX)	+= ixp4xx.o
 obj-$(CONFIG_MTD_IXP2000)	+= ixp2000.o
 obj-$(CONFIG_MTD_WRSBC8260)	+= wr_sbc82xx_flash.o
diff --git a/drivers/mtd/maps/ebony.c b/drivers/mtd/maps/ebony.c
deleted file mode 100644
index d92b7c70d3ed..000000000000
--- a/drivers/mtd/maps/ebony.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Mapping for Ebony user flash
- *
- * Matt Porter <mporter@kernel.crashing.org>
- *
- * Copyright 2002-2004 MontaVista Software Inc.
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-#include <asm/io.h>
-#include <asm/ibm44x.h>
-#include <platforms/4xx/ebony.h>
-
-static struct mtd_info *flash;
-
-static struct map_info ebony_small_map = {
-	.name =		"Ebony small flash",
-	.size =		EBONY_SMALL_FLASH_SIZE,
-	.bankwidth =	1,
-};
-
-static struct map_info ebony_large_map = {
-	.name =		"Ebony large flash",
-	.size =		EBONY_LARGE_FLASH_SIZE,
-	.bankwidth =	1,
-};
-
-static struct mtd_partition ebony_small_partitions[] = {
-	{
-		.name =   "OpenBIOS",
-		.offset = 0x0,
-		.size =   0x80000,
-	}
-};
-
-static struct mtd_partition ebony_large_partitions[] = {
-	{
-		.name =   "fs",
-		.offset = 0,
-		.size =   0x380000,
-	},
-	{
-		.name =   "firmware",
-		.offset = 0x380000,
-		.size =   0x80000,
-	}
-};
-
-int __init init_ebony(void)
-{
-	u8 fpga0_reg;
-	u8 __iomem *fpga0_adr;
-	unsigned long long small_flash_base, large_flash_base;
-
-	fpga0_adr = ioremap64(EBONY_FPGA_ADDR, 16);
-	if (!fpga0_adr)
-		return -ENOMEM;
-
-	fpga0_reg = readb(fpga0_adr);
-	iounmap(fpga0_adr);
-
-	if (EBONY_BOOT_SMALL_FLASH(fpga0_reg) &&
-			!EBONY_FLASH_SEL(fpga0_reg))
-		small_flash_base = EBONY_SMALL_FLASH_HIGH2;
-	else if (EBONY_BOOT_SMALL_FLASH(fpga0_reg) &&
-			EBONY_FLASH_SEL(fpga0_reg))
-		small_flash_base = EBONY_SMALL_FLASH_HIGH1;
-	else if (!EBONY_BOOT_SMALL_FLASH(fpga0_reg) &&
-			!EBONY_FLASH_SEL(fpga0_reg))
-		small_flash_base = EBONY_SMALL_FLASH_LOW2;
-	else
-		small_flash_base = EBONY_SMALL_FLASH_LOW1;
-
-	if (EBONY_BOOT_SMALL_FLASH(fpga0_reg) &&
-			!EBONY_ONBRD_FLASH_EN(fpga0_reg))
-		large_flash_base = EBONY_LARGE_FLASH_LOW;
-	else
-		large_flash_base = EBONY_LARGE_FLASH_HIGH;
-
-	ebony_small_map.phys = small_flash_base;
-	ebony_small_map.virt = ioremap64(small_flash_base,
-					 ebony_small_map.size);
-
-	if (!ebony_small_map.virt) {
-		printk("Failed to ioremap flash\n");
-		return -EIO;
-	}
-
-	simple_map_init(&ebony_small_map);
-
-	flash = do_map_probe("jedec_probe", &ebony_small_map);
-	if (flash) {
-		flash->owner = THIS_MODULE;
-		add_mtd_partitions(flash, ebony_small_partitions,
-					ARRAY_SIZE(ebony_small_partitions));
-	} else {
-		printk("map probe failed for flash\n");
-		iounmap(ebony_small_map.virt);
-		return -ENXIO;
-	}
-
-	ebony_large_map.phys = large_flash_base;
-	ebony_large_map.virt = ioremap64(large_flash_base,
-					 ebony_large_map.size);
-
-	if (!ebony_large_map.virt) {
-		printk("Failed to ioremap flash\n");
-		iounmap(ebony_small_map.virt);
-		return -EIO;
-	}
-
-	simple_map_init(&ebony_large_map);
-
-	flash = do_map_probe("jedec_probe", &ebony_large_map);
-	if (flash) {
-		flash->owner = THIS_MODULE;
-		add_mtd_partitions(flash, ebony_large_partitions,
-					ARRAY_SIZE(ebony_large_partitions));
-	} else {
-		printk("map probe failed for flash\n");
-		iounmap(ebony_small_map.virt);
-		iounmap(ebony_large_map.virt);
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
-static void __exit cleanup_ebony(void)
-{
-	if (flash) {
-		del_mtd_partitions(flash);
-		map_destroy(flash);
-	}
-
-	if (ebony_small_map.virt) {
-		iounmap(ebony_small_map.virt);
-		ebony_small_map.virt = NULL;
-	}
-
-	if (ebony_large_map.virt) {
-		iounmap(ebony_large_map.virt);
-		ebony_large_map.virt = NULL;
-	}
-}
-
-module_init(init_ebony);
-module_exit(cleanup_ebony);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Matt Porter <mporter@kernel.crashing.org>");
-MODULE_DESCRIPTION("MTD map and partitions for IBM 440GP Ebony boards");
diff --git a/drivers/mtd/maps/ocotea.c b/drivers/mtd/maps/ocotea.c
deleted file mode 100644
index 5522eac8c980..000000000000
--- a/drivers/mtd/maps/ocotea.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Mapping for Ocotea user flash
- *
- * Matt Porter <mporter@kernel.crashing.org>
- *
- * Copyright 2002-2004 MontaVista Software Inc.
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-#include <asm/io.h>
-#include <asm/ibm44x.h>
-#include <platforms/4xx/ocotea.h>
-
-static struct mtd_info *flash;
-
-static struct map_info ocotea_small_map = {
-	.name =		"Ocotea small flash",
-	.size =		OCOTEA_SMALL_FLASH_SIZE,
-	.buswidth =	1,
-};
-
-static struct map_info ocotea_large_map = {
-	.name =		"Ocotea large flash",
-	.size =		OCOTEA_LARGE_FLASH_SIZE,
-	.buswidth =	1,
-};
-
-static struct mtd_partition ocotea_small_partitions[] = {
-	{
-		.name =   "pibs",
-		.offset = 0x0,
-		.size =   0x100000,
-	}
-};
-
-static struct mtd_partition ocotea_large_partitions[] = {
-	{
-		.name =   "fs",
-		.offset = 0,
-		.size =   0x300000,
-	},
-	{
-		.name =   "firmware",
-		.offset = 0x300000,
-		.size =   0x100000,
-	}
-};
-
-int __init init_ocotea(void)
-{
-	u8 fpga0_reg;
-	u8 *fpga0_adr;
-	unsigned long long small_flash_base, large_flash_base;
-
-	fpga0_adr = ioremap64(OCOTEA_FPGA_ADDR, 16);
-	if (!fpga0_adr)
-		return -ENOMEM;
-
-	fpga0_reg = readb((unsigned long)fpga0_adr);
-	iounmap(fpga0_adr);
-
-	if (OCOTEA_BOOT_LARGE_FLASH(fpga0_reg)) {
-		small_flash_base = OCOTEA_SMALL_FLASH_HIGH;
-		large_flash_base = OCOTEA_LARGE_FLASH_LOW;
-	}
-	else {
-		small_flash_base = OCOTEA_SMALL_FLASH_LOW;
-		large_flash_base = OCOTEA_LARGE_FLASH_HIGH;
-	}
-
-	ocotea_small_map.phys = small_flash_base;
-	ocotea_small_map.virt = ioremap64(small_flash_base,
-					 ocotea_small_map.size);
-
-	if (!ocotea_small_map.virt) {
-		printk("Failed to ioremap flash\n");
-		return -EIO;
-	}
-
-	simple_map_init(&ocotea_small_map);
-
-	flash = do_map_probe("map_rom", &ocotea_small_map);
-	if (flash) {
-		flash->owner = THIS_MODULE;
-		add_mtd_partitions(flash, ocotea_small_partitions,
-					ARRAY_SIZE(ocotea_small_partitions));
-	} else {
-		printk("map probe failed for flash\n");
-		iounmap(ocotea_small_map.virt);
-		return -ENXIO;
-	}
-
-	ocotea_large_map.phys = large_flash_base;
-	ocotea_large_map.virt = ioremap64(large_flash_base,
-					 ocotea_large_map.size);
-
-	if (!ocotea_large_map.virt) {
-		printk("Failed to ioremap flash\n");
-		iounmap(ocotea_small_map.virt);
-		return -EIO;
-	}
-
-	simple_map_init(&ocotea_large_map);
-
-	flash = do_map_probe("cfi_probe", &ocotea_large_map);
-	if (flash) {
-		flash->owner = THIS_MODULE;
-		add_mtd_partitions(flash, ocotea_large_partitions,
-					ARRAY_SIZE(ocotea_large_partitions));
-	} else {
-		printk("map probe failed for flash\n");
-		iounmap(ocotea_small_map.virt);
-		iounmap(ocotea_large_map.virt);
-		return -ENXIO;
-	}
-
-	return 0;
-}
-
-static void __exit cleanup_ocotea(void)
-{
-	if (flash) {
-		del_mtd_partitions(flash);
-		map_destroy(flash);
-	}
-
-	if (ocotea_small_map.virt) {
-		iounmap((void *)ocotea_small_map.virt);
-		ocotea_small_map.virt = 0;
-	}
-
-	if (ocotea_large_map.virt) {
-		iounmap((void *)ocotea_large_map.virt);
-		ocotea_large_map.virt = 0;
-	}
-}
-
-module_init(init_ocotea);
-module_exit(cleanup_ocotea);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Matt Porter <mporter@kernel.crashing.org>");
-MODULE_DESCRIPTION("MTD map and partitions for IBM 440GX Ocotea boards");
diff --git a/drivers/mtd/maps/omap-toto-flash.c b/drivers/mtd/maps/omap-toto-flash.c
deleted file mode 100644
index 0a60ebbc2175..000000000000
--- a/drivers/mtd/maps/omap-toto-flash.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * NOR Flash memory access on TI Toto board
- *
- * jzhang@ti.com (C) 2003 Texas Instruments.
- *
- *  (C) 2002 MontVista Software, Inc.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-
-#include <asm/hardware.h>
-#include <asm/io.h>
-
-
-#ifndef CONFIG_ARCH_OMAP
-#error This is for OMAP architecture only
-#endif
-
-//these lines need be moved to a hardware header file
-#define OMAP_TOTO_FLASH_BASE 0xd8000000
-#define OMAP_TOTO_FLASH_SIZE 0x80000
-
-static struct map_info omap_toto_map_flash = {
-	.name =		"OMAP Toto flash",
-	.bankwidth =	2,
-	.virt =		(void __iomem *)OMAP_TOTO_FLASH_BASE,
-};
-
-
-static struct mtd_partition toto_flash_partitions[] = {
-	{
-		.name =		"BootLoader",
-		.size =		0x00040000,     /* hopefully u-boot will stay 128k + 128*/
-		.offset =	0,
-		.mask_flags =	MTD_WRITEABLE,  /* force read-only */
-	}, {
-		.name =		"ReservedSpace",
-		.size =		0x00030000,
-		.offset =	MTDPART_OFS_APPEND,
-		//mask_flags:	MTD_WRITEABLE,  /* force read-only */
-	}, {
-		.name =		"EnvArea",      /* bottom 64KiB for env vars */
-		.size =		MTDPART_SIZ_FULL,
-		.offset =	MTDPART_OFS_APPEND,
-	}
-};
-
-static struct mtd_partition *parsed_parts;
-
-static struct mtd_info *flash_mtd;
-
-static int __init init_flash (void)
-{
-
-	struct mtd_partition *parts;
-	int nb_parts = 0;
-	int parsed_nr_parts = 0;
-	const char *part_type;
-
-	/*
-	 * Static partition definition selection
-	 */
-	part_type = "static";
-
- 	parts = toto_flash_partitions;
-	nb_parts = ARRAY_SIZE(toto_flash_partitions);
-	omap_toto_map_flash.size = OMAP_TOTO_FLASH_SIZE;
-	omap_toto_map_flash.phys = virt_to_phys(OMAP_TOTO_FLASH_BASE);
-
-	simple_map_init(&omap_toto_map_flash);
-	/*
-	 * Now let's probe for the actual flash.  Do it here since
-	 * specific machine settings might have been set above.
-	 */
-	printk(KERN_NOTICE "OMAP toto flash: probing %d-bit flash bus\n",
-		omap_toto_map_flash.bankwidth*8);
-	flash_mtd = do_map_probe("jedec_probe", &omap_toto_map_flash);
-	if (!flash_mtd)
-		return -ENXIO;
-
- 	if (parsed_nr_parts > 0) {
-		parts = parsed_parts;
-		nb_parts = parsed_nr_parts;
-	}
-
-	if (nb_parts == 0) {
-		printk(KERN_NOTICE "OMAP toto flash: no partition info available,"
-			"registering whole flash at once\n");
-		if (add_mtd_device(flash_mtd)){
-            return -ENXIO;
-        }
-	} else {
-		printk(KERN_NOTICE "Using %s partition definition\n",
-			part_type);
-		return add_mtd_partitions(flash_mtd, parts, nb_parts);
-	}
-	return 0;
-}
-
-int __init omap_toto_mtd_init(void)
-{
-	int status;
-
- 	if (status = init_flash()) {
-		printk(KERN_ERR "OMAP Toto Flash: unable to init map for toto flash\n");
-	}
-    return status;
-}
-
-static void  __exit omap_toto_mtd_cleanup(void)
-{
-	if (flash_mtd) {
-		del_mtd_partitions(flash_mtd);
-		map_destroy(flash_mtd);
-		kfree(parsed_parts);
-	}
-}
-
-module_init(omap_toto_mtd_init);
-module_exit(omap_toto_mtd_cleanup);
-
-MODULE_AUTHOR("Jian Zhang");
-MODULE_DESCRIPTION("OMAP Toto board map driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/maps/pci.c b/drivers/mtd/maps/pci.c
index 5c6a25c90380..48f4cf5cb9d1 100644
--- a/drivers/mtd/maps/pci.c
+++ b/drivers/mtd/maps/pci.c
@@ -203,15 +203,8 @@ intel_dc21285_init(struct pci_dev *dev, struct map_pci_info *map)
 		 * not enabled, should we be allocating a new resource for it
 		 * or simply enabling it?
 		 */
-		if (!(pci_resource_flags(dev, PCI_ROM_RESOURCE) &
-				    IORESOURCE_ROM_ENABLE)) {
-		     	u32 val;
-			pci_resource_flags(dev, PCI_ROM_RESOURCE) |= IORESOURCE_ROM_ENABLE;
-			pci_read_config_dword(dev, PCI_ROM_ADDRESS, &val);
-			val |= PCI_ROM_ADDRESS_ENABLE;
-			pci_write_config_dword(dev, PCI_ROM_ADDRESS, val);
-			printk("%s: enabling expansion ROM\n", pci_name(dev));
-		}
+		pci_enable_rom(dev);
+		printk("%s: enabling expansion ROM\n", pci_name(dev));
 	}
 
 	if (!len || !base)
@@ -232,18 +225,13 @@ intel_dc21285_init(struct pci_dev *dev, struct map_pci_info *map)
 static void
 intel_dc21285_exit(struct pci_dev *dev, struct map_pci_info *map)
 {
-	u32 val;
-
 	if (map->base)
 		iounmap(map->base);
 
 	/*
 	 * We need to undo the PCI BAR2/PCI ROM BAR address alteration.
 	 */
-	pci_resource_flags(dev, PCI_ROM_RESOURCE) &= ~IORESOURCE_ROM_ENABLE;
-	pci_read_config_dword(dev, PCI_ROM_ADDRESS, &val);
-	val &= ~PCI_ROM_ADDRESS_ENABLE;
-	pci_write_config_dword(dev, PCI_ROM_ADDRESS, val);
+	pci_disable_rom(dev);
 }
 
 static unsigned long
diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c
index 49acd4171893..5fcfec034a94 100644
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -230,8 +230,7 @@ static int __devinit of_flash_probe(struct of_device *dev,
 
 #ifdef CONFIG_MTD_OF_PARTS
 	if (err == 0) {
-		err = of_mtd_parse_partitions(&dev->dev, info->mtd,
-		                              dp, &info->parts);
+		err = of_mtd_parse_partitions(&dev->dev, dp, &info->parts);
 		if (err < 0)
 			return err;
 	}
diff --git a/drivers/mtd/maps/walnut.c b/drivers/mtd/maps/walnut.c
deleted file mode 100644
index e243476c8171..000000000000
--- a/drivers/mtd/maps/walnut.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Mapping for Walnut flash
- * (used ebony.c as a "framework")
- *
- * Heikki Lindholm <holindho@infradead.org>
- *
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/map.h>
-#include <linux/mtd/partitions.h>
-#include <asm/io.h>
-#include <asm/ibm4xx.h>
-#include <platforms/4xx/walnut.h>
-
-/* these should be in platforms/4xx/walnut.h ? */
-#define WALNUT_FLASH_ONBD_N(x)		(x & 0x02)
-#define WALNUT_FLASH_SRAM_SEL(x)	(x & 0x01)
-#define WALNUT_FLASH_LOW		0xFFF00000
-#define WALNUT_FLASH_HIGH		0xFFF80000
-#define WALNUT_FLASH_SIZE		0x80000
-
-static struct mtd_info *flash;
-
-static struct map_info walnut_map = {
-	.name =		"Walnut flash",
-	.size =		WALNUT_FLASH_SIZE,
-	.bankwidth =	1,
-};
-
-/* Actually, OpenBIOS is the last 128 KiB of the flash - better
- * partitioning could be made */
-static struct mtd_partition walnut_partitions[] = {
-	{
-		.name =   "OpenBIOS",
-		.offset = 0x0,
-		.size =   WALNUT_FLASH_SIZE,
-		/*.mask_flags = MTD_WRITEABLE, */ /* force read-only */
-	}
-};
-
-int __init init_walnut(void)
-{
-	u8 fpga_brds1;
-	void *fpga_brds1_adr;
-	void *fpga_status_adr;
-	unsigned long flash_base;
-
-	/* this should already be mapped (platform/4xx/walnut.c) */
-	fpga_status_adr = ioremap(WALNUT_FPGA_BASE, 8);
-	if (!fpga_status_adr)
-		return -ENOMEM;
-
-	fpga_brds1_adr = fpga_status_adr+5;
-	fpga_brds1 = readb(fpga_brds1_adr);
-	/* iounmap(fpga_status_adr); */
-
-	if (WALNUT_FLASH_ONBD_N(fpga_brds1)) {
-		printk("The on-board flash is disabled (U79 sw 5)!");
-		iounmap(fpga_status_adr);
-		return -EIO;
-	}
-	if (WALNUT_FLASH_SRAM_SEL(fpga_brds1))
-		flash_base = WALNUT_FLASH_LOW;
-	else
-		flash_base = WALNUT_FLASH_HIGH;
-
-	walnut_map.phys = flash_base;
-	walnut_map.virt =
-		(void __iomem *)ioremap(flash_base, walnut_map.size);
-
-	if (!walnut_map.virt) {
-		printk("Failed to ioremap flash.\n");
-		iounmap(fpga_status_adr);
-		return -EIO;
-	}
-
-	simple_map_init(&walnut_map);
-
-	flash = do_map_probe("jedec_probe", &walnut_map);
-	if (flash) {
-		flash->owner = THIS_MODULE;
-		add_mtd_partitions(flash, walnut_partitions,
-					ARRAY_SIZE(walnut_partitions));
-	} else {
-		printk("map probe failed for flash\n");
-		iounmap(fpga_status_adr);
-		return -ENXIO;
-	}
-
-	iounmap(fpga_status_adr);
-	return 0;
-}
-
-static void __exit cleanup_walnut(void)
-{
-	if (flash) {
-		del_mtd_partitions(flash);
-		map_destroy(flash);
-	}
-
-	if (walnut_map.virt) {
-		iounmap((void *)walnut_map.virt);
-		walnut_map.virt = 0;
-	}
-}
-
-module_init(init_walnut);
-module_exit(cleanup_walnut);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Heikki Lindholm <holindho@infradead.org>");
-MODULE_DESCRIPTION("MTD map and partitions for IBM 405GP Walnut boards");
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 1c74762dec89..963840e9b5bf 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -348,7 +348,7 @@ static void mtdchar_erase_callback (struct erase_info *instr)
 	wake_up((wait_queue_head_t *)instr->priv);
 }
 
-#if defined(CONFIG_MTD_OTP) || defined(CONFIG_MTD_ONENAND_OTP)
+#ifdef CONFIG_HAVE_MTD_OTP
 static int otp_select_filemode(struct mtd_file_info *mfi, int mode)
 {
 	struct mtd_info *mtd = mfi->mtd;
@@ -665,7 +665,7 @@ static int mtd_ioctl(struct inode *inode, struct file *file,
 		break;
 	}
 
-#if defined(CONFIG_MTD_OTP) || defined(CONFIG_MTD_ONENAND_OTP)
+#ifdef CONFIG_HAVE_MTD_OTP
 	case OTPSELECT:
 	{
 		int mode;
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 2972a5edb73d..789842d0e6f2 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -444,7 +444,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 			return -EINVAL;
 	}
 
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	/* make a local copy of instr to avoid modifying the caller's struct */
 	erase = kmalloc(sizeof (struct erase_info), GFP_KERNEL);
@@ -493,7 +493,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr)
 			/* sanity check: should never happen since
 			 * block alignment has been checked above */
 			BUG_ON(err == -EINVAL);
-			if (erase->fail_addr != 0xffffffff)
+			if (erase->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 				instr->fail_addr = erase->fail_addr + offset;
 			break;
 		}
diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c
index 5a680e1e61f1..aebb3b27edbd 100644
--- a/drivers/mtd/mtdoops.c
+++ b/drivers/mtd/mtdoops.c
@@ -33,6 +33,7 @@
 #include <linux/interrupt.h>
 #include <linux/mtd/mtd.h>
 
+#define MTDOOPS_KERNMSG_MAGIC 0x5d005d00
 #define OOPS_PAGE_SIZE 4096
 
 static struct mtdoops_context {
@@ -99,7 +100,7 @@ static void mtdoops_inc_counter(struct mtdoops_context *cxt)
 	int ret;
 
 	cxt->nextpage++;
-	if (cxt->nextpage > cxt->oops_pages)
+	if (cxt->nextpage >= cxt->oops_pages)
 		cxt->nextpage = 0;
 	cxt->nextcount++;
 	if (cxt->nextcount == 0xffffffff)
@@ -141,7 +142,7 @@ static void mtdoops_workfunc_erase(struct work_struct *work)
 	mod = (cxt->nextpage * OOPS_PAGE_SIZE) % mtd->erasesize;
 	if (mod != 0) {
 		cxt->nextpage = cxt->nextpage + ((mtd->erasesize - mod) / OOPS_PAGE_SIZE);
-		if (cxt->nextpage > cxt->oops_pages)
+		if (cxt->nextpage >= cxt->oops_pages)
 			cxt->nextpage = 0;
 	}
 
@@ -158,7 +159,7 @@ badblock:
 				cxt->nextpage * OOPS_PAGE_SIZE);
 		i++;
 		cxt->nextpage = cxt->nextpage + (mtd->erasesize / OOPS_PAGE_SIZE);
-		if (cxt->nextpage > cxt->oops_pages)
+		if (cxt->nextpage >= cxt->oops_pages)
 			cxt->nextpage = 0;
 		if (i == (cxt->oops_pages / (mtd->erasesize / OOPS_PAGE_SIZE))) {
 			printk(KERN_ERR "mtdoops: All blocks bad!\n");
@@ -224,40 +225,40 @@ static void find_next_position(struct mtdoops_context *cxt)
 {
 	struct mtd_info *mtd = cxt->mtd;
 	int ret, page, maxpos = 0;
-	u32 count, maxcount = 0xffffffff;
+	u32 count[2], maxcount = 0xffffffff;
 	size_t retlen;
 
 	for (page = 0; page < cxt->oops_pages; page++) {
-		ret = mtd->read(mtd, page * OOPS_PAGE_SIZE, 4, &retlen, (u_char *) &count);
-		if ((retlen != 4) || ((ret < 0) && (ret != -EUCLEAN))) {
-			printk(KERN_ERR "mtdoops: Read failure at %d (%td of 4 read)"
+		ret = mtd->read(mtd, page * OOPS_PAGE_SIZE, 8, &retlen, (u_char *) &count[0]);
+		if ((retlen != 8) || ((ret < 0) && (ret != -EUCLEAN))) {
+			printk(KERN_ERR "mtdoops: Read failure at %d (%td of 8 read)"
 				", err %d.\n", page * OOPS_PAGE_SIZE, retlen, ret);
 			continue;
 		}
 
-		if (count == 0xffffffff)
+		if (count[1] != MTDOOPS_KERNMSG_MAGIC)
+			continue;
+		if (count[0] == 0xffffffff)
 			continue;
 		if (maxcount == 0xffffffff) {
-			maxcount = count;
+			maxcount = count[0];
 			maxpos = page;
-		} else if ((count < 0x40000000) && (maxcount > 0xc0000000)) {
-			maxcount = count;
+		} else if ((count[0] < 0x40000000) && (maxcount > 0xc0000000)) {
+			maxcount = count[0];
 			maxpos = page;
-		} else if ((count > maxcount) && (count < 0xc0000000)) {
-			maxcount = count;
+		} else if ((count[0] > maxcount) && (count[0] < 0xc0000000)) {
+			maxcount = count[0];
 			maxpos = page;
-		} else if ((count > maxcount) && (count > 0xc0000000)
+		} else if ((count[0] > maxcount) && (count[0] > 0xc0000000)
 					&& (maxcount > 0x80000000)) {
-			maxcount = count;
+			maxcount = count[0];
 			maxpos = page;
 		}
 	}
 	if (maxcount == 0xffffffff) {
 		cxt->nextpage = 0;
 		cxt->nextcount = 1;
-		cxt->ready = 1;
-		printk(KERN_DEBUG "mtdoops: Ready %d, %d (first init)\n",
-				cxt->nextpage, cxt->nextcount);
+		schedule_work(&cxt->work_erase);
 		return;
 	}
 
@@ -358,8 +359,9 @@ mtdoops_console_write(struct console *co, const char *s, unsigned int count)
 
 	if (cxt->writecount == 0) {
 		u32 *stamp = cxt->oops_buf;
-		*stamp = cxt->nextcount;
-		cxt->writecount = 4;
+		*stamp++ = cxt->nextcount;
+		*stamp = MTDOOPS_KERNMSG_MAGIC;
+		cxt->writecount = 8;
 	}
 
 	if ((count + cxt->writecount) > OOPS_PAGE_SIZE)
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 9a06dc93ee0d..3728913fa5fa 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -214,7 +214,7 @@ static int part_erase(struct mtd_info *mtd, struct erase_info *instr)
 	instr->addr += part->offset;
 	ret = part->master->erase(part->master, instr);
 	if (ret) {
-		if (instr->fail_addr != 0xffffffff)
+		if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 			instr->fail_addr -= part->offset;
 		instr->addr -= part->offset;
 	}
@@ -226,7 +226,7 @@ void mtd_erase_callback(struct erase_info *instr)
 	if (instr->mtd->erase == part_erase) {
 		struct mtd_part *part = PART(instr->mtd);
 
-		if (instr->fail_addr != 0xffffffff)
+		if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 			instr->fail_addr -= part->offset;
 		instr->addr -= part->offset;
 	}
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 41f361c49b32..1c2e9450d663 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -56,6 +56,12 @@ config MTD_NAND_H1900
 	help
 	  This enables the driver for the iPAQ h1900 flash.
 
+config MTD_NAND_GPIO
+	tristate "GPIO NAND Flash driver"
+	depends on GENERIC_GPIO && ARM
+	help
+	  This enables a GPIO based NAND flash driver.
+
 config MTD_NAND_SPIA
 	tristate "NAND Flash device on SPIA board"
 	depends on ARCH_P720T
@@ -68,12 +74,6 @@ config MTD_NAND_AMS_DELTA
 	help
 	  Support for NAND flash on Amstrad E3 (Delta).
 
-config MTD_NAND_TOTO
-	tristate "NAND Flash device on TOTO board"
-	depends on ARCH_OMAP && BROKEN
-	help
-	  Support for NAND flash on Texas Instruments Toto platform.
-
 config MTD_NAND_TS7250
 	tristate "NAND Flash device on TS-7250 board"
 	depends on MACH_TS72XX
@@ -163,13 +163,6 @@ config MTD_NAND_S3C2410_HWECC
 	  incorrect ECC generation, and if using these, the default of
 	  software ECC is preferable.
 
-config MTD_NAND_NDFC
-	tristate "NDFC NanD Flash Controller"
-	depends on 4xx && !PPC_MERGE
-	select MTD_NAND_ECC_SMC
-	help
-	 NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs
-
 config MTD_NAND_S3C2410_CLKSTOP
 	bool "S3C2410 NAND IDLE clock stop"
 	depends on MTD_NAND_S3C2410
@@ -340,6 +333,13 @@ config MTD_NAND_PXA3xx
 	  This enables the driver for the NAND flash device found on
 	  PXA3xx processors
 
+config MTD_NAND_PXA3xx_BUILTIN
+	bool "Use builtin definitions for some NAND chips (deprecated)"
+	depends on MTD_NAND_PXA3xx
+	help
+	  This enables builtin definitions for some NAND chips. This
+	  is deprecated in favor of platform specific data.
+
 config MTD_NAND_CM_X270
 	tristate "Support for NAND Flash on CM-X270 modules"
 	depends on MTD_NAND && MACH_ARMCORE
@@ -400,10 +400,24 @@ config MTD_NAND_FSL_ELBC
 
 config MTD_NAND_FSL_UPM
 	tristate "Support for NAND on Freescale UPM"
-	depends on MTD_NAND && OF_GPIO && (PPC_83xx || PPC_85xx)
+	depends on MTD_NAND && (PPC_83xx || PPC_85xx)
 	select FSL_LBC
 	help
 	  Enables support for NAND Flash chips wired onto Freescale PowerPC
 	  processor localbus with User-Programmable Machine support.
 
+config MTD_NAND_MXC
+	tristate "MXC NAND support"
+	depends on ARCH_MX2
+	help
+	  This enables the driver for the NAND flash controller on the
+	  MXC processors.
+
+config MTD_NAND_SH_FLCTL
+	tristate "Support for NAND on Renesas SuperH FLCTL"
+	depends on MTD_NAND && SUPERH && CPU_SUBTYPE_SH7723
+	help
+	  Several Renesas SuperH CPU has FLCTL. This option enables support
+	  for NAND Flash using FLCTL. This driver support SH7723.
+
 endif # MTD_NAND
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index b786c5da82da..b661586afbfc 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -8,7 +8,6 @@ obj-$(CONFIG_MTD_NAND_IDS)		+= nand_ids.o
 obj-$(CONFIG_MTD_NAND_CAFE)		+= cafe_nand.o
 obj-$(CONFIG_MTD_NAND_SPIA)		+= spia.o
 obj-$(CONFIG_MTD_NAND_AMS_DELTA)	+= ams-delta.o
-obj-$(CONFIG_MTD_NAND_TOTO)		+= toto.o
 obj-$(CONFIG_MTD_NAND_AUTCPU12)		+= autcpu12.o
 obj-$(CONFIG_MTD_NAND_EDB7312)		+= edb7312.o
 obj-$(CONFIG_MTD_NAND_AU1550)		+= au1550nd.o
@@ -24,6 +23,7 @@ obj-$(CONFIG_MTD_NAND_NANDSIM)		+= nandsim.o
 obj-$(CONFIG_MTD_NAND_CS553X)		+= cs553x_nand.o
 obj-$(CONFIG_MTD_NAND_NDFC)		+= ndfc.o
 obj-$(CONFIG_MTD_NAND_ATMEL)		+= atmel_nand.o
+obj-$(CONFIG_MTD_NAND_GPIO)		+= gpio.o
 obj-$(CONFIG_MTD_NAND_CM_X270)		+= cmx270_nand.o
 obj-$(CONFIG_MTD_NAND_BASLER_EXCITE)	+= excite_nandflash.o
 obj-$(CONFIG_MTD_NAND_PXA3xx)		+= pxa3xx_nand.o
@@ -34,5 +34,7 @@ obj-$(CONFIG_MTD_NAND_PASEMI)		+= pasemi_nand.o
 obj-$(CONFIG_MTD_NAND_ORION)		+= orion_nand.o
 obj-$(CONFIG_MTD_NAND_FSL_ELBC)		+= fsl_elbc_nand.o
 obj-$(CONFIG_MTD_NAND_FSL_UPM)		+= fsl_upm.o
+obj-$(CONFIG_MTD_NAND_SH_FLCTL)		+= sh_flctl.o
+obj-$(CONFIG_MTD_NAND_MXC)		+= mxc_nand.o
 
 nand-objs := nand_base.o nand_bbt.o
diff --git a/drivers/mtd/nand/atmel_nand.c b/drivers/mtd/nand/atmel_nand.c
index 3387e0d5076b..c98c1570a40b 100644
--- a/drivers/mtd/nand/atmel_nand.c
+++ b/drivers/mtd/nand/atmel_nand.c
@@ -174,48 +174,6 @@ static void atmel_write_buf16(struct mtd_info *mtd, const u8 *buf, int len)
 }
 
 /*
- * write oob for small pages
- */
-static int atmel_nand_write_oob_512(struct mtd_info *mtd,
-		struct nand_chip *chip, int page)
-{
-	int chunk = chip->ecc.bytes + chip->ecc.prepad + chip->ecc.postpad;
-	int eccsize = chip->ecc.size, length = mtd->oobsize;
-	int len, pos, status = 0;
-	const uint8_t *bufpoi = chip->oob_poi;
-
-	pos = eccsize + chunk;
-
-	chip->cmdfunc(mtd, NAND_CMD_SEQIN, pos, page);
-	len = min_t(int, length, chunk);
-	chip->write_buf(mtd, bufpoi, len);
-	bufpoi += len;
-	length -= len;
-	if (length > 0)
-		chip->write_buf(mtd, bufpoi, length);
-
-	chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
-	status = chip->waitfunc(mtd, chip);
-
-	return status & NAND_STATUS_FAIL ? -EIO : 0;
-
-}
-
-/*
- * read oob for small pages
- */
-static int atmel_nand_read_oob_512(struct mtd_info *mtd,
-		struct nand_chip *chip,	int page, int sndcmd)
-{
-	if (sndcmd) {
-		chip->cmdfunc(mtd, NAND_CMD_READOOB, 0, page);
-		sndcmd = 0;
-	}
-	chip->read_buf(mtd, chip->oob_poi, mtd->oobsize);
-	return sndcmd;
-}
-
-/*
  * Calculate HW ECC
  *
  * function called after a write
@@ -235,14 +193,14 @@ static int atmel_nand_calculate(struct mtd_info *mtd,
 	/* get the first 2 ECC bytes */
 	ecc_value = ecc_readl(host->ecc, PR);
 
-	ecc_code[eccpos[0]] = ecc_value & 0xFF;
-	ecc_code[eccpos[1]] = (ecc_value >> 8) & 0xFF;
+	ecc_code[0] = ecc_value & 0xFF;
+	ecc_code[1] = (ecc_value >> 8) & 0xFF;
 
 	/* get the last 2 ECC bytes */
 	ecc_value = ecc_readl(host->ecc, NPR) & ATMEL_ECC_NPARITY;
 
-	ecc_code[eccpos[2]] = ecc_value & 0xFF;
-	ecc_code[eccpos[3]] = (ecc_value >> 8) & 0xFF;
+	ecc_code[2] = ecc_value & 0xFF;
+	ecc_code[3] = (ecc_value >> 8) & 0xFF;
 
 	return 0;
 }
@@ -476,14 +434,12 @@ static int __init atmel_nand_probe(struct platform_device *pdev)
 			res = -EIO;
 			goto err_ecc_ioremap;
 		}
-		nand_chip->ecc.mode = NAND_ECC_HW_SYNDROME;
+		nand_chip->ecc.mode = NAND_ECC_HW;
 		nand_chip->ecc.calculate = atmel_nand_calculate;
 		nand_chip->ecc.correct = atmel_nand_correct;
 		nand_chip->ecc.hwctl = atmel_nand_hwctl;
 		nand_chip->ecc.read_page = atmel_nand_read_page;
 		nand_chip->ecc.bytes = 4;
-		nand_chip->ecc.prepad = 0;
-		nand_chip->ecc.postpad = 0;
 	}
 
 	nand_chip->chip_delay = 20;		/* 20us command delay time */
@@ -514,7 +470,7 @@ static int __init atmel_nand_probe(struct platform_device *pdev)
 		goto err_scan_ident;
 	}
 
-	if (nand_chip->ecc.mode == NAND_ECC_HW_SYNDROME) {
+	if (nand_chip->ecc.mode == NAND_ECC_HW) {
 		/* ECC is calculated for the whole page (1 step) */
 		nand_chip->ecc.size = mtd->writesize;
 
@@ -522,8 +478,6 @@ static int __init atmel_nand_probe(struct platform_device *pdev)
 		switch (mtd->writesize) {
 		case 512:
 			nand_chip->ecc.layout = &atmel_oobinfo_small;
-			nand_chip->ecc.read_oob = atmel_nand_read_oob_512;
-			nand_chip->ecc.write_oob = atmel_nand_write_oob_512;
 			ecc_writel(host->ecc, MR, ATMEL_ECC_PAGESIZE_528);
 			break;
 		case 1024:
diff --git a/drivers/mtd/nand/cs553x_nand.c b/drivers/mtd/nand/cs553x_nand.c
index 3370a800fd36..9f1b451005ca 100644
--- a/drivers/mtd/nand/cs553x_nand.c
+++ b/drivers/mtd/nand/cs553x_nand.c
@@ -289,8 +289,10 @@ static int __init cs553x_init(void)
 	int i;
 	uint64_t val;
 
+#ifdef CONFIG_MTD_PARTITIONS
 	int mtd_parts_nb = 0;
 	struct mtd_partition *mtd_parts = NULL;
+#endif
 
 	/* If the CPU isn't a Geode GX or LX, abort */
 	if (!is_geode())
diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c
index 98ad3cefcaf4..4aa5bd6158da 100644
--- a/drivers/mtd/nand/fsl_elbc_nand.c
+++ b/drivers/mtd/nand/fsl_elbc_nand.c
@@ -918,8 +918,7 @@ static int __devinit fsl_elbc_chip_probe(struct fsl_elbc_ctrl *ctrl,
 
 #ifdef CONFIG_MTD_OF_PARTS
 	if (ret == 0) {
-		ret = of_mtd_parse_partitions(priv->dev, &priv->mtd,
-		                              node, &parts);
+		ret = of_mtd_parse_partitions(priv->dev, node, &parts);
 		if (ret < 0)
 			goto err;
 	}
diff --git a/drivers/mtd/nand/fsl_upm.c b/drivers/mtd/nand/fsl_upm.c
index 1ebfd87f00b4..024e3fffd4bb 100644
--- a/drivers/mtd/nand/fsl_upm.c
+++ b/drivers/mtd/nand/fsl_upm.c
@@ -13,6 +13,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/delay.h>
 #include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
 #include <linux/mtd/partitions.h>
@@ -36,8 +37,6 @@ struct fsl_upm_nand {
 	uint8_t upm_cmd_offset;
 	void __iomem *io_base;
 	int rnb_gpio;
-	const uint32_t *wait_pattern;
-	const uint32_t *wait_write;
 	int chip_delay;
 };
 
@@ -61,10 +60,11 @@ static void fun_wait_rnb(struct fsl_upm_nand *fun)
 	if (fun->rnb_gpio >= 0) {
 		while (--cnt && !fun_chip_ready(&fun->mtd))
 			cpu_relax();
+		if (!cnt)
+			dev_err(fun->dev, "tired waiting for RNB\n");
+	} else {
+		ndelay(100);
 	}
-
-	if (!cnt)
-		dev_err(fun->dev, "tired waiting for RNB\n");
 }
 
 static void fun_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
@@ -89,8 +89,7 @@ static void fun_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
 
 	fsl_upm_run_pattern(&fun->upm, fun->io_base, cmd);
 
-	if (fun->wait_pattern)
-		fun_wait_rnb(fun);
+	fun_wait_rnb(fun);
 }
 
 static uint8_t fun_read_byte(struct mtd_info *mtd)
@@ -116,14 +115,16 @@ static void fun_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
 
 	for (i = 0; i < len; i++) {
 		out_8(fun->chip.IO_ADDR_W, buf[i]);
-		if (fun->wait_write)
-			fun_wait_rnb(fun);
+		fun_wait_rnb(fun);
 	}
 }
 
-static int __devinit fun_chip_init(struct fsl_upm_nand *fun)
+static int __devinit fun_chip_init(struct fsl_upm_nand *fun,
+				   const struct device_node *upm_np,
+				   const struct resource *io_res)
 {
 	int ret;
+	struct device_node *flash_np;
 #ifdef CONFIG_MTD_PARTITIONS
 	static const char *part_types[] = { "cmdlinepart", NULL, };
 #endif
@@ -143,18 +144,37 @@ static int __devinit fun_chip_init(struct fsl_upm_nand *fun)
 	fun->mtd.priv = &fun->chip;
 	fun->mtd.owner = THIS_MODULE;
 
+	flash_np = of_get_next_child(upm_np, NULL);
+	if (!flash_np)
+		return -ENODEV;
+
+	fun->mtd.name = kasprintf(GFP_KERNEL, "%x.%s", io_res->start,
+				  flash_np->name);
+	if (!fun->mtd.name) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	ret = nand_scan(&fun->mtd, 1);
 	if (ret)
-		return ret;
-
-	fun->mtd.name = fun->dev->bus_id;
+		goto err;
 
 #ifdef CONFIG_MTD_PARTITIONS
 	ret = parse_mtd_partitions(&fun->mtd, part_types, &fun->parts, 0);
+
+#ifdef CONFIG_MTD_OF_PARTS
+	if (ret == 0)
+		ret = of_mtd_parse_partitions(fun->dev, &fun->mtd,
+					      flash_np, &fun->parts);
+#endif
 	if (ret > 0)
-		return add_mtd_partitions(&fun->mtd, fun->parts, ret);
+		ret = add_mtd_partitions(&fun->mtd, fun->parts, ret);
+	else
 #endif
-	return add_mtd_device(&fun->mtd);
+		ret = add_mtd_device(&fun->mtd);
+err:
+	of_node_put(flash_np);
+	return ret;
 }
 
 static int __devinit fun_probe(struct of_device *ofdev,
@@ -211,6 +231,12 @@ static int __devinit fun_probe(struct of_device *ofdev,
 		goto err2;
 	}
 
+	prop = of_get_property(ofdev->node, "chip-delay", NULL);
+	if (prop)
+		fun->chip_delay = *prop;
+	else
+		fun->chip_delay = 50;
+
 	fun->io_base = devm_ioremap_nocache(&ofdev->dev, io_res.start,
 					  io_res.end - io_res.start + 1);
 	if (!fun->io_base) {
@@ -220,17 +246,8 @@ static int __devinit fun_probe(struct of_device *ofdev,
 
 	fun->dev = &ofdev->dev;
 	fun->last_ctrl = NAND_CLE;
-	fun->wait_pattern = of_get_property(ofdev->node, "fsl,wait-pattern",
-					    NULL);
-	fun->wait_write = of_get_property(ofdev->node, "fsl,wait-write", NULL);
-
-	prop = of_get_property(ofdev->node, "chip-delay", NULL);
-	if (prop)
-		fun->chip_delay = *prop;
-	else
-		fun->chip_delay = 50;
 
-	ret = fun_chip_init(fun);
+	ret = fun_chip_init(fun, ofdev->node, &io_res);
 	if (ret)
 		goto err2;
 
@@ -251,6 +268,7 @@ static int __devexit fun_remove(struct of_device *ofdev)
 	struct fsl_upm_nand *fun = dev_get_drvdata(&ofdev->dev);
 
 	nand_release(&fun->mtd);
+	kfree(fun->mtd.name);
 
 	if (fun->rnb_gpio >= 0)
 		gpio_free(fun->rnb_gpio);
diff --git a/drivers/mtd/nand/gpio.c b/drivers/mtd/nand/gpio.c
new file mode 100644
index 000000000000..8f902e75aa85
--- /dev/null
+++ b/drivers/mtd/nand/gpio.c
@@ -0,0 +1,375 @@
+/*
+ * drivers/mtd/nand/gpio.c
+ *
+ * Updated, and converted to generic GPIO based driver by Russell King.
+ *
+ * Written by Ben Dooks <ben@simtec.co.uk>
+ *   Based on 2.4 version by Mark Whittaker
+ *
+ * © 2004 Simtec Electronics
+ *
+ * Device driver for NAND connected via GPIO
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/io.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/nand-gpio.h>
+
+struct gpiomtd {
+	void __iomem		*io_sync;
+	struct mtd_info		mtd_info;
+	struct nand_chip	nand_chip;
+	struct gpio_nand_platdata plat;
+};
+
+#define gpio_nand_getpriv(x) container_of(x, struct gpiomtd, mtd_info)
+
+
+#ifdef CONFIG_ARM
+/* gpio_nand_dosync()
+ *
+ * Make sure the GPIO state changes occur in-order with writes to NAND
+ * memory region.
+ * Needed on PXA due to bus-reordering within the SoC itself (see section on
+ * I/O ordering in PXA manual (section 2.3, p35)
+ */
+static void gpio_nand_dosync(struct gpiomtd *gpiomtd)
+{
+	unsigned long tmp;
+
+	if (gpiomtd->io_sync) {
+		/*
+		 * Linux memory barriers don't cater for what's required here.
+		 * What's required is what's here - a read from a separate
+		 * region with a dependency on that read.
+		 */
+		tmp = readl(gpiomtd->io_sync);
+		asm volatile("mov %1, %0\n" : "=r" (tmp) : "r" (tmp));
+	}
+}
+#else
+static inline void gpio_nand_dosync(struct gpiomtd *gpiomtd) {}
+#endif
+
+static void gpio_nand_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
+{
+	struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd);
+
+	gpio_nand_dosync(gpiomtd);
+
+	if (ctrl & NAND_CTRL_CHANGE) {
+		gpio_set_value(gpiomtd->plat.gpio_nce, !(ctrl & NAND_NCE));
+		gpio_set_value(gpiomtd->plat.gpio_cle, !!(ctrl & NAND_CLE));
+		gpio_set_value(gpiomtd->plat.gpio_ale, !!(ctrl & NAND_ALE));
+		gpio_nand_dosync(gpiomtd);
+	}
+	if (cmd == NAND_CMD_NONE)
+		return;
+
+	writeb(cmd, gpiomtd->nand_chip.IO_ADDR_W);
+	gpio_nand_dosync(gpiomtd);
+}
+
+static void gpio_nand_writebuf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	writesb(this->IO_ADDR_W, buf, len);
+}
+
+static void gpio_nand_readbuf(struct mtd_info *mtd, u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	readsb(this->IO_ADDR_R, buf, len);
+}
+
+static int gpio_nand_verifybuf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+	unsigned char read, *p = (unsigned char *) buf;
+	int i, err = 0;
+
+	for (i = 0; i < len; i++) {
+		read = readb(this->IO_ADDR_R);
+		if (read != p[i]) {
+			pr_debug("%s: err at %d (read %04x vs %04x)\n",
+			       __func__, i, read, p[i]);
+			err = -EFAULT;
+		}
+	}
+	return err;
+}
+
+static void gpio_nand_writebuf16(struct mtd_info *mtd, const u_char *buf,
+				 int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	if (IS_ALIGNED((unsigned long)buf, 2)) {
+		writesw(this->IO_ADDR_W, buf, len>>1);
+	} else {
+		int i;
+		unsigned short *ptr = (unsigned short *)buf;
+
+		for (i = 0; i < len; i += 2, ptr++)
+			writew(*ptr, this->IO_ADDR_W);
+	}
+}
+
+static void gpio_nand_readbuf16(struct mtd_info *mtd, u_char *buf, int len)
+{
+	struct nand_chip *this = mtd->priv;
+
+	if (IS_ALIGNED((unsigned long)buf, 2)) {
+		readsw(this->IO_ADDR_R, buf, len>>1);
+	} else {
+		int i;
+		unsigned short *ptr = (unsigned short *)buf;
+
+		for (i = 0; i < len; i += 2, ptr++)
+			*ptr = readw(this->IO_ADDR_R);
+	}
+}
+
+static int gpio_nand_verifybuf16(struct mtd_info *mtd, const u_char *buf,
+				 int len)
+{
+	struct nand_chip *this = mtd->priv;
+	unsigned short read, *p = (unsigned short *) buf;
+	int i, err = 0;
+	len >>= 1;
+
+	for (i = 0; i < len; i++) {
+		read = readw(this->IO_ADDR_R);
+		if (read != p[i]) {
+			pr_debug("%s: err at %d (read %04x vs %04x)\n",
+			       __func__, i, read, p[i]);
+			err = -EFAULT;
+		}
+	}
+	return err;
+}
+
+
+static int gpio_nand_devready(struct mtd_info *mtd)
+{
+	struct gpiomtd *gpiomtd = gpio_nand_getpriv(mtd);
+	return gpio_get_value(gpiomtd->plat.gpio_rdy);
+}
+
+static int __devexit gpio_nand_remove(struct platform_device *dev)
+{
+	struct gpiomtd *gpiomtd = platform_get_drvdata(dev);
+	struct resource *res;
+
+	nand_release(&gpiomtd->mtd_info);
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 1);
+	iounmap(gpiomtd->io_sync);
+	if (res)
+		release_mem_region(res->start, res->end - res->start + 1);
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	iounmap(gpiomtd->nand_chip.IO_ADDR_R);
+	release_mem_region(res->start, res->end - res->start + 1);
+
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_set_value(gpiomtd->plat.gpio_nwp, 0);
+	gpio_set_value(gpiomtd->plat.gpio_nce, 1);
+
+	gpio_free(gpiomtd->plat.gpio_cle);
+	gpio_free(gpiomtd->plat.gpio_ale);
+	gpio_free(gpiomtd->plat.gpio_nce);
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_free(gpiomtd->plat.gpio_nwp);
+	gpio_free(gpiomtd->plat.gpio_rdy);
+
+	kfree(gpiomtd);
+
+	return 0;
+}
+
+static void __iomem *request_and_remap(struct resource *res, size_t size,
+					const char *name, int *err)
+{
+	void __iomem *ptr;
+
+	if (!request_mem_region(res->start, res->end - res->start + 1, name)) {
+		*err = -EBUSY;
+		return NULL;
+	}
+
+	ptr = ioremap(res->start, size);
+	if (!ptr) {
+		release_mem_region(res->start, res->end - res->start + 1);
+		*err = -ENOMEM;
+	}
+	return ptr;
+}
+
+static int __devinit gpio_nand_probe(struct platform_device *dev)
+{
+	struct gpiomtd *gpiomtd;
+	struct nand_chip *this;
+	struct resource *res0, *res1;
+	int ret;
+
+	if (!dev->dev.platform_data)
+		return -EINVAL;
+
+	res0 = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res0)
+		return -EINVAL;
+
+	gpiomtd = kzalloc(sizeof(*gpiomtd), GFP_KERNEL);
+	if (gpiomtd == NULL) {
+		dev_err(&dev->dev, "failed to create NAND MTD\n");
+		return -ENOMEM;
+	}
+
+	this = &gpiomtd->nand_chip;
+	this->IO_ADDR_R = request_and_remap(res0, 2, "NAND", &ret);
+	if (!this->IO_ADDR_R) {
+		dev_err(&dev->dev, "unable to map NAND\n");
+		goto err_map;
+	}
+
+	res1 = platform_get_resource(dev, IORESOURCE_MEM, 1);
+	if (res1) {
+		gpiomtd->io_sync = request_and_remap(res1, 4, "NAND sync", &ret);
+		if (!gpiomtd->io_sync) {
+			dev_err(&dev->dev, "unable to map sync NAND\n");
+			goto err_sync;
+		}
+	}
+
+	memcpy(&gpiomtd->plat, dev->dev.platform_data, sizeof(gpiomtd->plat));
+
+	ret = gpio_request(gpiomtd->plat.gpio_nce, "NAND NCE");
+	if (ret)
+		goto err_nce;
+	gpio_direction_output(gpiomtd->plat.gpio_nce, 1);
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp)) {
+		ret = gpio_request(gpiomtd->plat.gpio_nwp, "NAND NWP");
+		if (ret)
+			goto err_nwp;
+		gpio_direction_output(gpiomtd->plat.gpio_nwp, 1);
+	}
+	ret = gpio_request(gpiomtd->plat.gpio_ale, "NAND ALE");
+	if (ret)
+		goto err_ale;
+	gpio_direction_output(gpiomtd->plat.gpio_ale, 0);
+	ret = gpio_request(gpiomtd->plat.gpio_cle, "NAND CLE");
+	if (ret)
+		goto err_cle;
+	gpio_direction_output(gpiomtd->plat.gpio_cle, 0);
+	ret = gpio_request(gpiomtd->plat.gpio_rdy, "NAND RDY");
+	if (ret)
+		goto err_rdy;
+	gpio_direction_input(gpiomtd->plat.gpio_rdy);
+
+
+	this->IO_ADDR_W  = this->IO_ADDR_R;
+	this->ecc.mode   = NAND_ECC_SOFT;
+	this->options    = gpiomtd->plat.options;
+	this->chip_delay = gpiomtd->plat.chip_delay;
+
+	/* install our routines */
+	this->cmd_ctrl   = gpio_nand_cmd_ctrl;
+	this->dev_ready  = gpio_nand_devready;
+
+	if (this->options & NAND_BUSWIDTH_16) {
+		this->read_buf   = gpio_nand_readbuf16;
+		this->write_buf  = gpio_nand_writebuf16;
+		this->verify_buf = gpio_nand_verifybuf16;
+	} else {
+		this->read_buf   = gpio_nand_readbuf;
+		this->write_buf  = gpio_nand_writebuf;
+		this->verify_buf = gpio_nand_verifybuf;
+	}
+
+	/* set the mtd private data for the nand driver */
+	gpiomtd->mtd_info.priv = this;
+	gpiomtd->mtd_info.owner = THIS_MODULE;
+
+	if (nand_scan(&gpiomtd->mtd_info, 1)) {
+		dev_err(&dev->dev, "no nand chips found?\n");
+		ret = -ENXIO;
+		goto err_wp;
+	}
+
+	if (gpiomtd->plat.adjust_parts)
+		gpiomtd->plat.adjust_parts(&gpiomtd->plat,
+					   gpiomtd->mtd_info.size);
+
+	add_mtd_partitions(&gpiomtd->mtd_info, gpiomtd->plat.parts,
+			   gpiomtd->plat.num_parts);
+	platform_set_drvdata(dev, gpiomtd);
+
+	return 0;
+
+err_wp:
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_set_value(gpiomtd->plat.gpio_nwp, 0);
+	gpio_free(gpiomtd->plat.gpio_rdy);
+err_rdy:
+	gpio_free(gpiomtd->plat.gpio_cle);
+err_cle:
+	gpio_free(gpiomtd->plat.gpio_ale);
+err_ale:
+	if (gpio_is_valid(gpiomtd->plat.gpio_nwp))
+		gpio_free(gpiomtd->plat.gpio_nwp);
+err_nwp:
+	gpio_free(gpiomtd->plat.gpio_nce);
+err_nce:
+	iounmap(gpiomtd->io_sync);
+	if (res1)
+		release_mem_region(res1->start, res1->end - res1->start + 1);
+err_sync:
+	iounmap(gpiomtd->nand_chip.IO_ADDR_R);
+	release_mem_region(res0->start, res0->end - res0->start + 1);
+err_map:
+	kfree(gpiomtd);
+	return ret;
+}
+
+static struct platform_driver gpio_nand_driver = {
+	.probe		= gpio_nand_probe,
+	.remove		= gpio_nand_remove,
+	.driver		= {
+		.name	= "gpio-nand",
+	},
+};
+
+static int __init gpio_nand_init(void)
+{
+	printk(KERN_INFO "GPIO NAND driver, © 2004 Simtec Electronics\n");
+
+	return platform_driver_register(&gpio_nand_driver);
+}
+
+static void __exit gpio_nand_exit(void)
+{
+	platform_driver_unregister(&gpio_nand_driver);
+}
+
+module_init(gpio_nand_init);
+module_exit(gpio_nand_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
+MODULE_DESCRIPTION("GPIO NAND Driver");
diff --git a/drivers/mtd/nand/mxc_nand.c b/drivers/mtd/nand/mxc_nand.c
new file mode 100644
index 000000000000..21fd4f1c4806
--- /dev/null
+++ b/drivers/mtd/nand/mxc_nand.c
@@ -0,0 +1,1077 @@
+/*
+ * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
+ * Copyright 2008 Sascha Hauer, kernel@pengutronix.de
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+
+#include <asm/mach/flash.h>
+#include <mach/mxc_nand.h>
+
+#define DRIVER_NAME "mxc_nand"
+
+/* Addresses for NFC registers */
+#define NFC_BUF_SIZE		0xE00
+#define NFC_BUF_ADDR		0xE04
+#define NFC_FLASH_ADDR		0xE06
+#define NFC_FLASH_CMD		0xE08
+#define NFC_CONFIG		0xE0A
+#define NFC_ECC_STATUS_RESULT	0xE0C
+#define NFC_RSLTMAIN_AREA	0xE0E
+#define NFC_RSLTSPARE_AREA	0xE10
+#define NFC_WRPROT		0xE12
+#define NFC_UNLOCKSTART_BLKADDR	0xE14
+#define NFC_UNLOCKEND_BLKADDR	0xE16
+#define NFC_NF_WRPRST		0xE18
+#define NFC_CONFIG1		0xE1A
+#define NFC_CONFIG2		0xE1C
+
+/* Addresses for NFC RAM BUFFER Main area 0 */
+#define MAIN_AREA0		0x000
+#define MAIN_AREA1		0x200
+#define MAIN_AREA2		0x400
+#define MAIN_AREA3		0x600
+
+/* Addresses for NFC SPARE BUFFER Spare area 0 */
+#define SPARE_AREA0		0x800
+#define SPARE_AREA1		0x810
+#define SPARE_AREA2		0x820
+#define SPARE_AREA3		0x830
+
+/* Set INT to 0, FCMD to 1, rest to 0 in NFC_CONFIG2 Register
+ * for Command operation */
+#define NFC_CMD            0x1
+
+/* Set INT to 0, FADD to 1, rest to 0 in NFC_CONFIG2 Register
+ * for Address operation */
+#define NFC_ADDR           0x2
+
+/* Set INT to 0, FDI to 1, rest to 0 in NFC_CONFIG2 Register
+ * for Input operation */
+#define NFC_INPUT          0x4
+
+/* Set INT to 0, FDO to 001, rest to 0 in NFC_CONFIG2 Register
+ * for Data Output operation */
+#define NFC_OUTPUT         0x8
+
+/* Set INT to 0, FD0 to 010, rest to 0 in NFC_CONFIG2 Register
+ * for Read ID operation */
+#define NFC_ID             0x10
+
+/* Set INT to 0, FDO to 100, rest to 0 in NFC_CONFIG2 Register
+ * for Read Status operation */
+#define NFC_STATUS         0x20
+
+/* Set INT to 1, rest to 0 in NFC_CONFIG2 Register for Read
+ * Status operation */
+#define NFC_INT            0x8000
+
+#define NFC_SP_EN           (1 << 2)
+#define NFC_ECC_EN          (1 << 3)
+#define NFC_INT_MSK         (1 << 4)
+#define NFC_BIG             (1 << 5)
+#define NFC_RST             (1 << 6)
+#define NFC_CE              (1 << 7)
+#define NFC_ONE_CYCLE       (1 << 8)
+
+struct mxc_nand_host {
+	struct mtd_info		mtd;
+	struct nand_chip	nand;
+	struct mtd_partition	*parts;
+	struct device		*dev;
+
+	void __iomem		*regs;
+	int			spare_only;
+	int			status_request;
+	int			pagesize_2k;
+	uint16_t		col_addr;
+	struct clk		*clk;
+	int			clk_act;
+	int			irq;
+
+	wait_queue_head_t	irq_waitq;
+};
+
+/* Define delays in microsec for NAND device operations */
+#define TROP_US_DELAY   2000
+/* Macros to get byte and bit positions of ECC */
+#define COLPOS(x)  ((x) >> 3)
+#define BITPOS(x) ((x) & 0xf)
+
+/* Define single bit Error positions in Main & Spare area */
+#define MAIN_SINGLEBIT_ERROR 0x4
+#define SPARE_SINGLEBIT_ERROR 0x1
+
+/* OOB placement block for use with hardware ecc generation */
+static struct nand_ecclayout nand_hw_eccoob_8 = {
+	.eccbytes = 5,
+	.eccpos = {6, 7, 8, 9, 10},
+	.oobfree = {{0, 5}, {11, 5}, }
+};
+
+static struct nand_ecclayout nand_hw_eccoob_16 = {
+	.eccbytes = 5,
+	.eccpos = {6, 7, 8, 9, 10},
+	.oobfree = {{0, 6}, {12, 4}, }
+};
+
+#ifdef CONFIG_MTD_PARTITIONS
+static const char *part_probes[] = { "RedBoot", "cmdlinepart", NULL };
+#endif
+
+static irqreturn_t mxc_nfc_irq(int irq, void *dev_id)
+{
+	struct mxc_nand_host *host = dev_id;
+
+	uint16_t tmp;
+
+	tmp = readw(host->regs + NFC_CONFIG1);
+	tmp |= NFC_INT_MSK; /* Disable interrupt */
+	writew(tmp, host->regs + NFC_CONFIG1);
+
+	wake_up(&host->irq_waitq);
+
+	return IRQ_HANDLED;
+}
+
+/* This function polls the NANDFC to wait for the basic operation to
+ * complete by checking the INT bit of config2 register.
+ */
+static void wait_op_done(struct mxc_nand_host *host, int max_retries,
+				uint16_t param, int useirq)
+{
+	uint32_t tmp;
+
+	if (useirq) {
+		if ((readw(host->regs + NFC_CONFIG2) & NFC_INT) == 0) {
+
+			tmp = readw(host->regs + NFC_CONFIG1);
+			tmp  &= ~NFC_INT_MSK;	/* Enable interrupt */
+			writew(tmp, host->regs + NFC_CONFIG1);
+
+			wait_event(host->irq_waitq,
+				readw(host->regs + NFC_CONFIG2) & NFC_INT);
+
+			tmp = readw(host->regs + NFC_CONFIG2);
+			tmp  &= ~NFC_INT;
+			writew(tmp, host->regs + NFC_CONFIG2);
+		}
+	} else {
+		while (max_retries-- > 0) {
+			if (readw(host->regs + NFC_CONFIG2) & NFC_INT) {
+				tmp = readw(host->regs + NFC_CONFIG2);
+				tmp  &= ~NFC_INT;
+				writew(tmp, host->regs + NFC_CONFIG2);
+				break;
+			}
+			udelay(1);
+		}
+		if (max_retries <= 0)
+			DEBUG(MTD_DEBUG_LEVEL0, "%s(%d): INT not set\n",
+			      __func__, param);
+	}
+}
+
+/* This function issues the specified command to the NAND device and
+ * waits for completion. */
+static void send_cmd(struct mxc_nand_host *host, uint16_t cmd, int useirq)
+{
+	DEBUG(MTD_DEBUG_LEVEL3, "send_cmd(host, 0x%x, %d)\n", cmd, useirq);
+
+	writew(cmd, host->regs + NFC_FLASH_CMD);
+	writew(NFC_CMD, host->regs + NFC_CONFIG2);
+
+	/* Wait for operation to complete */
+	wait_op_done(host, TROP_US_DELAY, cmd, useirq);
+}
+
+/* This function sends an address (or partial address) to the
+ * NAND device. The address is used to select the source/destination for
+ * a NAND command. */
+static void send_addr(struct mxc_nand_host *host, uint16_t addr, int islast)
+{
+	DEBUG(MTD_DEBUG_LEVEL3, "send_addr(host, 0x%x %d)\n", addr, islast);
+
+	writew(addr, host->regs + NFC_FLASH_ADDR);
+	writew(NFC_ADDR, host->regs + NFC_CONFIG2);
+
+	/* Wait for operation to complete */
+	wait_op_done(host, TROP_US_DELAY, addr, islast);
+}
+
+/* This function requests the NANDFC to initate the transfer
+ * of data currently in the NANDFC RAM buffer to the NAND device. */
+static void send_prog_page(struct mxc_nand_host *host, uint8_t buf_id,
+			int spare_only)
+{
+	DEBUG(MTD_DEBUG_LEVEL3, "send_prog_page (%d)\n", spare_only);
+
+	/* NANDFC buffer 0 is used for page read/write */
+	writew(buf_id, host->regs + NFC_BUF_ADDR);
+
+	/* Configure spare or page+spare access */
+	if (!host->pagesize_2k) {
+		uint16_t config1 = readw(host->regs + NFC_CONFIG1);
+		if (spare_only)
+			config1 |= NFC_SP_EN;
+		else
+			config1 &= ~(NFC_SP_EN);
+		writew(config1, host->regs + NFC_CONFIG1);
+	}
+
+	writew(NFC_INPUT, host->regs + NFC_CONFIG2);
+
+	/* Wait for operation to complete */
+	wait_op_done(host, TROP_US_DELAY, spare_only, true);
+}
+
+/* Requests NANDFC to initated the transfer of data from the
+ * NAND device into in the NANDFC ram buffer. */
+static void send_read_page(struct mxc_nand_host *host, uint8_t buf_id,
+		int spare_only)
+{
+	DEBUG(MTD_DEBUG_LEVEL3, "send_read_page (%d)\n", spare_only);
+
+	/* NANDFC buffer 0 is used for page read/write */
+	writew(buf_id, host->regs + NFC_BUF_ADDR);
+
+	/* Configure spare or page+spare access */
+	if (!host->pagesize_2k) {
+		uint32_t config1 = readw(host->regs + NFC_CONFIG1);
+		if (spare_only)
+			config1 |= NFC_SP_EN;
+		else
+			config1 &= ~NFC_SP_EN;
+		writew(config1, host->regs + NFC_CONFIG1);
+	}
+
+	writew(NFC_OUTPUT, host->regs + NFC_CONFIG2);
+
+	/* Wait for operation to complete */
+	wait_op_done(host, TROP_US_DELAY, spare_only, true);
+}
+
+/* Request the NANDFC to perform a read of the NAND device ID. */
+static void send_read_id(struct mxc_nand_host *host)
+{
+	struct nand_chip *this = &host->nand;
+	uint16_t tmp;
+
+	/* NANDFC buffer 0 is used for device ID output */
+	writew(0x0, host->regs + NFC_BUF_ADDR);
+
+	/* Read ID into main buffer */
+	tmp = readw(host->regs + NFC_CONFIG1);
+	tmp &= ~NFC_SP_EN;
+	writew(tmp, host->regs + NFC_CONFIG1);
+
+	writew(NFC_ID, host->regs + NFC_CONFIG2);
+
+	/* Wait for operation to complete */
+	wait_op_done(host, TROP_US_DELAY, 0, true);
+
+	if (this->options & NAND_BUSWIDTH_16) {
+		void __iomem *main_buf = host->regs + MAIN_AREA0;
+		/* compress the ID info */
+		writeb(readb(main_buf + 2), main_buf + 1);
+		writeb(readb(main_buf + 4), main_buf + 2);
+		writeb(readb(main_buf + 6), main_buf + 3);
+		writeb(readb(main_buf + 8), main_buf + 4);
+		writeb(readb(main_buf + 10), main_buf + 5);
+	}
+}
+
+/* This function requests the NANDFC to perform a read of the
+ * NAND device status and returns the current status. */
+static uint16_t get_dev_status(struct mxc_nand_host *host)
+{
+	void __iomem *main_buf = host->regs + MAIN_AREA1;
+	uint32_t store;
+	uint16_t ret, tmp;
+	/* Issue status request to NAND device */
+
+	/* store the main area1 first word, later do recovery */
+	store = readl(main_buf);
+	/* NANDFC buffer 1 is used for device status to prevent
+	 * corruption of read/write buffer on status requests. */
+	writew(1, host->regs + NFC_BUF_ADDR);
+
+	/* Read status into main buffer */
+	tmp = readw(host->regs + NFC_CONFIG1);
+	tmp &= ~NFC_SP_EN;
+	writew(tmp, host->regs + NFC_CONFIG1);
+
+	writew(NFC_STATUS, host->regs + NFC_CONFIG2);
+
+	/* Wait for operation to complete */
+	wait_op_done(host, TROP_US_DELAY, 0, true);
+
+	/* Status is placed in first word of main buffer */
+	/* get status, then recovery area 1 data */
+	ret = readw(main_buf);
+	writel(store, main_buf);
+
+	return ret;
+}
+
+/* This functions is used by upper layer to checks if device is ready */
+static int mxc_nand_dev_ready(struct mtd_info *mtd)
+{
+	/*
+	 * NFC handles R/B internally. Therefore, this function
+	 * always returns status as ready.
+	 */
+	return 1;
+}
+
+static void mxc_nand_enable_hwecc(struct mtd_info *mtd, int mode)
+{
+	/*
+	 * If HW ECC is enabled, we turn it on during init. There is
+	 * no need to enable again here.
+	 */
+}
+
+static int mxc_nand_correct_data(struct mtd_info *mtd, u_char *dat,
+				 u_char *read_ecc, u_char *calc_ecc)
+{
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
+
+	/*
+	 * 1-Bit errors are automatically corrected in HW.  No need for
+	 * additional correction.  2-Bit errors cannot be corrected by
+	 * HW ECC, so we need to return failure
+	 */
+	uint16_t ecc_status = readw(host->regs + NFC_ECC_STATUS_RESULT);
+
+	if (((ecc_status & 0x3) == 2) || ((ecc_status >> 2) == 2)) {
+		DEBUG(MTD_DEBUG_LEVEL0,
+		      "MXC_NAND: HWECC uncorrectable 2-bit ECC error\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int mxc_nand_calculate_ecc(struct mtd_info *mtd, const u_char *dat,
+				  u_char *ecc_code)
+{
+	return 0;
+}
+
+static u_char mxc_nand_read_byte(struct mtd_info *mtd)
+{
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
+	uint8_t ret = 0;
+	uint16_t col, rd_word;
+	uint16_t __iomem *main_buf = host->regs + MAIN_AREA0;
+	uint16_t __iomem *spare_buf = host->regs + SPARE_AREA0;
+
+	/* Check for status request */
+	if (host->status_request)
+		return get_dev_status(host) & 0xFF;
+
+	/* Get column for 16-bit access */
+	col = host->col_addr >> 1;
+
+	/* If we are accessing the spare region */
+	if (host->spare_only)
+		rd_word = readw(&spare_buf[col]);
+	else
+		rd_word = readw(&main_buf[col]);
+
+	/* Pick upper/lower byte of word from RAM buffer */
+	if (host->col_addr & 0x1)
+		ret = (rd_word >> 8) & 0xFF;
+	else
+		ret = rd_word & 0xFF;
+
+	/* Update saved column address */
+	host->col_addr++;
+
+	return ret;
+}
+
+static uint16_t mxc_nand_read_word(struct mtd_info *mtd)
+{
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
+	uint16_t col, rd_word, ret;
+	uint16_t __iomem *p;
+
+	DEBUG(MTD_DEBUG_LEVEL3,
+	      "mxc_nand_read_word(col = %d)\n", host->col_addr);
+
+	col = host->col_addr;
+	/* Adjust saved column address */
+	if (col < mtd->writesize && host->spare_only)
+		col += mtd->writesize;
+
+	if (col < mtd->writesize)
+		p = (host->regs + MAIN_AREA0) + (col >> 1);
+	else
+		p = (host->regs + SPARE_AREA0) + ((col - mtd->writesize) >> 1);
+
+	if (col & 1) {
+		rd_word = readw(p);
+		ret = (rd_word >> 8) & 0xff;
+		rd_word = readw(&p[1]);
+		ret |= (rd_word << 8) & 0xff00;
+
+	} else
+		ret = readw(p);
+
+	/* Update saved column address */
+	host->col_addr = col + 2;
+
+	return ret;
+}
+
+/* Write data of length len to buffer buf. The data to be
+ * written on NAND Flash is first copied to RAMbuffer. After the Data Input
+ * Operation by the NFC, the data is written to NAND Flash */
+static void mxc_nand_write_buf(struct mtd_info *mtd,
+				const u_char *buf, int len)
+{
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
+	int n, col, i = 0;
+
+	DEBUG(MTD_DEBUG_LEVEL3,
+	      "mxc_nand_write_buf(col = %d, len = %d)\n", host->col_addr,
+	      len);
+
+	col = host->col_addr;
+
+	/* Adjust saved column address */
+	if (col < mtd->writesize && host->spare_only)
+		col += mtd->writesize;
+
+	n = mtd->writesize + mtd->oobsize - col;
+	n = min(len, n);
+
+	DEBUG(MTD_DEBUG_LEVEL3,
+	      "%s:%d: col = %d, n = %d\n", __func__, __LINE__, col, n);
+
+	while (n) {
+		void __iomem *p;
+
+		if (col < mtd->writesize)
+			p = host->regs + MAIN_AREA0 + (col & ~3);
+		else
+			p = host->regs + SPARE_AREA0 -
+						mtd->writesize + (col & ~3);
+
+		DEBUG(MTD_DEBUG_LEVEL3, "%s:%d: p = %p\n", __func__,
+		      __LINE__, p);
+
+		if (((col | (int)&buf[i]) & 3) || n < 16) {
+			uint32_t data = 0;
+
+			if (col & 3 || n < 4)
+				data = readl(p);
+
+			switch (col & 3) {
+			case 0:
+				if (n) {
+					data = (data & 0xffffff00) |
+					    (buf[i++] << 0);
+					n--;
+					col++;
+				}
+			case 1:
+				if (n) {
+					data = (data & 0xffff00ff) |
+					    (buf[i++] << 8);
+					n--;
+					col++;
+				}
+			case 2:
+				if (n) {
+					data = (data & 0xff00ffff) |
+					    (buf[i++] << 16);
+					n--;
+					col++;
+				}
+			case 3:
+				if (n) {
+					data = (data & 0x00ffffff) |
+					    (buf[i++] << 24);
+					n--;
+					col++;
+				}
+			}
+
+			writel(data, p);
+		} else {
+			int m = mtd->writesize - col;
+
+			if (col >= mtd->writesize)
+				m += mtd->oobsize;
+
+			m = min(n, m) & ~3;
+
+			DEBUG(MTD_DEBUG_LEVEL3,
+			      "%s:%d: n = %d, m = %d, i = %d, col = %d\n",
+			      __func__,  __LINE__, n, m, i, col);
+
+			memcpy(p, &buf[i], m);
+			col += m;
+			i += m;
+			n -= m;
+		}
+	}
+	/* Update saved column address */
+	host->col_addr = col;
+}
+
+/* Read the data buffer from the NAND Flash. To read the data from NAND
+ * Flash first the data output cycle is initiated by the NFC, which copies
+ * the data to RAMbuffer. This data of length len is then copied to buffer buf.
+ */
+static void mxc_nand_read_buf(struct mtd_info *mtd, u_char *buf, int len)
+{
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
+	int n, col, i = 0;
+
+	DEBUG(MTD_DEBUG_LEVEL3,
+	      "mxc_nand_read_buf(col = %d, len = %d)\n", host->col_addr, len);
+
+	col = host->col_addr;
+
+	/* Adjust saved column address */
+	if (col < mtd->writesize && host->spare_only)
+		col += mtd->writesize;
+
+	n = mtd->writesize + mtd->oobsize - col;
+	n = min(len, n);
+
+	while (n) {
+		void __iomem *p;
+
+		if (col < mtd->writesize)
+			p = host->regs + MAIN_AREA0 + (col & ~3);
+		else
+			p = host->regs + SPARE_AREA0 -
+					mtd->writesize + (col & ~3);
+
+		if (((col | (int)&buf[i]) & 3) || n < 16) {
+			uint32_t data;
+
+			data = readl(p);
+			switch (col & 3) {
+			case 0:
+				if (n) {
+					buf[i++] = (uint8_t) (data);
+					n--;
+					col++;
+				}
+			case 1:
+				if (n) {
+					buf[i++] = (uint8_t) (data >> 8);
+					n--;
+					col++;
+				}
+			case 2:
+				if (n) {
+					buf[i++] = (uint8_t) (data >> 16);
+					n--;
+					col++;
+				}
+			case 3:
+				if (n) {
+					buf[i++] = (uint8_t) (data >> 24);
+					n--;
+					col++;
+				}
+			}
+		} else {
+			int m = mtd->writesize - col;
+
+			if (col >= mtd->writesize)
+				m += mtd->oobsize;
+
+			m = min(n, m) & ~3;
+			memcpy(&buf[i], p, m);
+			col += m;
+			i += m;
+			n -= m;
+		}
+	}
+	/* Update saved column address */
+	host->col_addr = col;
+
+}
+
+/* Used by the upper layer to verify the data in NAND Flash
+ * with the data in the buf. */
+static int mxc_nand_verify_buf(struct mtd_info *mtd,
+				const u_char *buf, int len)
+{
+	return -EFAULT;
+}
+
+/* This function is used by upper layer for select and
+ * deselect of the NAND chip */
+static void mxc_nand_select_chip(struct mtd_info *mtd, int chip)
+{
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
+
+#ifdef CONFIG_MTD_NAND_MXC_FORCE_CE
+	if (chip > 0) {
+		DEBUG(MTD_DEBUG_LEVEL0,
+		      "ERROR:  Illegal chip select (chip = %d)\n", chip);
+		return;
+	}
+
+	if (chip == -1) {
+		writew(readw(host->regs + NFC_CONFIG1) & ~NFC_CE,
+				host->regs + NFC_CONFIG1);
+		return;
+	}
+
+	writew(readw(host->regs + NFC_CONFIG1) | NFC_CE,
+			host->regs + NFC_CONFIG1);
+#endif
+
+	switch (chip) {
+	case -1:
+		/* Disable the NFC clock */
+		if (host->clk_act) {
+			clk_disable(host->clk);
+			host->clk_act = 0;
+		}
+		break;
+	case 0:
+		/* Enable the NFC clock */
+		if (!host->clk_act) {
+			clk_enable(host->clk);
+			host->clk_act = 1;
+		}
+		break;
+
+	default:
+		break;
+	}
+}
+
+/* Used by the upper layer to write command to NAND Flash for
+ * different operations to be carried out on NAND Flash */
+static void mxc_nand_command(struct mtd_info *mtd, unsigned command,
+				int column, int page_addr)
+{
+	struct nand_chip *nand_chip = mtd->priv;
+	struct mxc_nand_host *host = nand_chip->priv;
+	int useirq = true;
+
+	DEBUG(MTD_DEBUG_LEVEL3,
+	      "mxc_nand_command (cmd = 0x%x, col = 0x%x, page = 0x%x)\n",
+	      command, column, page_addr);
+
+	/* Reset command state information */
+	host->status_request = false;
+
+	/* Command pre-processing step */
+	switch (command) {
+
+	case NAND_CMD_STATUS:
+		host->col_addr = 0;
+		host->status_request = true;
+		break;
+
+	case NAND_CMD_READ0:
+		host->col_addr = column;
+		host->spare_only = false;
+		useirq = false;
+		break;
+
+	case NAND_CMD_READOOB:
+		host->col_addr = column;
+		host->spare_only = true;
+		useirq = false;
+		if (host->pagesize_2k)
+			command = NAND_CMD_READ0; /* only READ0 is valid */
+		break;
+
+	case NAND_CMD_SEQIN:
+		if (column >= mtd->writesize) {
+			/*
+			 * FIXME: before send SEQIN command for write OOB,
+			 * We must read one page out.
+			 * For K9F1GXX has no READ1 command to set current HW
+			 * pointer to spare area, we must write the whole page
+			 * including OOB together.
+			 */
+			if (host->pagesize_2k)
+				/* call ourself to read a page */
+				mxc_nand_command(mtd, NAND_CMD_READ0, 0,
+						page_addr);
+
+			host->col_addr = column - mtd->writesize;
+			host->spare_only = true;
+
+			/* Set program pointer to spare region */
+			if (!host->pagesize_2k)
+				send_cmd(host, NAND_CMD_READOOB, false);
+		} else {
+			host->spare_only = false;
+			host->col_addr = column;
+
+			/* Set program pointer to page start */
+			if (!host->pagesize_2k)
+				send_cmd(host, NAND_CMD_READ0, false);
+		}
+		useirq = false;
+		break;
+
+	case NAND_CMD_PAGEPROG:
+		send_prog_page(host, 0, host->spare_only);
+
+		if (host->pagesize_2k) {
+			/* data in 4 areas datas */
+			send_prog_page(host, 1, host->spare_only);
+			send_prog_page(host, 2, host->spare_only);
+			send_prog_page(host, 3, host->spare_only);
+		}
+
+		break;
+
+	case NAND_CMD_ERASE1:
+		useirq = false;
+		break;
+	}
+
+	/* Write out the command to the device. */
+	send_cmd(host, command, useirq);
+
+	/* Write out column address, if necessary */
+	if (column != -1) {
+		/*
+		 * MXC NANDFC can only perform full page+spare or
+		 * spare-only read/write.  When the upper layers
+		 * layers perform a read/write buf operation,
+		 * we will used the saved column adress to index into
+		 * the full page.
+		 */
+		send_addr(host, 0, page_addr == -1);
+		if (host->pagesize_2k)
+			/* another col addr cycle for 2k page */
+			send_addr(host, 0, false);
+	}
+
+	/* Write out page address, if necessary */
+	if (page_addr != -1) {
+		/* paddr_0 - p_addr_7 */
+		send_addr(host, (page_addr & 0xff), false);
+
+		if (host->pagesize_2k) {
+			send_addr(host, (page_addr >> 8) & 0xFF, false);
+			if (mtd->size >= 0x40000000)
+				send_addr(host, (page_addr >> 16) & 0xff, true);
+		} else {
+			/* One more address cycle for higher density devices */
+			if (mtd->size >= 0x4000000) {
+				/* paddr_8 - paddr_15 */
+				send_addr(host, (page_addr >> 8) & 0xff, false);
+				send_addr(host, (page_addr >> 16) & 0xff, true);
+			} else
+				/* paddr_8 - paddr_15 */
+				send_addr(host, (page_addr >> 8) & 0xff, true);
+		}
+	}
+
+	/* Command post-processing step */
+	switch (command) {
+
+	case NAND_CMD_RESET:
+		break;
+
+	case NAND_CMD_READOOB:
+	case NAND_CMD_READ0:
+		if (host->pagesize_2k) {
+			/* send read confirm command */
+			send_cmd(host, NAND_CMD_READSTART, true);
+			/* read for each AREA */
+			send_read_page(host, 0, host->spare_only);
+			send_read_page(host, 1, host->spare_only);
+			send_read_page(host, 2, host->spare_only);
+			send_read_page(host, 3, host->spare_only);
+		} else
+			send_read_page(host, 0, host->spare_only);
+		break;
+
+	case NAND_CMD_READID:
+		send_read_id(host);
+		break;
+
+	case NAND_CMD_PAGEPROG:
+		break;
+
+	case NAND_CMD_STATUS:
+		break;
+
+	case NAND_CMD_ERASE2:
+		break;
+	}
+}
+
+static int __init mxcnd_probe(struct platform_device *pdev)
+{
+	struct nand_chip *this;
+	struct mtd_info *mtd;
+	struct mxc_nand_platform_data *pdata = pdev->dev.platform_data;
+	struct mxc_nand_host *host;
+	struct resource *res;
+	uint16_t tmp;
+	int err = 0, nr_parts = 0;
+
+	/* Allocate memory for MTD device structure and private data */
+	host = kzalloc(sizeof(struct mxc_nand_host), GFP_KERNEL);
+	if (!host)
+		return -ENOMEM;
+
+	host->dev = &pdev->dev;
+	/* structures must be linked */
+	this = &host->nand;
+	mtd = &host->mtd;
+	mtd->priv = this;
+	mtd->owner = THIS_MODULE;
+
+	/* 50 us command delay time */
+	this->chip_delay = 5;
+
+	this->priv = host;
+	this->dev_ready = mxc_nand_dev_ready;
+	this->cmdfunc = mxc_nand_command;
+	this->select_chip = mxc_nand_select_chip;
+	this->read_byte = mxc_nand_read_byte;
+	this->read_word = mxc_nand_read_word;
+	this->write_buf = mxc_nand_write_buf;
+	this->read_buf = mxc_nand_read_buf;
+	this->verify_buf = mxc_nand_verify_buf;
+
+	host->clk = clk_get(&pdev->dev, "nfc_clk");
+	if (IS_ERR(host->clk))
+		goto eclk;
+
+	clk_enable(host->clk);
+	host->clk_act = 1;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		err = -ENODEV;
+		goto eres;
+	}
+
+	host->regs = ioremap(res->start, res->end - res->start + 1);
+	if (!host->regs) {
+		err = -EIO;
+		goto eres;
+	}
+
+	tmp = readw(host->regs + NFC_CONFIG1);
+	tmp |= NFC_INT_MSK;
+	writew(tmp, host->regs + NFC_CONFIG1);
+
+	init_waitqueue_head(&host->irq_waitq);
+
+	host->irq = platform_get_irq(pdev, 0);
+
+	err = request_irq(host->irq, mxc_nfc_irq, 0, "mxc_nd", host);
+	if (err)
+		goto eirq;
+
+	if (pdata->hw_ecc) {
+		this->ecc.calculate = mxc_nand_calculate_ecc;
+		this->ecc.hwctl = mxc_nand_enable_hwecc;
+		this->ecc.correct = mxc_nand_correct_data;
+		this->ecc.mode = NAND_ECC_HW;
+		this->ecc.size = 512;
+		this->ecc.bytes = 3;
+		this->ecc.layout = &nand_hw_eccoob_8;
+		tmp = readw(host->regs + NFC_CONFIG1);
+		tmp |= NFC_ECC_EN;
+		writew(tmp, host->regs + NFC_CONFIG1);
+	} else {
+		this->ecc.size = 512;
+		this->ecc.bytes = 3;
+		this->ecc.layout = &nand_hw_eccoob_8;
+		this->ecc.mode = NAND_ECC_SOFT;
+		tmp = readw(host->regs + NFC_CONFIG1);
+		tmp &= ~NFC_ECC_EN;
+		writew(tmp, host->regs + NFC_CONFIG1);
+	}
+
+	/* Reset NAND */
+	this->cmdfunc(mtd, NAND_CMD_RESET, -1, -1);
+
+	/* preset operation */
+	/* Unlock the internal RAM Buffer */
+	writew(0x2, host->regs + NFC_CONFIG);
+
+	/* Blocks to be unlocked */
+	writew(0x0, host->regs + NFC_UNLOCKSTART_BLKADDR);
+	writew(0x4000, host->regs + NFC_UNLOCKEND_BLKADDR);
+
+	/* Unlock Block Command for given address range */
+	writew(0x4, host->regs + NFC_WRPROT);
+
+	/* NAND bus width determines access funtions used by upper layer */
+	if (pdata->width == 2) {
+		this->options |= NAND_BUSWIDTH_16;
+		this->ecc.layout = &nand_hw_eccoob_16;
+	}
+
+	host->pagesize_2k = 0;
+
+	/* Scan to find existence of the device */
+	if (nand_scan(mtd, 1)) {
+		DEBUG(MTD_DEBUG_LEVEL0,
+		      "MXC_ND: Unable to find any NAND device.\n");
+		err = -ENXIO;
+		goto escan;
+	}
+
+	/* Register the partitions */
+#ifdef CONFIG_MTD_PARTITIONS
+	nr_parts =
+	    parse_mtd_partitions(mtd, part_probes, &host->parts, 0);
+	if (nr_parts > 0)
+		add_mtd_partitions(mtd, host->parts, nr_parts);
+	else
+#endif
+	{
+		pr_info("Registering %s as whole device\n", mtd->name);
+		add_mtd_device(mtd);
+	}
+
+	platform_set_drvdata(pdev, host);
+
+	return 0;
+
+escan:
+	free_irq(host->irq, NULL);
+eirq:
+	iounmap(host->regs);
+eres:
+	clk_put(host->clk);
+eclk:
+	kfree(host);
+
+	return err;
+}
+
+static int __devexit mxcnd_remove(struct platform_device *pdev)
+{
+	struct mxc_nand_host *host = platform_get_drvdata(pdev);
+
+	clk_put(host->clk);
+
+	platform_set_drvdata(pdev, NULL);
+
+	nand_release(&host->mtd);
+	free_irq(host->irq, NULL);
+	iounmap(host->regs);
+	kfree(host);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int mxcnd_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	struct mtd_info *info = platform_get_drvdata(pdev);
+	int ret = 0;
+
+	DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND suspend\n");
+	if (info)
+		ret = info->suspend(info);
+
+	/* Disable the NFC clock */
+	clk_disable(nfc_clk);	/* FIXME */
+
+	return ret;
+}
+
+static int mxcnd_resume(struct platform_device *pdev)
+{
+	struct mtd_info *info = platform_get_drvdata(pdev);
+	int ret = 0;
+
+	DEBUG(MTD_DEBUG_LEVEL0, "MXC_ND : NAND resume\n");
+	/* Enable the NFC clock */
+	clk_enable(nfc_clk);	/* FIXME */
+
+	if (info)
+		info->resume(info);
+
+	return ret;
+}
+
+#else
+# define mxcnd_suspend   NULL
+# define mxcnd_resume    NULL
+#endif				/* CONFIG_PM */
+
+static struct platform_driver mxcnd_driver = {
+	.driver = {
+		   .name = DRIVER_NAME,
+		   },
+	.remove = __exit_p(mxcnd_remove),
+	.suspend = mxcnd_suspend,
+	.resume = mxcnd_resume,
+};
+
+static int __init mxc_nd_init(void)
+{
+	/* Register the device driver structure. */
+	pr_info("MXC MTD nand Driver\n");
+	if (platform_driver_probe(&mxcnd_driver, mxcnd_probe) != 0) {
+		printk(KERN_ERR "Driver register failed for mxcnd_driver\n");
+		return -ENODEV;
+	}
+	return 0;
+}
+
+static void __exit mxc_nd_cleanup(void)
+{
+	/* Unregister the device structure */
+	platform_driver_unregister(&mxcnd_driver);
+}
+
+module_init(mxc_nd_init);
+module_exit(mxc_nd_cleanup);
+
+MODULE_AUTHOR("Freescale Semiconductor, Inc.");
+MODULE_DESCRIPTION("MXC NAND MTD driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index d1129bae6c27..0a9c9cd33f96 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -801,9 +801,9 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
  * nand_read_subpage - [REPLACABLE] software ecc based sub-page read function
  * @mtd:	mtd info structure
  * @chip:	nand chip info structure
- * @dataofs	offset of requested data within the page
- * @readlen	data length
- * @buf:	buffer to store read data
+ * @data_offs:	offset of requested data within the page
+ * @readlen:	data length
+ * @bufpoi:	buffer to store read data
  */
 static int nand_read_subpage(struct mtd_info *mtd, struct nand_chip *chip, uint32_t data_offs, uint32_t readlen, uint8_t *bufpoi)
 {
@@ -2042,7 +2042,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		return -EINVAL;
 	}
 
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	/* Grab the lock and see if the device is available */
 	nand_get_device(chip, mtd, FL_ERASING);
@@ -2318,6 +2318,12 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 	/* Select the device */
 	chip->select_chip(mtd, 0);
 
+	/*
+	 * Reset the chip, required by some chips (e.g. Micron MT29FxGxxxxx)
+	 * after power-up
+	 */
+	chip->cmdfunc(mtd, NAND_CMD_RESET, -1, -1);
+
 	/* Send the command for reading device ID */
 	chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1);
 
@@ -2488,6 +2494,8 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips)
 	/* Check for a chip array */
 	for (i = 1; i < maxchips; i++) {
 		chip->select_chip(mtd, i);
+		/* See comment in nand_get_flash_type for reset */
+		chip->cmdfunc(mtd, NAND_CMD_RESET, -1, -1);
 		/* Send the command for reading device ID */
 		chip->cmdfunc(mtd, NAND_CMD_READID, 0x00, -1);
 		/* Read manufacturer and device IDs */
diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c
index 918a806a8471..868147acce2c 100644
--- a/drivers/mtd/nand/nand_ecc.c
+++ b/drivers/mtd/nand/nand_ecc.c
@@ -1,13 +1,18 @@
 /*
- * This file contains an ECC algorithm from Toshiba that detects and
- * corrects 1 bit errors in a 256 byte block of data.
+ * This file contains an ECC algorithm that detects and corrects 1 bit
+ * errors in a 256 byte block of data.
  *
  * drivers/mtd/nand/nand_ecc.c
  *
- * Copyright (C) 2000-2004 Steven J. Hill (sjhill@realitydiluted.com)
- *                         Toshiba America Electronics Components, Inc.
+ * Copyright © 2008 Koninklijke Philips Electronics NV.
+ *                  Author: Frans Meulenbroeks
  *
- * Copyright (C) 2006 Thomas Gleixner <tglx@linutronix.de>
+ * Completely replaces the previous ECC implementation which was written by:
+ *   Steven J. Hill (sjhill@realitydiluted.com)
+ *   Thomas Gleixner (tglx@linutronix.de)
+ *
+ * Information on how this algorithm works and how it was developed
+ * can be found in Documentation/mtd/nand_ecc.txt
  *
  * This file is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
@@ -23,174 +28,475 @@
  * with this file; if not, write to the Free Software Foundation, Inc.,
  * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
  *
- * As a special exception, if other files instantiate templates or use
- * macros or inline functions from these files, or you compile these
- * files and link them with other works to produce a work based on these
- * files, these files do not by themselves cause the resulting work to be
- * covered by the GNU General Public License. However the source code for
- * these files must still be made available in accordance with section (3)
- * of the GNU General Public License.
- *
- * This exception does not invalidate any other reasons why a work based on
- * this file might be covered by the GNU General Public License.
  */
 
+/*
+ * The STANDALONE macro is useful when running the code outside the kernel
+ * e.g. when running the code in a testbed or a benchmark program.
+ * When STANDALONE is used, the module related macros are commented out
+ * as well as the linux include files.
+ * Instead a private definition of mtd_info is given to satisfy the compiler
+ * (the code does not use mtd_info, so the code does not care)
+ */
+#ifndef STANDALONE
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
 #include <linux/mtd/nand_ecc.h>
+#include <asm/byteorder.h>
+#else
+#include <stdint.h>
+struct mtd_info;
+#define EXPORT_SYMBOL(x)  /* x */
+
+#define MODULE_LICENSE(x)	/* x */
+#define MODULE_AUTHOR(x)	/* x */
+#define MODULE_DESCRIPTION(x)	/* x */
+
+#define printk printf
+#define KERN_ERR		""
+#endif
+
+/*
+ * invparity is a 256 byte table that contains the odd parity
+ * for each byte. So if the number of bits in a byte is even,
+ * the array element is 1, and when the number of bits is odd
+ * the array eleemnt is 0.
+ */
+static const char invparity[256] = {
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+	1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1
+};
+
+/*
+ * bitsperbyte contains the number of bits per byte
+ * this is only used for testing and repairing parity
+ * (a precalculated value slightly improves performance)
+ */
+static const char bitsperbyte[256] = {
+	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
 
 /*
- * Pre-calculated 256-way 1 byte column parity
+ * addressbits is a lookup table to filter out the bits from the xor-ed
+ * ecc data that identify the faulty location.
+ * this is only used for repairing parity
+ * see the comments in nand_correct_data for more details
  */
-static const u_char nand_ecc_precalc_table[] = {
-	0x00, 0x55, 0x56, 0x03, 0x59, 0x0c, 0x0f, 0x5a, 0x5a, 0x0f, 0x0c, 0x59, 0x03, 0x56, 0x55, 0x00,
-	0x65, 0x30, 0x33, 0x66, 0x3c, 0x69, 0x6a, 0x3f, 0x3f, 0x6a, 0x69, 0x3c, 0x66, 0x33, 0x30, 0x65,
-	0x66, 0x33, 0x30, 0x65, 0x3f, 0x6a, 0x69, 0x3c, 0x3c, 0x69, 0x6a, 0x3f, 0x65, 0x30, 0x33, 0x66,
-	0x03, 0x56, 0x55, 0x00, 0x5a, 0x0f, 0x0c, 0x59, 0x59, 0x0c, 0x0f, 0x5a, 0x00, 0x55, 0x56, 0x03,
-	0x69, 0x3c, 0x3f, 0x6a, 0x30, 0x65, 0x66, 0x33, 0x33, 0x66, 0x65, 0x30, 0x6a, 0x3f, 0x3c, 0x69,
-	0x0c, 0x59, 0x5a, 0x0f, 0x55, 0x00, 0x03, 0x56, 0x56, 0x03, 0x00, 0x55, 0x0f, 0x5a, 0x59, 0x0c,
-	0x0f, 0x5a, 0x59, 0x0c, 0x56, 0x03, 0x00, 0x55, 0x55, 0x00, 0x03, 0x56, 0x0c, 0x59, 0x5a, 0x0f,
-	0x6a, 0x3f, 0x3c, 0x69, 0x33, 0x66, 0x65, 0x30, 0x30, 0x65, 0x66, 0x33, 0x69, 0x3c, 0x3f, 0x6a,
-	0x6a, 0x3f, 0x3c, 0x69, 0x33, 0x66, 0x65, 0x30, 0x30, 0x65, 0x66, 0x33, 0x69, 0x3c, 0x3f, 0x6a,
-	0x0f, 0x5a, 0x59, 0x0c, 0x56, 0x03, 0x00, 0x55, 0x55, 0x00, 0x03, 0x56, 0x0c, 0x59, 0x5a, 0x0f,
-	0x0c, 0x59, 0x5a, 0x0f, 0x55, 0x00, 0x03, 0x56, 0x56, 0x03, 0x00, 0x55, 0x0f, 0x5a, 0x59, 0x0c,
-	0x69, 0x3c, 0x3f, 0x6a, 0x30, 0x65, 0x66, 0x33, 0x33, 0x66, 0x65, 0x30, 0x6a, 0x3f, 0x3c, 0x69,
-	0x03, 0x56, 0x55, 0x00, 0x5a, 0x0f, 0x0c, 0x59, 0x59, 0x0c, 0x0f, 0x5a, 0x00, 0x55, 0x56, 0x03,
-	0x66, 0x33, 0x30, 0x65, 0x3f, 0x6a, 0x69, 0x3c, 0x3c, 0x69, 0x6a, 0x3f, 0x65, 0x30, 0x33, 0x66,
-	0x65, 0x30, 0x33, 0x66, 0x3c, 0x69, 0x6a, 0x3f, 0x3f, 0x6a, 0x69, 0x3c, 0x66, 0x33, 0x30, 0x65,
-	0x00, 0x55, 0x56, 0x03, 0x59, 0x0c, 0x0f, 0x5a, 0x5a, 0x0f, 0x0c, 0x59, 0x03, 0x56, 0x55, 0x00
+static const char addressbits[256] = {
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+	0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x04, 0x04, 0x05, 0x05, 0x04, 0x04, 0x05, 0x05,
+	0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x08, 0x08, 0x09, 0x09, 0x08, 0x08, 0x09, 0x09,
+	0x0a, 0x0a, 0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f,
+	0x0c, 0x0c, 0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d,
+	0x0e, 0x0e, 0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f
 };
 
 /**
- * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256-byte block
+ * nand_calculate_ecc - [NAND Interface] Calculate 3-byte ECC for 256/512-byte
+ *			 block
  * @mtd:	MTD block structure
- * @dat:	raw data
- * @ecc_code:	buffer for ECC
+ * @buf:	input buffer with raw data
+ * @code:	output buffer with ECC
  */
-int nand_calculate_ecc(struct mtd_info *mtd, const u_char *dat,
-		       u_char *ecc_code)
+int nand_calculate_ecc(struct mtd_info *mtd, const unsigned char *buf,
+		       unsigned char *code)
 {
-	uint8_t idx, reg1, reg2, reg3, tmp1, tmp2;
 	int i;
+	const uint32_t *bp = (uint32_t *)buf;
+	/* 256 or 512 bytes/ecc  */
+	const uint32_t eccsize_mult =
+			(((struct nand_chip *)mtd->priv)->ecc.size) >> 8;
+	uint32_t cur;		/* current value in buffer */
+	/* rp0..rp15..rp17 are the various accumulated parities (per byte) */
+	uint32_t rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+	uint32_t rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15, rp16;
+	uint32_t uninitialized_var(rp17);	/* to make compiler happy */
+	uint32_t par;		/* the cumulative parity for all data */
+	uint32_t tmppar;	/* the cumulative parity for this iteration;
+				   for rp12, rp14 and rp16 at the end of the
+				   loop */
+
+	par = 0;
+	rp4 = 0;
+	rp6 = 0;
+	rp8 = 0;
+	rp10 = 0;
+	rp12 = 0;
+	rp14 = 0;
+	rp16 = 0;
+
+	/*
+	 * The loop is unrolled a number of times;
+	 * This avoids if statements to decide on which rp value to update
+	 * Also we process the data by longwords.
+	 * Note: passing unaligned data might give a performance penalty.
+	 * It is assumed that the buffers are aligned.
+	 * tmppar is the cumulative sum of this iteration.
+	 * needed for calculating rp12, rp14, rp16 and par
+	 * also used as a performance improvement for rp6, rp8 and rp10
+	 */
+	for (i = 0; i < eccsize_mult << 2; i++) {
+		cur = *bp++;
+		tmppar = cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= tmppar;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp8 ^= tmppar;
 
-	/* Initialize variables */
-	reg1 = reg2 = reg3 = 0;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp10 ^= tmppar;
 
-	/* Build up column parity */
-	for(i = 0; i < 256; i++) {
-		/* Get CP0 - CP5 from table */
-		idx = nand_ecc_precalc_table[*dat++];
-		reg1 ^= (idx & 0x3f);
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp6 ^= cur;
+		rp8 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= cur;
+		rp8 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp8 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp8 ^= cur;
 
-		/* All bit XOR = 1 ? */
-		if (idx & 0x40) {
-			reg3 ^= (uint8_t) i;
-			reg2 ^= ~((uint8_t) i);
-		}
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp6 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+		rp4 ^= cur;
+		cur = *bp++;
+		tmppar ^= cur;
+
+		par ^= tmppar;
+		if ((i & 0x1) == 0)
+			rp12 ^= tmppar;
+		if ((i & 0x2) == 0)
+			rp14 ^= tmppar;
+		if (eccsize_mult == 2 && (i & 0x4) == 0)
+			rp16 ^= tmppar;
 	}
 
-	/* Create non-inverted ECC code from line parity */
-	tmp1  = (reg3 & 0x80) >> 0; /* B7 -> B7 */
-	tmp1 |= (reg2 & 0x80) >> 1; /* B7 -> B6 */
-	tmp1 |= (reg3 & 0x40) >> 1; /* B6 -> B5 */
-	tmp1 |= (reg2 & 0x40) >> 2; /* B6 -> B4 */
-	tmp1 |= (reg3 & 0x20) >> 2; /* B5 -> B3 */
-	tmp1 |= (reg2 & 0x20) >> 3; /* B5 -> B2 */
-	tmp1 |= (reg3 & 0x10) >> 3; /* B4 -> B1 */
-	tmp1 |= (reg2 & 0x10) >> 4; /* B4 -> B0 */
-
-	tmp2  = (reg3 & 0x08) << 4; /* B3 -> B7 */
-	tmp2 |= (reg2 & 0x08) << 3; /* B3 -> B6 */
-	tmp2 |= (reg3 & 0x04) << 3; /* B2 -> B5 */
-	tmp2 |= (reg2 & 0x04) << 2; /* B2 -> B4 */
-	tmp2 |= (reg3 & 0x02) << 2; /* B1 -> B3 */
-	tmp2 |= (reg2 & 0x02) << 1; /* B1 -> B2 */
-	tmp2 |= (reg3 & 0x01) << 1; /* B0 -> B1 */
-	tmp2 |= (reg2 & 0x01) << 0; /* B7 -> B0 */
-
-	/* Calculate final ECC code */
-#ifdef CONFIG_MTD_NAND_ECC_SMC
-	ecc_code[0] = ~tmp2;
-	ecc_code[1] = ~tmp1;
+	/*
+	 * handle the fact that we use longword operations
+	 * we'll bring rp4..rp14..rp16 back to single byte entities by
+	 * shifting and xoring first fold the upper and lower 16 bits,
+	 * then the upper and lower 8 bits.
+	 */
+	rp4 ^= (rp4 >> 16);
+	rp4 ^= (rp4 >> 8);
+	rp4 &= 0xff;
+	rp6 ^= (rp6 >> 16);
+	rp6 ^= (rp6 >> 8);
+	rp6 &= 0xff;
+	rp8 ^= (rp8 >> 16);
+	rp8 ^= (rp8 >> 8);
+	rp8 &= 0xff;
+	rp10 ^= (rp10 >> 16);
+	rp10 ^= (rp10 >> 8);
+	rp10 &= 0xff;
+	rp12 ^= (rp12 >> 16);
+	rp12 ^= (rp12 >> 8);
+	rp12 &= 0xff;
+	rp14 ^= (rp14 >> 16);
+	rp14 ^= (rp14 >> 8);
+	rp14 &= 0xff;
+	if (eccsize_mult == 2) {
+		rp16 ^= (rp16 >> 16);
+		rp16 ^= (rp16 >> 8);
+		rp16 &= 0xff;
+	}
+
+	/*
+	 * we also need to calculate the row parity for rp0..rp3
+	 * This is present in par, because par is now
+	 * rp3 rp3 rp2 rp2 in little endian and
+	 * rp2 rp2 rp3 rp3 in big endian
+	 * as well as
+	 * rp1 rp0 rp1 rp0 in little endian and
+	 * rp0 rp1 rp0 rp1 in big endian
+	 * First calculate rp2 and rp3
+	 */
+#ifdef __BIG_ENDIAN
+	rp2 = (par >> 16);
+	rp2 ^= (rp2 >> 8);
+	rp2 &= 0xff;
+	rp3 = par & 0xffff;
+	rp3 ^= (rp3 >> 8);
+	rp3 &= 0xff;
 #else
-	ecc_code[0] = ~tmp1;
-	ecc_code[1] = ~tmp2;
+	rp3 = (par >> 16);
+	rp3 ^= (rp3 >> 8);
+	rp3 &= 0xff;
+	rp2 = par & 0xffff;
+	rp2 ^= (rp2 >> 8);
+	rp2 &= 0xff;
 #endif
-	ecc_code[2] = ((~reg1) << 2) | 0x03;
 
-	return 0;
-}
-EXPORT_SYMBOL(nand_calculate_ecc);
+	/* reduce par to 16 bits then calculate rp1 and rp0 */
+	par ^= (par >> 16);
+#ifdef __BIG_ENDIAN
+	rp0 = (par >> 8) & 0xff;
+	rp1 = (par & 0xff);
+#else
+	rp1 = (par >> 8) & 0xff;
+	rp0 = (par & 0xff);
+#endif
 
-static inline int countbits(uint32_t byte)
-{
-	int res = 0;
+	/* finally reduce par to 8 bits */
+	par ^= (par >> 8);
+	par &= 0xff;
 
-	for (;byte; byte >>= 1)
-		res += byte & 0x01;
-	return res;
+	/*
+	 * and calculate rp5..rp15..rp17
+	 * note that par = rp4 ^ rp5 and due to the commutative property
+	 * of the ^ operator we can say:
+	 * rp5 = (par ^ rp4);
+	 * The & 0xff seems superfluous, but benchmarking learned that
+	 * leaving it out gives slightly worse results. No idea why, probably
+	 * it has to do with the way the pipeline in pentium is organized.
+	 */
+	rp5 = (par ^ rp4) & 0xff;
+	rp7 = (par ^ rp6) & 0xff;
+	rp9 = (par ^ rp8) & 0xff;
+	rp11 = (par ^ rp10) & 0xff;
+	rp13 = (par ^ rp12) & 0xff;
+	rp15 = (par ^ rp14) & 0xff;
+	if (eccsize_mult == 2)
+		rp17 = (par ^ rp16) & 0xff;
+
+	/*
+	 * Finally calculate the ecc bits.
+	 * Again here it might seem that there are performance optimisations
+	 * possible, but benchmarks showed that on the system this is developed
+	 * the code below is the fastest
+	 */
+#ifdef CONFIG_MTD_NAND_ECC_SMC
+	code[0] =
+	    (invparity[rp7] << 7) |
+	    (invparity[rp6] << 6) |
+	    (invparity[rp5] << 5) |
+	    (invparity[rp4] << 4) |
+	    (invparity[rp3] << 3) |
+	    (invparity[rp2] << 2) |
+	    (invparity[rp1] << 1) |
+	    (invparity[rp0]);
+	code[1] =
+	    (invparity[rp15] << 7) |
+	    (invparity[rp14] << 6) |
+	    (invparity[rp13] << 5) |
+	    (invparity[rp12] << 4) |
+	    (invparity[rp11] << 3) |
+	    (invparity[rp10] << 2) |
+	    (invparity[rp9] << 1)  |
+	    (invparity[rp8]);
+#else
+	code[1] =
+	    (invparity[rp7] << 7) |
+	    (invparity[rp6] << 6) |
+	    (invparity[rp5] << 5) |
+	    (invparity[rp4] << 4) |
+	    (invparity[rp3] << 3) |
+	    (invparity[rp2] << 2) |
+	    (invparity[rp1] << 1) |
+	    (invparity[rp0]);
+	code[0] =
+	    (invparity[rp15] << 7) |
+	    (invparity[rp14] << 6) |
+	    (invparity[rp13] << 5) |
+	    (invparity[rp12] << 4) |
+	    (invparity[rp11] << 3) |
+	    (invparity[rp10] << 2) |
+	    (invparity[rp9] << 1)  |
+	    (invparity[rp8]);
+#endif
+	if (eccsize_mult == 1)
+		code[2] =
+		    (invparity[par & 0xf0] << 7) |
+		    (invparity[par & 0x0f] << 6) |
+		    (invparity[par & 0xcc] << 5) |
+		    (invparity[par & 0x33] << 4) |
+		    (invparity[par & 0xaa] << 3) |
+		    (invparity[par & 0x55] << 2) |
+		    3;
+	else
+		code[2] =
+		    (invparity[par & 0xf0] << 7) |
+		    (invparity[par & 0x0f] << 6) |
+		    (invparity[par & 0xcc] << 5) |
+		    (invparity[par & 0x33] << 4) |
+		    (invparity[par & 0xaa] << 3) |
+		    (invparity[par & 0x55] << 2) |
+		    (invparity[rp17] << 1) |
+		    (invparity[rp16] << 0);
+	return 0;
 }
+EXPORT_SYMBOL(nand_calculate_ecc);
 
 /**
  * nand_correct_data - [NAND Interface] Detect and correct bit error(s)
  * @mtd:	MTD block structure
- * @dat:	raw data read from the chip
+ * @buf:	raw data read from the chip
  * @read_ecc:	ECC from the chip
  * @calc_ecc:	the ECC calculated from raw data
  *
- * Detect and correct a 1 bit error for 256 byte block
+ * Detect and correct a 1 bit error for 256/512 byte block
  */
-int nand_correct_data(struct mtd_info *mtd, u_char *dat,
-		      u_char *read_ecc, u_char *calc_ecc)
+int nand_correct_data(struct mtd_info *mtd, unsigned char *buf,
+		      unsigned char *read_ecc, unsigned char *calc_ecc)
 {
-	uint8_t s0, s1, s2;
+	unsigned char b0, b1, b2;
+	unsigned char byte_addr, bit_addr;
+	/* 256 or 512 bytes/ecc  */
+	const uint32_t eccsize_mult =
+			(((struct nand_chip *)mtd->priv)->ecc.size) >> 8;
 
+	/*
+	 * b0 to b2 indicate which bit is faulty (if any)
+	 * we might need the xor result  more than once,
+	 * so keep them in a local var
+	*/
 #ifdef CONFIG_MTD_NAND_ECC_SMC
-	s0 = calc_ecc[0] ^ read_ecc[0];
-	s1 = calc_ecc[1] ^ read_ecc[1];
-	s2 = calc_ecc[2] ^ read_ecc[2];
+	b0 = read_ecc[0] ^ calc_ecc[0];
+	b1 = read_ecc[1] ^ calc_ecc[1];
 #else
-	s1 = calc_ecc[0] ^ read_ecc[0];
-	s0 = calc_ecc[1] ^ read_ecc[1];
-	s2 = calc_ecc[2] ^ read_ecc[2];
+	b0 = read_ecc[1] ^ calc_ecc[1];
+	b1 = read_ecc[0] ^ calc_ecc[0];
 #endif
-	if ((s0 | s1 | s2) == 0)
-		return 0;
-
-	/* Check for a single bit error */
-	if( ((s0 ^ (s0 >> 1)) & 0x55) == 0x55 &&
-	    ((s1 ^ (s1 >> 1)) & 0x55) == 0x55 &&
-	    ((s2 ^ (s2 >> 1)) & 0x54) == 0x54) {
+	b2 = read_ecc[2] ^ calc_ecc[2];
 
-		uint32_t byteoffs, bitnum;
+	/* check if there are any bitfaults */
 
-		byteoffs = (s1 << 0) & 0x80;
-		byteoffs |= (s1 << 1) & 0x40;
-		byteoffs |= (s1 << 2) & 0x20;
-		byteoffs |= (s1 << 3) & 0x10;
+	/* repeated if statements are slightly more efficient than switch ... */
+	/* ordered in order of likelihood */
 
-		byteoffs |= (s0 >> 4) & 0x08;
-		byteoffs |= (s0 >> 3) & 0x04;
-		byteoffs |= (s0 >> 2) & 0x02;
-		byteoffs |= (s0 >> 1) & 0x01;
-
-		bitnum = (s2 >> 5) & 0x04;
-		bitnum |= (s2 >> 4) & 0x02;
-		bitnum |= (s2 >> 3) & 0x01;
-
-		dat[byteoffs] ^= (1 << bitnum);
+	if ((b0 | b1 | b2) == 0)
+		return 0;	/* no error */
 
+	if ((((b0 ^ (b0 >> 1)) & 0x55) == 0x55) &&
+	    (((b1 ^ (b1 >> 1)) & 0x55) == 0x55) &&
+	    ((eccsize_mult == 1 && ((b2 ^ (b2 >> 1)) & 0x54) == 0x54) ||
+	     (eccsize_mult == 2 && ((b2 ^ (b2 >> 1)) & 0x55) == 0x55))) {
+	/* single bit error */
+		/*
+		 * rp17/rp15/13/11/9/7/5/3/1 indicate which byte is the faulty
+		 * byte, cp 5/3/1 indicate the faulty bit.
+		 * A lookup table (called addressbits) is used to filter
+		 * the bits from the byte they are in.
+		 * A marginal optimisation is possible by having three
+		 * different lookup tables.
+		 * One as we have now (for b0), one for b2
+		 * (that would avoid the >> 1), and one for b1 (with all values
+		 * << 4). However it was felt that introducing two more tables
+		 * hardly justify the gain.
+		 *
+		 * The b2 shift is there to get rid of the lowest two bits.
+		 * We could also do addressbits[b2] >> 1 but for the
+		 * performace it does not make any difference
+		 */
+		if (eccsize_mult == 1)
+			byte_addr = (addressbits[b1] << 4) + addressbits[b0];
+		else
+			byte_addr = (addressbits[b2 & 0x3] << 8) +
+				    (addressbits[b1] << 4) + addressbits[b0];
+		bit_addr = addressbits[b2 >> 2];
+		/* flip the bit */
+		buf[byte_addr] ^= (1 << bit_addr);
 		return 1;
-	}
 
-	if(countbits(s0 | ((uint32_t)s1 << 8) | ((uint32_t)s2 <<16)) == 1)
-		return 1;
+	}
+	/* count nr of bits; use table lookup, faster than calculating it */
+	if ((bitsperbyte[b0] + bitsperbyte[b1] + bitsperbyte[b2]) == 1)
+		return 1;	/* error in ecc data; no action needed */
 
-	return -EBADMSG;
+	printk(KERN_ERR "uncorrectable error : ");
+	return -1;
 }
 EXPORT_SYMBOL(nand_correct_data);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Steven J. Hill <sjhill@realitydiluted.com>");
+MODULE_AUTHOR("Frans Meulenbroeks <fransmeulenbroeks@gmail.com>");
 MODULE_DESCRIPTION("Generic NAND ECC support");
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 556e8131ecdc..ae7c57781a68 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -38,7 +38,6 @@
 #include <linux/delay.h>
 #include <linux/list.h>
 #include <linux/random.h>
-#include <asm/div64.h>
 
 /* Default simulator parameters values */
 #if !defined(CONFIG_NANDSIM_FIRST_ID_BYTE)  || \
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index a64ad15b8fdd..c0fa9c9edf08 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -115,55 +115,11 @@ enum {
 	STATE_PIO_WRITING,
 };
 
-struct pxa3xx_nand_timing {
-	unsigned int	tCH;  /* Enable signal hold time */
-	unsigned int	tCS;  /* Enable signal setup time */
-	unsigned int	tWH;  /* ND_nWE high duration */
-	unsigned int	tWP;  /* ND_nWE pulse time */
-	unsigned int	tRH;  /* ND_nRE high duration */
-	unsigned int	tRP;  /* ND_nRE pulse width */
-	unsigned int	tR;   /* ND_nWE high to ND_nRE low for read */
-	unsigned int	tWHR; /* ND_nWE high to ND_nRE low for status read */
-	unsigned int	tAR;  /* ND_ALE low to ND_nRE low delay */
-};
-
-struct pxa3xx_nand_cmdset {
-	uint16_t	read1;
-	uint16_t	read2;
-	uint16_t	program;
-	uint16_t	read_status;
-	uint16_t	read_id;
-	uint16_t	erase;
-	uint16_t	reset;
-	uint16_t	lock;
-	uint16_t	unlock;
-	uint16_t	lock_status;
-};
-
-struct pxa3xx_nand_flash {
-	struct pxa3xx_nand_timing *timing; /* NAND Flash timing */
-	struct pxa3xx_nand_cmdset *cmdset;
-
-	uint32_t page_per_block;/* Pages per block (PG_PER_BLK) */
-	uint32_t page_size;	/* Page size in bytes (PAGE_SZ) */
-	uint32_t flash_width;	/* Width of Flash memory (DWIDTH_M) */
-	uint32_t dfc_width;	/* Width of flash controller(DWIDTH_C) */
-	uint32_t num_blocks;	/* Number of physical blocks in Flash */
-	uint32_t chip_id;
-
-	/* NOTE: these are automatically calculated, do not define */
-	size_t		oob_size;
-	size_t		read_id_bytes;
-
-	unsigned int	col_addr_cycles;
-	unsigned int	row_addr_cycles;
-};
-
 struct pxa3xx_nand_info {
 	struct nand_chip	nand_chip;
 
 	struct platform_device	 *pdev;
-	struct pxa3xx_nand_flash *flash_info;
+	const struct pxa3xx_nand_flash *flash_info;
 
 	struct clk		*clk;
 	void __iomem		*mmio_base;
@@ -202,12 +158,20 @@ struct pxa3xx_nand_info {
 	uint32_t		ndcb0;
 	uint32_t		ndcb1;
 	uint32_t		ndcb2;
+
+	/* calculated from pxa3xx_nand_flash data */
+	size_t		oob_size;
+	size_t		read_id_bytes;
+
+	unsigned int	col_addr_cycles;
+	unsigned int	row_addr_cycles;
 };
 
 static int use_dma = 1;
 module_param(use_dma, bool, 0444);
 MODULE_PARM_DESC(use_dma, "enable DMA for data transfering to/from NAND HW");
 
+#ifdef CONFIG_MTD_NAND_PXA3xx_BUILTIN
 static struct pxa3xx_nand_cmdset smallpage_cmdset = {
 	.read1		= 0x0000,
 	.read2		= 0x0050,
@@ -291,11 +255,35 @@ static struct pxa3xx_nand_flash micron1GbX16 = {
 	.chip_id	= 0xb12c,
 };
 
+static struct pxa3xx_nand_timing stm2GbX16_timing = {
+	.tCH = 10,
+	.tCS = 35,
+	.tWH = 15,
+	.tWP = 25,
+	.tRH = 15,
+	.tRP = 25,
+	.tR = 25000,
+	.tWHR = 60,
+	.tAR = 10,
+};
+
+static struct pxa3xx_nand_flash stm2GbX16 = {
+	.timing = &stm2GbX16_timing,
+	.page_per_block = 64,
+	.page_size = 2048,
+	.flash_width = 16,
+	.dfc_width = 16,
+	.num_blocks = 2048,
+	.chip_id = 0xba20,
+};
+
 static struct pxa3xx_nand_flash *builtin_flash_types[] = {
 	&samsung512MbX16,
 	&micron1GbX8,
 	&micron1GbX16,
+	&stm2GbX16,
 };
+#endif /* CONFIG_MTD_NAND_PXA3xx_BUILTIN */
 
 #define NDTR0_tCH(c)	(min((c), 7) << 19)
 #define NDTR0_tCS(c)	(min((c), 7) << 16)
@@ -312,7 +300,7 @@ static struct pxa3xx_nand_flash *builtin_flash_types[] = {
 #define ns2cycle(ns, clk)	(int)(((ns) * (clk / 1000000) / 1000) + 1)
 
 static void pxa3xx_nand_set_timing(struct pxa3xx_nand_info *info,
-				   struct pxa3xx_nand_timing *t)
+				   const struct pxa3xx_nand_timing *t)
 {
 	unsigned long nand_clk = clk_get_rate(info->clk);
 	uint32_t ndtr0, ndtr1;
@@ -354,8 +342,8 @@ static int wait_for_event(struct pxa3xx_nand_info *info, uint32_t event)
 static int prepare_read_prog_cmd(struct pxa3xx_nand_info *info,
 			uint16_t cmd, int column, int page_addr)
 {
-	struct pxa3xx_nand_flash *f = info->flash_info;
-	struct pxa3xx_nand_cmdset *cmdset = f->cmdset;
+	const struct pxa3xx_nand_flash *f = info->flash_info;
+	const struct pxa3xx_nand_cmdset *cmdset = f->cmdset;
 
 	/* calculate data size */
 	switch (f->page_size) {
@@ -373,14 +361,14 @@ static int prepare_read_prog_cmd(struct pxa3xx_nand_info *info,
 	info->ndcb0 = cmd | ((cmd & 0xff00) ? NDCB0_DBC : 0);
 	info->ndcb1 = 0;
 	info->ndcb2 = 0;
-	info->ndcb0 |= NDCB0_ADDR_CYC(f->row_addr_cycles + f->col_addr_cycles);
+	info->ndcb0 |= NDCB0_ADDR_CYC(info->row_addr_cycles + info->col_addr_cycles);
 
-	if (f->col_addr_cycles == 2) {
+	if (info->col_addr_cycles == 2) {
 		/* large block, 2 cycles for column address
 		 * row address starts from 3rd cycle
 		 */
 		info->ndcb1 |= (page_addr << 16) | (column & 0xffff);
-		if (f->row_addr_cycles == 3)
+		if (info->row_addr_cycles == 3)
 			info->ndcb2 = (page_addr >> 16) & 0xff;
 	} else
 		/* small block, 1 cycles for column address
@@ -406,7 +394,7 @@ static int prepare_erase_cmd(struct pxa3xx_nand_info *info,
 
 static int prepare_other_cmd(struct pxa3xx_nand_info *info, uint16_t cmd)
 {
-	struct pxa3xx_nand_cmdset *cmdset = info->flash_info->cmdset;
+	const struct pxa3xx_nand_cmdset *cmdset = info->flash_info->cmdset;
 
 	info->ndcb0 = cmd | ((cmd & 0xff00) ? NDCB0_DBC : 0);
 	info->ndcb1 = 0;
@@ -641,8 +629,8 @@ static void pxa3xx_nand_cmdfunc(struct mtd_info *mtd, unsigned command,
 				int column, int page_addr)
 {
 	struct pxa3xx_nand_info *info = mtd->priv;
-	struct pxa3xx_nand_flash *flash_info = info->flash_info;
-	struct pxa3xx_nand_cmdset *cmdset = flash_info->cmdset;
+	const struct pxa3xx_nand_flash *flash_info = info->flash_info;
+	const struct pxa3xx_nand_cmdset *cmdset = flash_info->cmdset;
 	int ret;
 
 	info->use_dma = (use_dma) ? 1 : 0;
@@ -720,7 +708,7 @@ static void pxa3xx_nand_cmdfunc(struct mtd_info *mtd, unsigned command,
 		info->use_dma = 0;	/* force PIO read */
 		info->buf_start = 0;
 		info->buf_count = (command == NAND_CMD_READID) ?
-				flash_info->read_id_bytes : 1;
+				info->read_id_bytes : 1;
 
 		if (prepare_other_cmd(info, (command == NAND_CMD_READID) ?
 				cmdset->read_id : cmdset->read_status))
@@ -861,8 +849,8 @@ static int pxa3xx_nand_ecc_correct(struct mtd_info *mtd,
 
 static int __readid(struct pxa3xx_nand_info *info, uint32_t *id)
 {
-	struct pxa3xx_nand_flash *f = info->flash_info;
-	struct pxa3xx_nand_cmdset *cmdset = f->cmdset;
+	const struct pxa3xx_nand_flash *f = info->flash_info;
+	const struct pxa3xx_nand_cmdset *cmdset = f->cmdset;
 	uint32_t ndcr;
 	uint8_t  id_buff[8];
 
@@ -891,7 +879,7 @@ fail_timeout:
 }
 
 static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info,
-				    struct pxa3xx_nand_flash *f)
+				    const struct pxa3xx_nand_flash *f)
 {
 	struct platform_device *pdev = info->pdev;
 	struct pxa3xx_nand_platform_data *pdata = pdev->dev.platform_data;
@@ -904,25 +892,25 @@ static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info,
 		return -EINVAL;
 
 	/* calculate flash information */
-	f->oob_size = (f->page_size == 2048) ? 64 : 16;
-	f->read_id_bytes = (f->page_size == 2048) ? 4 : 2;
+	info->oob_size = (f->page_size == 2048) ? 64 : 16;
+	info->read_id_bytes = (f->page_size == 2048) ? 4 : 2;
 
 	/* calculate addressing information */
-	f->col_addr_cycles = (f->page_size == 2048) ? 2 : 1;
+	info->col_addr_cycles = (f->page_size == 2048) ? 2 : 1;
 
 	if (f->num_blocks * f->page_per_block > 65536)
-		f->row_addr_cycles = 3;
+		info->row_addr_cycles = 3;
 	else
-		f->row_addr_cycles = 2;
+		info->row_addr_cycles = 2;
 
 	ndcr |= (pdata->enable_arbiter) ? NDCR_ND_ARB_EN : 0;
-	ndcr |= (f->col_addr_cycles == 2) ? NDCR_RA_START : 0;
+	ndcr |= (info->col_addr_cycles == 2) ? NDCR_RA_START : 0;
 	ndcr |= (f->page_per_block == 64) ? NDCR_PG_PER_BLK : 0;
 	ndcr |= (f->page_size == 2048) ? NDCR_PAGE_SZ : 0;
 	ndcr |= (f->flash_width == 16) ? NDCR_DWIDTH_M : 0;
 	ndcr |= (f->dfc_width == 16) ? NDCR_DWIDTH_C : 0;
 
-	ndcr |= NDCR_RD_ID_CNT(f->read_id_bytes);
+	ndcr |= NDCR_RD_ID_CNT(info->read_id_bytes);
 	ndcr |= NDCR_SPARE_EN; /* enable spare by default */
 
 	info->reg_ndcr = ndcr;
@@ -932,12 +920,27 @@ static int pxa3xx_nand_config_flash(struct pxa3xx_nand_info *info,
 	return 0;
 }
 
-static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info)
+static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info,
+				    const struct pxa3xx_nand_platform_data *pdata)
 {
-	struct pxa3xx_nand_flash *f;
-	uint32_t id;
+	const struct pxa3xx_nand_flash *f;
+	uint32_t id = -1;
 	int i;
 
+	for (i = 0; i<pdata->num_flash; ++i) {
+		f = pdata->flash + i;
+
+		if (pxa3xx_nand_config_flash(info, f))
+			continue;
+
+		if (__readid(info, &id))
+			continue;
+
+		if (id == f->chip_id)
+			return 0;
+	}
+
+#ifdef CONFIG_MTD_NAND_PXA3xx_BUILTIN
 	for (i = 0; i < ARRAY_SIZE(builtin_flash_types); i++) {
 
 		f = builtin_flash_types[i];
@@ -951,7 +954,11 @@ static int pxa3xx_nand_detect_flash(struct pxa3xx_nand_info *info)
 		if (id == f->chip_id)
 			return 0;
 	}
+#endif
 
+	dev_warn(&info->pdev->dev,
+		 "failed to detect configured nand flash; found %04x instead of\n",
+		 id);
 	return -ENODEV;
 }
 
@@ -1014,7 +1021,7 @@ static struct nand_ecclayout hw_largepage_ecclayout = {
 static void pxa3xx_nand_init_mtd(struct mtd_info *mtd,
 				 struct pxa3xx_nand_info *info)
 {
-	struct pxa3xx_nand_flash *f = info->flash_info;
+	const struct pxa3xx_nand_flash *f = info->flash_info;
 	struct nand_chip *this = &info->nand_chip;
 
 	this->options = (f->flash_width == 16) ? NAND_BUSWIDTH_16: 0;
@@ -1135,7 +1142,7 @@ static int pxa3xx_nand_probe(struct platform_device *pdev)
 		goto fail_free_buf;
 	}
 
-	ret = pxa3xx_nand_detect_flash(info);
+	ret = pxa3xx_nand_detect_flash(info, pdata);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to detect flash\n");
 		ret = -ENODEV;
diff --git a/drivers/mtd/nand/sh_flctl.c b/drivers/mtd/nand/sh_flctl.c
new file mode 100644
index 000000000000..821acb08ff1c
--- /dev/null
+++ b/drivers/mtd/nand/sh_flctl.c
@@ -0,0 +1,878 @@
+/*
+ * SuperH FLCTL nand controller
+ *
+ * Copyright © 2008 Renesas Solutions Corp.
+ * Copyright © 2008 Atom Create Engineering Co., Ltd.
+ *
+ * Based on fsl_elbc_nand.c, Copyright © 2006-2007 Freescale Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/platform_device.h>
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/sh_flctl.h>
+
+static struct nand_ecclayout flctl_4secc_oob_16 = {
+	.eccbytes = 10,
+	.eccpos = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+	.oobfree = {
+		{.offset = 12,
+		. length = 4} },
+};
+
+static struct nand_ecclayout flctl_4secc_oob_64 = {
+	.eccbytes = 10,
+	.eccpos = {48, 49, 50, 51, 52, 53, 54, 55, 56, 57},
+	.oobfree = {
+		{.offset = 60,
+		. length = 4} },
+};
+
+static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
+
+static struct nand_bbt_descr flctl_4secc_smallpage = {
+	.options = NAND_BBT_SCAN2NDPAGE,
+	.offs = 11,
+	.len = 1,
+	.pattern = scan_ff_pattern,
+};
+
+static struct nand_bbt_descr flctl_4secc_largepage = {
+	.options = 0,
+	.offs = 58,
+	.len = 2,
+	.pattern = scan_ff_pattern,
+};
+
+static void empty_fifo(struct sh_flctl *flctl)
+{
+	writel(0x000c0000, FLINTDMACR(flctl));	/* FIFO Clear */
+	writel(0x00000000, FLINTDMACR(flctl));	/* Clear Error flags */
+}
+
+static void start_translation(struct sh_flctl *flctl)
+{
+	writeb(TRSTRT, FLTRCR(flctl));
+}
+
+static void wait_completion(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+
+	while (timeout--) {
+		if (readb(FLTRCR(flctl)) & TREND) {
+			writeb(0x0, FLTRCR(flctl));
+			return;
+		}
+		udelay(1);
+	}
+
+	printk(KERN_ERR "wait_completion(): Timeout occured \n");
+	writeb(0x0, FLTRCR(flctl));
+}
+
+static void set_addr(struct mtd_info *mtd, int column, int page_addr)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	uint32_t addr = 0;
+
+	if (column == -1) {
+		addr = page_addr;	/* ERASE1 */
+	} else if (page_addr != -1) {
+		/* SEQIN, READ0, etc.. */
+		if (flctl->page_size) {
+			addr = column & 0x0FFF;
+			addr |= (page_addr & 0xff) << 16;
+			addr |= ((page_addr >> 8) & 0xff) << 24;
+			/* big than 128MB */
+			if (flctl->rw_ADRCNT == ADRCNT2_E) {
+				uint32_t 	addr2;
+				addr2 = (page_addr >> 16) & 0xff;
+				writel(addr2, FLADR2(flctl));
+			}
+		} else {
+			addr = column;
+			addr |= (page_addr & 0xff) << 8;
+			addr |= ((page_addr >> 8) & 0xff) << 16;
+			addr |= ((page_addr >> 16) & 0xff) << 24;
+		}
+	}
+	writel(addr, FLADR(flctl));
+}
+
+static void wait_rfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+
+	while (timeout--) {
+		uint32_t val;
+		/* check FIFO */
+		val = readl(FLDTCNTR(flctl)) >> 16;
+		if (val & 0xFF)
+			return;
+		udelay(1);
+	}
+	printk(KERN_ERR "wait_rfifo_ready(): Timeout occured \n");
+}
+
+static void wait_wfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t len, timeout = LOOP_TIMEOUT_MAX;
+
+	while (timeout--) {
+		/* check FIFO */
+		len = (readl(FLDTCNTR(flctl)) >> 16) & 0xFF;
+		if (len >= 4)
+			return;
+		udelay(1);
+	}
+	printk(KERN_ERR "wait_wfifo_ready(): Timeout occured \n");
+}
+
+static int wait_recfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+	int checked[4];
+	void __iomem *ecc_reg[4];
+	int i;
+	uint32_t data, size;
+
+	memset(checked, 0, sizeof(checked));
+
+	while (timeout--) {
+		size = readl(FLDTCNTR(flctl)) >> 24;
+		if (size & 0xFF)
+			return 0;	/* success */
+
+		if (readl(FL4ECCCR(flctl)) & _4ECCFA)
+			return 1;	/* can't correct */
+
+		udelay(1);
+		if (!(readl(FL4ECCCR(flctl)) & _4ECCEND))
+			continue;
+
+		/* start error correction */
+		ecc_reg[0] = FL4ECCRESULT0(flctl);
+		ecc_reg[1] = FL4ECCRESULT1(flctl);
+		ecc_reg[2] = FL4ECCRESULT2(flctl);
+		ecc_reg[3] = FL4ECCRESULT3(flctl);
+
+		for (i = 0; i < 3; i++) {
+			data = readl(ecc_reg[i]);
+			if (data != INIT_FL4ECCRESULT_VAL && !checked[i]) {
+				uint8_t org;
+				int index;
+
+				index = data >> 16;
+				org = flctl->done_buff[index];
+				flctl->done_buff[index] = org ^ (data & 0xFF);
+				checked[i] = 1;
+			}
+		}
+
+		writel(0, FL4ECCCR(flctl));
+	}
+
+	printk(KERN_ERR "wait_recfifo_ready(): Timeout occured \n");
+	return 1;	/* timeout */
+}
+
+static void wait_wecfifo_ready(struct sh_flctl *flctl)
+{
+	uint32_t timeout = LOOP_TIMEOUT_MAX;
+	uint32_t len;
+
+	while (timeout--) {
+		/* check FLECFIFO */
+		len = (readl(FLDTCNTR(flctl)) >> 24) & 0xFF;
+		if (len >= 4)
+			return;
+		udelay(1);
+	}
+	printk(KERN_ERR "wait_wecfifo_ready(): Timeout occured \n");
+}
+
+static void read_datareg(struct sh_flctl *flctl, int offset)
+{
+	unsigned long data;
+	unsigned long *buf = (unsigned long *)&flctl->done_buff[offset];
+
+	wait_completion(flctl);
+
+	data = readl(FLDATAR(flctl));
+	*buf = le32_to_cpu(data);
+}
+
+static void read_fiforeg(struct sh_flctl *flctl, int rlen, int offset)
+{
+	int i, len_4align;
+	unsigned long *buf = (unsigned long *)&flctl->done_buff[offset];
+	void *fifo_addr = (void *)FLDTFIFO(flctl);
+
+	len_4align = (rlen + 3) / 4;
+
+	for (i = 0; i < len_4align; i++) {
+		wait_rfifo_ready(flctl);
+		buf[i] = readl(fifo_addr);
+		buf[i] = be32_to_cpu(buf[i]);
+	}
+}
+
+static int read_ecfiforeg(struct sh_flctl *flctl, uint8_t *buff)
+{
+	int i;
+	unsigned long *ecc_buf = (unsigned long *)buff;
+	void *fifo_addr = (void *)FLECFIFO(flctl);
+
+	for (i = 0; i < 4; i++) {
+		if (wait_recfifo_ready(flctl))
+			return 1;
+		ecc_buf[i] = readl(fifo_addr);
+		ecc_buf[i] = be32_to_cpu(ecc_buf[i]);
+	}
+
+	return 0;
+}
+
+static void write_fiforeg(struct sh_flctl *flctl, int rlen, int offset)
+{
+	int i, len_4align;
+	unsigned long *data = (unsigned long *)&flctl->done_buff[offset];
+	void *fifo_addr = (void *)FLDTFIFO(flctl);
+
+	len_4align = (rlen + 3) / 4;
+	for (i = 0; i < len_4align; i++) {
+		wait_wfifo_ready(flctl);
+		writel(cpu_to_be32(data[i]), fifo_addr);
+	}
+}
+
+static void set_cmd_regs(struct mtd_info *mtd, uint32_t cmd, uint32_t flcmcdr_val)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	uint32_t flcmncr_val = readl(FLCMNCR(flctl));
+	uint32_t flcmdcr_val, addr_len_bytes = 0;
+
+	/* Set SNAND bit if page size is 2048byte */
+	if (flctl->page_size)
+		flcmncr_val |= SNAND_E;
+	else
+		flcmncr_val &= ~SNAND_E;
+
+	/* default FLCMDCR val */
+	flcmdcr_val = DOCMD1_E | DOADR_E;
+
+	/* Set for FLCMDCR */
+	switch (cmd) {
+	case NAND_CMD_ERASE1:
+		addr_len_bytes = flctl->erase_ADRCNT;
+		flcmdcr_val |= DOCMD2_E;
+		break;
+	case NAND_CMD_READ0:
+	case NAND_CMD_READOOB:
+		addr_len_bytes = flctl->rw_ADRCNT;
+		flcmdcr_val |= CDSRC_E;
+		break;
+	case NAND_CMD_SEQIN:
+		/* This case is that cmd is READ0 or READ1 or READ00 */
+		flcmdcr_val &= ~DOADR_E;	/* ONLY execute 1st cmd */
+		break;
+	case NAND_CMD_PAGEPROG:
+		addr_len_bytes = flctl->rw_ADRCNT;
+		flcmdcr_val |= DOCMD2_E | CDSRC_E | SELRW;
+		break;
+	case NAND_CMD_READID:
+		flcmncr_val &= ~SNAND_E;
+		addr_len_bytes = ADRCNT_1;
+		break;
+	case NAND_CMD_STATUS:
+	case NAND_CMD_RESET:
+		flcmncr_val &= ~SNAND_E;
+		flcmdcr_val &= ~(DOADR_E | DOSR_E);
+		break;
+	default:
+		break;
+	}
+
+	/* Set address bytes parameter */
+	flcmdcr_val |= addr_len_bytes;
+
+	/* Now actually write */
+	writel(flcmncr_val, FLCMNCR(flctl));
+	writel(flcmdcr_val, FLCMDCR(flctl));
+	writel(flcmcdr_val, FLCMCDR(flctl));
+}
+
+static int flctl_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
+				uint8_t *buf)
+{
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	uint8_t *p = buf;
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
+		chip->read_buf(mtd, p, eccsize);
+
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize) {
+		if (flctl->hwecc_cant_correct[i])
+			mtd->ecc_stats.failed++;
+		else
+			mtd->ecc_stats.corrected += 0;
+	}
+
+	return 0;
+}
+
+static void flctl_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
+				   const uint8_t *buf)
+{
+	int i, eccsize = chip->ecc.size;
+	int eccbytes = chip->ecc.bytes;
+	int eccsteps = chip->ecc.steps;
+	const uint8_t *p = buf;
+
+	for (i = 0; eccsteps; eccsteps--, i += eccbytes, p += eccsize)
+		chip->write_buf(mtd, p, eccsize);
+}
+
+static void execmd_read_page_sector(struct mtd_info *mtd, int page_addr)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	int sector, page_sectors;
+
+	if (flctl->page_size)
+		page_sectors = 4;
+	else
+		page_sectors = 1;
+
+	writel(readl(FLCMNCR(flctl)) | ACM_SACCES_MODE | _4ECCCORRECT,
+		 FLCMNCR(flctl));
+
+	set_cmd_regs(mtd, NAND_CMD_READ0,
+		(NAND_CMD_READSTART << 8) | NAND_CMD_READ0);
+
+	for (sector = 0; sector < page_sectors; sector++) {
+		int ret;
+
+		empty_fifo(flctl);
+		writel(readl(FLCMDCR(flctl)) | 1, FLCMDCR(flctl));
+		writel(page_addr << 2 | sector, FLADR(flctl));
+
+		start_translation(flctl);
+		read_fiforeg(flctl, 512, 512 * sector);
+
+		ret = read_ecfiforeg(flctl,
+			&flctl->done_buff[mtd->writesize + 16 * sector]);
+
+		if (ret)
+			flctl->hwecc_cant_correct[sector] = 1;
+
+		writel(0x0, FL4ECCCR(flctl));
+		wait_completion(flctl);
+	}
+	writel(readl(FLCMNCR(flctl)) & ~(ACM_SACCES_MODE | _4ECCCORRECT),
+			FLCMNCR(flctl));
+}
+
+static void execmd_read_oob(struct mtd_info *mtd, int page_addr)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+
+	set_cmd_regs(mtd, NAND_CMD_READ0,
+		(NAND_CMD_READSTART << 8) | NAND_CMD_READ0);
+
+	empty_fifo(flctl);
+	if (flctl->page_size) {
+		int i;
+		/* In case that the page size is 2k */
+		for (i = 0; i < 16 * 3; i++)
+			flctl->done_buff[i] = 0xFF;
+
+		set_addr(mtd, 3 * 528 + 512, page_addr);
+		writel(16, FLDTCNTR(flctl));
+
+		start_translation(flctl);
+		read_fiforeg(flctl, 16, 16 * 3);
+		wait_completion(flctl);
+	} else {
+		/* In case that the page size is 512b */
+		set_addr(mtd, 512, page_addr);
+		writel(16, FLDTCNTR(flctl));
+
+		start_translation(flctl);
+		read_fiforeg(flctl, 16, 0);
+		wait_completion(flctl);
+	}
+}
+
+static void execmd_write_page_sector(struct mtd_info *mtd)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	int i, page_addr = flctl->seqin_page_addr;
+	int sector, page_sectors;
+
+	if (flctl->page_size)
+		page_sectors = 4;
+	else
+		page_sectors = 1;
+
+	writel(readl(FLCMNCR(flctl)) | ACM_SACCES_MODE, FLCMNCR(flctl));
+
+	set_cmd_regs(mtd, NAND_CMD_PAGEPROG,
+			(NAND_CMD_PAGEPROG << 8) | NAND_CMD_SEQIN);
+
+	for (sector = 0; sector < page_sectors; sector++) {
+		empty_fifo(flctl);
+		writel(readl(FLCMDCR(flctl)) | 1, FLCMDCR(flctl));
+		writel(page_addr << 2 | sector, FLADR(flctl));
+
+		start_translation(flctl);
+		write_fiforeg(flctl, 512, 512 * sector);
+
+		for (i = 0; i < 4; i++) {
+			wait_wecfifo_ready(flctl); /* wait for write ready */
+			writel(0xFFFFFFFF, FLECFIFO(flctl));
+		}
+		wait_completion(flctl);
+	}
+
+	writel(readl(FLCMNCR(flctl)) & ~ACM_SACCES_MODE, FLCMNCR(flctl));
+}
+
+static void execmd_write_oob(struct mtd_info *mtd)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	int page_addr = flctl->seqin_page_addr;
+	int sector, page_sectors;
+
+	if (flctl->page_size) {
+		sector = 3;
+		page_sectors = 4;
+	} else {
+		sector = 0;
+		page_sectors = 1;
+	}
+
+	set_cmd_regs(mtd, NAND_CMD_PAGEPROG,
+			(NAND_CMD_PAGEPROG << 8) | NAND_CMD_SEQIN);
+
+	for (; sector < page_sectors; sector++) {
+		empty_fifo(flctl);
+		set_addr(mtd, sector * 528 + 512, page_addr);
+		writel(16, FLDTCNTR(flctl));	/* set read size */
+
+		start_translation(flctl);
+		write_fiforeg(flctl, 16, 16 * sector);
+		wait_completion(flctl);
+	}
+}
+
+static void flctl_cmdfunc(struct mtd_info *mtd, unsigned int command,
+			int column, int page_addr)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	uint32_t read_cmd = 0;
+
+	flctl->read_bytes = 0;
+	if (command != NAND_CMD_PAGEPROG)
+		flctl->index = 0;
+
+	switch (command) {
+	case NAND_CMD_READ1:
+	case NAND_CMD_READ0:
+		if (flctl->hwecc) {
+			/* read page with hwecc */
+			execmd_read_page_sector(mtd, page_addr);
+			break;
+		}
+		empty_fifo(flctl);
+		if (flctl->page_size)
+			set_cmd_regs(mtd, command, (NAND_CMD_READSTART << 8)
+				| command);
+		else
+			set_cmd_regs(mtd, command, command);
+
+		set_addr(mtd, 0, page_addr);
+
+		flctl->read_bytes = mtd->writesize + mtd->oobsize;
+		flctl->index += column;
+		goto read_normal_exit;
+
+	case NAND_CMD_READOOB:
+		if (flctl->hwecc) {
+			/* read page with hwecc */
+			execmd_read_oob(mtd, page_addr);
+			break;
+		}
+
+		empty_fifo(flctl);
+		if (flctl->page_size) {
+			set_cmd_regs(mtd, command, (NAND_CMD_READSTART << 8)
+				| NAND_CMD_READ0);
+			set_addr(mtd, mtd->writesize, page_addr);
+		} else {
+			set_cmd_regs(mtd, command, command);
+			set_addr(mtd, 0, page_addr);
+		}
+		flctl->read_bytes = mtd->oobsize;
+		goto read_normal_exit;
+
+	case NAND_CMD_READID:
+		empty_fifo(flctl);
+		set_cmd_regs(mtd, command, command);
+		set_addr(mtd, 0, 0);
+
+		flctl->read_bytes = 4;
+		writel(flctl->read_bytes, FLDTCNTR(flctl)); /* set read size */
+		start_translation(flctl);
+		read_datareg(flctl, 0);	/* read and end */
+		break;
+
+	case NAND_CMD_ERASE1:
+		flctl->erase1_page_addr = page_addr;
+		break;
+
+	case NAND_CMD_ERASE2:
+		set_cmd_regs(mtd, NAND_CMD_ERASE1,
+			(command << 8) | NAND_CMD_ERASE1);
+		set_addr(mtd, -1, flctl->erase1_page_addr);
+		start_translation(flctl);
+		wait_completion(flctl);
+		break;
+
+	case NAND_CMD_SEQIN:
+		if (!flctl->page_size) {
+			/* output read command */
+			if (column >= mtd->writesize) {
+				column -= mtd->writesize;
+				read_cmd = NAND_CMD_READOOB;
+			} else if (column < 256) {
+				read_cmd = NAND_CMD_READ0;
+			} else {
+				column -= 256;
+				read_cmd = NAND_CMD_READ1;
+			}
+		}
+		flctl->seqin_column = column;
+		flctl->seqin_page_addr = page_addr;
+		flctl->seqin_read_cmd = read_cmd;
+		break;
+
+	case NAND_CMD_PAGEPROG:
+		empty_fifo(flctl);
+		if (!flctl->page_size) {
+			set_cmd_regs(mtd, NAND_CMD_SEQIN,
+					flctl->seqin_read_cmd);
+			set_addr(mtd, -1, -1);
+			writel(0, FLDTCNTR(flctl));	/* set 0 size */
+			start_translation(flctl);
+			wait_completion(flctl);
+		}
+		if (flctl->hwecc) {
+			/* write page with hwecc */
+			if (flctl->seqin_column == mtd->writesize)
+				execmd_write_oob(mtd);
+			else if (!flctl->seqin_column)
+				execmd_write_page_sector(mtd);
+			else
+				printk(KERN_ERR "Invalid address !?\n");
+			break;
+		}
+		set_cmd_regs(mtd, command, (command << 8) | NAND_CMD_SEQIN);
+		set_addr(mtd, flctl->seqin_column, flctl->seqin_page_addr);
+		writel(flctl->index, FLDTCNTR(flctl));	/* set write size */
+		start_translation(flctl);
+		write_fiforeg(flctl, flctl->index, 0);
+		wait_completion(flctl);
+		break;
+
+	case NAND_CMD_STATUS:
+		set_cmd_regs(mtd, command, command);
+		set_addr(mtd, -1, -1);
+
+		flctl->read_bytes = 1;
+		writel(flctl->read_bytes, FLDTCNTR(flctl)); /* set read size */
+		start_translation(flctl);
+		read_datareg(flctl, 0); /* read and end */
+		break;
+
+	case NAND_CMD_RESET:
+		set_cmd_regs(mtd, command, command);
+		set_addr(mtd, -1, -1);
+
+		writel(0, FLDTCNTR(flctl));	/* set 0 size */
+		start_translation(flctl);
+		wait_completion(flctl);
+		break;
+
+	default:
+		break;
+	}
+	return;
+
+read_normal_exit:
+	writel(flctl->read_bytes, FLDTCNTR(flctl));	/* set read size */
+	start_translation(flctl);
+	read_fiforeg(flctl, flctl->read_bytes, 0);
+	wait_completion(flctl);
+	return;
+}
+
+static void flctl_select_chip(struct mtd_info *mtd, int chipnr)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	uint32_t flcmncr_val = readl(FLCMNCR(flctl));
+
+	switch (chipnr) {
+	case -1:
+		flcmncr_val &= ~CE0_ENABLE;
+		writel(flcmncr_val, FLCMNCR(flctl));
+		break;
+	case 0:
+		flcmncr_val |= CE0_ENABLE;
+		writel(flcmncr_val, FLCMNCR(flctl));
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void flctl_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	int i, index = flctl->index;
+
+	for (i = 0; i < len; i++)
+		flctl->done_buff[index + i] = buf[i];
+	flctl->index += len;
+}
+
+static uint8_t flctl_read_byte(struct mtd_info *mtd)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	int index = flctl->index;
+	uint8_t data;
+
+	data = flctl->done_buff[index];
+	flctl->index++;
+	return data;
+}
+
+static void flctl_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		buf[i] = flctl_read_byte(mtd);
+}
+
+static int flctl_verify_buf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		if (buf[i] != flctl_read_byte(mtd))
+			return -EFAULT;
+	return 0;
+}
+
+static void flctl_register_init(struct sh_flctl *flctl, unsigned long val)
+{
+	writel(val, FLCMNCR(flctl));
+}
+
+static int flctl_chip_init_tail(struct mtd_info *mtd)
+{
+	struct sh_flctl *flctl = mtd_to_flctl(mtd);
+	struct nand_chip *chip = &flctl->chip;
+
+	if (mtd->writesize == 512) {
+		flctl->page_size = 0;
+		if (chip->chipsize > (32 << 20)) {
+			/* big than 32MB */
+			flctl->rw_ADRCNT = ADRCNT_4;
+			flctl->erase_ADRCNT = ADRCNT_3;
+		} else if (chip->chipsize > (2 << 16)) {
+			/* big than 128KB */
+			flctl->rw_ADRCNT = ADRCNT_3;
+			flctl->erase_ADRCNT = ADRCNT_2;
+		} else {
+			flctl->rw_ADRCNT = ADRCNT_2;
+			flctl->erase_ADRCNT = ADRCNT_1;
+		}
+	} else {
+		flctl->page_size = 1;
+		if (chip->chipsize > (128 << 20)) {
+			/* big than 128MB */
+			flctl->rw_ADRCNT = ADRCNT2_E;
+			flctl->erase_ADRCNT = ADRCNT_3;
+		} else if (chip->chipsize > (8 << 16)) {
+			/* big than 512KB */
+			flctl->rw_ADRCNT = ADRCNT_4;
+			flctl->erase_ADRCNT = ADRCNT_2;
+		} else {
+			flctl->rw_ADRCNT = ADRCNT_3;
+			flctl->erase_ADRCNT = ADRCNT_1;
+		}
+	}
+
+	if (flctl->hwecc) {
+		if (mtd->writesize == 512) {
+			chip->ecc.layout = &flctl_4secc_oob_16;
+			chip->badblock_pattern = &flctl_4secc_smallpage;
+		} else {
+			chip->ecc.layout = &flctl_4secc_oob_64;
+			chip->badblock_pattern = &flctl_4secc_largepage;
+		}
+
+		chip->ecc.size = 512;
+		chip->ecc.bytes = 10;
+		chip->ecc.read_page = flctl_read_page_hwecc;
+		chip->ecc.write_page = flctl_write_page_hwecc;
+		chip->ecc.mode = NAND_ECC_HW;
+
+		/* 4 symbols ECC enabled */
+		writel(readl(FLCMNCR(flctl)) | _4ECCEN | ECCPOS2 | ECCPOS_02,
+				FLCMNCR(flctl));
+	} else {
+		chip->ecc.mode = NAND_ECC_SOFT;
+	}
+
+	return 0;
+}
+
+static int __init flctl_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	struct sh_flctl *flctl;
+	struct mtd_info *flctl_mtd;
+	struct nand_chip *nand;
+	struct sh_flctl_platform_data *pdata;
+	int ret;
+
+	pdata = pdev->dev.platform_data;
+	if (pdata == NULL) {
+		printk(KERN_ERR "sh_flctl platform_data not found.\n");
+		return -ENODEV;
+	}
+
+	flctl = kzalloc(sizeof(struct sh_flctl), GFP_KERNEL);
+	if (!flctl) {
+		printk(KERN_ERR "Unable to allocate NAND MTD dev structure.\n");
+		return -ENOMEM;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		printk(KERN_ERR "%s: resource not found.\n", __func__);
+		ret = -ENODEV;
+		goto err;
+	}
+
+	flctl->reg = ioremap(res->start, res->end - res->start + 1);
+	if (flctl->reg == NULL) {
+		printk(KERN_ERR "%s: ioremap error.\n", __func__);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	platform_set_drvdata(pdev, flctl);
+	flctl_mtd = &flctl->mtd;
+	nand = &flctl->chip;
+	flctl_mtd->priv = nand;
+	flctl->hwecc = pdata->has_hwecc;
+
+	flctl_register_init(flctl, pdata->flcmncr_val);
+
+	nand->options = NAND_NO_AUTOINCR;
+
+	/* Set address of hardware control function */
+	/* 20 us command delay time */
+	nand->chip_delay = 20;
+
+	nand->read_byte = flctl_read_byte;
+	nand->write_buf = flctl_write_buf;
+	nand->read_buf = flctl_read_buf;
+	nand->verify_buf = flctl_verify_buf;
+	nand->select_chip = flctl_select_chip;
+	nand->cmdfunc = flctl_cmdfunc;
+
+	ret = nand_scan_ident(flctl_mtd, 1);
+	if (ret)
+		goto err;
+
+	ret = flctl_chip_init_tail(flctl_mtd);
+	if (ret)
+		goto err;
+
+	ret = nand_scan_tail(flctl_mtd);
+	if (ret)
+		goto err;
+
+	add_mtd_partitions(flctl_mtd, pdata->parts, pdata->nr_parts);
+
+	return 0;
+
+err:
+	kfree(flctl);
+	return ret;
+}
+
+static int __exit flctl_remove(struct platform_device *pdev)
+{
+	struct sh_flctl *flctl = platform_get_drvdata(pdev);
+
+	nand_release(&flctl->mtd);
+	kfree(flctl);
+
+	return 0;
+}
+
+static struct platform_driver flctl_driver = {
+	.probe		= flctl_probe,
+	.remove		= flctl_remove,
+	.driver = {
+		.name	= "sh_flctl",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init flctl_nand_init(void)
+{
+	return platform_driver_register(&flctl_driver);
+}
+
+static void __exit flctl_nand_cleanup(void)
+{
+	platform_driver_unregister(&flctl_driver);
+}
+
+module_init(flctl_nand_init);
+module_exit(flctl_nand_cleanup);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yoshihiro Shimoda");
+MODULE_DESCRIPTION("SuperH FLCTL driver");
+MODULE_ALIAS("platform:sh_flctl");
diff --git a/drivers/mtd/nand/toto.c b/drivers/mtd/nand/toto.c
deleted file mode 100644
index bbf492e6830d..000000000000
--- a/drivers/mtd/nand/toto.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- *  drivers/mtd/nand/toto.c
- *
- *  Copyright (c) 2003 Texas Instruments
- *
- *  Derived from drivers/mtd/autcpu12.c
- *
- *  Copyright (c) 2002 Thomas Gleixner <tgxl@linutronix.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- *  Overview:
- *   This is a device driver for the NAND flash device found on the
- *   TI fido board. It supports 32MiB and 64MiB cards
- */
-
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/mtd/mtd.h>
-#include <linux/mtd/nand.h>
-#include <linux/mtd/partitions.h>
-#include <asm/io.h>
-#include <asm/arch/hardware.h>
-#include <asm/sizes.h>
-#include <asm/arch/toto.h>
-#include <asm/arch-omap1510/hardware.h>
-#include <asm/arch/gpio.h>
-
-#define CONFIG_NAND_WORKAROUND 1
-
-/*
- * MTD structure for TOTO board
- */
-static struct mtd_info *toto_mtd = NULL;
-
-static unsigned long toto_io_base = OMAP_FLASH_1_BASE;
-
-/*
- * Define partitions for flash devices
- */
-
-static struct mtd_partition partition_info64M[] = {
-	{ .name =	"toto kernel partition 1",
-	  .offset =	0,
-	  .size	=	2 * SZ_1M },
-	{ .name =	"toto file sys partition 2",
-	  .offset =	2 * SZ_1M,
-	  .size =	14 * SZ_1M },
-	{ .name =	"toto user partition 3",
-	  .offset =	16 * SZ_1M,
-	  .size =	16 * SZ_1M },
-	{ .name =	"toto devboard extra partition 4",
-	  .offset =	32 * SZ_1M,
-	  .size =	32 * SZ_1M },
-};
-
-static struct mtd_partition partition_info32M[] = {
-	{ .name =	"toto kernel partition 1",
-	  .offset =	0,
-	  .size =	2 * SZ_1M },
-	{ .name =	"toto file sys partition 2",
-	  .offset =	2 * SZ_1M,
-	  .size =	14 * SZ_1M },
-	{ .name =	"toto user partition 3",
-	  .offset =	16 * SZ_1M,
-	  .size =	16 * SZ_1M },
-};
-
-#define NUM_PARTITIONS32M 3
-#define NUM_PARTITIONS64M 4
-
-/*
- *	hardware specific access to control-lines
- *
- *	ctrl:
- *	NAND_NCE: bit 0 -> bit 14 (0x4000)
- *	NAND_CLE: bit 1 -> bit 12 (0x1000)
- *	NAND_ALE: bit 2 -> bit 1  (0x0002)
- */
-static void toto_hwcontrol(struct mtd_info *mtd, int cmd,
-			   unsigned int ctrl)
-{
-	struct nand_chip *chip = mtd->priv;
-
-	if (ctrl & NAND_CTRL_CHANGE) {
-		unsigned long bits;
-
-		/* hopefully enough time for tc make proceding write to clear */
-		udelay(1);
-
-		bits = (~ctrl & NAND_NCE) << 14;
-		bits |= (ctrl & NAND_CLE) << 12;
-		bits |= (ctrl & NAND_ALE) >> 1;
-
-#warning Wild guess as gpiosetout() is nowhere defined in the kernel source - tglx
-		gpiosetout(0x5002, bits);
-
-#ifdef CONFIG_NAND_WORKAROUND
-		/* "some" dev boards busted, blue wired to rts2 :( */
-		rts2setout(2, (ctrl & NAND_CLE) << 1);
-#endif
-		/* allow time to ensure gpio state to over take memory write */
-		udelay(1);
-	}
-
-	if (cmd != NAND_CMD_NONE)
-		writeb(cmd, chip->IO_ADDR_W);
-}
-
-/*
- * Main initialization routine
- */
-static int __init toto_init(void)
-{
-	struct nand_chip *this;
-	int err = 0;
-
-	/* Allocate memory for MTD device structure and private data */
-	toto_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL);
-	if (!toto_mtd) {
-		printk(KERN_WARNING "Unable to allocate toto NAND MTD device structure.\n");
-		err = -ENOMEM;
-		goto out;
-	}
-
-	/* Get pointer to private data */
-	this = (struct nand_chip *)(&toto_mtd[1]);
-
-	/* Initialize structures */
-	memset(toto_mtd, 0, sizeof(struct mtd_info));
-	memset(this, 0, sizeof(struct nand_chip));
-
-	/* Link the private data with the MTD structure */
-	toto_mtd->priv = this;
-	toto_mtd->owner = THIS_MODULE;
-
-	/* Set address of NAND IO lines */
-	this->IO_ADDR_R = toto_io_base;
-	this->IO_ADDR_W = toto_io_base;
-	this->cmd_ctrl = toto_hwcontrol;
-	this->dev_ready = NULL;
-	/* 25 us command delay time */
-	this->chip_delay = 30;
-	this->ecc.mode = NAND_ECC_SOFT;
-
-	/* Scan to find existance of the device */
-	if (nand_scan(toto_mtd, 1)) {
-		err = -ENXIO;
-		goto out_mtd;
-	}
-
-	/* Register the partitions */
-	switch (toto_mtd->size) {
-	case SZ_64M:
-		add_mtd_partitions(toto_mtd, partition_info64M, NUM_PARTITIONS64M);
-		break;
-	case SZ_32M:
-		add_mtd_partitions(toto_mtd, partition_info32M, NUM_PARTITIONS32M);
-		break;
-	default:{
-			printk(KERN_WARNING "Unsupported Nand device\n");
-			err = -ENXIO;
-			goto out_buf;
-		}
-	}
-
-	gpioreserve(NAND_MASK);	/* claim our gpios */
-	archflashwp(0, 0);	/* open up flash for writing */
-
-	goto out;
-
- out_mtd:
-	kfree(toto_mtd);
- out:
-	return err;
-}
-
-module_init(toto_init);
-
-/*
- * Clean up routine
- */
-static void __exit toto_cleanup(void)
-{
-	/* Release resources, unregister device */
-	nand_release(toto_mtd);
-
-	/* Free the MTD device structure */
-	kfree(toto_mtd);
-
-	/* stop flash writes */
-	archflashwp(0, 1);
-
-	/* release gpios to system */
-	gpiorelease(NAND_MASK);
-}
-
-module_exit(toto_cleanup);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Richard Woodruff <r-woodruff2@ti.com>");
-MODULE_DESCRIPTION("Glue layer for NAND flash on toto board");
diff --git a/drivers/mtd/ofpart.c b/drivers/mtd/ofpart.c
index 4f80c2fd89af..9e45b3f39c0e 100644
--- a/drivers/mtd/ofpart.c
+++ b/drivers/mtd/ofpart.c
@@ -20,7 +20,6 @@
 #include <linux/mtd/partitions.h>
 
 int __devinit of_mtd_parse_partitions(struct device *dev,
-                                      struct mtd_info *mtd,
                                       struct device_node *node,
                                       struct mtd_partition **pparts)
 {
diff --git a/drivers/mtd/onenand/Kconfig b/drivers/mtd/onenand/Kconfig
index cb41cbca64f7..79fa79e8f8de 100644
--- a/drivers/mtd/onenand/Kconfig
+++ b/drivers/mtd/onenand/Kconfig
@@ -27,8 +27,16 @@ config MTD_ONENAND_GENERIC
 	help
 	  Support for OneNAND flash via platform device driver.
 
+config MTD_ONENAND_OMAP2
+	tristate "OneNAND on OMAP2/OMAP3 support"
+	depends on MTD_ONENAND && (ARCH_OMAP2 || ARCH_OMAP3)
+	help
+	  Support for a OneNAND flash device connected to an OMAP2/OMAP3 CPU
+	  via the GPMC memory controller.
+
 config MTD_ONENAND_OTP
 	bool "OneNAND OTP Support"
+	select HAVE_MTD_OTP
 	help
 	  One Block of the NAND Flash Array memory is reserved as
 	  a One-Time Programmable Block memory area.
diff --git a/drivers/mtd/onenand/Makefile b/drivers/mtd/onenand/Makefile
index 4d2eacfd7e11..64b6cc61a520 100644
--- a/drivers/mtd/onenand/Makefile
+++ b/drivers/mtd/onenand/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_MTD_ONENAND)		+= onenand.o
 
 # Board specific.
 obj-$(CONFIG_MTD_ONENAND_GENERIC)	+= generic.o
+obj-$(CONFIG_MTD_ONENAND_OMAP2)		+= omap2.o
 
 # Simulator
 obj-$(CONFIG_MTD_ONENAND_SIM)		+= onenand_sim.o
diff --git a/drivers/mtd/onenand/omap2.c b/drivers/mtd/onenand/omap2.c
new file mode 100644
index 000000000000..8387e05daae2
--- /dev/null
+++ b/drivers/mtd/onenand/omap2.c
@@ -0,0 +1,802 @@
+/*
+ *  linux/drivers/mtd/onenand/omap2.c
+ *
+ *  OneNAND driver for OMAP2 / OMAP3
+ *
+ *  Copyright © 2005-2006 Nokia Corporation
+ *
+ *  Author: Jarkko Lavinen <jarkko.lavinen@nokia.com> and Juha Yrjölä
+ *  IRQ and DMA support written by Timo Teras
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; see the file COPYING. If not, write to the Free Software
+ * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/onenand.h>
+#include <linux/mtd/partitions.h>
+#include <linux/platform_device.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+
+#include <asm/io.h>
+#include <asm/mach/flash.h>
+#include <asm/arch/gpmc.h>
+#include <asm/arch/onenand.h>
+#include <asm/arch/gpio.h>
+#include <asm/arch/gpmc.h>
+#include <asm/arch/pm.h>
+
+#include <linux/dma-mapping.h>
+#include <asm/dma-mapping.h>
+#include <asm/arch/dma.h>
+
+#include <asm/arch/board.h>
+
+#define DRIVER_NAME "omap2-onenand"
+
+#define ONENAND_IO_SIZE		SZ_128K
+#define ONENAND_BUFRAM_SIZE	(1024 * 5)
+
+struct omap2_onenand {
+	struct platform_device *pdev;
+	int gpmc_cs;
+	unsigned long phys_base;
+	int gpio_irq;
+	struct mtd_info mtd;
+	struct mtd_partition *parts;
+	struct onenand_chip onenand;
+	struct completion irq_done;
+	struct completion dma_done;
+	int dma_channel;
+	int freq;
+	int (*setup)(void __iomem *base, int freq);
+};
+
+static void omap2_onenand_dma_cb(int lch, u16 ch_status, void *data)
+{
+	struct omap2_onenand *c = data;
+
+	complete(&c->dma_done);
+}
+
+static irqreturn_t omap2_onenand_interrupt(int irq, void *dev_id)
+{
+	struct omap2_onenand *c = dev_id;
+
+	complete(&c->irq_done);
+
+	return IRQ_HANDLED;
+}
+
+static inline unsigned short read_reg(struct omap2_onenand *c, int reg)
+{
+	return readw(c->onenand.base + reg);
+}
+
+static inline void write_reg(struct omap2_onenand *c, unsigned short value,
+			     int reg)
+{
+	writew(value, c->onenand.base + reg);
+}
+
+static void wait_err(char *msg, int state, unsigned int ctrl, unsigned int intr)
+{
+	printk(KERN_ERR "onenand_wait: %s! state %d ctrl 0x%04x intr 0x%04x\n",
+	       msg, state, ctrl, intr);
+}
+
+static void wait_warn(char *msg, int state, unsigned int ctrl,
+		      unsigned int intr)
+{
+	printk(KERN_WARNING "onenand_wait: %s! state %d ctrl 0x%04x "
+	       "intr 0x%04x\n", msg, state, ctrl, intr);
+}
+
+static int omap2_onenand_wait(struct mtd_info *mtd, int state)
+{
+	struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd);
+	unsigned int intr = 0;
+	unsigned int ctrl;
+	unsigned long timeout;
+	u32 syscfg;
+
+	if (state == FL_RESETING) {
+		int i;
+
+		for (i = 0; i < 20; i++) {
+			udelay(1);
+			intr = read_reg(c, ONENAND_REG_INTERRUPT);
+			if (intr & ONENAND_INT_MASTER)
+				break;
+		}
+		ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS);
+		if (ctrl & ONENAND_CTRL_ERROR) {
+			wait_err("controller error", state, ctrl, intr);
+			return -EIO;
+		}
+		if (!(intr & ONENAND_INT_RESET)) {
+			wait_err("timeout", state, ctrl, intr);
+			return -EIO;
+		}
+		return 0;
+	}
+
+	if (state != FL_READING) {
+		int result;
+
+		/* Turn interrupts on */
+		syscfg = read_reg(c, ONENAND_REG_SYS_CFG1);
+		if (!(syscfg & ONENAND_SYS_CFG1_IOBE)) {
+			syscfg |= ONENAND_SYS_CFG1_IOBE;
+			write_reg(c, syscfg, ONENAND_REG_SYS_CFG1);
+			if (cpu_is_omap34xx())
+				/* Add a delay to let GPIO settle */
+				syscfg = read_reg(c, ONENAND_REG_SYS_CFG1);
+		}
+
+		INIT_COMPLETION(c->irq_done);
+		if (c->gpio_irq) {
+			result = omap_get_gpio_datain(c->gpio_irq);
+			if (result == -1) {
+				ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS);
+				intr = read_reg(c, ONENAND_REG_INTERRUPT);
+				wait_err("gpio error", state, ctrl, intr);
+				return -EIO;
+			}
+		} else
+			result = 0;
+		if (result == 0) {
+			int retry_cnt = 0;
+retry:
+			result = wait_for_completion_timeout(&c->irq_done,
+						    msecs_to_jiffies(20));
+			if (result == 0) {
+				/* Timeout after 20ms */
+				ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS);
+				if (ctrl & ONENAND_CTRL_ONGO) {
+					/*
+					 * The operation seems to be still going
+					 * so give it some more time.
+					 */
+					retry_cnt += 1;
+					if (retry_cnt < 3)
+						goto retry;
+					intr = read_reg(c,
+							ONENAND_REG_INTERRUPT);
+					wait_err("timeout", state, ctrl, intr);
+					return -EIO;
+				}
+				intr = read_reg(c, ONENAND_REG_INTERRUPT);
+				if ((intr & ONENAND_INT_MASTER) == 0)
+					wait_warn("timeout", state, ctrl, intr);
+			}
+		}
+	} else {
+		int retry_cnt = 0;
+
+		/* Turn interrupts off */
+		syscfg = read_reg(c, ONENAND_REG_SYS_CFG1);
+		syscfg &= ~ONENAND_SYS_CFG1_IOBE;
+		write_reg(c, syscfg, ONENAND_REG_SYS_CFG1);
+
+		timeout = jiffies + msecs_to_jiffies(20);
+		while (1) {
+			if (time_before(jiffies, timeout)) {
+				intr = read_reg(c, ONENAND_REG_INTERRUPT);
+				if (intr & ONENAND_INT_MASTER)
+					break;
+			} else {
+				/* Timeout after 20ms */
+				ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS);
+				if (ctrl & ONENAND_CTRL_ONGO) {
+					/*
+					 * The operation seems to be still going
+					 * so give it some more time.
+					 */
+					retry_cnt += 1;
+					if (retry_cnt < 3) {
+						timeout = jiffies +
+							  msecs_to_jiffies(20);
+						continue;
+					}
+				}
+				break;
+			}
+		}
+	}
+
+	intr = read_reg(c, ONENAND_REG_INTERRUPT);
+	ctrl = read_reg(c, ONENAND_REG_CTRL_STATUS);
+
+	if (intr & ONENAND_INT_READ) {
+		int ecc = read_reg(c, ONENAND_REG_ECC_STATUS);
+
+		if (ecc) {
+			unsigned int addr1, addr8;
+
+			addr1 = read_reg(c, ONENAND_REG_START_ADDRESS1);
+			addr8 = read_reg(c, ONENAND_REG_START_ADDRESS8);
+			if (ecc & ONENAND_ECC_2BIT_ALL) {
+				printk(KERN_ERR "onenand_wait: ECC error = "
+				       "0x%04x, addr1 %#x, addr8 %#x\n",
+				       ecc, addr1, addr8);
+				mtd->ecc_stats.failed++;
+				return -EBADMSG;
+			} else if (ecc & ONENAND_ECC_1BIT_ALL) {
+				printk(KERN_NOTICE "onenand_wait: correctable "
+				       "ECC error = 0x%04x, addr1 %#x, "
+				       "addr8 %#x\n", ecc, addr1, addr8);
+				mtd->ecc_stats.corrected++;
+			}
+		}
+	} else if (state == FL_READING) {
+		wait_err("timeout", state, ctrl, intr);
+		return -EIO;
+	}
+
+	if (ctrl & ONENAND_CTRL_ERROR) {
+		wait_err("controller error", state, ctrl, intr);
+		if (ctrl & ONENAND_CTRL_LOCK)
+			printk(KERN_ERR "onenand_wait: "
+					"Device is write protected!!!\n");
+		return -EIO;
+	}
+
+	if (ctrl & 0xFE9F)
+		wait_warn("unexpected controller status", state, ctrl, intr);
+
+	return 0;
+}
+
+static inline int omap2_onenand_bufferram_offset(struct mtd_info *mtd, int area)
+{
+	struct onenand_chip *this = mtd->priv;
+
+	if (ONENAND_CURRENT_BUFFERRAM(this)) {
+		if (area == ONENAND_DATARAM)
+			return mtd->writesize;
+		if (area == ONENAND_SPARERAM)
+			return mtd->oobsize;
+	}
+
+	return 0;
+}
+
+#if defined(CONFIG_ARCH_OMAP3) || defined(MULTI_OMAP2)
+
+static int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area,
+					unsigned char *buffer, int offset,
+					size_t count)
+{
+	struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd);
+	struct onenand_chip *this = mtd->priv;
+	dma_addr_t dma_src, dma_dst;
+	int bram_offset;
+	unsigned long timeout;
+	void *buf = (void *)buffer;
+	size_t xtra;
+	volatile unsigned *done;
+
+	bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset;
+	if (bram_offset & 3 || (size_t)buf & 3 || count < 384)
+		goto out_copy;
+
+	if (buf >= high_memory) {
+		struct page *p1;
+
+		if (((size_t)buf & PAGE_MASK) !=
+		    ((size_t)(buf + count - 1) & PAGE_MASK))
+			goto out_copy;
+		p1 = vmalloc_to_page(buf);
+		if (!p1)
+			goto out_copy;
+		buf = page_address(p1) + ((size_t)buf & ~PAGE_MASK);
+	}
+
+	xtra = count & 3;
+	if (xtra) {
+		count -= xtra;
+		memcpy(buf + count, this->base + bram_offset + count, xtra);
+	}
+
+	dma_src = c->phys_base + bram_offset;
+	dma_dst = dma_map_single(&c->pdev->dev, buf, count, DMA_FROM_DEVICE);
+	if (dma_mapping_error(&c->pdev->dev, dma_dst)) {
+		dev_err(&c->pdev->dev,
+			"Couldn't DMA map a %d byte buffer\n",
+			count);
+		goto out_copy;
+	}
+
+	omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S32,
+				     count >> 2, 1, 0, 0, 0);
+	omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				dma_src, 0, 0);
+	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				 dma_dst, 0, 0);
+
+	INIT_COMPLETION(c->dma_done);
+	omap_start_dma(c->dma_channel);
+
+	timeout = jiffies + msecs_to_jiffies(20);
+	done = &c->dma_done.done;
+	while (time_before(jiffies, timeout))
+		if (*done)
+			break;
+
+	dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_FROM_DEVICE);
+
+	if (!*done) {
+		dev_err(&c->pdev->dev, "timeout waiting for DMA\n");
+		goto out_copy;
+	}
+
+	return 0;
+
+out_copy:
+	memcpy(buf, this->base + bram_offset, count);
+	return 0;
+}
+
+static int omap3_onenand_write_bufferram(struct mtd_info *mtd, int area,
+					 const unsigned char *buffer,
+					 int offset, size_t count)
+{
+	struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd);
+	struct onenand_chip *this = mtd->priv;
+	dma_addr_t dma_src, dma_dst;
+	int bram_offset;
+	unsigned long timeout;
+	void *buf = (void *)buffer;
+	volatile unsigned *done;
+
+	bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset;
+	if (bram_offset & 3 || (size_t)buf & 3 || count < 384)
+		goto out_copy;
+
+	/* panic_write() may be in an interrupt context */
+	if (in_interrupt())
+		goto out_copy;
+
+	if (buf >= high_memory) {
+		struct page *p1;
+
+		if (((size_t)buf & PAGE_MASK) !=
+		    ((size_t)(buf + count - 1) & PAGE_MASK))
+			goto out_copy;
+		p1 = vmalloc_to_page(buf);
+		if (!p1)
+			goto out_copy;
+		buf = page_address(p1) + ((size_t)buf & ~PAGE_MASK);
+	}
+
+	dma_src = dma_map_single(&c->pdev->dev, buf, count, DMA_TO_DEVICE);
+	dma_dst = c->phys_base + bram_offset;
+	if (dma_mapping_error(&c->pdev->dev, dma_dst)) {
+		dev_err(&c->pdev->dev,
+			"Couldn't DMA map a %d byte buffer\n",
+			count);
+		return -1;
+	}
+
+	omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S32,
+				     count >> 2, 1, 0, 0, 0);
+	omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				dma_src, 0, 0);
+	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				 dma_dst, 0, 0);
+
+	INIT_COMPLETION(c->dma_done);
+	omap_start_dma(c->dma_channel);
+
+	timeout = jiffies + msecs_to_jiffies(20);
+	done = &c->dma_done.done;
+	while (time_before(jiffies, timeout))
+		if (*done)
+			break;
+
+	dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_TO_DEVICE);
+
+	if (!*done) {
+		dev_err(&c->pdev->dev, "timeout waiting for DMA\n");
+		goto out_copy;
+	}
+
+	return 0;
+
+out_copy:
+	memcpy(this->base + bram_offset, buf, count);
+	return 0;
+}
+
+#else
+
+int omap3_onenand_read_bufferram(struct mtd_info *mtd, int area,
+				 unsigned char *buffer, int offset,
+				 size_t count);
+
+int omap3_onenand_write_bufferram(struct mtd_info *mtd, int area,
+				  const unsigned char *buffer,
+				  int offset, size_t count);
+
+#endif
+
+#if defined(CONFIG_ARCH_OMAP2) || defined(MULTI_OMAP2)
+
+static int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area,
+					unsigned char *buffer, int offset,
+					size_t count)
+{
+	struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd);
+	struct onenand_chip *this = mtd->priv;
+	dma_addr_t dma_src, dma_dst;
+	int bram_offset;
+
+	bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset;
+	/* DMA is not used.  Revisit PM requirements before enabling it. */
+	if (1 || (c->dma_channel < 0) ||
+	    ((void *) buffer >= (void *) high_memory) || (bram_offset & 3) ||
+	    (((unsigned int) buffer) & 3) || (count < 1024) || (count & 3)) {
+		memcpy(buffer, (__force void *)(this->base + bram_offset),
+		       count);
+		return 0;
+	}
+
+	dma_src = c->phys_base + bram_offset;
+	dma_dst = dma_map_single(&c->pdev->dev, buffer, count,
+				 DMA_FROM_DEVICE);
+	if (dma_mapping_error(&c->pdev->dev, dma_dst)) {
+		dev_err(&c->pdev->dev,
+			"Couldn't DMA map a %d byte buffer\n",
+			count);
+		return -1;
+	}
+
+	omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S32,
+				     count / 4, 1, 0, 0, 0);
+	omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				dma_src, 0, 0);
+	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				 dma_dst, 0, 0);
+
+	INIT_COMPLETION(c->dma_done);
+	omap_start_dma(c->dma_channel);
+	wait_for_completion(&c->dma_done);
+
+	dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_FROM_DEVICE);
+
+	return 0;
+}
+
+static int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area,
+					 const unsigned char *buffer,
+					 int offset, size_t count)
+{
+	struct omap2_onenand *c = container_of(mtd, struct omap2_onenand, mtd);
+	struct onenand_chip *this = mtd->priv;
+	dma_addr_t dma_src, dma_dst;
+	int bram_offset;
+
+	bram_offset = omap2_onenand_bufferram_offset(mtd, area) + area + offset;
+	/* DMA is not used.  Revisit PM requirements before enabling it. */
+	if (1 || (c->dma_channel < 0) ||
+	    ((void *) buffer >= (void *) high_memory) || (bram_offset & 3) ||
+	    (((unsigned int) buffer) & 3) || (count < 1024) || (count & 3)) {
+		memcpy((__force void *)(this->base + bram_offset), buffer,
+		       count);
+		return 0;
+	}
+
+	dma_src = dma_map_single(&c->pdev->dev, (void *) buffer, count,
+				 DMA_TO_DEVICE);
+	dma_dst = c->phys_base + bram_offset;
+	if (dma_mapping_error(&c->pdev->dev, dma_dst)) {
+		dev_err(&c->pdev->dev,
+			"Couldn't DMA map a %d byte buffer\n",
+			count);
+		return -1;
+	}
+
+	omap_set_dma_transfer_params(c->dma_channel, OMAP_DMA_DATA_TYPE_S16,
+				     count / 2, 1, 0, 0, 0);
+	omap_set_dma_src_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				dma_src, 0, 0);
+	omap_set_dma_dest_params(c->dma_channel, 0, OMAP_DMA_AMODE_POST_INC,
+				 dma_dst, 0, 0);
+
+	INIT_COMPLETION(c->dma_done);
+	omap_start_dma(c->dma_channel);
+	wait_for_completion(&c->dma_done);
+
+	dma_unmap_single(&c->pdev->dev, dma_dst, count, DMA_TO_DEVICE);
+
+	return 0;
+}
+
+#else
+
+int omap2_onenand_read_bufferram(struct mtd_info *mtd, int area,
+				 unsigned char *buffer, int offset,
+				 size_t count);
+
+int omap2_onenand_write_bufferram(struct mtd_info *mtd, int area,
+				  const unsigned char *buffer,
+				  int offset, size_t count);
+
+#endif
+
+static struct platform_driver omap2_onenand_driver;
+
+static int __adjust_timing(struct device *dev, void *data)
+{
+	int ret = 0;
+	struct omap2_onenand *c;
+
+	c = dev_get_drvdata(dev);
+
+	BUG_ON(c->setup == NULL);
+
+	/* DMA is not in use so this is all that is needed */
+	/* Revisit for OMAP3! */
+	ret = c->setup(c->onenand.base, c->freq);
+
+	return ret;
+}
+
+int omap2_onenand_rephase(void)
+{
+	return driver_for_each_device(&omap2_onenand_driver.driver, NULL,
+				      NULL, __adjust_timing);
+}
+
+static void __devexit omap2_onenand_shutdown(struct platform_device *pdev)
+{
+	struct omap2_onenand *c = dev_get_drvdata(&pdev->dev);
+
+	/* With certain content in the buffer RAM, the OMAP boot ROM code
+	 * can recognize the flash chip incorrectly. Zero it out before
+	 * soft reset.
+	 */
+	memset((__force void *)c->onenand.base, 0, ONENAND_BUFRAM_SIZE);
+}
+
+static int __devinit omap2_onenand_probe(struct platform_device *pdev)
+{
+	struct omap_onenand_platform_data *pdata;
+	struct omap2_onenand *c;
+	int r;
+
+	pdata = pdev->dev.platform_data;
+	if (pdata == NULL) {
+		dev_err(&pdev->dev, "platform data missing\n");
+		return -ENODEV;
+	}
+
+	c = kzalloc(sizeof(struct omap2_onenand), GFP_KERNEL);
+	if (!c)
+		return -ENOMEM;
+
+	init_completion(&c->irq_done);
+	init_completion(&c->dma_done);
+	c->gpmc_cs = pdata->cs;
+	c->gpio_irq = pdata->gpio_irq;
+	c->dma_channel = pdata->dma_channel;
+	if (c->dma_channel < 0) {
+		/* if -1, don't use DMA */
+		c->gpio_irq = 0;
+	}
+
+	r = gpmc_cs_request(c->gpmc_cs, ONENAND_IO_SIZE, &c->phys_base);
+	if (r < 0) {
+		dev_err(&pdev->dev, "Cannot request GPMC CS\n");
+		goto err_kfree;
+	}
+
+	if (request_mem_region(c->phys_base, ONENAND_IO_SIZE,
+			       pdev->dev.driver->name) == NULL) {
+		dev_err(&pdev->dev, "Cannot reserve memory region at 0x%08lx, "
+			"size: 0x%x\n",	c->phys_base, ONENAND_IO_SIZE);
+		r = -EBUSY;
+		goto err_free_cs;
+	}
+	c->onenand.base = ioremap(c->phys_base, ONENAND_IO_SIZE);
+	if (c->onenand.base == NULL) {
+		r = -ENOMEM;
+		goto err_release_mem_region;
+	}
+
+	if (pdata->onenand_setup != NULL) {
+		r = pdata->onenand_setup(c->onenand.base, c->freq);
+		if (r < 0) {
+			dev_err(&pdev->dev, "Onenand platform setup failed: "
+				"%d\n", r);
+			goto err_iounmap;
+		}
+		c->setup = pdata->onenand_setup;
+	}
+
+	if (c->gpio_irq) {
+		if ((r = omap_request_gpio(c->gpio_irq)) < 0) {
+			dev_err(&pdev->dev,  "Failed to request GPIO%d for "
+				"OneNAND\n", c->gpio_irq);
+			goto err_iounmap;
+	}
+	omap_set_gpio_direction(c->gpio_irq, 1);
+
+	if ((r = request_irq(OMAP_GPIO_IRQ(c->gpio_irq),
+			     omap2_onenand_interrupt, IRQF_TRIGGER_RISING,
+			     pdev->dev.driver->name, c)) < 0)
+		goto err_release_gpio;
+	}
+
+	if (c->dma_channel >= 0) {
+		r = omap_request_dma(0, pdev->dev.driver->name,
+				     omap2_onenand_dma_cb, (void *) c,
+				     &c->dma_channel);
+		if (r == 0) {
+			omap_set_dma_write_mode(c->dma_channel,
+						OMAP_DMA_WRITE_NON_POSTED);
+			omap_set_dma_src_data_pack(c->dma_channel, 1);
+			omap_set_dma_src_burst_mode(c->dma_channel,
+						    OMAP_DMA_DATA_BURST_8);
+			omap_set_dma_dest_data_pack(c->dma_channel, 1);
+			omap_set_dma_dest_burst_mode(c->dma_channel,
+						     OMAP_DMA_DATA_BURST_8);
+		} else {
+			dev_info(&pdev->dev,
+				 "failed to allocate DMA for OneNAND, "
+				 "using PIO instead\n");
+			c->dma_channel = -1;
+		}
+	}
+
+	dev_info(&pdev->dev, "initializing on CS%d, phys base 0x%08lx, virtual "
+		 "base %p\n", c->gpmc_cs, c->phys_base,
+		 c->onenand.base);
+
+	c->pdev = pdev;
+	c->mtd.name = pdev->dev.bus_id;
+	c->mtd.priv = &c->onenand;
+	c->mtd.owner = THIS_MODULE;
+
+	if (c->dma_channel >= 0) {
+		struct onenand_chip *this = &c->onenand;
+
+		this->wait = omap2_onenand_wait;
+		if (cpu_is_omap34xx()) {
+			this->read_bufferram = omap3_onenand_read_bufferram;
+			this->write_bufferram = omap3_onenand_write_bufferram;
+		} else {
+			this->read_bufferram = omap2_onenand_read_bufferram;
+			this->write_bufferram = omap2_onenand_write_bufferram;
+		}
+	}
+
+	if ((r = onenand_scan(&c->mtd, 1)) < 0)
+		goto err_release_dma;
+
+	switch ((c->onenand.version_id >> 4) & 0xf) {
+	case 0:
+		c->freq = 40;
+		break;
+	case 1:
+		c->freq = 54;
+		break;
+	case 2:
+		c->freq = 66;
+		break;
+	case 3:
+		c->freq = 83;
+		break;
+	}
+
+#ifdef CONFIG_MTD_PARTITIONS
+	if (pdata->parts != NULL)
+		r = add_mtd_partitions(&c->mtd, pdata->parts,
+				       pdata->nr_parts);
+	else
+#endif
+		r = add_mtd_device(&c->mtd);
+	if (r < 0)
+		goto err_release_onenand;
+
+	platform_set_drvdata(pdev, c);
+
+	return 0;
+
+err_release_onenand:
+	onenand_release(&c->mtd);
+err_release_dma:
+	if (c->dma_channel != -1)
+		omap_free_dma(c->dma_channel);
+	if (c->gpio_irq)
+		free_irq(OMAP_GPIO_IRQ(c->gpio_irq), c);
+err_release_gpio:
+	if (c->gpio_irq)
+		omap_free_gpio(c->gpio_irq);
+err_iounmap:
+	iounmap(c->onenand.base);
+err_release_mem_region:
+	release_mem_region(c->phys_base, ONENAND_IO_SIZE);
+err_free_cs:
+	gpmc_cs_free(c->gpmc_cs);
+err_kfree:
+	kfree(c);
+
+	return r;
+}
+
+static int __devexit omap2_onenand_remove(struct platform_device *pdev)
+{
+	struct omap2_onenand *c = dev_get_drvdata(&pdev->dev);
+
+	BUG_ON(c == NULL);
+
+#ifdef CONFIG_MTD_PARTITIONS
+	if (c->parts)
+		del_mtd_partitions(&c->mtd);
+	else
+		del_mtd_device(&c->mtd);
+#else
+	del_mtd_device(&c->mtd);
+#endif
+
+	onenand_release(&c->mtd);
+	if (c->dma_channel != -1)
+		omap_free_dma(c->dma_channel);
+	omap2_onenand_shutdown(pdev);
+	platform_set_drvdata(pdev, NULL);
+	if (c->gpio_irq) {
+		free_irq(OMAP_GPIO_IRQ(c->gpio_irq), c);
+		omap_free_gpio(c->gpio_irq);
+	}
+	iounmap(c->onenand.base);
+	release_mem_region(c->phys_base, ONENAND_IO_SIZE);
+	kfree(c);
+
+	return 0;
+}
+
+static struct platform_driver omap2_onenand_driver = {
+	.probe		= omap2_onenand_probe,
+	.remove		= omap2_onenand_remove,
+	.shutdown	= omap2_onenand_shutdown,
+	.driver		= {
+		.name	= DRIVER_NAME,
+		.owner  = THIS_MODULE,
+	},
+};
+
+static int __init omap2_onenand_init(void)
+{
+	printk(KERN_INFO "OneNAND driver initializing\n");
+	return platform_driver_register(&omap2_onenand_driver);
+}
+
+static void __exit omap2_onenand_exit(void)
+{
+	platform_driver_unregister(&omap2_onenand_driver);
+}
+
+module_init(omap2_onenand_init);
+module_exit(omap2_onenand_exit);
+
+MODULE_ALIAS(DRIVER_NAME);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jarkko Lavinen <jarkko.lavinen@nokia.com>");
+MODULE_DESCRIPTION("Glue layer for OneNAND flash on OMAP2 / OMAP3");
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index 926cf3a4135d..90ed319f26e6 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -1794,7 +1794,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr)
 		return -EINVAL;
 	}
 
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	/* Grab the lock and see if the device is available */
 	onenand_get_device(mtd, FL_ERASING);
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index a5f3d60047d4..33a5d6ed6f18 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -321,8 +321,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 	DEBUG(MTD_DEBUG_LEVEL1,
 		"SSFDC_RO: cis_block=%d,erase_size=%d,map_len=%d,n_zones=%d\n",
 		ssfdc->cis_block, ssfdc->erase_size, ssfdc->map_len,
-		(ssfdc->map_len + MAX_PHYS_BLK_PER_ZONE - 1) /
-		MAX_PHYS_BLK_PER_ZONE);
+		DIV_ROUND_UP(ssfdc->map_len, MAX_PHYS_BLK_PER_ZONE));
 
 	/* Set geometry */
 	ssfdc->heads = 16;
diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
index 03c759b4eeb5..b30a0b83d7f1 100644
--- a/drivers/mtd/ubi/cdev.c
+++ b/drivers/mtd/ubi/cdev.c
@@ -104,12 +104,9 @@ static int vol_cdev_open(struct inode *inode, struct file *file)
 	struct ubi_volume_desc *desc;
 	int vol_id = iminor(inode) - 1, mode, ubi_num;
 
-	lock_kernel();
 	ubi_num = ubi_major2num(imajor(inode));
-	if (ubi_num < 0) {
-		unlock_kernel();
+	if (ubi_num < 0)
 		return ubi_num;
-	}
 
 	if (file->f_mode & FMODE_WRITE)
 		mode = UBI_READWRITE;
@@ -119,7 +116,6 @@ static int vol_cdev_open(struct inode *inode, struct file *file)
 	dbg_gen("open volume %d, mode %d", vol_id, mode);
 
 	desc = ubi_open_volume(ubi_num, vol_id, mode);
-	unlock_kernel();
 	if (IS_ERR(desc))
 		return PTR_ERR(desc);
 
diff --git a/drivers/mtd/ubi/scan.c b/drivers/mtd/ubi/scan.c
index 967bb4406df9..4f2daa5bbecf 100644
--- a/drivers/mtd/ubi/scan.c
+++ b/drivers/mtd/ubi/scan.c
@@ -387,7 +387,7 @@ int ubi_scan_add_used(struct ubi_device *ubi, struct ubi_scan_info *si,
 		pnum, vol_id, lnum, ec, sqnum, bitflips);
 
 	sv = add_volume(si, vol_id, pnum, vid_hdr);
-	if (IS_ERR(sv) < 0)
+	if (IS_ERR(sv))
 		return PTR_ERR(sv);
 
 	if (si->max_sqnum < sqnum)
diff --git a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c
index 217d0e111b2a..333c8941552f 100644
--- a/drivers/mtd/ubi/vtbl.c
+++ b/drivers/mtd/ubi/vtbl.c
@@ -244,8 +244,8 @@ static int vtbl_check(const struct ubi_device *ubi,
 		}
 
 		if (reserved_pebs > ubi->good_peb_count) {
-			dbg_err("too large reserved_pebs, good PEBs %d",
-				ubi->good_peb_count);
+			dbg_err("too large reserved_pebs %d, good PEBs %d",
+				reserved_pebs, ubi->good_peb_count);
 			err = 9;
 			goto bad;
 		}
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index fc5f2dbf5323..8b51e10b7783 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -563,7 +563,7 @@ static int __iommu_flush_context(struct intel_iommu *iommu,
 
 	spin_unlock_irqrestore(&iommu->register_lock, flag);
 
-	/* flush context entry will implictly flush write buffer */
+	/* flush context entry will implicitly flush write buffer */
 	return 0;
 }
 
@@ -656,7 +656,7 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
 		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
 			DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
-	/* flush context entry will implictly flush write buffer */
+	/* flush iotlb entry will implicitly flush write buffer */
 	return 0;
 }
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c9884bba22de..dbe9f39f4436 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1358,11 +1358,10 @@ int pci_request_region(struct pci_dev *pdev, int bar, const char *res_name)
 	return 0;
 
 err_out:
-	dev_warn(&pdev->dev, "BAR %d: can't reserve %s region [%#llx-%#llx]\n",
+	dev_warn(&pdev->dev, "BAR %d: can't reserve %s region %pR\n",
 		 bar,
 		 pci_resource_flags(pdev, bar) & IORESOURCE_IO ? "I/O" : "mem",
-		 (unsigned long long)pci_resource_start(pdev, bar),
-		 (unsigned long long)pci_resource_end(pdev, bar));
+		 &pdev->resource[bar]);
 	return -EBUSY;
 }
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index dd9161a054e1..d3db8b249729 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -304,9 +304,8 @@ static int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 		} else {
 			res->start = l64;
 			res->end = l64 + sz64;
-			printk(KERN_DEBUG "PCI: %s reg %x 64bit mmio: [%llx, %llx]\n",
-				pci_name(dev), pos, (unsigned long long)res->start,
-				(unsigned long long)res->end);
+			printk(KERN_DEBUG "PCI: %s reg %x 64bit mmio: %pR\n",
+				pci_name(dev), pos, res);
 		}
 	} else {
 		sz = pci_size(l, sz, mask);
@@ -316,9 +315,10 @@ static int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
 
 		res->start = l;
 		res->end = l + sz;
-		printk(KERN_DEBUG "PCI: %s reg %x %s: [%llx, %llx]\n", pci_name(dev),
-			pos, (res->flags & IORESOURCE_IO) ? "io port":"32bit mmio",
-			(unsigned long long)res->start, (unsigned long long)res->end);
+		printk(KERN_DEBUG "PCI: %s reg %x %s: %pR\n",
+		       pci_name(dev), pos,
+		       (res->flags & IORESOURCE_IO) ? "io port":"32bit mmio",
+		       res);
 	}
 
  out:
@@ -389,9 +389,8 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child)
 			res->start = base;
 		if (!res->end)
 			res->end = limit + 0xfff;
-		printk(KERN_DEBUG "PCI: bridge %s io port: [%llx, %llx]\n",
-			pci_name(dev), (unsigned long long) res->start,
-			(unsigned long long) res->end);
+		printk(KERN_DEBUG "PCI: bridge %s io port: %pR\n",
+		       pci_name(dev), res);
 	}
 
 	res = child->resource[1];
@@ -403,9 +402,8 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child)
 		res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM;
 		res->start = base;
 		res->end = limit + 0xfffff;
-		printk(KERN_DEBUG "PCI: bridge %s 32bit mmio: [%llx, %llx]\n",
-			pci_name(dev), (unsigned long long) res->start,
-			(unsigned long long) res->end);
+		printk(KERN_DEBUG "PCI: bridge %s 32bit mmio: %pR\n",
+		       pci_name(dev), res);
 	}
 
 	res = child->resource[2];
@@ -441,9 +439,9 @@ void __devinit pci_read_bridge_bases(struct pci_bus *child)
 		res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM | IORESOURCE_PREFETCH;
 		res->start = base;
 		res->end = limit + 0xfffff;
-		printk(KERN_DEBUG "PCI: bridge %s %sbit mmio pref: [%llx, %llx]\n",
-			pci_name(dev), (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64" : "32",
-			(unsigned long long) res->start, (unsigned long long) res->end);
+		printk(KERN_DEBUG "PCI: bridge %s %sbit mmio pref: %pR\n",
+		       pci_name(dev),
+		       (res->flags & PCI_PREF_RANGE_TYPE_64) ? "64":"32", res);
 	}
 }
 
diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index bd5c0e031398..1f5f6143f35c 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -21,7 +21,7 @@
  * between the ROM and other resources, so enabling it may disable access
  * to MMIO registers or other card memory.
  */
-static int pci_enable_rom(struct pci_dev *pdev)
+int pci_enable_rom(struct pci_dev *pdev)
 {
 	struct resource *res = pdev->resource + PCI_ROM_RESOURCE;
 	struct pci_bus_region region;
@@ -45,7 +45,7 @@ static int pci_enable_rom(struct pci_dev *pdev)
  * Disable ROM decoding on a PCI device by turning off the last bit in the
  * ROM BAR.
  */
-static void pci_disable_rom(struct pci_dev *pdev)
+void pci_disable_rom(struct pci_dev *pdev)
 {
 	u32 rom_addr;
 	pci_read_config_dword(pdev, pdev->rom_base_reg, &rom_addr);
@@ -260,3 +260,5 @@ void pci_cleanup_rom(struct pci_dev *pdev)
 
 EXPORT_SYMBOL(pci_map_rom);
 EXPORT_SYMBOL(pci_unmap_rom);
+EXPORT_SYMBOL_GPL(pci_enable_rom);
+EXPORT_SYMBOL_GPL(pci_disable_rom);
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index d5e2106760f8..471a429d7a20 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -356,10 +356,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long
 			order = __ffs(align) - 20;
 			if (order > 11) {
 				dev_warn(&dev->dev, "BAR %d bad alignment %llx: "
-				       "%#016llx-%#016llx\n", i,
-				       (unsigned long long)align,
-				       (unsigned long long)r->start,
-				       (unsigned long long)r->end);
+					 "%pR\n", i, (unsigned long long)align, r);
 				r->flags = 0;
 				continue;
 			}
@@ -539,11 +536,9 @@ static void pci_bus_dump_res(struct pci_bus *bus)
                 if (!res)
                         continue;
 
-		printk(KERN_INFO "bus: %02x index %x %s: [%llx, %llx]\n",
-			bus->number, i,
-			(res->flags & IORESOURCE_IO) ? "io port" : "mmio",
-			(unsigned long long) res->start,
-			(unsigned long long) res->end);
+		printk(KERN_INFO "bus: %02x index %x %s: %pR\n",
+		       bus->number, i,
+		       (res->flags & IORESOURCE_IO) ? "io port" : "mmio", res);
         }
 }
 
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 1a5fc83c71b3..d4b5c690eaa7 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -49,10 +49,8 @@ void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno)
 
 	pcibios_resource_to_bus(dev, &region, res);
 
-	dev_dbg(&dev->dev, "BAR %d: got res [%#llx-%#llx] bus [%#llx-%#llx] "
-		"flags %#lx\n", resno,
-		 (unsigned long long)res->start,
-		 (unsigned long long)res->end,
+	dev_dbg(&dev->dev, "BAR %d: got res %pR bus [%#llx-%#llx] "
+		"flags %#lx\n", resno, res,
 		 (unsigned long long)region.start,
 		 (unsigned long long)region.end,
 		 (unsigned long)res->flags);
@@ -114,13 +112,11 @@ int pci_claim_resource(struct pci_dev *dev, int resource)
 		err = insert_resource(root, res);
 
 	if (err) {
-		dev_err(&dev->dev, "BAR %d: %s of %s [%#llx-%#llx]\n",
+		dev_err(&dev->dev, "BAR %d: %s of %s %pR\n",
 			resource,
 			root ? "address space collision on" :
 				"no parent found for",
-			dtype,
-			(unsigned long long)res->start,
-			(unsigned long long)res->end);
+			dtype, res);
 	}
 
 	return err;
@@ -139,9 +135,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 	align = resource_alignment(res);
 	if (!align) {
 		dev_err(&dev->dev, "BAR %d: can't allocate resource (bogus "
-			"alignment) [%#llx-%#llx] flags %#lx\n",
-			resno, (unsigned long long)res->start,
-			(unsigned long long)res->end, res->flags);
+			"alignment) %pR flags %#lx\n",
+			resno, res, res->flags);
 		return -EINVAL;
 	}
 
@@ -162,11 +157,8 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
 	}
 
 	if (ret) {
-		dev_err(&dev->dev, "BAR %d: can't allocate %s resource "
-			"[%#llx-%#llx]\n", resno,
-			res->flags & IORESOURCE_IO ? "I/O" : "mem",
-			(unsigned long long)res->start,
-			(unsigned long long)res->end);
+		dev_err(&dev->dev, "BAR %d: can't allocate %s resource %pR\n",
+			resno, res->flags & IORESOURCE_IO ? "I/O" : "mem", res);
 	} else {
 		res->flags &= ~IORESOURCE_STARTALIGN;
 		if (resno < PCI_BRIDGE_RESOURCES)
@@ -202,11 +194,8 @@ int pci_assign_resource_fixed(struct pci_dev *dev, int resno)
 	}
 
 	if (ret) {
-		dev_err(&dev->dev, "BAR %d: can't allocate %s resource "
-			"[%#llx-%#llx\n]", resno,
-			res->flags & IORESOURCE_IO ? "I/O" : "mem",
-			(unsigned long long)res->start,
-			(unsigned long long)res->end);
+		dev_err(&dev->dev, "BAR %d: can't allocate %s resource %pR\n",
+			resno, res->flags & IORESOURCE_IO ? "I/O" : "mem", res);
 	} else if (resno < PCI_BRIDGE_RESOURCES) {
 		pci_update_resource(dev, res, resno);
 	}
@@ -237,9 +226,8 @@ void pdev_sort_resources(struct pci_dev *dev, struct resource_list *head)
 		r_align = resource_alignment(r);
 		if (!r_align) {
 			dev_warn(&dev->dev, "BAR %d: bogus alignment "
-				"[%#llx-%#llx] flags %#lx\n",
-				i, (unsigned long long)r->start,
-				(unsigned long long)r->end, r->flags);
+				"%pR flags %#lx\n",
+				i, r, r->flags);
 			continue;
 		}
 		for (list = head; ; list = list->next) {
@@ -287,9 +275,7 @@ int pci_enable_resources(struct pci_dev *dev, int mask)
 
 		if (!r->parent) {
 			dev_err(&dev->dev, "device not available because of "
-				"BAR %d [%#llx-%#llx] collisions\n", i,
-				(unsigned long long) r->start,
-				(unsigned long long) r->end);
+				"BAR %d %pR collisions\n", i, r);
 			return -EINVAL;
 		}
 
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index fe2aeb11939b..23ae8460f5c1 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -30,7 +30,7 @@
 
 #define POWER_SUPPLY_ATTR(_name)					\
 {									\
-	.attr = { .name = #_name, .mode = 0444, .owner = THIS_MODULE },	\
+	.attr = { .name = #_name, .mode = 0444 },	\
 	.show = power_supply_show_property,				\
 	.store = NULL,							\
 }
diff --git a/drivers/ps3/ps3av.c b/drivers/ps3/ps3av.c
index 6f2f90ebb020..06848b254d57 100644
--- a/drivers/ps3/ps3av.c
+++ b/drivers/ps3/ps3av.c
@@ -915,6 +915,22 @@ int ps3av_video_mute(int mute)
 
 EXPORT_SYMBOL_GPL(ps3av_video_mute);
 
+/* mute analog output only */
+int ps3av_audio_mute_analog(int mute)
+{
+	int i, res;
+
+	for (i = 0; i < ps3av->av_hw_conf.num_of_avmulti; i++) {
+		res = ps3av_cmd_av_audio_mute(1,
+			&ps3av->av_port[i + ps3av->av_hw_conf.num_of_hdmi],
+			mute);
+		if (res < 0)
+			return -1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ps3av_audio_mute_analog);
+
 int ps3av_audio_mute(int mute)
 {
 	return ps3av_set_audio_mute(mute ? PS3AV_CMD_MUTE_ON
diff --git a/drivers/ps3/ps3av_cmd.c b/drivers/ps3/ps3av_cmd.c
index 7f880c26122f..11eb50318fec 100644
--- a/drivers/ps3/ps3av_cmd.c
+++ b/drivers/ps3/ps3av_cmd.c
@@ -660,9 +660,10 @@ u32 ps3av_cmd_set_av_audio_param(void *p, u32 port,
 }
 
 /* default cs val */
-static const u8 ps3av_mode_cs_info[] = {
+u8 ps3av_mode_cs_info[] = {
 	0x00, 0x09, 0x00, 0x02, 0x01, 0x00, 0x00, 0x00
 };
+EXPORT_SYMBOL_GPL(ps3av_mode_cs_info);
 
 #define CS_44	0x00
 #define CS_48	0x02
@@ -677,7 +678,7 @@ void ps3av_cmd_set_audio_mode(struct ps3av_pkt_audio_mode *audio, u32 avport,
 			      u32 ch, u32 fs, u32 word_bits, u32 format,
 			      u32 source)
 {
-	int spdif_through, spdif_bitstream;
+	int spdif_through;
 	int i;
 
 	if (!(ch | fs | format | word_bits | source)) {
@@ -687,7 +688,6 @@ void ps3av_cmd_set_audio_mode(struct ps3av_pkt_audio_mode *audio, u32 avport,
 		format = PS3AV_CMD_AUDIO_FORMAT_PCM;
 		source = PS3AV_CMD_AUDIO_SOURCE_SERIAL;
 	}
-	spdif_through = spdif_bitstream = 0;	/* XXX not supported */
 
 	/* audio mode */
 	memset(audio, 0, sizeof(*audio));
@@ -777,16 +777,17 @@ void ps3av_cmd_set_audio_mode(struct ps3av_pkt_audio_mode *audio, u32 avport,
 		break;
 	}
 
+	/* non-audio bit */
+	spdif_through = audio->audio_cs_info[0] & 0x02;
+
 	/* pass through setting */
 	if (spdif_through &&
 	    (avport == PS3AV_CMD_AVPORT_SPDIF_0 ||
-	     avport == PS3AV_CMD_AVPORT_SPDIF_1)) {
+	     avport == PS3AV_CMD_AVPORT_SPDIF_1 ||
+	     avport == PS3AV_CMD_AVPORT_HDMI_0 ||
+	     avport == PS3AV_CMD_AVPORT_HDMI_1)) {
 		audio->audio_word_bits = PS3AV_CMD_AUDIO_WORD_BITS_16;
-		audio->audio_source = PS3AV_CMD_AUDIO_SOURCE_SPDIF;
-		if (spdif_bitstream) {
-			audio->audio_format = PS3AV_CMD_AUDIO_FORMAT_BITSTREAM;
-			audio->audio_cs_info[0] |= CS_BIT;
-		}
+		audio->audio_format = PS3AV_CMD_AUDIO_FORMAT_BITSTREAM;
 	}
 }
 
diff --git a/drivers/rtc/rtc-at91rm9200.c b/drivers/rtc/rtc-at91rm9200.c
index 37082616482b..b5bf93706913 100644
--- a/drivers/rtc/rtc-at91rm9200.c
+++ b/drivers/rtc/rtc-at91rm9200.c
@@ -53,21 +53,21 @@ static void at91_rtc_decodetime(unsigned int timereg, unsigned int calreg,
 	} while ((time != at91_sys_read(timereg)) ||
 			(date != at91_sys_read(calreg)));
 
-	tm->tm_sec  = BCD2BIN((time & AT91_RTC_SEC) >> 0);
-	tm->tm_min  = BCD2BIN((time & AT91_RTC_MIN) >> 8);
-	tm->tm_hour = BCD2BIN((time & AT91_RTC_HOUR) >> 16);
+	tm->tm_sec  = bcd2bin((time & AT91_RTC_SEC) >> 0);
+	tm->tm_min  = bcd2bin((time & AT91_RTC_MIN) >> 8);
+	tm->tm_hour = bcd2bin((time & AT91_RTC_HOUR) >> 16);
 
 	/*
 	 * The Calendar Alarm register does not have a field for
 	 * the year - so these will return an invalid value.  When an
 	 * alarm is set, at91_alarm_year wille store the current year.
 	 */
-	tm->tm_year  = BCD2BIN(date & AT91_RTC_CENT) * 100;	/* century */
-	tm->tm_year += BCD2BIN((date & AT91_RTC_YEAR) >> 8);	/* year */
+	tm->tm_year  = bcd2bin(date & AT91_RTC_CENT) * 100;	/* century */
+	tm->tm_year += bcd2bin((date & AT91_RTC_YEAR) >> 8);	/* year */
 
-	tm->tm_wday = BCD2BIN((date & AT91_RTC_DAY) >> 21) - 1;	/* day of the week [0-6], Sunday=0 */
-	tm->tm_mon  = BCD2BIN((date & AT91_RTC_MONTH) >> 16) - 1;
-	tm->tm_mday = BCD2BIN((date & AT91_RTC_DATE) >> 24);
+	tm->tm_wday = bcd2bin((date & AT91_RTC_DAY) >> 21) - 1;	/* day of the week [0-6], Sunday=0 */
+	tm->tm_mon  = bcd2bin((date & AT91_RTC_MONTH) >> 16) - 1;
+	tm->tm_mday = bcd2bin((date & AT91_RTC_DATE) >> 24);
 }
 
 /*
@@ -106,16 +106,16 @@ static int at91_rtc_settime(struct device *dev, struct rtc_time *tm)
 	at91_sys_write(AT91_RTC_IDR, AT91_RTC_ACKUPD);
 
 	at91_sys_write(AT91_RTC_TIMR,
-			  BIN2BCD(tm->tm_sec) << 0
-			| BIN2BCD(tm->tm_min) << 8
-			| BIN2BCD(tm->tm_hour) << 16);
+			  bin2bcd(tm->tm_sec) << 0
+			| bin2bcd(tm->tm_min) << 8
+			| bin2bcd(tm->tm_hour) << 16);
 
 	at91_sys_write(AT91_RTC_CALR,
-			  BIN2BCD((tm->tm_year + 1900) / 100)	/* century */
-			| BIN2BCD(tm->tm_year % 100) << 8	/* year */
-			| BIN2BCD(tm->tm_mon + 1) << 16		/* tm_mon starts at zero */
-			| BIN2BCD(tm->tm_wday + 1) << 21	/* day of the week [0-6], Sunday=0 */
-			| BIN2BCD(tm->tm_mday) << 24);
+			  bin2bcd((tm->tm_year + 1900) / 100)	/* century */
+			| bin2bcd(tm->tm_year % 100) << 8	/* year */
+			| bin2bcd(tm->tm_mon + 1) << 16		/* tm_mon starts at zero */
+			| bin2bcd(tm->tm_wday + 1) << 21	/* day of the week [0-6], Sunday=0 */
+			| bin2bcd(tm->tm_mday) << 24);
 
 	/* Restart Time/Calendar */
 	cr = at91_sys_read(AT91_RTC_CR);
@@ -162,13 +162,13 @@ static int at91_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 
 	at91_sys_write(AT91_RTC_IDR, AT91_RTC_ALARM);
 	at91_sys_write(AT91_RTC_TIMALR,
-		  BIN2BCD(tm.tm_sec) << 0
-		| BIN2BCD(tm.tm_min) << 8
-		| BIN2BCD(tm.tm_hour) << 16
+		  bin2bcd(tm.tm_sec) << 0
+		| bin2bcd(tm.tm_min) << 8
+		| bin2bcd(tm.tm_hour) << 16
 		| AT91_RTC_HOUREN | AT91_RTC_MINEN | AT91_RTC_SECEN);
 	at91_sys_write(AT91_RTC_CALALR,
-		  BIN2BCD(tm.tm_mon + 1) << 16		/* tm_mon starts at zero */
-		| BIN2BCD(tm.tm_mday) << 24
+		  bin2bcd(tm.tm_mon + 1) << 16		/* tm_mon starts at zero */
+		| bin2bcd(tm.tm_mday) << 24
 		| AT91_RTC_DATEEN | AT91_RTC_MTHEN);
 
 	if (alrm->enabled) {
diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c
index 189a018bdf34..d00a274df8fc 100644
--- a/drivers/rtc/rtc-bq4802.c
+++ b/drivers/rtc/rtc-bq4802.c
@@ -71,14 +71,14 @@ static int bq4802_read_time(struct device *dev, struct rtc_time *tm)
 
 	spin_unlock_irqrestore(&p->lock, flags);
 
-	BCD_TO_BIN(tm->tm_sec);
-	BCD_TO_BIN(tm->tm_min);
-	BCD_TO_BIN(tm->tm_hour);
-	BCD_TO_BIN(tm->tm_mday);
-	BCD_TO_BIN(tm->tm_mon);
-	BCD_TO_BIN(tm->tm_year);
-	BCD_TO_BIN(tm->tm_wday);
-	BCD_TO_BIN(century);
+	tm->tm_sec = bcd2bin(tm->tm_sec);
+	tm->tm_min = bcd2bin(tm->tm_min);
+	tm->tm_hour = bcd2bin(tm->tm_hour);
+	tm->tm_mday = bcd2bin(tm->tm_mday);
+	tm->tm_mon = bcd2bin(tm->tm_mon);
+	tm->tm_year = bcd2bin(tm->tm_year);
+	tm->tm_wday = bcd2bin(tm->tm_wday);
+	century = bcd2bin(century);
 
 	tm->tm_year += (century * 100);
 	tm->tm_year -= 1900;
@@ -106,13 +106,13 @@ static int bq4802_set_time(struct device *dev, struct rtc_time *tm)
 	min = tm->tm_min;
 	sec = tm->tm_sec;
 
-	BIN_TO_BCD(sec);
-	BIN_TO_BCD(min);
-	BIN_TO_BCD(hrs);
-	BIN_TO_BCD(day);
-	BIN_TO_BCD(mon);
-	BIN_TO_BCD(yrs);
-	BIN_TO_BCD(century);
+	sec = bin2bcd(sec);
+	min = bin2bcd(min);
+	hrs = bin2bcd(hrs);
+	day = bin2bcd(day);
+	mon = bin2bcd(mon);
+	yrs = bin2bcd(yrs);
+	century = bin2bcd(century);
 
 	spin_lock_irqsave(&p->lock, flags);
 
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 963ad0b6a4e9..5549231179a2 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -143,6 +143,43 @@ static inline int hpet_unregister_irq_handler(irq_handler_t handler)
 
 /*----------------------------------------------------------------*/
 
+#ifdef RTC_PORT
+
+/* Most newer x86 systems have two register banks, the first used
+ * for RTC and NVRAM and the second only for NVRAM.  Caller must
+ * own rtc_lock ... and we won't worry about access during NMI.
+ */
+#define can_bank2	true
+
+static inline unsigned char cmos_read_bank2(unsigned char addr)
+{
+	outb(addr, RTC_PORT(2));
+	return inb(RTC_PORT(3));
+}
+
+static inline void cmos_write_bank2(unsigned char val, unsigned char addr)
+{
+	outb(addr, RTC_PORT(2));
+	outb(val, RTC_PORT(2));
+}
+
+#else
+
+#define can_bank2	false
+
+static inline unsigned char cmos_read_bank2(unsigned char addr)
+{
+	return 0;
+}
+
+static inline void cmos_write_bank2(unsigned char val, unsigned char addr)
+{
+}
+
+#endif
+
+/*----------------------------------------------------------------*/
+
 static int cmos_read_time(struct device *dev, struct rtc_time *t)
 {
 	/* REVISIT:  if the clock has a "century" register, use
@@ -203,26 +240,26 @@ static int cmos_read_alarm(struct device *dev, struct rtc_wkalrm *t)
 	/* REVISIT this assumes PC style usage:  always BCD */
 
 	if (((unsigned)t->time.tm_sec) < 0x60)
-		t->time.tm_sec = BCD2BIN(t->time.tm_sec);
+		t->time.tm_sec = bcd2bin(t->time.tm_sec);
 	else
 		t->time.tm_sec = -1;
 	if (((unsigned)t->time.tm_min) < 0x60)
-		t->time.tm_min = BCD2BIN(t->time.tm_min);
+		t->time.tm_min = bcd2bin(t->time.tm_min);
 	else
 		t->time.tm_min = -1;
 	if (((unsigned)t->time.tm_hour) < 0x24)
-		t->time.tm_hour = BCD2BIN(t->time.tm_hour);
+		t->time.tm_hour = bcd2bin(t->time.tm_hour);
 	else
 		t->time.tm_hour = -1;
 
 	if (cmos->day_alrm) {
 		if (((unsigned)t->time.tm_mday) <= 0x31)
-			t->time.tm_mday = BCD2BIN(t->time.tm_mday);
+			t->time.tm_mday = bcd2bin(t->time.tm_mday);
 		else
 			t->time.tm_mday = -1;
 		if (cmos->mon_alrm) {
 			if (((unsigned)t->time.tm_mon) <= 0x12)
-				t->time.tm_mon = BCD2BIN(t->time.tm_mon) - 1;
+				t->time.tm_mon = bcd2bin(t->time.tm_mon) - 1;
 			else
 				t->time.tm_mon = -1;
 		}
@@ -294,19 +331,19 @@ static int cmos_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 	/* Writing 0xff means "don't care" or "match all".  */
 
 	mon = t->time.tm_mon + 1;
-	mon = (mon <= 12) ? BIN2BCD(mon) : 0xff;
+	mon = (mon <= 12) ? bin2bcd(mon) : 0xff;
 
 	mday = t->time.tm_mday;
-	mday = (mday >= 1 && mday <= 31) ? BIN2BCD(mday) : 0xff;
+	mday = (mday >= 1 && mday <= 31) ? bin2bcd(mday) : 0xff;
 
 	hrs = t->time.tm_hour;
-	hrs = (hrs < 24) ? BIN2BCD(hrs) : 0xff;
+	hrs = (hrs < 24) ? bin2bcd(hrs) : 0xff;
 
 	min = t->time.tm_min;
-	min = (min < 60) ? BIN2BCD(min) : 0xff;
+	min = (min < 60) ? bin2bcd(min) : 0xff;
 
 	sec = t->time.tm_sec;
-	sec = (sec < 60) ? BIN2BCD(sec) : 0xff;
+	sec = (sec < 60) ? bin2bcd(sec) : 0xff;
 
 	spin_lock_irq(&rtc_lock);
 
@@ -491,12 +528,21 @@ cmos_nvram_read(struct kobject *kobj, struct bin_attribute *attr,
 
 	if (unlikely(off >= attr->size))
 		return 0;
+	if (unlikely(off < 0))
+		return -EINVAL;
 	if ((off + count) > attr->size)
 		count = attr->size - off;
 
+	off += NVRAM_OFFSET;
 	spin_lock_irq(&rtc_lock);
-	for (retval = 0, off += NVRAM_OFFSET; count--; retval++, off++)
-		*buf++ = CMOS_READ(off);
+	for (retval = 0; count; count--, off++, retval++) {
+		if (off < 128)
+			*buf++ = CMOS_READ(off);
+		else if (can_bank2)
+			*buf++ = cmos_read_bank2(off);
+		else
+			break;
+	}
 	spin_unlock_irq(&rtc_lock);
 
 	return retval;
@@ -512,6 +558,8 @@ cmos_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
 	cmos = dev_get_drvdata(container_of(kobj, struct device, kobj));
 	if (unlikely(off >= attr->size))
 		return -EFBIG;
+	if (unlikely(off < 0))
+		return -EINVAL;
 	if ((off + count) > attr->size)
 		count = attr->size - off;
 
@@ -520,15 +568,20 @@ cmos_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
 	 * here.  If userspace is smart enough to know what fields of
 	 * NVRAM to update, updating checksums is also part of its job.
 	 */
+	off += NVRAM_OFFSET;
 	spin_lock_irq(&rtc_lock);
-	for (retval = 0, off += NVRAM_OFFSET; count--; retval++, off++) {
+	for (retval = 0; count; count--, off++, retval++) {
 		/* don't trash RTC registers */
 		if (off == cmos->day_alrm
 				|| off == cmos->mon_alrm
 				|| off == cmos->century)
 			buf++;
-		else
+		else if (off < 128)
 			CMOS_WRITE(*buf++, off);
+		else if (can_bank2)
+			cmos_write_bank2(*buf++, off);
+		else
+			break;
 	}
 	spin_unlock_irq(&rtc_lock);
 
@@ -539,7 +592,6 @@ static struct bin_attribute nvram = {
 	.attr = {
 		.name	= "nvram",
 		.mode	= S_IRUGO | S_IWUSR,
-		.owner	= THIS_MODULE,
 	},
 
 	.read	= cmos_nvram_read,
@@ -631,8 +683,8 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 
 	/* Heuristic to deduce NVRAM size ... do what the legacy NVRAM
 	 * driver did, but don't reject unknown configs.   Old hardware
-	 * won't address 128 bytes, and for now we ignore the way newer
-	 * chips can address 256 bytes (using two more i/o ports).
+	 * won't address 128 bytes.  Newer chips have multiple banks,
+	 * though they may not be listed in one I/O resource.
 	 */
 #if	defined(CONFIG_ATARI)
 	address_space = 64;
@@ -642,6 +694,8 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 #warning Assuming 128 bytes of RTC+NVRAM address space, not 64 bytes.
 	address_space = 128;
 #endif
+	if (can_bank2 && ports->end > (ports->start + 1))
+		address_space = 256;
 
 	/* For ACPI systems extension info comes from the FADT.  On others,
 	 * board specific setup provides it as appropriate.  Systems where
@@ -740,7 +794,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 		goto cleanup2;
 	}
 
-	pr_info("%s: alarms up to one %s%s%s\n",
+	pr_info("%s: alarms up to one %s%s, %zd bytes nvram, %s irqs\n",
 			cmos_rtc.rtc->dev.bus_id,
 			is_valid_irq(rtc_irq)
 				?  (cmos_rtc.mon_alrm
@@ -749,6 +803,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 						? "month" : "day"))
 				: "no",
 			cmos_rtc.century ? ", y3k" : "",
+			nvram.size,
 			is_hpet_enabled() ? ", hpet irqs" : "");
 
 	return 0;
diff --git a/drivers/rtc/rtc-ds1216.c b/drivers/rtc/rtc-ds1216.c
index 0b17770b032b..9a234a4ec06d 100644
--- a/drivers/rtc/rtc-ds1216.c
+++ b/drivers/rtc/rtc-ds1216.c
@@ -86,19 +86,19 @@ static int ds1216_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	ds1216_switch_ds_to_clock(priv->ioaddr);
 	ds1216_read(priv->ioaddr, (u8 *)&regs);
 
-	tm->tm_sec = BCD2BIN(regs.sec);
-	tm->tm_min = BCD2BIN(regs.min);
+	tm->tm_sec = bcd2bin(regs.sec);
+	tm->tm_min = bcd2bin(regs.min);
 	if (regs.hour & DS1216_HOUR_1224) {
 		/* AM/PM mode */
-		tm->tm_hour = BCD2BIN(regs.hour & 0x1f);
+		tm->tm_hour = bcd2bin(regs.hour & 0x1f);
 		if (regs.hour & DS1216_HOUR_AMPM)
 			tm->tm_hour += 12;
 	} else
-		tm->tm_hour = BCD2BIN(regs.hour & 0x3f);
+		tm->tm_hour = bcd2bin(regs.hour & 0x3f);
 	tm->tm_wday = (regs.wday & 7) - 1;
-	tm->tm_mday = BCD2BIN(regs.mday & 0x3f);
-	tm->tm_mon = BCD2BIN(regs.month & 0x1f);
-	tm->tm_year = BCD2BIN(regs.year);
+	tm->tm_mday = bcd2bin(regs.mday & 0x3f);
+	tm->tm_mon = bcd2bin(regs.month & 0x1f);
+	tm->tm_year = bcd2bin(regs.year);
 	if (tm->tm_year < 70)
 		tm->tm_year += 100;
 	return 0;
@@ -114,19 +114,19 @@ static int ds1216_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	ds1216_read(priv->ioaddr, (u8 *)&regs);
 
 	regs.tsec = 0; /* clear 0.1 and 0.01 seconds */
-	regs.sec = BIN2BCD(tm->tm_sec);
-	regs.min = BIN2BCD(tm->tm_min);
+	regs.sec = bin2bcd(tm->tm_sec);
+	regs.min = bin2bcd(tm->tm_min);
 	regs.hour &= DS1216_HOUR_1224;
 	if (regs.hour && tm->tm_hour > 12) {
 		regs.hour |= DS1216_HOUR_AMPM;
 		tm->tm_hour -= 12;
 	}
-	regs.hour |= BIN2BCD(tm->tm_hour);
+	regs.hour |= bin2bcd(tm->tm_hour);
 	regs.wday &= ~7;
 	regs.wday |= tm->tm_wday;
-	regs.mday = BIN2BCD(tm->tm_mday);
-	regs.month = BIN2BCD(tm->tm_mon);
-	regs.year = BIN2BCD(tm->tm_year % 100);
+	regs.mday = bin2bcd(tm->tm_mday);
+	regs.month = bin2bcd(tm->tm_mon);
+	regs.year = bin2bcd(tm->tm_year % 100);
 
 	ds1216_switch_ds_to_clock(priv->ioaddr);
 	ds1216_write(priv->ioaddr, (u8 *)&regs);
diff --git a/drivers/rtc/rtc-ds1302.c b/drivers/rtc/rtc-ds1302.c
index 8f4e96bb229a..184556620778 100644
--- a/drivers/rtc/rtc-ds1302.c
+++ b/drivers/rtc/rtc-ds1302.c
@@ -107,13 +107,13 @@ static int ds1302_rtc_read_time(struct device *dev, struct rtc_time *tm)
 
 	spin_lock_irq(&rtc->lock);
 
-	tm->tm_sec	= BCD2BIN(ds1302_readbyte(RTC_ADDR_SEC));
-	tm->tm_min	= BCD2BIN(ds1302_readbyte(RTC_ADDR_MIN));
-	tm->tm_hour	= BCD2BIN(ds1302_readbyte(RTC_ADDR_HOUR));
-	tm->tm_wday	= BCD2BIN(ds1302_readbyte(RTC_ADDR_DAY));
-	tm->tm_mday	= BCD2BIN(ds1302_readbyte(RTC_ADDR_DATE));
-	tm->tm_mon	= BCD2BIN(ds1302_readbyte(RTC_ADDR_MON)) - 1;
-	tm->tm_year	= BCD2BIN(ds1302_readbyte(RTC_ADDR_YEAR));
+	tm->tm_sec	= bcd2bin(ds1302_readbyte(RTC_ADDR_SEC));
+	tm->tm_min	= bcd2bin(ds1302_readbyte(RTC_ADDR_MIN));
+	tm->tm_hour	= bcd2bin(ds1302_readbyte(RTC_ADDR_HOUR));
+	tm->tm_wday	= bcd2bin(ds1302_readbyte(RTC_ADDR_DAY));
+	tm->tm_mday	= bcd2bin(ds1302_readbyte(RTC_ADDR_DATE));
+	tm->tm_mon	= bcd2bin(ds1302_readbyte(RTC_ADDR_MON)) - 1;
+	tm->tm_year	= bcd2bin(ds1302_readbyte(RTC_ADDR_YEAR));
 
 	if (tm->tm_year < 70)
 		tm->tm_year += 100;
@@ -141,13 +141,13 @@ static int ds1302_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	/* Stop RTC */
 	ds1302_writebyte(RTC_ADDR_SEC, ds1302_readbyte(RTC_ADDR_SEC) | 0x80);
 
-	ds1302_writebyte(RTC_ADDR_SEC, BIN2BCD(tm->tm_sec));
-	ds1302_writebyte(RTC_ADDR_MIN, BIN2BCD(tm->tm_min));
-	ds1302_writebyte(RTC_ADDR_HOUR, BIN2BCD(tm->tm_hour));
-	ds1302_writebyte(RTC_ADDR_DAY, BIN2BCD(tm->tm_wday));
-	ds1302_writebyte(RTC_ADDR_DATE, BIN2BCD(tm->tm_mday));
-	ds1302_writebyte(RTC_ADDR_MON, BIN2BCD(tm->tm_mon + 1));
-	ds1302_writebyte(RTC_ADDR_YEAR, BIN2BCD(tm->tm_year % 100));
+	ds1302_writebyte(RTC_ADDR_SEC, bin2bcd(tm->tm_sec));
+	ds1302_writebyte(RTC_ADDR_MIN, bin2bcd(tm->tm_min));
+	ds1302_writebyte(RTC_ADDR_HOUR, bin2bcd(tm->tm_hour));
+	ds1302_writebyte(RTC_ADDR_DAY, bin2bcd(tm->tm_wday));
+	ds1302_writebyte(RTC_ADDR_DATE, bin2bcd(tm->tm_mday));
+	ds1302_writebyte(RTC_ADDR_MON, bin2bcd(tm->tm_mon + 1));
+	ds1302_writebyte(RTC_ADDR_YEAR, bin2bcd(tm->tm_year % 100));
 
 	/* Start RTC */
 	ds1302_writebyte(RTC_ADDR_SEC, ds1302_readbyte(RTC_ADDR_SEC) & ~0x80);
diff --git a/drivers/rtc/rtc-ds1305.c b/drivers/rtc/rtc-ds1305.c
index b91d02a3ace9..fc372df6534b 100644
--- a/drivers/rtc/rtc-ds1305.c
+++ b/drivers/rtc/rtc-ds1305.c
@@ -114,10 +114,10 @@ static unsigned bcd2hour(u8 bcd)
 			hour = 12;
 			bcd &= ~DS1305_HR_PM;
 		}
-		hour += BCD2BIN(bcd);
+		hour += bcd2bin(bcd);
 		return hour - 1;
 	}
-	return BCD2BIN(bcd);
+	return bcd2bin(bcd);
 }
 
 static u8 hour2bcd(bool hr12, int hour)
@@ -125,11 +125,11 @@ static u8 hour2bcd(bool hr12, int hour)
 	if (hr12) {
 		hour++;
 		if (hour <= 12)
-			return DS1305_HR_12 | BIN2BCD(hour);
+			return DS1305_HR_12 | bin2bcd(hour);
 		hour -= 12;
-		return DS1305_HR_12 | DS1305_HR_PM | BIN2BCD(hour);
+		return DS1305_HR_12 | DS1305_HR_PM | bin2bcd(hour);
 	}
-	return BIN2BCD(hour);
+	return bin2bcd(hour);
 }
 
 /*----------------------------------------------------------------------*/
@@ -206,13 +206,13 @@ static int ds1305_get_time(struct device *dev, struct rtc_time *time)
 		buf[4], buf[5], buf[6]);
 
 	/* Decode the registers */
-	time->tm_sec = BCD2BIN(buf[DS1305_SEC]);
-	time->tm_min = BCD2BIN(buf[DS1305_MIN]);
+	time->tm_sec = bcd2bin(buf[DS1305_SEC]);
+	time->tm_min = bcd2bin(buf[DS1305_MIN]);
 	time->tm_hour = bcd2hour(buf[DS1305_HOUR]);
 	time->tm_wday = buf[DS1305_WDAY] - 1;
-	time->tm_mday = BCD2BIN(buf[DS1305_MDAY]);
-	time->tm_mon = BCD2BIN(buf[DS1305_MON]) - 1;
-	time->tm_year = BCD2BIN(buf[DS1305_YEAR]) + 100;
+	time->tm_mday = bcd2bin(buf[DS1305_MDAY]);
+	time->tm_mon = bcd2bin(buf[DS1305_MON]) - 1;
+	time->tm_year = bcd2bin(buf[DS1305_YEAR]) + 100;
 
 	dev_vdbg(dev, "%s secs=%d, mins=%d, "
 		"hours=%d, mday=%d, mon=%d, year=%d, wday=%d\n",
@@ -239,13 +239,13 @@ static int ds1305_set_time(struct device *dev, struct rtc_time *time)
 	/* Write registers starting at the first time/date address. */
 	*bp++ = DS1305_WRITE | DS1305_SEC;
 
-	*bp++ = BIN2BCD(time->tm_sec);
-	*bp++ = BIN2BCD(time->tm_min);
+	*bp++ = bin2bcd(time->tm_sec);
+	*bp++ = bin2bcd(time->tm_min);
 	*bp++ = hour2bcd(ds1305->hr12, time->tm_hour);
 	*bp++ = (time->tm_wday < 7) ? (time->tm_wday + 1) : 1;
-	*bp++ = BIN2BCD(time->tm_mday);
-	*bp++ = BIN2BCD(time->tm_mon + 1);
-	*bp++ = BIN2BCD(time->tm_year - 100);
+	*bp++ = bin2bcd(time->tm_mday);
+	*bp++ = bin2bcd(time->tm_mon + 1);
+	*bp++ = bin2bcd(time->tm_year - 100);
 
 	dev_dbg(dev, "%s: %02x %02x %02x, %02x %02x %02x %02x\n",
 		"write", buf[1], buf[2], buf[3],
@@ -329,8 +329,8 @@ static int ds1305_get_alarm(struct device *dev, struct rtc_wkalrm *alm)
 	 * fill in the rest ... and also handle rollover to tomorrow when
 	 * that's needed.
 	 */
-	alm->time.tm_sec = BCD2BIN(buf[DS1305_SEC]);
-	alm->time.tm_min = BCD2BIN(buf[DS1305_MIN]);
+	alm->time.tm_sec = bcd2bin(buf[DS1305_SEC]);
+	alm->time.tm_min = bcd2bin(buf[DS1305_MIN]);
 	alm->time.tm_hour = bcd2hour(buf[DS1305_HOUR]);
 	alm->time.tm_mday = -1;
 	alm->time.tm_mon = -1;
@@ -387,8 +387,8 @@ static int ds1305_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
 
 	/* write alarm */
 	buf[0] = DS1305_WRITE | DS1305_ALM0(DS1305_SEC);
-	buf[1 + DS1305_SEC] = BIN2BCD(alm->time.tm_sec);
-	buf[1 + DS1305_MIN] = BIN2BCD(alm->time.tm_min);
+	buf[1 + DS1305_SEC] = bin2bcd(alm->time.tm_sec);
+	buf[1 + DS1305_MIN] = bin2bcd(alm->time.tm_min);
 	buf[1 + DS1305_HOUR] = hour2bcd(ds1305->hr12, alm->time.tm_hour);
 	buf[1 + DS1305_WDAY] = DS1305_ALM_DISABLE;
 
@@ -606,7 +606,6 @@ ds1305_nvram_write(struct kobject *kobj, struct bin_attribute *attr,
 static struct bin_attribute nvram = {
 	.attr.name	= "nvram",
 	.attr.mode	= S_IRUGO | S_IWUSR,
-	.attr.owner	= THIS_MODULE,
 	.read		= ds1305_nvram_read,
 	.write		= ds1305_nvram_write,
 	.size		= DS1305_NVRAM_LEN,
diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c
index 4fcf0734a6ef..162330b9d1dc 100644
--- a/drivers/rtc/rtc-ds1307.c
+++ b/drivers/rtc/rtc-ds1307.c
@@ -222,17 +222,17 @@ static int ds1307_get_time(struct device *dev, struct rtc_time *t)
 			ds1307->regs[4], ds1307->regs[5],
 			ds1307->regs[6]);
 
-	t->tm_sec = BCD2BIN(ds1307->regs[DS1307_REG_SECS] & 0x7f);
-	t->tm_min = BCD2BIN(ds1307->regs[DS1307_REG_MIN] & 0x7f);
+	t->tm_sec = bcd2bin(ds1307->regs[DS1307_REG_SECS] & 0x7f);
+	t->tm_min = bcd2bin(ds1307->regs[DS1307_REG_MIN] & 0x7f);
 	tmp = ds1307->regs[DS1307_REG_HOUR] & 0x3f;
-	t->tm_hour = BCD2BIN(tmp);
-	t->tm_wday = BCD2BIN(ds1307->regs[DS1307_REG_WDAY] & 0x07) - 1;
-	t->tm_mday = BCD2BIN(ds1307->regs[DS1307_REG_MDAY] & 0x3f);
+	t->tm_hour = bcd2bin(tmp);
+	t->tm_wday = bcd2bin(ds1307->regs[DS1307_REG_WDAY] & 0x07) - 1;
+	t->tm_mday = bcd2bin(ds1307->regs[DS1307_REG_MDAY] & 0x3f);
 	tmp = ds1307->regs[DS1307_REG_MONTH] & 0x1f;
-	t->tm_mon = BCD2BIN(tmp) - 1;
+	t->tm_mon = bcd2bin(tmp) - 1;
 
 	/* assume 20YY not 19YY, and ignore DS1337_BIT_CENTURY */
-	t->tm_year = BCD2BIN(ds1307->regs[DS1307_REG_YEAR]) + 100;
+	t->tm_year = bcd2bin(ds1307->regs[DS1307_REG_YEAR]) + 100;
 
 	dev_dbg(dev, "%s secs=%d, mins=%d, "
 		"hours=%d, mday=%d, mon=%d, year=%d, wday=%d\n",
@@ -258,16 +258,16 @@ static int ds1307_set_time(struct device *dev, struct rtc_time *t)
 		t->tm_mon, t->tm_year, t->tm_wday);
 
 	*buf++ = 0;		/* first register addr */
-	buf[DS1307_REG_SECS] = BIN2BCD(t->tm_sec);
-	buf[DS1307_REG_MIN] = BIN2BCD(t->tm_min);
-	buf[DS1307_REG_HOUR] = BIN2BCD(t->tm_hour);
-	buf[DS1307_REG_WDAY] = BIN2BCD(t->tm_wday + 1);
-	buf[DS1307_REG_MDAY] = BIN2BCD(t->tm_mday);
-	buf[DS1307_REG_MONTH] = BIN2BCD(t->tm_mon + 1);
+	buf[DS1307_REG_SECS] = bin2bcd(t->tm_sec);
+	buf[DS1307_REG_MIN] = bin2bcd(t->tm_min);
+	buf[DS1307_REG_HOUR] = bin2bcd(t->tm_hour);
+	buf[DS1307_REG_WDAY] = bin2bcd(t->tm_wday + 1);
+	buf[DS1307_REG_MDAY] = bin2bcd(t->tm_mday);
+	buf[DS1307_REG_MONTH] = bin2bcd(t->tm_mon + 1);
 
 	/* assume 20YY not 19YY */
 	tmp = t->tm_year - 100;
-	buf[DS1307_REG_YEAR] = BIN2BCD(tmp);
+	buf[DS1307_REG_YEAR] = bin2bcd(tmp);
 
 	switch (ds1307->type) {
 	case ds_1337:
@@ -551,7 +551,6 @@ static struct bin_attribute nvram = {
 	.attr = {
 		.name	= "nvram",
 		.mode	= S_IRUGO | S_IWUSR,
-		.owner	= THIS_MODULE,
 	},
 
 	.read	= ds1307_nvram_read,
@@ -709,18 +708,18 @@ read_rtc:
 	}
 
 	tmp = ds1307->regs[DS1307_REG_SECS];
-	tmp = BCD2BIN(tmp & 0x7f);
+	tmp = bcd2bin(tmp & 0x7f);
 	if (tmp > 60)
 		goto exit_bad;
-	tmp = BCD2BIN(ds1307->regs[DS1307_REG_MIN] & 0x7f);
+	tmp = bcd2bin(ds1307->regs[DS1307_REG_MIN] & 0x7f);
 	if (tmp > 60)
 		goto exit_bad;
 
-	tmp = BCD2BIN(ds1307->regs[DS1307_REG_MDAY] & 0x3f);
+	tmp = bcd2bin(ds1307->regs[DS1307_REG_MDAY] & 0x3f);
 	if (tmp == 0 || tmp > 31)
 		goto exit_bad;
 
-	tmp = BCD2BIN(ds1307->regs[DS1307_REG_MONTH] & 0x1f);
+	tmp = bcd2bin(ds1307->regs[DS1307_REG_MONTH] & 0x1f);
 	if (tmp == 0 || tmp > 12)
 		goto exit_bad;
 
@@ -739,14 +738,14 @@ read_rtc:
 		/* Be sure we're in 24 hour mode.  Multi-master systems
 		 * take note...
 		 */
-		tmp = BCD2BIN(tmp & 0x1f);
+		tmp = bcd2bin(tmp & 0x1f);
 		if (tmp == 12)
 			tmp = 0;
 		if (ds1307->regs[DS1307_REG_HOUR] & DS1307_BIT_PM)
 			tmp += 12;
 		i2c_smbus_write_byte_data(client,
 				DS1307_REG_HOUR,
-				BIN2BCD(tmp));
+				bin2bcd(tmp));
 	}
 
 	ds1307->rtc = rtc_device_register(client->name, &client->dev,
diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 86981d34fbb6..25caada78398 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -153,8 +153,8 @@ ds1511_wdog_set(unsigned long deciseconds)
 	/*
 	 * set the wdog values in the wdog registers
 	 */
-	rtc_write(BIN2BCD(deciseconds % 100), DS1511_WD_MSEC);
-	rtc_write(BIN2BCD(deciseconds / 100), DS1511_WD_SEC);
+	rtc_write(bin2bcd(deciseconds % 100), DS1511_WD_MSEC);
+	rtc_write(bin2bcd(deciseconds / 100), DS1511_WD_SEC);
 	/*
 	 * set wdog enable and wdog 'steering' bit to issue a reset
 	 */
@@ -220,13 +220,13 @@ static int ds1511_rtc_set_time(struct device *dev, struct rtc_time *rtc_tm)
 	/*
 	 * each register is a different number of valid bits
 	 */
-	sec = BIN2BCD(sec) & 0x7f;
-	min = BIN2BCD(min) & 0x7f;
-	hrs = BIN2BCD(hrs) & 0x3f;
-	day = BIN2BCD(day) & 0x3f;
-	mon = BIN2BCD(mon) & 0x1f;
-	yrs = BIN2BCD(yrs) & 0xff;
-	cen = BIN2BCD(cen) & 0xff;
+	sec = bin2bcd(sec) & 0x7f;
+	min = bin2bcd(min) & 0x7f;
+	hrs = bin2bcd(hrs) & 0x3f;
+	day = bin2bcd(day) & 0x3f;
+	mon = bin2bcd(mon) & 0x1f;
+	yrs = bin2bcd(yrs) & 0xff;
+	cen = bin2bcd(cen) & 0xff;
 
 	spin_lock_irqsave(&ds1511_lock, flags);
 	rtc_disable_update();
@@ -264,14 +264,14 @@ static int ds1511_rtc_read_time(struct device *dev, struct rtc_time *rtc_tm)
 	rtc_enable_update();
 	spin_unlock_irqrestore(&ds1511_lock, flags);
 
-	rtc_tm->tm_sec = BCD2BIN(rtc_tm->tm_sec);
-	rtc_tm->tm_min = BCD2BIN(rtc_tm->tm_min);
-	rtc_tm->tm_hour = BCD2BIN(rtc_tm->tm_hour);
-	rtc_tm->tm_mday = BCD2BIN(rtc_tm->tm_mday);
-	rtc_tm->tm_wday = BCD2BIN(rtc_tm->tm_wday);
-	rtc_tm->tm_mon = BCD2BIN(rtc_tm->tm_mon);
-	rtc_tm->tm_year = BCD2BIN(rtc_tm->tm_year);
-	century = BCD2BIN(century) * 100;
+	rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec);
+	rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min);
+	rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour);
+	rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday);
+	rtc_tm->tm_wday = bcd2bin(rtc_tm->tm_wday);
+	rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
+	rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
+	century = bcd2bin(century) * 100;
 
 	/*
 	 * Account for differences between how the RTC uses the values
@@ -304,16 +304,16 @@ ds1511_rtc_update_alarm(struct rtc_plat_data *pdata)
 
 	spin_lock_irqsave(&pdata->rtc->irq_lock, flags);
 	rtc_write(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_mday) & 0x3f,
+	       0x80 : bin2bcd(pdata->alrm_mday) & 0x3f,
 	       RTC_ALARM_DATE);
 	rtc_write(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_hour) & 0x3f,
+	       0x80 : bin2bcd(pdata->alrm_hour) & 0x3f,
 	       RTC_ALARM_HOUR);
 	rtc_write(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_min) & 0x7f,
+	       0x80 : bin2bcd(pdata->alrm_min) & 0x7f,
 	       RTC_ALARM_MIN);
 	rtc_write(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_sec) & 0x7f,
+	       0x80 : bin2bcd(pdata->alrm_sec) & 0x7f,
 	       RTC_ALARM_SEC);
 	rtc_write(rtc_read(RTC_CMD) | (pdata->irqen ? RTC_TIE : 0), RTC_CMD);
 	rtc_read(RTC_CMD1);	/* clear interrupts */
@@ -481,7 +481,6 @@ static struct bin_attribute ds1511_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = DS1511_RAM_MAX,
 	.read = ds1511_nvram_read,
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index 4ef59285b489..b9475cd20210 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -78,17 +78,17 @@ static int ds1553_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	void __iomem *ioaddr = pdata->ioaddr;
 	u8 century;
 
-	century = BIN2BCD((tm->tm_year + 1900) / 100);
+	century = bin2bcd((tm->tm_year + 1900) / 100);
 
 	writeb(RTC_WRITE, pdata->ioaddr + RTC_CONTROL);
 
-	writeb(BIN2BCD(tm->tm_year % 100), ioaddr + RTC_YEAR);
-	writeb(BIN2BCD(tm->tm_mon + 1), ioaddr + RTC_MONTH);
-	writeb(BIN2BCD(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY);
-	writeb(BIN2BCD(tm->tm_mday), ioaddr + RTC_DATE);
-	writeb(BIN2BCD(tm->tm_hour), ioaddr + RTC_HOURS);
-	writeb(BIN2BCD(tm->tm_min), ioaddr + RTC_MINUTES);
-	writeb(BIN2BCD(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS);
+	writeb(bin2bcd(tm->tm_year % 100), ioaddr + RTC_YEAR);
+	writeb(bin2bcd(tm->tm_mon + 1), ioaddr + RTC_MONTH);
+	writeb(bin2bcd(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY);
+	writeb(bin2bcd(tm->tm_mday), ioaddr + RTC_DATE);
+	writeb(bin2bcd(tm->tm_hour), ioaddr + RTC_HOURS);
+	writeb(bin2bcd(tm->tm_min), ioaddr + RTC_MINUTES);
+	writeb(bin2bcd(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS);
 
 	/* RTC_CENTURY and RTC_CONTROL share same register */
 	writeb(RTC_WRITE | (century & RTC_CENTURY_MASK), ioaddr + RTC_CENTURY);
@@ -118,14 +118,14 @@ static int ds1553_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	year = readb(ioaddr + RTC_YEAR);
 	century = readb(ioaddr + RTC_CENTURY) & RTC_CENTURY_MASK;
 	writeb(0, ioaddr + RTC_CONTROL);
-	tm->tm_sec = BCD2BIN(second);
-	tm->tm_min = BCD2BIN(minute);
-	tm->tm_hour = BCD2BIN(hour);
-	tm->tm_mday = BCD2BIN(day);
-	tm->tm_wday = BCD2BIN(week);
-	tm->tm_mon = BCD2BIN(month) - 1;
+	tm->tm_sec = bcd2bin(second);
+	tm->tm_min = bcd2bin(minute);
+	tm->tm_hour = bcd2bin(hour);
+	tm->tm_mday = bcd2bin(day);
+	tm->tm_wday = bcd2bin(week);
+	tm->tm_mon = bcd2bin(month) - 1;
 	/* year is 1900 + tm->tm_year */
-	tm->tm_year = BCD2BIN(year) + BCD2BIN(century) * 100 - 1900;
+	tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900;
 
 	if (rtc_valid_tm(tm) < 0) {
 		dev_err(dev, "retrieved date/time is not valid.\n");
@@ -141,16 +141,16 @@ static void ds1553_rtc_update_alarm(struct rtc_plat_data *pdata)
 
 	spin_lock_irqsave(&pdata->rtc->irq_lock, flags);
 	writeb(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_mday),
+	       0x80 : bin2bcd(pdata->alrm_mday),
 	       ioaddr + RTC_DATE_ALARM);
 	writeb(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_hour),
+	       0x80 : bin2bcd(pdata->alrm_hour),
 	       ioaddr + RTC_HOURS_ALARM);
 	writeb(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_min),
+	       0x80 : bin2bcd(pdata->alrm_min),
 	       ioaddr + RTC_MINUTES_ALARM);
 	writeb(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_sec),
+	       0x80 : bin2bcd(pdata->alrm_sec),
 	       ioaddr + RTC_SECONDS_ALARM);
 	writeb(pdata->irqen ? RTC_INTS_AE : 0, ioaddr + RTC_INTERRUPTS);
 	readb(ioaddr + RTC_FLAGS);	/* clear interrupts */
diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 24d35ede2dbf..8bc8501bffc8 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -66,17 +66,17 @@ static int ds1742_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	void __iomem *ioaddr = pdata->ioaddr_rtc;
 	u8 century;
 
-	century = BIN2BCD((tm->tm_year + 1900) / 100);
+	century = bin2bcd((tm->tm_year + 1900) / 100);
 
 	writeb(RTC_WRITE, ioaddr + RTC_CONTROL);
 
-	writeb(BIN2BCD(tm->tm_year % 100), ioaddr + RTC_YEAR);
-	writeb(BIN2BCD(tm->tm_mon + 1), ioaddr + RTC_MONTH);
-	writeb(BIN2BCD(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY);
-	writeb(BIN2BCD(tm->tm_mday), ioaddr + RTC_DATE);
-	writeb(BIN2BCD(tm->tm_hour), ioaddr + RTC_HOURS);
-	writeb(BIN2BCD(tm->tm_min), ioaddr + RTC_MINUTES);
-	writeb(BIN2BCD(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS);
+	writeb(bin2bcd(tm->tm_year % 100), ioaddr + RTC_YEAR);
+	writeb(bin2bcd(tm->tm_mon + 1), ioaddr + RTC_MONTH);
+	writeb(bin2bcd(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY);
+	writeb(bin2bcd(tm->tm_mday), ioaddr + RTC_DATE);
+	writeb(bin2bcd(tm->tm_hour), ioaddr + RTC_HOURS);
+	writeb(bin2bcd(tm->tm_min), ioaddr + RTC_MINUTES);
+	writeb(bin2bcd(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS);
 
 	/* RTC_CENTURY and RTC_CONTROL share same register */
 	writeb(RTC_WRITE | (century & RTC_CENTURY_MASK), ioaddr + RTC_CENTURY);
@@ -106,14 +106,14 @@ static int ds1742_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	year = readb(ioaddr + RTC_YEAR);
 	century = readb(ioaddr + RTC_CENTURY) & RTC_CENTURY_MASK;
 	writeb(0, ioaddr + RTC_CONTROL);
-	tm->tm_sec = BCD2BIN(second);
-	tm->tm_min = BCD2BIN(minute);
-	tm->tm_hour = BCD2BIN(hour);
-	tm->tm_mday = BCD2BIN(day);
-	tm->tm_wday = BCD2BIN(week);
-	tm->tm_mon = BCD2BIN(month) - 1;
+	tm->tm_sec = bcd2bin(second);
+	tm->tm_min = bcd2bin(minute);
+	tm->tm_hour = bcd2bin(hour);
+	tm->tm_mday = bcd2bin(day);
+	tm->tm_wday = bcd2bin(week);
+	tm->tm_mon = bcd2bin(month) - 1;
 	/* year is 1900 + tm->tm_year */
-	tm->tm_year = BCD2BIN(year) + BCD2BIN(century) * 100 - 1900;
+	tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900;
 
 	if (rtc_valid_tm(tm) < 0) {
 		dev_err(dev, "retrieved date/time is not valid.\n");
diff --git a/drivers/rtc/rtc-fm3130.c b/drivers/rtc/rtc-fm3130.c
index abfdfcbaa059..3a7be11cc6b9 100644
--- a/drivers/rtc/rtc-fm3130.c
+++ b/drivers/rtc/rtc-fm3130.c
@@ -131,17 +131,17 @@ static int fm3130_get_time(struct device *dev, struct rtc_time *t)
 			fm3130->regs[0xc], fm3130->regs[0xd],
 			fm3130->regs[0xe]);
 
-	t->tm_sec = BCD2BIN(fm3130->regs[FM3130_RTC_SECONDS] & 0x7f);
-	t->tm_min = BCD2BIN(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f);
+	t->tm_sec = bcd2bin(fm3130->regs[FM3130_RTC_SECONDS] & 0x7f);
+	t->tm_min = bcd2bin(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f);
 	tmp = fm3130->regs[FM3130_RTC_HOURS] & 0x3f;
-	t->tm_hour = BCD2BIN(tmp);
-	t->tm_wday = BCD2BIN(fm3130->regs[FM3130_RTC_DAY] & 0x07) - 1;
-	t->tm_mday = BCD2BIN(fm3130->regs[FM3130_RTC_DATE] & 0x3f);
+	t->tm_hour = bcd2bin(tmp);
+	t->tm_wday = bcd2bin(fm3130->regs[FM3130_RTC_DAY] & 0x07) - 1;
+	t->tm_mday = bcd2bin(fm3130->regs[FM3130_RTC_DATE] & 0x3f);
 	tmp = fm3130->regs[FM3130_RTC_MONTHS] & 0x1f;
-	t->tm_mon = BCD2BIN(tmp) - 1;
+	t->tm_mon = bcd2bin(tmp) - 1;
 
 	/* assume 20YY not 19YY, and ignore CF bit */
-	t->tm_year = BCD2BIN(fm3130->regs[FM3130_RTC_YEARS]) + 100;
+	t->tm_year = bcd2bin(fm3130->regs[FM3130_RTC_YEARS]) + 100;
 
 	dev_dbg(dev, "%s secs=%d, mins=%d, "
 		"hours=%d, mday=%d, mon=%d, year=%d, wday=%d\n",
@@ -167,16 +167,16 @@ static int fm3130_set_time(struct device *dev, struct rtc_time *t)
 		t->tm_mon, t->tm_year, t->tm_wday);
 
 	/* first register addr */
-	buf[FM3130_RTC_SECONDS] = BIN2BCD(t->tm_sec);
-	buf[FM3130_RTC_MINUTES] = BIN2BCD(t->tm_min);
-	buf[FM3130_RTC_HOURS] = BIN2BCD(t->tm_hour);
-	buf[FM3130_RTC_DAY] = BIN2BCD(t->tm_wday + 1);
-	buf[FM3130_RTC_DATE] = BIN2BCD(t->tm_mday);
-	buf[FM3130_RTC_MONTHS] = BIN2BCD(t->tm_mon + 1);
+	buf[FM3130_RTC_SECONDS] = bin2bcd(t->tm_sec);
+	buf[FM3130_RTC_MINUTES] = bin2bcd(t->tm_min);
+	buf[FM3130_RTC_HOURS] = bin2bcd(t->tm_hour);
+	buf[FM3130_RTC_DAY] = bin2bcd(t->tm_wday + 1);
+	buf[FM3130_RTC_DATE] = bin2bcd(t->tm_mday);
+	buf[FM3130_RTC_MONTHS] = bin2bcd(t->tm_mon + 1);
 
 	/* assume 20YY not 19YY */
 	tmp = t->tm_year - 100;
-	buf[FM3130_RTC_YEARS] = BIN2BCD(tmp);
+	buf[FM3130_RTC_YEARS] = bin2bcd(tmp);
 
 	dev_dbg(dev, "%s: %02x %02x %02x %02x %02x %02x %02x"
 		"%02x %02x %02x %02x %02x %02x %02x %02x\n",
@@ -222,11 +222,11 @@ static int fm3130_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 			fm3130->regs[FM3130_ALARM_MONTHS]);
 
 
-	tm->tm_sec	= BCD2BIN(fm3130->regs[FM3130_ALARM_SECONDS] & 0x7F);
-	tm->tm_min	= BCD2BIN(fm3130->regs[FM3130_ALARM_MINUTES] & 0x7F);
-	tm->tm_hour	= BCD2BIN(fm3130->regs[FM3130_ALARM_HOURS] & 0x3F);
-	tm->tm_mday	= BCD2BIN(fm3130->regs[FM3130_ALARM_DATE] & 0x3F);
-	tm->tm_mon	= BCD2BIN(fm3130->regs[FM3130_ALARM_MONTHS] & 0x1F);
+	tm->tm_sec	= bcd2bin(fm3130->regs[FM3130_ALARM_SECONDS] & 0x7F);
+	tm->tm_min	= bcd2bin(fm3130->regs[FM3130_ALARM_MINUTES] & 0x7F);
+	tm->tm_hour	= bcd2bin(fm3130->regs[FM3130_ALARM_HOURS] & 0x3F);
+	tm->tm_mday	= bcd2bin(fm3130->regs[FM3130_ALARM_DATE] & 0x3F);
+	tm->tm_mon	= bcd2bin(fm3130->regs[FM3130_ALARM_MONTHS] & 0x1F);
 	if (tm->tm_mon > 0)
 		tm->tm_mon -= 1; /* RTC is 1-12, tm_mon is 0-11 */
 	dev_dbg(dev, "%s secs=%d, mins=%d, "
@@ -252,23 +252,23 @@ static int fm3130_set_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 
 	if (tm->tm_sec != -1)
 		fm3130->regs[FM3130_ALARM_SECONDS] =
-			BIN2BCD(tm->tm_sec) | 0x80;
+			bin2bcd(tm->tm_sec) | 0x80;
 
 	if (tm->tm_min != -1)
 		fm3130->regs[FM3130_ALARM_MINUTES] =
-			BIN2BCD(tm->tm_min) | 0x80;
+			bin2bcd(tm->tm_min) | 0x80;
 
 	if (tm->tm_hour != -1)
 		fm3130->regs[FM3130_ALARM_HOURS] =
-			BIN2BCD(tm->tm_hour) | 0x80;
+			bin2bcd(tm->tm_hour) | 0x80;
 
 	if (tm->tm_mday != -1)
 		fm3130->regs[FM3130_ALARM_DATE] =
-			BIN2BCD(tm->tm_mday) | 0x80;
+			bin2bcd(tm->tm_mday) | 0x80;
 
 	if (tm->tm_mon != -1)
 		fm3130->regs[FM3130_ALARM_MONTHS] =
-			BIN2BCD(tm->tm_mon + 1) | 0x80;
+			bin2bcd(tm->tm_mon + 1) | 0x80;
 
 	dev_dbg(dev, "alarm write %02x %02x %02x %02x %02x\n",
 			fm3130->regs[FM3130_ALARM_SECONDS],
@@ -414,18 +414,18 @@ static int __devinit fm3130_probe(struct i2c_client *client,
 	/* TODO */
 	/* TODO need to sanity check alarm */
 	tmp = fm3130->regs[FM3130_RTC_SECONDS];
-	tmp = BCD2BIN(tmp & 0x7f);
+	tmp = bcd2bin(tmp & 0x7f);
 	if (tmp > 60)
 		goto exit_bad;
-	tmp = BCD2BIN(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f);
+	tmp = bcd2bin(fm3130->regs[FM3130_RTC_MINUTES] & 0x7f);
 	if (tmp > 60)
 		goto exit_bad;
 
-	tmp = BCD2BIN(fm3130->regs[FM3130_RTC_DATE] & 0x3f);
+	tmp = bcd2bin(fm3130->regs[FM3130_RTC_DATE] & 0x3f);
 	if (tmp == 0 || tmp > 31)
 		goto exit_bad;
 
-	tmp = BCD2BIN(fm3130->regs[FM3130_RTC_MONTHS] & 0x1f);
+	tmp = bcd2bin(fm3130->regs[FM3130_RTC_MONTHS] & 0x1f);
 	if (tmp == 0 || tmp > 12)
 		goto exit_bad;
 
diff --git a/drivers/rtc/rtc-isl1208.c b/drivers/rtc/rtc-isl1208.c
index a81adab6e515..2cd77ab8fc66 100644
--- a/drivers/rtc/rtc-isl1208.c
+++ b/drivers/rtc/rtc-isl1208.c
@@ -259,26 +259,26 @@ isl1208_i2c_read_time(struct i2c_client *client, struct rtc_time *tm)
 		return sr;
 	}
 
-	tm->tm_sec = BCD2BIN(regs[ISL1208_REG_SC]);
-	tm->tm_min = BCD2BIN(regs[ISL1208_REG_MN]);
+	tm->tm_sec = bcd2bin(regs[ISL1208_REG_SC]);
+	tm->tm_min = bcd2bin(regs[ISL1208_REG_MN]);
 
 	/* HR field has a more complex interpretation */
 	{
 		const u8 _hr = regs[ISL1208_REG_HR];
 		if (_hr & ISL1208_REG_HR_MIL)	/* 24h format */
-			tm->tm_hour = BCD2BIN(_hr & 0x3f);
+			tm->tm_hour = bcd2bin(_hr & 0x3f);
 		else {
 			/* 12h format */
-			tm->tm_hour = BCD2BIN(_hr & 0x1f);
+			tm->tm_hour = bcd2bin(_hr & 0x1f);
 			if (_hr & ISL1208_REG_HR_PM)	/* PM flag set */
 				tm->tm_hour += 12;
 		}
 	}
 
-	tm->tm_mday = BCD2BIN(regs[ISL1208_REG_DT]);
-	tm->tm_mon = BCD2BIN(regs[ISL1208_REG_MO]) - 1;	/* rtc starts at 1 */
-	tm->tm_year = BCD2BIN(regs[ISL1208_REG_YR]) + 100;
-	tm->tm_wday = BCD2BIN(regs[ISL1208_REG_DW]);
+	tm->tm_mday = bcd2bin(regs[ISL1208_REG_DT]);
+	tm->tm_mon = bcd2bin(regs[ISL1208_REG_MO]) - 1;	/* rtc starts at 1 */
+	tm->tm_year = bcd2bin(regs[ISL1208_REG_YR]) + 100;
+	tm->tm_wday = bcd2bin(regs[ISL1208_REG_DW]);
 
 	return 0;
 }
@@ -305,13 +305,13 @@ isl1208_i2c_read_alarm(struct i2c_client *client, struct rtc_wkalrm *alarm)
 	}
 
 	/* MSB of each alarm register is an enable bit */
-	tm->tm_sec = BCD2BIN(regs[ISL1208_REG_SCA - ISL1208_REG_SCA] & 0x7f);
-	tm->tm_min = BCD2BIN(regs[ISL1208_REG_MNA - ISL1208_REG_SCA] & 0x7f);
-	tm->tm_hour = BCD2BIN(regs[ISL1208_REG_HRA - ISL1208_REG_SCA] & 0x3f);
-	tm->tm_mday = BCD2BIN(regs[ISL1208_REG_DTA - ISL1208_REG_SCA] & 0x3f);
+	tm->tm_sec = bcd2bin(regs[ISL1208_REG_SCA - ISL1208_REG_SCA] & 0x7f);
+	tm->tm_min = bcd2bin(regs[ISL1208_REG_MNA - ISL1208_REG_SCA] & 0x7f);
+	tm->tm_hour = bcd2bin(regs[ISL1208_REG_HRA - ISL1208_REG_SCA] & 0x3f);
+	tm->tm_mday = bcd2bin(regs[ISL1208_REG_DTA - ISL1208_REG_SCA] & 0x3f);
 	tm->tm_mon =
-		BCD2BIN(regs[ISL1208_REG_MOA - ISL1208_REG_SCA] & 0x1f) - 1;
-	tm->tm_wday = BCD2BIN(regs[ISL1208_REG_DWA - ISL1208_REG_SCA] & 0x03);
+		bcd2bin(regs[ISL1208_REG_MOA - ISL1208_REG_SCA] & 0x1f) - 1;
+	tm->tm_wday = bcd2bin(regs[ISL1208_REG_DWA - ISL1208_REG_SCA] & 0x03);
 
 	return 0;
 }
@@ -328,15 +328,15 @@ isl1208_i2c_set_time(struct i2c_client *client, struct rtc_time const *tm)
 	int sr;
 	u8 regs[ISL1208_RTC_SECTION_LEN] = { 0, };
 
-	regs[ISL1208_REG_SC] = BIN2BCD(tm->tm_sec);
-	regs[ISL1208_REG_MN] = BIN2BCD(tm->tm_min);
-	regs[ISL1208_REG_HR] = BIN2BCD(tm->tm_hour) | ISL1208_REG_HR_MIL;
+	regs[ISL1208_REG_SC] = bin2bcd(tm->tm_sec);
+	regs[ISL1208_REG_MN] = bin2bcd(tm->tm_min);
+	regs[ISL1208_REG_HR] = bin2bcd(tm->tm_hour) | ISL1208_REG_HR_MIL;
 
-	regs[ISL1208_REG_DT] = BIN2BCD(tm->tm_mday);
-	regs[ISL1208_REG_MO] = BIN2BCD(tm->tm_mon + 1);
-	regs[ISL1208_REG_YR] = BIN2BCD(tm->tm_year - 100);
+	regs[ISL1208_REG_DT] = bin2bcd(tm->tm_mday);
+	regs[ISL1208_REG_MO] = bin2bcd(tm->tm_mon + 1);
+	regs[ISL1208_REG_YR] = bin2bcd(tm->tm_year - 100);
 
-	regs[ISL1208_REG_DW] = BIN2BCD(tm->tm_wday & 7);
+	regs[ISL1208_REG_DW] = bin2bcd(tm->tm_wday & 7);
 
 	sr = isl1208_i2c_get_sr(client);
 	if (sr < 0) {
diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c
index 470fb2d29545..893f7dece239 100644
--- a/drivers/rtc/rtc-m41t80.c
+++ b/drivers/rtc/rtc-m41t80.c
@@ -110,15 +110,15 @@ static int m41t80_get_datetime(struct i2c_client *client,
 		return -EIO;
 	}
 
-	tm->tm_sec = BCD2BIN(buf[M41T80_REG_SEC] & 0x7f);
-	tm->tm_min = BCD2BIN(buf[M41T80_REG_MIN] & 0x7f);
-	tm->tm_hour = BCD2BIN(buf[M41T80_REG_HOUR] & 0x3f);
-	tm->tm_mday = BCD2BIN(buf[M41T80_REG_DAY] & 0x3f);
+	tm->tm_sec = bcd2bin(buf[M41T80_REG_SEC] & 0x7f);
+	tm->tm_min = bcd2bin(buf[M41T80_REG_MIN] & 0x7f);
+	tm->tm_hour = bcd2bin(buf[M41T80_REG_HOUR] & 0x3f);
+	tm->tm_mday = bcd2bin(buf[M41T80_REG_DAY] & 0x3f);
 	tm->tm_wday = buf[M41T80_REG_WDAY] & 0x07;
-	tm->tm_mon = BCD2BIN(buf[M41T80_REG_MON] & 0x1f) - 1;
+	tm->tm_mon = bcd2bin(buf[M41T80_REG_MON] & 0x1f) - 1;
 
 	/* assume 20YY not 19YY, and ignore the Century Bit */
-	tm->tm_year = BCD2BIN(buf[M41T80_REG_YEAR]) + 100;
+	tm->tm_year = bcd2bin(buf[M41T80_REG_YEAR]) + 100;
 	return 0;
 }
 
@@ -161,19 +161,19 @@ static int m41t80_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 	/* Merge time-data and register flags into buf[0..7] */
 	buf[M41T80_REG_SSEC] = 0;
 	buf[M41T80_REG_SEC] =
-		BIN2BCD(tm->tm_sec) | (buf[M41T80_REG_SEC] & ~0x7f);
+		bin2bcd(tm->tm_sec) | (buf[M41T80_REG_SEC] & ~0x7f);
 	buf[M41T80_REG_MIN] =
-		BIN2BCD(tm->tm_min) | (buf[M41T80_REG_MIN] & ~0x7f);
+		bin2bcd(tm->tm_min) | (buf[M41T80_REG_MIN] & ~0x7f);
 	buf[M41T80_REG_HOUR] =
-		BIN2BCD(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f) ;
+		bin2bcd(tm->tm_hour) | (buf[M41T80_REG_HOUR] & ~0x3f) ;
 	buf[M41T80_REG_WDAY] =
 		(tm->tm_wday & 0x07) | (buf[M41T80_REG_WDAY] & ~0x07);
 	buf[M41T80_REG_DAY] =
-		BIN2BCD(tm->tm_mday) | (buf[M41T80_REG_DAY] & ~0x3f);
+		bin2bcd(tm->tm_mday) | (buf[M41T80_REG_DAY] & ~0x3f);
 	buf[M41T80_REG_MON] =
-		BIN2BCD(tm->tm_mon + 1) | (buf[M41T80_REG_MON] & ~0x1f);
+		bin2bcd(tm->tm_mon + 1) | (buf[M41T80_REG_MON] & ~0x1f);
 	/* assume 20YY not 19YY */
-	buf[M41T80_REG_YEAR] = BIN2BCD(tm->tm_year % 100);
+	buf[M41T80_REG_YEAR] = bin2bcd(tm->tm_year % 100);
 
 	if (i2c_transfer(client->adapter, msgs, 1) != 1) {
 		dev_err(&client->dev, "write error\n");
@@ -288,15 +288,15 @@ static int m41t80_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 
 	wbuf[0] = M41T80_REG_ALARM_MON; /* offset into rtc's regs */
 	reg[M41T80_REG_ALARM_SEC] |= t->time.tm_sec >= 0 ?
-		BIN2BCD(t->time.tm_sec) : 0x80;
+		bin2bcd(t->time.tm_sec) : 0x80;
 	reg[M41T80_REG_ALARM_MIN] |= t->time.tm_min >= 0 ?
-		BIN2BCD(t->time.tm_min) : 0x80;
+		bin2bcd(t->time.tm_min) : 0x80;
 	reg[M41T80_REG_ALARM_HOUR] |= t->time.tm_hour >= 0 ?
-		BIN2BCD(t->time.tm_hour) : 0x80;
+		bin2bcd(t->time.tm_hour) : 0x80;
 	reg[M41T80_REG_ALARM_DAY] |= t->time.tm_mday >= 0 ?
-		BIN2BCD(t->time.tm_mday) : 0x80;
+		bin2bcd(t->time.tm_mday) : 0x80;
 	if (t->time.tm_mon >= 0)
-		reg[M41T80_REG_ALARM_MON] |= BIN2BCD(t->time.tm_mon + 1);
+		reg[M41T80_REG_ALARM_MON] |= bin2bcd(t->time.tm_mon + 1);
 	else
 		reg[M41T80_REG_ALARM_DAY] |= 0x40;
 
@@ -347,15 +347,15 @@ static int m41t80_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *t)
 	t->time.tm_mday = -1;
 	t->time.tm_mon = -1;
 	if (!(reg[M41T80_REG_ALARM_SEC] & 0x80))
-		t->time.tm_sec = BCD2BIN(reg[M41T80_REG_ALARM_SEC] & 0x7f);
+		t->time.tm_sec = bcd2bin(reg[M41T80_REG_ALARM_SEC] & 0x7f);
 	if (!(reg[M41T80_REG_ALARM_MIN] & 0x80))
-		t->time.tm_min = BCD2BIN(reg[M41T80_REG_ALARM_MIN] & 0x7f);
+		t->time.tm_min = bcd2bin(reg[M41T80_REG_ALARM_MIN] & 0x7f);
 	if (!(reg[M41T80_REG_ALARM_HOUR] & 0x80))
-		t->time.tm_hour = BCD2BIN(reg[M41T80_REG_ALARM_HOUR] & 0x3f);
+		t->time.tm_hour = bcd2bin(reg[M41T80_REG_ALARM_HOUR] & 0x3f);
 	if (!(reg[M41T80_REG_ALARM_DAY] & 0x80))
-		t->time.tm_mday = BCD2BIN(reg[M41T80_REG_ALARM_DAY] & 0x3f);
+		t->time.tm_mday = bcd2bin(reg[M41T80_REG_ALARM_DAY] & 0x3f);
 	if (!(reg[M41T80_REG_ALARM_DAY] & 0x40))
-		t->time.tm_mon = BCD2BIN(reg[M41T80_REG_ALARM_MON] & 0x1f) - 1;
+		t->time.tm_mon = bcd2bin(reg[M41T80_REG_ALARM_MON] & 0x1f) - 1;
 	t->time.tm_year = -1;
 	t->time.tm_wday = -1;
 	t->time.tm_yday = -1;
diff --git a/drivers/rtc/rtc-m41t94.c b/drivers/rtc/rtc-m41t94.c
index 9b19499c829e..c3a18c58daf6 100644
--- a/drivers/rtc/rtc-m41t94.c
+++ b/drivers/rtc/rtc-m41t94.c
@@ -41,17 +41,17 @@ static int m41t94_set_time(struct device *dev, struct rtc_time *tm)
 		tm->tm_mon, tm->tm_year, tm->tm_wday);
 
 	buf[0] = 0x80 | M41T94_REG_SECONDS; /* write time + date */
-	buf[M41T94_REG_SECONDS] = BIN2BCD(tm->tm_sec);
-	buf[M41T94_REG_MINUTES] = BIN2BCD(tm->tm_min);
-	buf[M41T94_REG_HOURS]   = BIN2BCD(tm->tm_hour);
-	buf[M41T94_REG_WDAY]    = BIN2BCD(tm->tm_wday + 1);
-	buf[M41T94_REG_DAY]     = BIN2BCD(tm->tm_mday);
-	buf[M41T94_REG_MONTH]   = BIN2BCD(tm->tm_mon + 1);
+	buf[M41T94_REG_SECONDS] = bin2bcd(tm->tm_sec);
+	buf[M41T94_REG_MINUTES] = bin2bcd(tm->tm_min);
+	buf[M41T94_REG_HOURS]   = bin2bcd(tm->tm_hour);
+	buf[M41T94_REG_WDAY]    = bin2bcd(tm->tm_wday + 1);
+	buf[M41T94_REG_DAY]     = bin2bcd(tm->tm_mday);
+	buf[M41T94_REG_MONTH]   = bin2bcd(tm->tm_mon + 1);
 
 	buf[M41T94_REG_HOURS] |= M41T94_BIT_CEB;
 	if (tm->tm_year >= 100)
 		buf[M41T94_REG_HOURS] |= M41T94_BIT_CB;
-	buf[M41T94_REG_YEAR] = BIN2BCD(tm->tm_year % 100);
+	buf[M41T94_REG_YEAR] = bin2bcd(tm->tm_year % 100);
 
 	return spi_write(spi, buf, 8);
 }
@@ -82,14 +82,14 @@ static int m41t94_read_time(struct device *dev, struct rtc_time *tm)
 		spi_write(spi, buf, 2);
 	}
 
-	tm->tm_sec  = BCD2BIN(spi_w8r8(spi, M41T94_REG_SECONDS));
-	tm->tm_min  = BCD2BIN(spi_w8r8(spi, M41T94_REG_MINUTES));
+	tm->tm_sec  = bcd2bin(spi_w8r8(spi, M41T94_REG_SECONDS));
+	tm->tm_min  = bcd2bin(spi_w8r8(spi, M41T94_REG_MINUTES));
 	hour = spi_w8r8(spi, M41T94_REG_HOURS);
-	tm->tm_hour = BCD2BIN(hour & 0x3f);
-	tm->tm_wday = BCD2BIN(spi_w8r8(spi, M41T94_REG_WDAY)) - 1;
-	tm->tm_mday = BCD2BIN(spi_w8r8(spi, M41T94_REG_DAY));
-	tm->tm_mon  = BCD2BIN(spi_w8r8(spi, M41T94_REG_MONTH)) - 1;
-	tm->tm_year = BCD2BIN(spi_w8r8(spi, M41T94_REG_YEAR));
+	tm->tm_hour = bcd2bin(hour & 0x3f);
+	tm->tm_wday = bcd2bin(spi_w8r8(spi, M41T94_REG_WDAY)) - 1;
+	tm->tm_mday = bcd2bin(spi_w8r8(spi, M41T94_REG_DAY));
+	tm->tm_mon  = bcd2bin(spi_w8r8(spi, M41T94_REG_MONTH)) - 1;
+	tm->tm_year = bcd2bin(spi_w8r8(spi, M41T94_REG_YEAR));
 	if ((hour & M41T94_BIT_CB) || !(hour & M41T94_BIT_CEB))
 		tm->tm_year += 100;
 
diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c
index ce4eff6a8d51..04b63dab6932 100644
--- a/drivers/rtc/rtc-m48t59.c
+++ b/drivers/rtc/rtc-m48t59.c
@@ -76,10 +76,10 @@ static int m48t59_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	/* Issue the READ command */
 	M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL);
 
-	tm->tm_year	= BCD2BIN(M48T59_READ(M48T59_YEAR));
+	tm->tm_year	= bcd2bin(M48T59_READ(M48T59_YEAR));
 	/* tm_mon is 0-11 */
-	tm->tm_mon	= BCD2BIN(M48T59_READ(M48T59_MONTH)) - 1;
-	tm->tm_mday	= BCD2BIN(M48T59_READ(M48T59_MDAY));
+	tm->tm_mon	= bcd2bin(M48T59_READ(M48T59_MONTH)) - 1;
+	tm->tm_mday	= bcd2bin(M48T59_READ(M48T59_MDAY));
 
 	val = M48T59_READ(M48T59_WDAY);
 	if ((pdata->type == M48T59RTC_TYPE_M48T59) &&
@@ -88,10 +88,10 @@ static int m48t59_rtc_read_time(struct device *dev, struct rtc_time *tm)
 		tm->tm_year += 100;	/* one century */
 	}
 
-	tm->tm_wday	= BCD2BIN(val & 0x07);
-	tm->tm_hour	= BCD2BIN(M48T59_READ(M48T59_HOUR) & 0x3F);
-	tm->tm_min	= BCD2BIN(M48T59_READ(M48T59_MIN) & 0x7F);
-	tm->tm_sec	= BCD2BIN(M48T59_READ(M48T59_SEC) & 0x7F);
+	tm->tm_wday	= bcd2bin(val & 0x07);
+	tm->tm_hour	= bcd2bin(M48T59_READ(M48T59_HOUR) & 0x3F);
+	tm->tm_min	= bcd2bin(M48T59_READ(M48T59_MIN) & 0x7F);
+	tm->tm_sec	= bcd2bin(M48T59_READ(M48T59_SEC) & 0x7F);
 
 	/* Clear the READ bit */
 	M48T59_CLEAR_BITS(M48T59_CNTL_READ, M48T59_CNTL);
@@ -119,17 +119,17 @@ static int m48t59_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	/* Issue the WRITE command */
 	M48T59_SET_BITS(M48T59_CNTL_WRITE, M48T59_CNTL);
 
-	M48T59_WRITE((BIN2BCD(tm->tm_sec) & 0x7F), M48T59_SEC);
-	M48T59_WRITE((BIN2BCD(tm->tm_min) & 0x7F), M48T59_MIN);
-	M48T59_WRITE((BIN2BCD(tm->tm_hour) & 0x3F), M48T59_HOUR);
-	M48T59_WRITE((BIN2BCD(tm->tm_mday) & 0x3F), M48T59_MDAY);
+	M48T59_WRITE((bin2bcd(tm->tm_sec) & 0x7F), M48T59_SEC);
+	M48T59_WRITE((bin2bcd(tm->tm_min) & 0x7F), M48T59_MIN);
+	M48T59_WRITE((bin2bcd(tm->tm_hour) & 0x3F), M48T59_HOUR);
+	M48T59_WRITE((bin2bcd(tm->tm_mday) & 0x3F), M48T59_MDAY);
 	/* tm_mon is 0-11 */
-	M48T59_WRITE((BIN2BCD(tm->tm_mon + 1) & 0x1F), M48T59_MONTH);
-	M48T59_WRITE(BIN2BCD(tm->tm_year % 100), M48T59_YEAR);
+	M48T59_WRITE((bin2bcd(tm->tm_mon + 1) & 0x1F), M48T59_MONTH);
+	M48T59_WRITE(bin2bcd(tm->tm_year % 100), M48T59_YEAR);
 
 	if (pdata->type == M48T59RTC_TYPE_M48T59 && (tm->tm_year / 100))
 		val = (M48T59_WDAY_CEB | M48T59_WDAY_CB);
-	val |= (BIN2BCD(tm->tm_wday) & 0x07);
+	val |= (bin2bcd(tm->tm_wday) & 0x07);
 	M48T59_WRITE(val, M48T59_WDAY);
 
 	/* Clear the WRITE bit */
@@ -158,18 +158,18 @@ static int m48t59_rtc_readalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	/* Issue the READ command */
 	M48T59_SET_BITS(M48T59_CNTL_READ, M48T59_CNTL);
 
-	tm->tm_year = BCD2BIN(M48T59_READ(M48T59_YEAR));
+	tm->tm_year = bcd2bin(M48T59_READ(M48T59_YEAR));
 	/* tm_mon is 0-11 */
-	tm->tm_mon = BCD2BIN(M48T59_READ(M48T59_MONTH)) - 1;
+	tm->tm_mon = bcd2bin(M48T59_READ(M48T59_MONTH)) - 1;
 
 	val = M48T59_READ(M48T59_WDAY);
 	if ((val & M48T59_WDAY_CEB) && (val & M48T59_WDAY_CB))
 		tm->tm_year += 100;	/* one century */
 
-	tm->tm_mday = BCD2BIN(M48T59_READ(M48T59_ALARM_DATE));
-	tm->tm_hour = BCD2BIN(M48T59_READ(M48T59_ALARM_HOUR));
-	tm->tm_min = BCD2BIN(M48T59_READ(M48T59_ALARM_MIN));
-	tm->tm_sec = BCD2BIN(M48T59_READ(M48T59_ALARM_SEC));
+	tm->tm_mday = bcd2bin(M48T59_READ(M48T59_ALARM_DATE));
+	tm->tm_hour = bcd2bin(M48T59_READ(M48T59_ALARM_HOUR));
+	tm->tm_min = bcd2bin(M48T59_READ(M48T59_ALARM_MIN));
+	tm->tm_sec = bcd2bin(M48T59_READ(M48T59_ALARM_SEC));
 
 	/* Clear the READ bit */
 	M48T59_CLEAR_BITS(M48T59_CNTL_READ, M48T59_CNTL);
@@ -201,18 +201,18 @@ static int m48t59_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	 * 0xff means "always match"
 	 */
 	mday = tm->tm_mday;
-	mday = (mday >= 1 && mday <= 31) ? BIN2BCD(mday) : 0xff;
+	mday = (mday >= 1 && mday <= 31) ? bin2bcd(mday) : 0xff;
 	if (mday == 0xff)
 		mday = M48T59_READ(M48T59_MDAY);
 
 	hour = tm->tm_hour;
-	hour = (hour < 24) ? BIN2BCD(hour) : 0x00;
+	hour = (hour < 24) ? bin2bcd(hour) : 0x00;
 
 	min = tm->tm_min;
-	min = (min < 60) ? BIN2BCD(min) : 0x00;
+	min = (min < 60) ? bin2bcd(min) : 0x00;
 
 	sec = tm->tm_sec;
-	sec = (sec < 60) ? BIN2BCD(sec) : 0x00;
+	sec = (sec < 60) ? bin2bcd(sec) : 0x00;
 
 	spin_lock_irqsave(&m48t59->lock, flags);
 	/* Issue the WRITE command */
@@ -360,7 +360,6 @@ static struct bin_attribute m48t59_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.read = m48t59_nvram_read,
 	.write = m48t59_nvram_write,
diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c
index 3f7f99a5d96a..7c045cffa9ff 100644
--- a/drivers/rtc/rtc-m48t86.c
+++ b/drivers/rtc/rtc-m48t86.c
@@ -62,14 +62,14 @@ static int m48t86_rtc_read_time(struct device *dev, struct rtc_time *tm)
 		tm->tm_wday	= ops->readbyte(M48T86_REG_DOW);
 	} else {
 		/* bcd mode */
-		tm->tm_sec	= BCD2BIN(ops->readbyte(M48T86_REG_SEC));
-		tm->tm_min	= BCD2BIN(ops->readbyte(M48T86_REG_MIN));
-		tm->tm_hour	= BCD2BIN(ops->readbyte(M48T86_REG_HOUR) & 0x3F);
-		tm->tm_mday	= BCD2BIN(ops->readbyte(M48T86_REG_DOM));
+		tm->tm_sec	= bcd2bin(ops->readbyte(M48T86_REG_SEC));
+		tm->tm_min	= bcd2bin(ops->readbyte(M48T86_REG_MIN));
+		tm->tm_hour	= bcd2bin(ops->readbyte(M48T86_REG_HOUR) & 0x3F);
+		tm->tm_mday	= bcd2bin(ops->readbyte(M48T86_REG_DOM));
 		/* tm_mon is 0-11 */
-		tm->tm_mon	= BCD2BIN(ops->readbyte(M48T86_REG_MONTH)) - 1;
-		tm->tm_year	= BCD2BIN(ops->readbyte(M48T86_REG_YEAR)) + 100;
-		tm->tm_wday	= BCD2BIN(ops->readbyte(M48T86_REG_DOW));
+		tm->tm_mon	= bcd2bin(ops->readbyte(M48T86_REG_MONTH)) - 1;
+		tm->tm_year	= bcd2bin(ops->readbyte(M48T86_REG_YEAR)) + 100;
+		tm->tm_wday	= bcd2bin(ops->readbyte(M48T86_REG_DOW));
 	}
 
 	/* correct the hour if the clock is in 12h mode */
@@ -103,13 +103,13 @@ static int m48t86_rtc_set_time(struct device *dev, struct rtc_time *tm)
 		ops->writebyte(tm->tm_wday, M48T86_REG_DOW);
 	} else {
 		/* bcd mode */
-		ops->writebyte(BIN2BCD(tm->tm_sec), M48T86_REG_SEC);
-		ops->writebyte(BIN2BCD(tm->tm_min), M48T86_REG_MIN);
-		ops->writebyte(BIN2BCD(tm->tm_hour), M48T86_REG_HOUR);
-		ops->writebyte(BIN2BCD(tm->tm_mday), M48T86_REG_DOM);
-		ops->writebyte(BIN2BCD(tm->tm_mon + 1), M48T86_REG_MONTH);
-		ops->writebyte(BIN2BCD(tm->tm_year % 100), M48T86_REG_YEAR);
-		ops->writebyte(BIN2BCD(tm->tm_wday), M48T86_REG_DOW);
+		ops->writebyte(bin2bcd(tm->tm_sec), M48T86_REG_SEC);
+		ops->writebyte(bin2bcd(tm->tm_min), M48T86_REG_MIN);
+		ops->writebyte(bin2bcd(tm->tm_hour), M48T86_REG_HOUR);
+		ops->writebyte(bin2bcd(tm->tm_mday), M48T86_REG_DOM);
+		ops->writebyte(bin2bcd(tm->tm_mon + 1), M48T86_REG_MONTH);
+		ops->writebyte(bin2bcd(tm->tm_year % 100), M48T86_REG_YEAR);
+		ops->writebyte(bin2bcd(tm->tm_wday), M48T86_REG_DOW);
 	}
 
 	/* update ended */
diff --git a/drivers/rtc/rtc-max6900.c b/drivers/rtc/rtc-max6900.c
index 12c9cd25cad8..80782798763f 100644
--- a/drivers/rtc/rtc-max6900.c
+++ b/drivers/rtc/rtc-max6900.c
@@ -150,14 +150,14 @@ static int max6900_i2c_read_time(struct i2c_client *client, struct rtc_time *tm)
 	if (rc < 0)
 		return rc;
 
-	tm->tm_sec = BCD2BIN(regs[MAX6900_REG_SC]);
-	tm->tm_min = BCD2BIN(regs[MAX6900_REG_MN]);
-	tm->tm_hour = BCD2BIN(regs[MAX6900_REG_HR] & 0x3f);
-	tm->tm_mday = BCD2BIN(regs[MAX6900_REG_DT]);
-	tm->tm_mon = BCD2BIN(regs[MAX6900_REG_MO]) - 1;
-	tm->tm_year = BCD2BIN(regs[MAX6900_REG_YR]) +
-	    BCD2BIN(regs[MAX6900_REG_CENTURY]) * 100 - 1900;
-	tm->tm_wday = BCD2BIN(regs[MAX6900_REG_DW]);
+	tm->tm_sec = bcd2bin(regs[MAX6900_REG_SC]);
+	tm->tm_min = bcd2bin(regs[MAX6900_REG_MN]);
+	tm->tm_hour = bcd2bin(regs[MAX6900_REG_HR] & 0x3f);
+	tm->tm_mday = bcd2bin(regs[MAX6900_REG_DT]);
+	tm->tm_mon = bcd2bin(regs[MAX6900_REG_MO]) - 1;
+	tm->tm_year = bcd2bin(regs[MAX6900_REG_YR]) +
+		      bcd2bin(regs[MAX6900_REG_CENTURY]) * 100 - 1900;
+	tm->tm_wday = bcd2bin(regs[MAX6900_REG_DW]);
 
 	return 0;
 }
@@ -184,14 +184,14 @@ max6900_i2c_set_time(struct i2c_client *client, struct rtc_time const *tm)
 	if (rc < 0)
 		return rc;
 
-	regs[MAX6900_REG_SC] = BIN2BCD(tm->tm_sec);
-	regs[MAX6900_REG_MN] = BIN2BCD(tm->tm_min);
-	regs[MAX6900_REG_HR] = BIN2BCD(tm->tm_hour);
-	regs[MAX6900_REG_DT] = BIN2BCD(tm->tm_mday);
-	regs[MAX6900_REG_MO] = BIN2BCD(tm->tm_mon + 1);
-	regs[MAX6900_REG_DW] = BIN2BCD(tm->tm_wday);
-	regs[MAX6900_REG_YR] = BIN2BCD(tm->tm_year % 100);
-	regs[MAX6900_REG_CENTURY] = BIN2BCD((tm->tm_year + 1900) / 100);
+	regs[MAX6900_REG_SC] = bin2bcd(tm->tm_sec);
+	regs[MAX6900_REG_MN] = bin2bcd(tm->tm_min);
+	regs[MAX6900_REG_HR] = bin2bcd(tm->tm_hour);
+	regs[MAX6900_REG_DT] = bin2bcd(tm->tm_mday);
+	regs[MAX6900_REG_MO] = bin2bcd(tm->tm_mon + 1);
+	regs[MAX6900_REG_DW] = bin2bcd(tm->tm_wday);
+	regs[MAX6900_REG_YR] = bin2bcd(tm->tm_year % 100);
+	regs[MAX6900_REG_CENTURY] = bin2bcd((tm->tm_year + 1900) / 100);
 	/* set write protect */
 	regs[MAX6900_REG_CT] = MAX6900_REG_CT_WP;
 
diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c
index 78b2551fb19d..2f6507df7b49 100644
--- a/drivers/rtc/rtc-max6902.c
+++ b/drivers/rtc/rtc-max6902.c
@@ -124,15 +124,15 @@ static int max6902_get_datetime(struct device *dev, struct rtc_time *dt)
 
 	/* The chip sends data in this order:
 	 * Seconds, Minutes, Hours, Date, Month, Day, Year */
-	dt->tm_sec	= BCD2BIN(chip->buf[1]);
-	dt->tm_min	= BCD2BIN(chip->buf[2]);
-	dt->tm_hour	= BCD2BIN(chip->buf[3]);
-	dt->tm_mday	= BCD2BIN(chip->buf[4]);
-	dt->tm_mon	= BCD2BIN(chip->buf[5]) - 1;
-	dt->tm_wday	= BCD2BIN(chip->buf[6]);
-	dt->tm_year = BCD2BIN(chip->buf[7]);
+	dt->tm_sec	= bcd2bin(chip->buf[1]);
+	dt->tm_min	= bcd2bin(chip->buf[2]);
+	dt->tm_hour	= bcd2bin(chip->buf[3]);
+	dt->tm_mday	= bcd2bin(chip->buf[4]);
+	dt->tm_mon	= bcd2bin(chip->buf[5]) - 1;
+	dt->tm_wday	= bcd2bin(chip->buf[6]);
+	dt->tm_year = bcd2bin(chip->buf[7]);
 
-	century = BCD2BIN(tmp) * 100;
+	century = bcd2bin(tmp) * 100;
 
 	dt->tm_year += century;
 	dt->tm_year -= 1900;
@@ -168,15 +168,15 @@ static int max6902_set_datetime(struct device *dev, struct rtc_time *dt)
 	/* Remove write protection */
 	max6902_set_reg(dev, 0xF, 0);
 
-	max6902_set_reg(dev, 0x01, BIN2BCD(dt->tm_sec));
-	max6902_set_reg(dev, 0x03, BIN2BCD(dt->tm_min));
-	max6902_set_reg(dev, 0x05, BIN2BCD(dt->tm_hour));
+	max6902_set_reg(dev, 0x01, bin2bcd(dt->tm_sec));
+	max6902_set_reg(dev, 0x03, bin2bcd(dt->tm_min));
+	max6902_set_reg(dev, 0x05, bin2bcd(dt->tm_hour));
 
-	max6902_set_reg(dev, 0x07, BIN2BCD(dt->tm_mday));
-	max6902_set_reg(dev, 0x09, BIN2BCD(dt->tm_mon+1));
-	max6902_set_reg(dev, 0x0B, BIN2BCD(dt->tm_wday));
-	max6902_set_reg(dev, 0x0D, BIN2BCD(dt->tm_year%100));
-	max6902_set_reg(dev, 0x13, BIN2BCD(dt->tm_year/100));
+	max6902_set_reg(dev, 0x07, bin2bcd(dt->tm_mday));
+	max6902_set_reg(dev, 0x09, bin2bcd(dt->tm_mon+1));
+	max6902_set_reg(dev, 0x0B, bin2bcd(dt->tm_wday));
+	max6902_set_reg(dev, 0x0D, bin2bcd(dt->tm_year%100));
+	max6902_set_reg(dev, 0x13, bin2bcd(dt->tm_year/100));
 
 	/* Compulab used a delay here. However, the datasheet
 	 * does not mention a delay being required anywhere... */
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index 8876605d4d4b..2cbeb0794f14 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -186,30 +186,30 @@ static int tm2bcd(struct rtc_time *tm)
 	if (rtc_valid_tm(tm) != 0)
 		return -EINVAL;
 
-	tm->tm_sec = BIN2BCD(tm->tm_sec);
-	tm->tm_min = BIN2BCD(tm->tm_min);
-	tm->tm_hour = BIN2BCD(tm->tm_hour);
-	tm->tm_mday = BIN2BCD(tm->tm_mday);
+	tm->tm_sec = bin2bcd(tm->tm_sec);
+	tm->tm_min = bin2bcd(tm->tm_min);
+	tm->tm_hour = bin2bcd(tm->tm_hour);
+	tm->tm_mday = bin2bcd(tm->tm_mday);
 
-	tm->tm_mon = BIN2BCD(tm->tm_mon + 1);
+	tm->tm_mon = bin2bcd(tm->tm_mon + 1);
 
 	/* epoch == 1900 */
 	if (tm->tm_year < 100 || tm->tm_year > 199)
 		return -EINVAL;
-	tm->tm_year = BIN2BCD(tm->tm_year - 100);
+	tm->tm_year = bin2bcd(tm->tm_year - 100);
 
 	return 0;
 }
 
 static void bcd2tm(struct rtc_time *tm)
 {
-	tm->tm_sec = BCD2BIN(tm->tm_sec);
-	tm->tm_min = BCD2BIN(tm->tm_min);
-	tm->tm_hour = BCD2BIN(tm->tm_hour);
-	tm->tm_mday = BCD2BIN(tm->tm_mday);
-	tm->tm_mon = BCD2BIN(tm->tm_mon) - 1;
+	tm->tm_sec = bcd2bin(tm->tm_sec);
+	tm->tm_min = bcd2bin(tm->tm_min);
+	tm->tm_hour = bcd2bin(tm->tm_hour);
+	tm->tm_mday = bcd2bin(tm->tm_mday);
+	tm->tm_mon = bcd2bin(tm->tm_mon) - 1;
 	/* epoch == 1900 */
-	tm->tm_year = BCD2BIN(tm->tm_year) + 100;
+	tm->tm_year = bcd2bin(tm->tm_year) + 100;
 }
 
 
diff --git a/drivers/rtc/rtc-pcf8563.c b/drivers/rtc/rtc-pcf8563.c
index a829f20ad6d6..b725913ccbe8 100644
--- a/drivers/rtc/rtc-pcf8563.c
+++ b/drivers/rtc/rtc-pcf8563.c
@@ -97,13 +97,13 @@ static int pcf8563_get_datetime(struct i2c_client *client, struct rtc_time *tm)
 		buf[8]);
 
 
-	tm->tm_sec = BCD2BIN(buf[PCF8563_REG_SC] & 0x7F);
-	tm->tm_min = BCD2BIN(buf[PCF8563_REG_MN] & 0x7F);
-	tm->tm_hour = BCD2BIN(buf[PCF8563_REG_HR] & 0x3F); /* rtc hr 0-23 */
-	tm->tm_mday = BCD2BIN(buf[PCF8563_REG_DM] & 0x3F);
+	tm->tm_sec = bcd2bin(buf[PCF8563_REG_SC] & 0x7F);
+	tm->tm_min = bcd2bin(buf[PCF8563_REG_MN] & 0x7F);
+	tm->tm_hour = bcd2bin(buf[PCF8563_REG_HR] & 0x3F); /* rtc hr 0-23 */
+	tm->tm_mday = bcd2bin(buf[PCF8563_REG_DM] & 0x3F);
 	tm->tm_wday = buf[PCF8563_REG_DW] & 0x07;
-	tm->tm_mon = BCD2BIN(buf[PCF8563_REG_MO] & 0x1F) - 1; /* rtc mn 1-12 */
-	tm->tm_year = BCD2BIN(buf[PCF8563_REG_YR]);
+	tm->tm_mon = bcd2bin(buf[PCF8563_REG_MO] & 0x1F) - 1; /* rtc mn 1-12 */
+	tm->tm_year = bcd2bin(buf[PCF8563_REG_YR]);
 	if (tm->tm_year < 70)
 		tm->tm_year += 100;	/* assume we are in 1970...2069 */
 	/* detect the polarity heuristically. see note above. */
@@ -138,17 +138,17 @@ static int pcf8563_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 		tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
 	/* hours, minutes and seconds */
-	buf[PCF8563_REG_SC] = BIN2BCD(tm->tm_sec);
-	buf[PCF8563_REG_MN] = BIN2BCD(tm->tm_min);
-	buf[PCF8563_REG_HR] = BIN2BCD(tm->tm_hour);
+	buf[PCF8563_REG_SC] = bin2bcd(tm->tm_sec);
+	buf[PCF8563_REG_MN] = bin2bcd(tm->tm_min);
+	buf[PCF8563_REG_HR] = bin2bcd(tm->tm_hour);
 
-	buf[PCF8563_REG_DM] = BIN2BCD(tm->tm_mday);
+	buf[PCF8563_REG_DM] = bin2bcd(tm->tm_mday);
 
 	/* month, 1 - 12 */
-	buf[PCF8563_REG_MO] = BIN2BCD(tm->tm_mon + 1);
+	buf[PCF8563_REG_MO] = bin2bcd(tm->tm_mon + 1);
 
 	/* year and century */
-	buf[PCF8563_REG_YR] = BIN2BCD(tm->tm_year % 100);
+	buf[PCF8563_REG_YR] = bin2bcd(tm->tm_year % 100);
 	if (pcf8563->c_polarity ? (tm->tm_year >= 100) : (tm->tm_year < 100))
 		buf[PCF8563_REG_MO] |= PCF8563_MO_C;
 
diff --git a/drivers/rtc/rtc-pcf8583.c b/drivers/rtc/rtc-pcf8583.c
index d388c662bf4b..7d33cda3f8f6 100644
--- a/drivers/rtc/rtc-pcf8583.c
+++ b/drivers/rtc/rtc-pcf8583.c
@@ -76,11 +76,11 @@ static int pcf8583_get_datetime(struct i2c_client *client, struct rtc_time *dt)
 		buf[4] &= 0x3f;
 		buf[5] &= 0x1f;
 
-		dt->tm_sec = BCD2BIN(buf[1]);
-		dt->tm_min = BCD2BIN(buf[2]);
-		dt->tm_hour = BCD2BIN(buf[3]);
-		dt->tm_mday = BCD2BIN(buf[4]);
-		dt->tm_mon = BCD2BIN(buf[5]) - 1;
+		dt->tm_sec = bcd2bin(buf[1]);
+		dt->tm_min = bcd2bin(buf[2]);
+		dt->tm_hour = bcd2bin(buf[3]);
+		dt->tm_mday = bcd2bin(buf[4]);
+		dt->tm_mon = bcd2bin(buf[5]) - 1;
 	}
 
 	return ret == 2 ? 0 : -EIO;
@@ -94,14 +94,14 @@ static int pcf8583_set_datetime(struct i2c_client *client, struct rtc_time *dt,
 	buf[0] = 0;
 	buf[1] = get_ctrl(client) | 0x80;
 	buf[2] = 0;
-	buf[3] = BIN2BCD(dt->tm_sec);
-	buf[4] = BIN2BCD(dt->tm_min);
-	buf[5] = BIN2BCD(dt->tm_hour);
+	buf[3] = bin2bcd(dt->tm_sec);
+	buf[4] = bin2bcd(dt->tm_min);
+	buf[5] = bin2bcd(dt->tm_hour);
 
 	if (datetoo) {
 		len = 8;
-		buf[6] = BIN2BCD(dt->tm_mday) | (dt->tm_year << 6);
-		buf[7] = BIN2BCD(dt->tm_mon + 1)  | (dt->tm_wday << 5);
+		buf[6] = bin2bcd(dt->tm_mday) | (dt->tm_year << 6);
+		buf[7] = bin2bcd(dt->tm_mon + 1)  | (dt->tm_wday << 5);
 	}
 
 	ret = i2c_master_send(client, (char *)buf, len);
diff --git a/drivers/rtc/rtc-r9701.c b/drivers/rtc/rtc-r9701.c
index 395985b339c9..42028f233bef 100644
--- a/drivers/rtc/rtc-r9701.c
+++ b/drivers/rtc/rtc-r9701.c
@@ -80,13 +80,13 @@ static int r9701_get_datetime(struct device *dev, struct rtc_time *dt)
 
 	memset(dt, 0, sizeof(*dt));
 
-	dt->tm_sec = BCD2BIN(buf[0]); /* RSECCNT */
-	dt->tm_min = BCD2BIN(buf[1]); /* RMINCNT */
-	dt->tm_hour = BCD2BIN(buf[2]); /* RHRCNT */
+	dt->tm_sec = bcd2bin(buf[0]); /* RSECCNT */
+	dt->tm_min = bcd2bin(buf[1]); /* RMINCNT */
+	dt->tm_hour = bcd2bin(buf[2]); /* RHRCNT */
 
-	dt->tm_mday = BCD2BIN(buf[3]); /* RDAYCNT */
-	dt->tm_mon = BCD2BIN(buf[4]) - 1; /* RMONCNT */
-	dt->tm_year = BCD2BIN(buf[5]) + 100; /* RYRCNT */
+	dt->tm_mday = bcd2bin(buf[3]); /* RDAYCNT */
+	dt->tm_mon = bcd2bin(buf[4]) - 1; /* RMONCNT */
+	dt->tm_year = bcd2bin(buf[5]) + 100; /* RYRCNT */
 
 	/* the rtc device may contain illegal values on power up
 	 * according to the data sheet. make sure they are valid.
@@ -103,12 +103,12 @@ static int r9701_set_datetime(struct device *dev, struct rtc_time *dt)
 	if (year >= 2100 || year < 2000)
 		return -EINVAL;
 
-	ret = write_reg(dev, RHRCNT, BIN2BCD(dt->tm_hour));
-	ret = ret ? ret : write_reg(dev, RMINCNT, BIN2BCD(dt->tm_min));
-	ret = ret ? ret : write_reg(dev, RSECCNT, BIN2BCD(dt->tm_sec));
-	ret = ret ? ret : write_reg(dev, RDAYCNT, BIN2BCD(dt->tm_mday));
-	ret = ret ? ret : write_reg(dev, RMONCNT, BIN2BCD(dt->tm_mon + 1));
-	ret = ret ? ret : write_reg(dev, RYRCNT, BIN2BCD(dt->tm_year - 100));
+	ret = write_reg(dev, RHRCNT, bin2bcd(dt->tm_hour));
+	ret = ret ? ret : write_reg(dev, RMINCNT, bin2bcd(dt->tm_min));
+	ret = ret ? ret : write_reg(dev, RSECCNT, bin2bcd(dt->tm_sec));
+	ret = ret ? ret : write_reg(dev, RDAYCNT, bin2bcd(dt->tm_mday));
+	ret = ret ? ret : write_reg(dev, RMONCNT, bin2bcd(dt->tm_mon + 1));
+	ret = ret ? ret : write_reg(dev, RYRCNT, bin2bcd(dt->tm_year - 100));
 	ret = ret ? ret : write_reg(dev, RWKCNT, 1 << dt->tm_wday);
 
 	return ret;
diff --git a/drivers/rtc/rtc-rs5c313.c b/drivers/rtc/rtc-rs5c313.c
index 1c14d4497c4d..e6ea3f5ee1eb 100644
--- a/drivers/rtc/rtc-rs5c313.c
+++ b/drivers/rtc/rtc-rs5c313.c
@@ -235,33 +235,33 @@ static int rs5c313_rtc_read_time(struct device *dev, struct rtc_time *tm)
 
 	data = rs5c313_read_reg(RS5C313_ADDR_SEC);
 	data |= (rs5c313_read_reg(RS5C313_ADDR_SEC10) << 4);
-	tm->tm_sec = BCD2BIN(data);
+	tm->tm_sec = bcd2bin(data);
 
 	data = rs5c313_read_reg(RS5C313_ADDR_MIN);
 	data |= (rs5c313_read_reg(RS5C313_ADDR_MIN10) << 4);
-	tm->tm_min = BCD2BIN(data);
+	tm->tm_min = bcd2bin(data);
 
 	data = rs5c313_read_reg(RS5C313_ADDR_HOUR);
 	data |= (rs5c313_read_reg(RS5C313_ADDR_HOUR10) << 4);
-	tm->tm_hour = BCD2BIN(data);
+	tm->tm_hour = bcd2bin(data);
 
 	data = rs5c313_read_reg(RS5C313_ADDR_DAY);
 	data |= (rs5c313_read_reg(RS5C313_ADDR_DAY10) << 4);
-	tm->tm_mday = BCD2BIN(data);
+	tm->tm_mday = bcd2bin(data);
 
 	data = rs5c313_read_reg(RS5C313_ADDR_MON);
 	data |= (rs5c313_read_reg(RS5C313_ADDR_MON10) << 4);
-	tm->tm_mon = BCD2BIN(data) - 1;
+	tm->tm_mon = bcd2bin(data) - 1;
 
 	data = rs5c313_read_reg(RS5C313_ADDR_YEAR);
 	data |= (rs5c313_read_reg(RS5C313_ADDR_YEAR10) << 4);
-	tm->tm_year = BCD2BIN(data);
+	tm->tm_year = bcd2bin(data);
 
 	if (tm->tm_year < 70)
 		tm->tm_year += 100;
 
 	data = rs5c313_read_reg(RS5C313_ADDR_WEEK);
-	tm->tm_wday = BCD2BIN(data);
+	tm->tm_wday = bcd2bin(data);
 
 	RS5C313_CEDISABLE;
 	ndelay(700);		/* CE:L */
@@ -294,31 +294,31 @@ static int rs5c313_rtc_set_time(struct device *dev, struct rtc_time *tm)
 		}
 	}
 
-	data = BIN2BCD(tm->tm_sec);
+	data = bin2bcd(tm->tm_sec);
 	rs5c313_write_reg(RS5C313_ADDR_SEC, data);
 	rs5c313_write_reg(RS5C313_ADDR_SEC10, (data >> 4));
 
-	data = BIN2BCD(tm->tm_min);
+	data = bin2bcd(tm->tm_min);
 	rs5c313_write_reg(RS5C313_ADDR_MIN, data );
 	rs5c313_write_reg(RS5C313_ADDR_MIN10, (data >> 4));
 
-	data = BIN2BCD(tm->tm_hour);
+	data = bin2bcd(tm->tm_hour);
 	rs5c313_write_reg(RS5C313_ADDR_HOUR, data);
 	rs5c313_write_reg(RS5C313_ADDR_HOUR10, (data >> 4));
 
-	data = BIN2BCD(tm->tm_mday);
+	data = bin2bcd(tm->tm_mday);
 	rs5c313_write_reg(RS5C313_ADDR_DAY, data);
 	rs5c313_write_reg(RS5C313_ADDR_DAY10, (data>> 4));
 
-	data = BIN2BCD(tm->tm_mon + 1);
+	data = bin2bcd(tm->tm_mon + 1);
 	rs5c313_write_reg(RS5C313_ADDR_MON, data);
 	rs5c313_write_reg(RS5C313_ADDR_MON10, (data >> 4));
 
-	data = BIN2BCD(tm->tm_year % 100);
+	data = bin2bcd(tm->tm_year % 100);
 	rs5c313_write_reg(RS5C313_ADDR_YEAR, data);
 	rs5c313_write_reg(RS5C313_ADDR_YEAR10, (data >> 4));
 
-	data = BIN2BCD(tm->tm_wday);
+	data = bin2bcd(tm->tm_wday);
 	rs5c313_write_reg(RS5C313_ADDR_WEEK, data);
 
 	RS5C313_CEDISABLE;	/* CE:H */
diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c
index 839462659afa..dd1e2bc7a472 100644
--- a/drivers/rtc/rtc-rs5c348.c
+++ b/drivers/rtc/rtc-rs5c348.c
@@ -74,20 +74,20 @@ rs5c348_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	txbuf[3] = 0;	/* dummy */
 	txbuf[4] = RS5C348_CMD_MW(RS5C348_REG_SECS); /* cmd, sec, ... */
 	txp = &txbuf[5];
-	txp[RS5C348_REG_SECS] = BIN2BCD(tm->tm_sec);
-	txp[RS5C348_REG_MINS] = BIN2BCD(tm->tm_min);
+	txp[RS5C348_REG_SECS] = bin2bcd(tm->tm_sec);
+	txp[RS5C348_REG_MINS] = bin2bcd(tm->tm_min);
 	if (pdata->rtc_24h) {
-		txp[RS5C348_REG_HOURS] = BIN2BCD(tm->tm_hour);
+		txp[RS5C348_REG_HOURS] = bin2bcd(tm->tm_hour);
 	} else {
 		/* hour 0 is AM12, noon is PM12 */
-		txp[RS5C348_REG_HOURS] = BIN2BCD((tm->tm_hour + 11) % 12 + 1) |
+		txp[RS5C348_REG_HOURS] = bin2bcd((tm->tm_hour + 11) % 12 + 1) |
 			(tm->tm_hour >= 12 ? RS5C348_BIT_PM : 0);
 	}
-	txp[RS5C348_REG_WDAY] = BIN2BCD(tm->tm_wday);
-	txp[RS5C348_REG_DAY] = BIN2BCD(tm->tm_mday);
-	txp[RS5C348_REG_MONTH] = BIN2BCD(tm->tm_mon + 1) |
+	txp[RS5C348_REG_WDAY] = bin2bcd(tm->tm_wday);
+	txp[RS5C348_REG_DAY] = bin2bcd(tm->tm_mday);
+	txp[RS5C348_REG_MONTH] = bin2bcd(tm->tm_mon + 1) |
 		(tm->tm_year >= 100 ? RS5C348_BIT_Y2K : 0);
-	txp[RS5C348_REG_YEAR] = BIN2BCD(tm->tm_year % 100);
+	txp[RS5C348_REG_YEAR] = bin2bcd(tm->tm_year % 100);
 	/* write in one transfer to avoid data inconsistency */
 	ret = spi_write_then_read(spi, txbuf, sizeof(txbuf), NULL, 0);
 	udelay(62);	/* Tcsr 62us */
@@ -116,20 +116,20 @@ rs5c348_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	if (ret < 0)
 		return ret;
 
-	tm->tm_sec = BCD2BIN(rxbuf[RS5C348_REG_SECS] & RS5C348_SECS_MASK);
-	tm->tm_min = BCD2BIN(rxbuf[RS5C348_REG_MINS] & RS5C348_MINS_MASK);
-	tm->tm_hour = BCD2BIN(rxbuf[RS5C348_REG_HOURS] & RS5C348_HOURS_MASK);
+	tm->tm_sec = bcd2bin(rxbuf[RS5C348_REG_SECS] & RS5C348_SECS_MASK);
+	tm->tm_min = bcd2bin(rxbuf[RS5C348_REG_MINS] & RS5C348_MINS_MASK);
+	tm->tm_hour = bcd2bin(rxbuf[RS5C348_REG_HOURS] & RS5C348_HOURS_MASK);
 	if (!pdata->rtc_24h) {
 		tm->tm_hour %= 12;
 		if (rxbuf[RS5C348_REG_HOURS] & RS5C348_BIT_PM)
 			tm->tm_hour += 12;
 	}
-	tm->tm_wday = BCD2BIN(rxbuf[RS5C348_REG_WDAY] & RS5C348_WDAY_MASK);
-	tm->tm_mday = BCD2BIN(rxbuf[RS5C348_REG_DAY] & RS5C348_DAY_MASK);
+	tm->tm_wday = bcd2bin(rxbuf[RS5C348_REG_WDAY] & RS5C348_WDAY_MASK);
+	tm->tm_mday = bcd2bin(rxbuf[RS5C348_REG_DAY] & RS5C348_DAY_MASK);
 	tm->tm_mon =
-		BCD2BIN(rxbuf[RS5C348_REG_MONTH] & RS5C348_MONTH_MASK) - 1;
+		bcd2bin(rxbuf[RS5C348_REG_MONTH] & RS5C348_MONTH_MASK) - 1;
 	/* year is 1900 + tm->tm_year */
-	tm->tm_year = BCD2BIN(rxbuf[RS5C348_REG_YEAR]) +
+	tm->tm_year = bcd2bin(rxbuf[RS5C348_REG_YEAR]) +
 		((rxbuf[RS5C348_REG_MONTH] & RS5C348_BIT_Y2K) ? 100 : 0);
 
 	if (rtc_valid_tm(tm) < 0) {
diff --git a/drivers/rtc/rtc-rs5c372.c b/drivers/rtc/rtc-rs5c372.c
index 8b561958fb1e..2f2c68d476da 100644
--- a/drivers/rtc/rtc-rs5c372.c
+++ b/drivers/rtc/rtc-rs5c372.c
@@ -148,9 +148,9 @@ static unsigned rs5c_reg2hr(struct rs5c372 *rs5c, unsigned reg)
 	unsigned	hour;
 
 	if (rs5c->time24)
-		return BCD2BIN(reg & 0x3f);
+		return bcd2bin(reg & 0x3f);
 
-	hour = BCD2BIN(reg & 0x1f);
+	hour = bcd2bin(reg & 0x1f);
 	if (hour == 12)
 		hour = 0;
 	if (reg & 0x20)
@@ -161,15 +161,15 @@ static unsigned rs5c_reg2hr(struct rs5c372 *rs5c, unsigned reg)
 static unsigned rs5c_hr2reg(struct rs5c372 *rs5c, unsigned hour)
 {
 	if (rs5c->time24)
-		return BIN2BCD(hour);
+		return bin2bcd(hour);
 
 	if (hour > 12)
-		return 0x20 | BIN2BCD(hour - 12);
+		return 0x20 | bin2bcd(hour - 12);
 	if (hour == 12)
-		return 0x20 | BIN2BCD(12);
+		return 0x20 | bin2bcd(12);
 	if (hour == 0)
-		return BIN2BCD(12);
-	return BIN2BCD(hour);
+		return bin2bcd(12);
+	return bin2bcd(hour);
 }
 
 static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm)
@@ -180,18 +180,18 @@ static int rs5c372_get_datetime(struct i2c_client *client, struct rtc_time *tm)
 	if (status < 0)
 		return status;
 
-	tm->tm_sec = BCD2BIN(rs5c->regs[RS5C372_REG_SECS] & 0x7f);
-	tm->tm_min = BCD2BIN(rs5c->regs[RS5C372_REG_MINS] & 0x7f);
+	tm->tm_sec = bcd2bin(rs5c->regs[RS5C372_REG_SECS] & 0x7f);
+	tm->tm_min = bcd2bin(rs5c->regs[RS5C372_REG_MINS] & 0x7f);
 	tm->tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C372_REG_HOURS]);
 
-	tm->tm_wday = BCD2BIN(rs5c->regs[RS5C372_REG_WDAY] & 0x07);
-	tm->tm_mday = BCD2BIN(rs5c->regs[RS5C372_REG_DAY] & 0x3f);
+	tm->tm_wday = bcd2bin(rs5c->regs[RS5C372_REG_WDAY] & 0x07);
+	tm->tm_mday = bcd2bin(rs5c->regs[RS5C372_REG_DAY] & 0x3f);
 
 	/* tm->tm_mon is zero-based */
-	tm->tm_mon = BCD2BIN(rs5c->regs[RS5C372_REG_MONTH] & 0x1f) - 1;
+	tm->tm_mon = bcd2bin(rs5c->regs[RS5C372_REG_MONTH] & 0x1f) - 1;
 
 	/* year is 1900 + tm->tm_year */
-	tm->tm_year = BCD2BIN(rs5c->regs[RS5C372_REG_YEAR]) + 100;
+	tm->tm_year = bcd2bin(rs5c->regs[RS5C372_REG_YEAR]) + 100;
 
 	dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, "
 		"mday=%d, mon=%d, year=%d, wday=%d\n",
@@ -216,13 +216,13 @@ static int rs5c372_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 		tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
 	addr   = RS5C_ADDR(RS5C372_REG_SECS);
-	buf[0] = BIN2BCD(tm->tm_sec);
-	buf[1] = BIN2BCD(tm->tm_min);
+	buf[0] = bin2bcd(tm->tm_sec);
+	buf[1] = bin2bcd(tm->tm_min);
 	buf[2] = rs5c_hr2reg(rs5c, tm->tm_hour);
-	buf[3] = BIN2BCD(tm->tm_wday);
-	buf[4] = BIN2BCD(tm->tm_mday);
-	buf[5] = BIN2BCD(tm->tm_mon + 1);
-	buf[6] = BIN2BCD(tm->tm_year - 100);
+	buf[3] = bin2bcd(tm->tm_wday);
+	buf[4] = bin2bcd(tm->tm_mday);
+	buf[5] = bin2bcd(tm->tm_mon + 1);
+	buf[6] = bin2bcd(tm->tm_year - 100);
 
 	if (i2c_smbus_write_i2c_block_data(client, addr, sizeof(buf), buf) < 0) {
 		dev_err(&client->dev, "%s: write error\n", __func__);
@@ -367,7 +367,7 @@ static int rs5c_read_alarm(struct device *dev, struct rtc_wkalrm *t)
 
 	/* report alarm time */
 	t->time.tm_sec = 0;
-	t->time.tm_min = BCD2BIN(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f);
+	t->time.tm_min = bcd2bin(rs5c->regs[RS5C_REG_ALARM_A_MIN] & 0x7f);
 	t->time.tm_hour = rs5c_reg2hr(rs5c, rs5c->regs[RS5C_REG_ALARM_A_HOURS]);
 	t->time.tm_mday = -1;
 	t->time.tm_mon = -1;
@@ -413,7 +413,7 @@ static int rs5c_set_alarm(struct device *dev, struct rtc_wkalrm *t)
 	}
 
 	/* set alarm */
-	buf[0] = BIN2BCD(t->time.tm_min);
+	buf[0] = bin2bcd(t->time.tm_min);
 	buf[1] = rs5c_hr2reg(rs5c, t->time.tm_hour);
 	buf[2] = 0x7f;	/* any/all days */
 
diff --git a/drivers/rtc/rtc-s35390a.c b/drivers/rtc/rtc-s35390a.c
index a6fa1f2f2ca6..def4d396d0b0 100644
--- a/drivers/rtc/rtc-s35390a.c
+++ b/drivers/rtc/rtc-s35390a.c
@@ -104,12 +104,12 @@ static int s35390a_disable_test_mode(struct s35390a *s35390a)
 static char s35390a_hr2reg(struct s35390a *s35390a, int hour)
 {
 	if (s35390a->twentyfourhour)
-		return BIN2BCD(hour);
+		return bin2bcd(hour);
 
 	if (hour < 12)
-		return BIN2BCD(hour);
+		return bin2bcd(hour);
 
-	return 0x40 | BIN2BCD(hour - 12);
+	return 0x40 | bin2bcd(hour - 12);
 }
 
 static int s35390a_reg2hr(struct s35390a *s35390a, char reg)
@@ -117,9 +117,9 @@ static int s35390a_reg2hr(struct s35390a *s35390a, char reg)
 	unsigned hour;
 
 	if (s35390a->twentyfourhour)
-		return BCD2BIN(reg & 0x3f);
+		return bcd2bin(reg & 0x3f);
 
-	hour = BCD2BIN(reg & 0x3f);
+	hour = bcd2bin(reg & 0x3f);
 	if (reg & 0x40)
 		hour += 12;
 
@@ -137,13 +137,13 @@ static int s35390a_set_datetime(struct i2c_client *client, struct rtc_time *tm)
 		tm->tm_min, tm->tm_hour, tm->tm_mday, tm->tm_mon, tm->tm_year,
 		tm->tm_wday);
 
-	buf[S35390A_BYTE_YEAR] = BIN2BCD(tm->tm_year - 100);
-	buf[S35390A_BYTE_MONTH] = BIN2BCD(tm->tm_mon + 1);
-	buf[S35390A_BYTE_DAY] = BIN2BCD(tm->tm_mday);
-	buf[S35390A_BYTE_WDAY] = BIN2BCD(tm->tm_wday);
+	buf[S35390A_BYTE_YEAR] = bin2bcd(tm->tm_year - 100);
+	buf[S35390A_BYTE_MONTH] = bin2bcd(tm->tm_mon + 1);
+	buf[S35390A_BYTE_DAY] = bin2bcd(tm->tm_mday);
+	buf[S35390A_BYTE_WDAY] = bin2bcd(tm->tm_wday);
 	buf[S35390A_BYTE_HOURS] = s35390a_hr2reg(s35390a, tm->tm_hour);
-	buf[S35390A_BYTE_MINS] = BIN2BCD(tm->tm_min);
-	buf[S35390A_BYTE_SECS] = BIN2BCD(tm->tm_sec);
+	buf[S35390A_BYTE_MINS] = bin2bcd(tm->tm_min);
+	buf[S35390A_BYTE_SECS] = bin2bcd(tm->tm_sec);
 
 	/* This chip expects the bits of each byte to be in reverse order */
 	for (i = 0; i < 7; ++i)
@@ -168,13 +168,13 @@ static int s35390a_get_datetime(struct i2c_client *client, struct rtc_time *tm)
 	for (i = 0; i < 7; ++i)
 		buf[i] = bitrev8(buf[i]);
 
-	tm->tm_sec = BCD2BIN(buf[S35390A_BYTE_SECS]);
-	tm->tm_min = BCD2BIN(buf[S35390A_BYTE_MINS]);
+	tm->tm_sec = bcd2bin(buf[S35390A_BYTE_SECS]);
+	tm->tm_min = bcd2bin(buf[S35390A_BYTE_MINS]);
 	tm->tm_hour = s35390a_reg2hr(s35390a, buf[S35390A_BYTE_HOURS]);
-	tm->tm_wday = BCD2BIN(buf[S35390A_BYTE_WDAY]);
-	tm->tm_mday = BCD2BIN(buf[S35390A_BYTE_DAY]);
-	tm->tm_mon = BCD2BIN(buf[S35390A_BYTE_MONTH]) - 1;
-	tm->tm_year = BCD2BIN(buf[S35390A_BYTE_YEAR]) + 100;
+	tm->tm_wday = bcd2bin(buf[S35390A_BYTE_WDAY]);
+	tm->tm_mday = bcd2bin(buf[S35390A_BYTE_DAY]);
+	tm->tm_mon = bcd2bin(buf[S35390A_BYTE_MONTH]) - 1;
+	tm->tm_year = bcd2bin(buf[S35390A_BYTE_YEAR]) + 100;
 
 	dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, mday=%d, "
 		"mon=%d, year=%d, wday=%d\n", __func__, tm->tm_sec,
diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index e7d19b6c265a..910bc704939c 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -134,12 +134,12 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
 		 rtc_tm->tm_year, rtc_tm->tm_mon, rtc_tm->tm_mday,
 		 rtc_tm->tm_hour, rtc_tm->tm_min, rtc_tm->tm_sec);
 
-	BCD_TO_BIN(rtc_tm->tm_sec);
-	BCD_TO_BIN(rtc_tm->tm_min);
-	BCD_TO_BIN(rtc_tm->tm_hour);
-	BCD_TO_BIN(rtc_tm->tm_mday);
-	BCD_TO_BIN(rtc_tm->tm_mon);
-	BCD_TO_BIN(rtc_tm->tm_year);
+	rtc_tm->tm_sec = bcd2bin(rtc_tm->tm_sec);
+	rtc_tm->tm_min = bcd2bin(rtc_tm->tm_min);
+	rtc_tm->tm_hour = bcd2bin(rtc_tm->tm_hour);
+	rtc_tm->tm_mday = bcd2bin(rtc_tm->tm_mday);
+	rtc_tm->tm_mon = bcd2bin(rtc_tm->tm_mon);
+	rtc_tm->tm_year = bcd2bin(rtc_tm->tm_year);
 
 	rtc_tm->tm_year += 100;
 	rtc_tm->tm_mon -= 1;
@@ -163,12 +163,12 @@ static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm)
 		return -EINVAL;
 	}
 
-	writeb(BIN2BCD(tm->tm_sec),  base + S3C2410_RTCSEC);
-	writeb(BIN2BCD(tm->tm_min),  base + S3C2410_RTCMIN);
-	writeb(BIN2BCD(tm->tm_hour), base + S3C2410_RTCHOUR);
-	writeb(BIN2BCD(tm->tm_mday), base + S3C2410_RTCDATE);
-	writeb(BIN2BCD(tm->tm_mon + 1), base + S3C2410_RTCMON);
-	writeb(BIN2BCD(year), base + S3C2410_RTCYEAR);
+	writeb(bin2bcd(tm->tm_sec),  base + S3C2410_RTCSEC);
+	writeb(bin2bcd(tm->tm_min),  base + S3C2410_RTCMIN);
+	writeb(bin2bcd(tm->tm_hour), base + S3C2410_RTCHOUR);
+	writeb(bin2bcd(tm->tm_mday), base + S3C2410_RTCDATE);
+	writeb(bin2bcd(tm->tm_mon + 1), base + S3C2410_RTCMON);
+	writeb(bin2bcd(year), base + S3C2410_RTCYEAR);
 
 	return 0;
 }
@@ -199,34 +199,34 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
 	/* decode the alarm enable field */
 
 	if (alm_en & S3C2410_RTCALM_SECEN)
-		BCD_TO_BIN(alm_tm->tm_sec);
+		alm_tm->tm_sec = bcd2bin(alm_tm->tm_sec);
 	else
 		alm_tm->tm_sec = 0xff;
 
 	if (alm_en & S3C2410_RTCALM_MINEN)
-		BCD_TO_BIN(alm_tm->tm_min);
+		alm_tm->tm_min = bcd2bin(alm_tm->tm_min);
 	else
 		alm_tm->tm_min = 0xff;
 
 	if (alm_en & S3C2410_RTCALM_HOUREN)
-		BCD_TO_BIN(alm_tm->tm_hour);
+		alm_tm->tm_hour = bcd2bin(alm_tm->tm_hour);
 	else
 		alm_tm->tm_hour = 0xff;
 
 	if (alm_en & S3C2410_RTCALM_DAYEN)
-		BCD_TO_BIN(alm_tm->tm_mday);
+		alm_tm->tm_mday = bcd2bin(alm_tm->tm_mday);
 	else
 		alm_tm->tm_mday = 0xff;
 
 	if (alm_en & S3C2410_RTCALM_MONEN) {
-		BCD_TO_BIN(alm_tm->tm_mon);
+		alm_tm->tm_mon = bcd2bin(alm_tm->tm_mon);
 		alm_tm->tm_mon -= 1;
 	} else {
 		alm_tm->tm_mon = 0xff;
 	}
 
 	if (alm_en & S3C2410_RTCALM_YEAREN)
-		BCD_TO_BIN(alm_tm->tm_year);
+		alm_tm->tm_year = bcd2bin(alm_tm->tm_year);
 	else
 		alm_tm->tm_year = 0xffff;
 
@@ -250,17 +250,17 @@ static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
 
 	if (tm->tm_sec < 60 && tm->tm_sec >= 0) {
 		alrm_en |= S3C2410_RTCALM_SECEN;
-		writeb(BIN2BCD(tm->tm_sec), base + S3C2410_ALMSEC);
+		writeb(bin2bcd(tm->tm_sec), base + S3C2410_ALMSEC);
 	}
 
 	if (tm->tm_min < 60 && tm->tm_min >= 0) {
 		alrm_en |= S3C2410_RTCALM_MINEN;
-		writeb(BIN2BCD(tm->tm_min), base + S3C2410_ALMMIN);
+		writeb(bin2bcd(tm->tm_min), base + S3C2410_ALMMIN);
 	}
 
 	if (tm->tm_hour < 24 && tm->tm_hour >= 0) {
 		alrm_en |= S3C2410_RTCALM_HOUREN;
-		writeb(BIN2BCD(tm->tm_hour), base + S3C2410_ALMHOUR);
+		writeb(bin2bcd(tm->tm_hour), base + S3C2410_ALMHOUR);
 	}
 
 	pr_debug("setting S3C2410_RTCALM to %08x\n", alrm_en);
diff --git a/drivers/rtc/rtc-sh.c b/drivers/rtc/rtc-sh.c
index 3f393c82e32c..aaf9d6a337cc 100644
--- a/drivers/rtc/rtc-sh.c
+++ b/drivers/rtc/rtc-sh.c
@@ -324,23 +324,23 @@ static int sh_rtc_read_time(struct device *dev, struct rtc_time *tm)
 
 		sec128 = readb(rtc->regbase + R64CNT);
 
-		tm->tm_sec	= BCD2BIN(readb(rtc->regbase + RSECCNT));
-		tm->tm_min	= BCD2BIN(readb(rtc->regbase + RMINCNT));
-		tm->tm_hour	= BCD2BIN(readb(rtc->regbase + RHRCNT));
-		tm->tm_wday	= BCD2BIN(readb(rtc->regbase + RWKCNT));
-		tm->tm_mday	= BCD2BIN(readb(rtc->regbase + RDAYCNT));
-		tm->tm_mon	= BCD2BIN(readb(rtc->regbase + RMONCNT)) - 1;
+		tm->tm_sec	= bcd2bin(readb(rtc->regbase + RSECCNT));
+		tm->tm_min	= bcd2bin(readb(rtc->regbase + RMINCNT));
+		tm->tm_hour	= bcd2bin(readb(rtc->regbase + RHRCNT));
+		tm->tm_wday	= bcd2bin(readb(rtc->regbase + RWKCNT));
+		tm->tm_mday	= bcd2bin(readb(rtc->regbase + RDAYCNT));
+		tm->tm_mon	= bcd2bin(readb(rtc->regbase + RMONCNT)) - 1;
 
 		if (rtc->capabilities & RTC_CAP_4_DIGIT_YEAR) {
 			yr  = readw(rtc->regbase + RYRCNT);
-			yr100 = BCD2BIN(yr >> 8);
+			yr100 = bcd2bin(yr >> 8);
 			yr &= 0xff;
 		} else {
 			yr  = readb(rtc->regbase + RYRCNT);
-			yr100 = BCD2BIN((yr == 0x99) ? 0x19 : 0x20);
+			yr100 = bcd2bin((yr == 0x99) ? 0x19 : 0x20);
 		}
 
-		tm->tm_year = (yr100 * 100 + BCD2BIN(yr)) - 1900;
+		tm->tm_year = (yr100 * 100 + bcd2bin(yr)) - 1900;
 
 		sec2 = readb(rtc->regbase + R64CNT);
 		cf_bit = readb(rtc->regbase + RCR1) & RCR1_CF;
@@ -382,20 +382,20 @@ static int sh_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	tmp &= ~RCR2_START;
 	writeb(tmp, rtc->regbase + RCR2);
 
-	writeb(BIN2BCD(tm->tm_sec),  rtc->regbase + RSECCNT);
-	writeb(BIN2BCD(tm->tm_min),  rtc->regbase + RMINCNT);
-	writeb(BIN2BCD(tm->tm_hour), rtc->regbase + RHRCNT);
-	writeb(BIN2BCD(tm->tm_wday), rtc->regbase + RWKCNT);
-	writeb(BIN2BCD(tm->tm_mday), rtc->regbase + RDAYCNT);
-	writeb(BIN2BCD(tm->tm_mon + 1), rtc->regbase + RMONCNT);
+	writeb(bin2bcd(tm->tm_sec),  rtc->regbase + RSECCNT);
+	writeb(bin2bcd(tm->tm_min),  rtc->regbase + RMINCNT);
+	writeb(bin2bcd(tm->tm_hour), rtc->regbase + RHRCNT);
+	writeb(bin2bcd(tm->tm_wday), rtc->regbase + RWKCNT);
+	writeb(bin2bcd(tm->tm_mday), rtc->regbase + RDAYCNT);
+	writeb(bin2bcd(tm->tm_mon + 1), rtc->regbase + RMONCNT);
 
 	if (rtc->capabilities & RTC_CAP_4_DIGIT_YEAR) {
-		year = (BIN2BCD((tm->tm_year + 1900) / 100) << 8) |
-			BIN2BCD(tm->tm_year % 100);
+		year = (bin2bcd((tm->tm_year + 1900) / 100) << 8) |
+			bin2bcd(tm->tm_year % 100);
 		writew(year, rtc->regbase + RYRCNT);
 	} else {
 		year = tm->tm_year % 100;
-		writeb(BIN2BCD(year), rtc->regbase + RYRCNT);
+		writeb(bin2bcd(year), rtc->regbase + RYRCNT);
 	}
 
 	/* Start RTC */
@@ -417,7 +417,7 @@ static inline int sh_rtc_read_alarm_value(struct sh_rtc *rtc, int reg_off)
 	byte = readb(rtc->regbase + reg_off);
 	if (byte & AR_ENB) {
 		byte &= ~AR_ENB;	/* strip the enable bit */
-		value = BCD2BIN(byte);
+		value = bcd2bin(byte);
 	}
 
 	return value;
@@ -455,7 +455,7 @@ static inline void sh_rtc_write_alarm_value(struct sh_rtc *rtc,
 	if (value < 0)
 		writeb(0, rtc->regbase + reg_off);
 	else
-		writeb(BIN2BCD(value) | AR_ENB,  rtc->regbase + reg_off);
+		writeb(bin2bcd(value) | AR_ENB,  rtc->regbase + reg_off);
 }
 
 static int sh_rtc_check_alarm(struct rtc_time *tm)
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index 9a7e920315fa..f4cd46e15af9 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -82,14 +82,14 @@ static int stk17ta8_rtc_set_time(struct device *dev, struct rtc_time *tm)
 	flags = readb(pdata->ioaddr + RTC_FLAGS);
 	writeb(flags | RTC_WRITE, pdata->ioaddr + RTC_FLAGS);
 
-	writeb(BIN2BCD(tm->tm_year % 100), ioaddr + RTC_YEAR);
-	writeb(BIN2BCD(tm->tm_mon + 1), ioaddr + RTC_MONTH);
-	writeb(BIN2BCD(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY);
-	writeb(BIN2BCD(tm->tm_mday), ioaddr + RTC_DATE);
-	writeb(BIN2BCD(tm->tm_hour), ioaddr + RTC_HOURS);
-	writeb(BIN2BCD(tm->tm_min), ioaddr + RTC_MINUTES);
-	writeb(BIN2BCD(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS);
-	writeb(BIN2BCD((tm->tm_year + 1900) / 100), ioaddr + RTC_CENTURY);
+	writeb(bin2bcd(tm->tm_year % 100), ioaddr + RTC_YEAR);
+	writeb(bin2bcd(tm->tm_mon + 1), ioaddr + RTC_MONTH);
+	writeb(bin2bcd(tm->tm_wday) & RTC_DAY_MASK, ioaddr + RTC_DAY);
+	writeb(bin2bcd(tm->tm_mday), ioaddr + RTC_DATE);
+	writeb(bin2bcd(tm->tm_hour), ioaddr + RTC_HOURS);
+	writeb(bin2bcd(tm->tm_min), ioaddr + RTC_MINUTES);
+	writeb(bin2bcd(tm->tm_sec) & RTC_SECONDS_MASK, ioaddr + RTC_SECONDS);
+	writeb(bin2bcd((tm->tm_year + 1900) / 100), ioaddr + RTC_CENTURY);
 
 	writeb(flags & ~RTC_WRITE, pdata->ioaddr + RTC_FLAGS);
 	return 0;
@@ -120,14 +120,14 @@ static int stk17ta8_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	year = readb(ioaddr + RTC_YEAR);
 	century = readb(ioaddr + RTC_CENTURY);
 	writeb(flags & ~RTC_READ, ioaddr + RTC_FLAGS);
-	tm->tm_sec = BCD2BIN(second);
-	tm->tm_min = BCD2BIN(minute);
-	tm->tm_hour = BCD2BIN(hour);
-	tm->tm_mday = BCD2BIN(day);
-	tm->tm_wday = BCD2BIN(week);
-	tm->tm_mon = BCD2BIN(month) - 1;
+	tm->tm_sec = bcd2bin(second);
+	tm->tm_min = bcd2bin(minute);
+	tm->tm_hour = bcd2bin(hour);
+	tm->tm_mday = bcd2bin(day);
+	tm->tm_wday = bcd2bin(week);
+	tm->tm_mon = bcd2bin(month) - 1;
 	/* year is 1900 + tm->tm_year */
-	tm->tm_year = BCD2BIN(year) + BCD2BIN(century) * 100 - 1900;
+	tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900;
 
 	if (rtc_valid_tm(tm) < 0) {
 		dev_err(dev, "retrieved date/time is not valid.\n");
@@ -148,16 +148,16 @@ static void stk17ta8_rtc_update_alarm(struct rtc_plat_data *pdata)
 	writeb(flags | RTC_WRITE, ioaddr + RTC_FLAGS);
 
 	writeb(pdata->alrm_mday < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_mday),
+	       0x80 : bin2bcd(pdata->alrm_mday),
 	       ioaddr + RTC_DATE_ALARM);
 	writeb(pdata->alrm_hour < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_hour),
+	       0x80 : bin2bcd(pdata->alrm_hour),
 	       ioaddr + RTC_HOURS_ALARM);
 	writeb(pdata->alrm_min < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_min),
+	       0x80 : bin2bcd(pdata->alrm_min),
 	       ioaddr + RTC_MINUTES_ALARM);
 	writeb(pdata->alrm_sec < 0 || (pdata->irqen & RTC_UF) ?
-	       0x80 : BIN2BCD(pdata->alrm_sec),
+	       0x80 : bin2bcd(pdata->alrm_sec),
 	       ioaddr + RTC_SECONDS_ALARM);
 	writeb(pdata->irqen ? RTC_INTS_AIE : 0, ioaddr + RTC_INTERRUPTS);
 	readb(ioaddr + RTC_FLAGS);	/* clear interrupts */
@@ -280,7 +280,6 @@ static struct bin_attribute stk17ta8_nvram_attr = {
 	.attr = {
 		.name = "nvram",
 		.mode = S_IRUGO | S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = RTC_OFFSET,
 	.read = stk17ta8_nvram_read,
diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c
index 10025d840268..14d4f036a768 100644
--- a/drivers/rtc/rtc-v3020.c
+++ b/drivers/rtc/rtc-v3020.c
@@ -92,19 +92,19 @@ static int v3020_read_time(struct device *dev, struct rtc_time *dt)
 
 	/* ...and then read constant values. */
 	tmp = v3020_get_reg(chip, V3020_SECONDS);
-	dt->tm_sec	= BCD2BIN(tmp);
+	dt->tm_sec	= bcd2bin(tmp);
 	tmp = v3020_get_reg(chip, V3020_MINUTES);
-	dt->tm_min	= BCD2BIN(tmp);
+	dt->tm_min	= bcd2bin(tmp);
 	tmp = v3020_get_reg(chip, V3020_HOURS);
-	dt->tm_hour	= BCD2BIN(tmp);
+	dt->tm_hour	= bcd2bin(tmp);
 	tmp = v3020_get_reg(chip, V3020_MONTH_DAY);
-	dt->tm_mday	= BCD2BIN(tmp);
+	dt->tm_mday	= bcd2bin(tmp);
 	tmp = v3020_get_reg(chip, V3020_MONTH);
-	dt->tm_mon    = BCD2BIN(tmp) - 1;
+	dt->tm_mon    = bcd2bin(tmp) - 1;
 	tmp = v3020_get_reg(chip, V3020_WEEK_DAY);
-	dt->tm_wday	= BCD2BIN(tmp);
+	dt->tm_wday	= bcd2bin(tmp);
 	tmp = v3020_get_reg(chip, V3020_YEAR);
-	dt->tm_year = BCD2BIN(tmp)+100;
+	dt->tm_year = bcd2bin(tmp)+100;
 
 #ifdef DEBUG
 	printk("\n%s : Read RTC values\n",__func__);
@@ -136,13 +136,13 @@ static int v3020_set_time(struct device *dev, struct rtc_time *dt)
 #endif
 
 	/* Write all the values to ram... */
-	v3020_set_reg(chip, V3020_SECONDS, 	BIN2BCD(dt->tm_sec));
-	v3020_set_reg(chip, V3020_MINUTES, 	BIN2BCD(dt->tm_min));
-	v3020_set_reg(chip, V3020_HOURS, 	BIN2BCD(dt->tm_hour));
-	v3020_set_reg(chip, V3020_MONTH_DAY,	BIN2BCD(dt->tm_mday));
-	v3020_set_reg(chip, V3020_MONTH,     BIN2BCD(dt->tm_mon + 1));
-	v3020_set_reg(chip, V3020_WEEK_DAY, 	BIN2BCD(dt->tm_wday));
-	v3020_set_reg(chip, V3020_YEAR, 	BIN2BCD(dt->tm_year % 100));
+	v3020_set_reg(chip, V3020_SECONDS, 	bin2bcd(dt->tm_sec));
+	v3020_set_reg(chip, V3020_MINUTES, 	bin2bcd(dt->tm_min));
+	v3020_set_reg(chip, V3020_HOURS, 	bin2bcd(dt->tm_hour));
+	v3020_set_reg(chip, V3020_MONTH_DAY,	bin2bcd(dt->tm_mday));
+	v3020_set_reg(chip, V3020_MONTH,     bin2bcd(dt->tm_mon + 1));
+	v3020_set_reg(chip, V3020_WEEK_DAY, 	bin2bcd(dt->tm_wday));
+	v3020_set_reg(chip, V3020_YEAR, 	bin2bcd(dt->tm_year % 100));
 
 	/* ...and set the clock. */
 	v3020_set_reg(chip, V3020_CMD_RAM2CLOCK, 0);
diff --git a/drivers/rtc/rtc-x1205.c b/drivers/rtc/rtc-x1205.c
index 7dcfba1bbfe1..310c10795e9a 100644
--- a/drivers/rtc/rtc-x1205.c
+++ b/drivers/rtc/rtc-x1205.c
@@ -118,13 +118,13 @@ static int x1205_get_datetime(struct i2c_client *client, struct rtc_time *tm,
 		for (i = 0; i <= 4; i++)
 			buf[i] &= 0x7F;
 
-	tm->tm_sec = BCD2BIN(buf[CCR_SEC]);
-	tm->tm_min = BCD2BIN(buf[CCR_MIN]);
-	tm->tm_hour = BCD2BIN(buf[CCR_HOUR] & 0x3F); /* hr is 0-23 */
-	tm->tm_mday = BCD2BIN(buf[CCR_MDAY]);
-	tm->tm_mon = BCD2BIN(buf[CCR_MONTH]) - 1; /* mon is 0-11 */
-	tm->tm_year = BCD2BIN(buf[CCR_YEAR])
-			+ (BCD2BIN(buf[CCR_Y2K]) * 100) - 1900;
+	tm->tm_sec = bcd2bin(buf[CCR_SEC]);
+	tm->tm_min = bcd2bin(buf[CCR_MIN]);
+	tm->tm_hour = bcd2bin(buf[CCR_HOUR] & 0x3F); /* hr is 0-23 */
+	tm->tm_mday = bcd2bin(buf[CCR_MDAY]);
+	tm->tm_mon = bcd2bin(buf[CCR_MONTH]) - 1; /* mon is 0-11 */
+	tm->tm_year = bcd2bin(buf[CCR_YEAR])
+			+ (bcd2bin(buf[CCR_Y2K]) * 100) - 1900;
 	tm->tm_wday = buf[CCR_WDAY];
 
 	dev_dbg(&client->dev, "%s: tm is secs=%d, mins=%d, hours=%d, "
@@ -174,11 +174,11 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm,
 		__func__,
 		tm->tm_sec, tm->tm_min, tm->tm_hour);
 
-	buf[CCR_SEC] = BIN2BCD(tm->tm_sec);
-	buf[CCR_MIN] = BIN2BCD(tm->tm_min);
+	buf[CCR_SEC] = bin2bcd(tm->tm_sec);
+	buf[CCR_MIN] = bin2bcd(tm->tm_min);
 
 	/* set hour and 24hr bit */
-	buf[CCR_HOUR] = BIN2BCD(tm->tm_hour) | X1205_HR_MIL;
+	buf[CCR_HOUR] = bin2bcd(tm->tm_hour) | X1205_HR_MIL;
 
 	/* should we also set the date? */
 	if (datetoo) {
@@ -187,15 +187,15 @@ static int x1205_set_datetime(struct i2c_client *client, struct rtc_time *tm,
 			__func__,
 			tm->tm_mday, tm->tm_mon, tm->tm_year, tm->tm_wday);
 
-		buf[CCR_MDAY] = BIN2BCD(tm->tm_mday);
+		buf[CCR_MDAY] = bin2bcd(tm->tm_mday);
 
 		/* month, 1 - 12 */
-		buf[CCR_MONTH] = BIN2BCD(tm->tm_mon + 1);
+		buf[CCR_MONTH] = bin2bcd(tm->tm_mon + 1);
 
 		/* year, since the rtc epoch*/
-		buf[CCR_YEAR] = BIN2BCD(tm->tm_year % 100);
+		buf[CCR_YEAR] = bin2bcd(tm->tm_year % 100);
 		buf[CCR_WDAY] = tm->tm_wday & 0x07;
-		buf[CCR_Y2K] = BIN2BCD(tm->tm_year / 100);
+		buf[CCR_Y2K] = bin2bcd(tm->tm_year / 100);
 	}
 
 	/* If writing alarm registers, set compare bits on registers 0-4 */
@@ -437,7 +437,7 @@ static int x1205_validate_client(struct i2c_client *client)
 			return -EIO;
 		}
 
-		value = BCD2BIN(reg & probe_limits_pattern[i].mask);
+		value = bcd2bin(reg & probe_limits_pattern[i].mask);
 
 		if (value > probe_limits_pattern[i].max ||
 			value < probe_limits_pattern[i].min) {
diff --git a/drivers/scsi/arcmsr/arcmsr_attr.c b/drivers/scsi/arcmsr/arcmsr_attr.c
index 69f8346aa288..5877f29a6005 100644
--- a/drivers/scsi/arcmsr/arcmsr_attr.c
+++ b/drivers/scsi/arcmsr/arcmsr_attr.c
@@ -189,7 +189,6 @@ static struct bin_attribute arcmsr_sysfs_message_read_attr = {
 	.attr = {
 		.name = "mu_read",
 		.mode = S_IRUSR ,
-		.owner = THIS_MODULE,
 	},
 	.size = 1032,
 	.read = arcmsr_sysfs_iop_message_read,
@@ -199,7 +198,6 @@ static struct bin_attribute arcmsr_sysfs_message_write_attr = {
 	.attr = {
 		.name = "mu_write",
 		.mode = S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 1032,
 	.write = arcmsr_sysfs_iop_message_write,
@@ -209,7 +207,6 @@ static struct bin_attribute arcmsr_sysfs_message_clear_attr = {
 	.attr = {
 		.name = "mu_clear",
 		.mode = S_IWUSR,
-		.owner = THIS_MODULE,
 	},
 	.size = 1,
 	.write = arcmsr_sysfs_iop_message_clear,
diff --git a/drivers/scsi/sr_vendor.c b/drivers/scsi/sr_vendor.c
index 4eb3da996b36..4ad3e017213f 100644
--- a/drivers/scsi/sr_vendor.c
+++ b/drivers/scsi/sr_vendor.c
@@ -223,9 +223,9 @@ int sr_cd_check(struct cdrom_device_info *cdi)
 				no_multi = 1;
 				break;
 			}
-			min = BCD2BIN(buffer[15]);
-			sec = BCD2BIN(buffer[16]);
-			frame = BCD2BIN(buffer[17]);
+			min = bcd2bin(buffer[15]);
+			sec = bcd2bin(buffer[16]);
+			frame = bcd2bin(buffer[17]);
 			sector = min * CD_SECS * CD_FRAMES + sec * CD_FRAMES + frame;
 			break;
 		}
@@ -252,9 +252,9 @@ int sr_cd_check(struct cdrom_device_info *cdi)
 			}
 			if (rc != 0)
 				break;
-			min = BCD2BIN(buffer[1]);
-			sec = BCD2BIN(buffer[2]);
-			frame = BCD2BIN(buffer[3]);
+			min = bcd2bin(buffer[1]);
+			sec = bcd2bin(buffer[2]);
+			frame = bcd2bin(buffer[3]);
 			sector = min * CD_SECS * CD_FRAMES + sec * CD_FRAMES + frame;
 			if (sector)
 				sector -= CD_MSF_OFFSET;
diff --git a/drivers/serial/8250_gsc.c b/drivers/serial/8250_gsc.c
index 0416ad3bc127..418b4fe9a0a1 100644
--- a/drivers/serial/8250_gsc.c
+++ b/drivers/serial/8250_gsc.c
@@ -111,7 +111,7 @@ static struct parisc_driver serial_driver = {
 	.probe		= serial_init_chip,
 };
 
-int __init probe_serial_gsc(void)
+static int __init probe_serial_gsc(void)
 {
 	register_parisc_driver(&lasi_driver);
 	register_parisc_driver(&serial_driver);
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index 8fcb4c5b9a26..7313c2edcb83 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -1039,7 +1039,7 @@ static int __devinit serial_txx9_probe(struct platform_device *dev)
 		ret = serial_txx9_register_port(&port);
 		if (ret < 0) {
 			dev_err(&dev->dev, "unable to register port at index %d "
-				"(IO%x MEM%llx IRQ%d): %d\n", i,
+				"(IO%lx MEM%llx IRQ%d): %d\n", i,
 				p->iobase, (unsigned long long)p->mapbase,
 				p->irq, ret);
 		}
diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c
index b73e3c0056cd..d5276c012f78 100644
--- a/drivers/serial/sn_console.c
+++ b/drivers/serial/sn_console.c
@@ -61,7 +61,7 @@
 #define SN_SAL_BUFFER_SIZE (64 * (1 << 10))
 
 #define SN_SAL_UART_FIFO_DEPTH 16
-#define SN_SAL_UART_FIFO_SPEED_CPS 9600/10
+#define SN_SAL_UART_FIFO_SPEED_CPS (9600/10)
 
 /* sn_transmit_chars() calling args */
 #define TRANSMIT_BUFFERED	0
diff --git a/drivers/staging/go7007/Kconfig b/drivers/staging/go7007/Kconfig
index 57a121c338c4..593fdb767aad 100644
--- a/drivers/staging/go7007/Kconfig
+++ b/drivers/staging/go7007/Kconfig
@@ -1,10 +1,12 @@
 config VIDEO_GO7007
 	tristate "Go 7007 support"
 	depends on VIDEO_DEV && PCI && I2C && INPUT
+	depends on SND
 	select VIDEOBUF_DMA_SG
 	select VIDEO_IR
 	select VIDEO_TUNER
 	select VIDEO_TVEEPROM
+	select SND_PCM
 	select CRC32
 	default N
 	---help---
diff --git a/drivers/staging/sxg/Kconfig b/drivers/staging/sxg/Kconfig
index 1ae350806600..6e6cf0b9ef99 100644
--- a/drivers/staging/sxg/Kconfig
+++ b/drivers/staging/sxg/Kconfig
@@ -1,6 +1,7 @@
 config SXG
 	tristate "Alacritech SLIC Technology Non-Accelerated 10Gbe support"
 	depends on PCI && NETDEV_10000
+	depends on X86
 	default n
 	help
 	  This driver supports the Alacritech SLIC Technology Non-Accelerated
diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c
index 4d74ba36c3a1..37caf4d69037 100644
--- a/drivers/telephony/phonedev.c
+++ b/drivers/telephony/phonedev.c
@@ -54,7 +54,6 @@ static int phone_open(struct inode *inode, struct file *file)
 	if (minor >= PHONE_NUM_DEVICES)
 		return -ENODEV;
 
-	lock_kernel();
 	mutex_lock(&phone_lock);
 	p = phone_device[minor];
 	if (p)
@@ -81,7 +80,6 @@ static int phone_open(struct inode *inode, struct file *file)
 	fops_put(old_fops);
 end:
 	mutex_unlock(&phone_lock);
-	unlock_kernel();
 	return err;
 }
 
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 217c5118ae9e..cd5f20da738a 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1002,101 +1002,132 @@ fb_blank(struct fb_info *info, int blank)
  	return ret;
 }
 
-static int 
-fb_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+static long
+fb_ioctl(struct file *file, unsigned int cmd,
 	 unsigned long arg)
 {
+	struct inode *inode = file->f_path.dentry->d_inode;
 	int fbidx = iminor(inode);
-	struct fb_info *info = registered_fb[fbidx];
-	struct fb_ops *fb = info->fbops;
+	struct fb_info *info;
+	struct fb_ops *fb;
 	struct fb_var_screeninfo var;
 	struct fb_fix_screeninfo fix;
 	struct fb_con2fbmap con2fb;
 	struct fb_cmap_user cmap;
 	struct fb_event event;
 	void __user *argp = (void __user *)arg;
-	int i;
-	
-	if (!fb)
+	long ret = 0;
+
+	info = registered_fb[fbidx];
+	mutex_lock(&info->lock);
+	fb = info->fbops;
+
+	if (!fb) {
+		mutex_unlock(&info->lock);
 		return -ENODEV;
+	}
 	switch (cmd) {
 	case FBIOGET_VSCREENINFO:
-		return copy_to_user(argp, &info->var,
+		ret = copy_to_user(argp, &info->var,
 				    sizeof(var)) ? -EFAULT : 0;
+		break;
 	case FBIOPUT_VSCREENINFO:
-		if (copy_from_user(&var, argp, sizeof(var)))
-			return -EFAULT;
+		if (copy_from_user(&var, argp, sizeof(var))) {
+			ret =  -EFAULT;
+			break;
+		}
 		acquire_console_sem();
 		info->flags |= FBINFO_MISC_USEREVENT;
-		i = fb_set_var(info, &var);
+		ret = fb_set_var(info, &var);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
 		release_console_sem();
-		if (i) return i;
-		if (copy_to_user(argp, &var, sizeof(var)))
-			return -EFAULT;
-		return 0;
+		if (ret == 0 && copy_to_user(argp, &var, sizeof(var)))
+			ret = -EFAULT;
+		break;
 	case FBIOGET_FSCREENINFO:
-		return copy_to_user(argp, &info->fix,
+		ret = copy_to_user(argp, &info->fix,
 				    sizeof(fix)) ? -EFAULT : 0;
+		break;
 	case FBIOPUTCMAP:
 		if (copy_from_user(&cmap, argp, sizeof(cmap)))
-			return -EFAULT;
-		return (fb_set_user_cmap(&cmap, info));
+			ret = -EFAULT;
+		else
+			ret = fb_set_user_cmap(&cmap, info);
+		break;
 	case FBIOGETCMAP:
 		if (copy_from_user(&cmap, argp, sizeof(cmap)))
-			return -EFAULT;
-		return fb_cmap_to_user(&info->cmap, &cmap);
+			ret = -EFAULT;
+		else
+			ret = fb_cmap_to_user(&info->cmap, &cmap);
+		break;
 	case FBIOPAN_DISPLAY:
-		if (copy_from_user(&var, argp, sizeof(var)))
-			return -EFAULT;
+		if (copy_from_user(&var, argp, sizeof(var))) {
+			ret = -EFAULT;
+			break;
+		}
 		acquire_console_sem();
-		i = fb_pan_display(info, &var);
+		ret = fb_pan_display(info, &var);
 		release_console_sem();
-		if (i)
-			return i;
-		if (copy_to_user(argp, &var, sizeof(var)))
-			return -EFAULT;
-		return 0;
+		if (ret == 0 && copy_to_user(argp, &var, sizeof(var)))
+			ret = -EFAULT;
+		break;
 	case FBIO_CURSOR:
-		return -EINVAL;
+		ret = -EINVAL;
+		break;
 	case FBIOGET_CON2FBMAP:
 		if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
-			return -EFAULT;
-		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
-		    return -EINVAL;
-		con2fb.framebuffer = -1;
-		event.info = info;
-		event.data = &con2fb;
-		fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP, &event);
-		return copy_to_user(argp, &con2fb,
+			ret = -EFAULT;
+		else if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
+			ret = -EINVAL;
+		else {
+			con2fb.framebuffer = -1;
+			event.info = info;
+			event.data = &con2fb;
+			fb_notifier_call_chain(FB_EVENT_GET_CONSOLE_MAP,
+								&event);
+			ret = copy_to_user(argp, &con2fb,
 				    sizeof(con2fb)) ? -EFAULT : 0;
+		}
+		break;
 	case FBIOPUT_CON2FBMAP:
-		if (copy_from_user(&con2fb, argp, sizeof(con2fb)))
-			return - EFAULT;
-		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES)
-		    return -EINVAL;
-		if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX)
-		    return -EINVAL;
-		if (!registered_fb[con2fb.framebuffer])
-		    request_module("fb%d", con2fb.framebuffer);
+		if (copy_from_user(&con2fb, argp, sizeof(con2fb))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (con2fb.console < 1 || con2fb.console > MAX_NR_CONSOLES) {
+			ret = -EINVAL;
+			break;
+		}
+		if (con2fb.framebuffer < 0 || con2fb.framebuffer >= FB_MAX) {
+			ret = -EINVAL;
+			break;
+		}
 		if (!registered_fb[con2fb.framebuffer])
-		    return -EINVAL;
+			request_module("fb%d", con2fb.framebuffer);
+		if (!registered_fb[con2fb.framebuffer]) {
+			ret = -EINVAL;
+			break;
+		}
 		event.info = info;
 		event.data = &con2fb;
-		return fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP,
+		ret = fb_notifier_call_chain(FB_EVENT_SET_CONSOLE_MAP,
 					      &event);
+		break;
 	case FBIOBLANK:
 		acquire_console_sem();
 		info->flags |= FBINFO_MISC_USEREVENT;
-		i = fb_blank(info, arg);
+		ret = fb_blank(info, arg);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
 		release_console_sem();
-		return i;
+		break;;
 	default:
 		if (fb->fb_ioctl == NULL)
-			return -EINVAL;
-		return fb->fb_ioctl(info, cmd, arg);
+			ret = -ENOTTY;
+		else
+			ret = fb->fb_ioctl(info, cmd, arg);
 	}
+	mutex_unlock(&info->lock);
+	return ret;
 }
 
 #ifdef CONFIG_COMPAT
@@ -1150,7 +1181,7 @@ static int fb_getput_cmap(struct inode *inode, struct file *file,
 	    put_user(compat_ptr(data), &cmap->transp))
 		return -EFAULT;
 
-	err = fb_ioctl(inode, file, cmd, (unsigned long) cmap);
+	err = fb_ioctl(file, cmd, (unsigned long) cmap);
 
 	if (!err) {
 		if (copy_in_user(&cmap32->start,
@@ -1204,7 +1235,7 @@ static int fb_get_fscreeninfo(struct inode *inode, struct file *file,
 
 	old_fs = get_fs();
 	set_fs(KERNEL_DS);
-	err = fb_ioctl(inode, file, cmd, (unsigned long) &fix);
+	err = fb_ioctl(file, cmd, (unsigned long) &fix);
 	set_fs(old_fs);
 
 	if (!err)
@@ -1222,7 +1253,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	struct fb_ops *fb = info->fbops;
 	long ret = -ENOIOCTLCMD;
 
-	lock_kernel();
+	mutex_lock(&info->lock);
 	switch(cmd) {
 	case FBIOGET_VSCREENINFO:
 	case FBIOPUT_VSCREENINFO:
@@ -1231,7 +1262,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FBIOPUT_CON2FBMAP:
 		arg = (unsigned long) compat_ptr(arg);
 	case FBIOBLANK:
-		ret = fb_ioctl(inode, file, cmd, arg);
+		ret = fb_ioctl(file, cmd, arg);
 		break;
 
 	case FBIOGET_FSCREENINFO:
@@ -1248,7 +1279,7 @@ fb_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			ret = fb->fb_compat_ioctl(info, cmd, arg);
 		break;
 	}
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	return ret;
 }
 #endif
@@ -1270,13 +1301,13 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 		return -ENODEV;
 	if (fb->fb_mmap) {
 		int res;
-		lock_kernel();
+		mutex_lock(&info->lock);
 		res = fb->fb_mmap(info, vma);
-		unlock_kernel();
+		mutex_unlock(&info->lock);
 		return res;
 	}
 
-	lock_kernel();
+	mutex_lock(&info->lock);
 
 	/* frame buffer memory */
 	start = info->fix.smem_start;
@@ -1285,13 +1316,13 @@ fb_mmap(struct file *file, struct vm_area_struct * vma)
 		/* memory mapped io */
 		off -= len;
 		if (info->var.accel_flags) {
-			unlock_kernel();
+			mutex_unlock(&info->lock);
 			return -EINVAL;
 		}
 		start = info->fix.mmio_start;
 		len = PAGE_ALIGN((start & ~PAGE_MASK) + info->fix.mmio_len);
 	}
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	start &= PAGE_MASK;
 	if ((vma->vm_end - vma->vm_start + off) > len)
 		return -EINVAL;
@@ -1315,13 +1346,13 @@ fb_open(struct inode *inode, struct file *file)
 
 	if (fbidx >= FB_MAX)
 		return -ENODEV;
-	lock_kernel();
-	if (!(info = registered_fb[fbidx]))
+	info = registered_fb[fbidx];
+	if (!info)
 		request_module("fb%d", fbidx);
-	if (!(info = registered_fb[fbidx])) {
-		res = -ENODEV;
-		goto out;
-	}
+	info = registered_fb[fbidx];
+	if (!info)
+		return -ENODEV;
+	mutex_lock(&info->lock);
 	if (!try_module_get(info->fbops->owner)) {
 		res = -ENODEV;
 		goto out;
@@ -1337,7 +1368,7 @@ fb_open(struct inode *inode, struct file *file)
 		fb_deferred_io_open(info, inode, file);
 #endif
 out:
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	return res;
 }
 
@@ -1346,11 +1377,11 @@ fb_release(struct inode *inode, struct file *file)
 {
 	struct fb_info * const info = file->private_data;
 
-	lock_kernel();
+	mutex_lock(&info->lock);
 	if (info->fbops->fb_release)
 		info->fbops->fb_release(info,1);
 	module_put(info->fbops->owner);
-	unlock_kernel();
+	mutex_unlock(&info->lock);
 	return 0;
 }
 
@@ -1358,7 +1389,7 @@ static const struct file_operations fb_fops = {
 	.owner =	THIS_MODULE,
 	.read =		fb_read,
 	.write =	fb_write,
-	.ioctl =	fb_ioctl,
+	.unlocked_ioctl = fb_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = fb_compat_ioctl,
 #endif
@@ -1429,6 +1460,7 @@ register_framebuffer(struct fb_info *fb_info)
 		if (!registered_fb[i])
 			break;
 	fb_info->node = i;
+	mutex_init(&fb_info->lock);
 
 	fb_info->dev = device_create(fb_class, fb_info->device,
 				     MKDEV(FB_MAJOR, i), NULL, "fb%d", i);
diff --git a/drivers/w1/slaves/w1_ds2760.c b/drivers/w1/slaves/w1_ds2760.c
index ed6b0576208c..1f09d4e4144c 100644
--- a/drivers/w1/slaves/w1_ds2760.c
+++ b/drivers/w1/slaves/w1_ds2760.c
@@ -80,7 +80,6 @@ static struct bin_attribute w1_ds2760_bin_attr = {
 	.attr = {
 		.name = "w1_slave",
 		.mode = S_IRUGO,
-		.owner = THIS_MODULE,
 	},
 	.size = DS2760_DATA_SIZE,
 	.read = w1_ds2760_read_bin,
diff --git a/fs/Kconfig b/fs/Kconfig
index d0a1174fb516..4eca61c201f0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1168,195 +1168,7 @@ config EFS_FS
 	  To compile the EFS file system support as a module, choose M here: the
 	  module will be called efs.
 
-config JFFS2_FS
-	tristate "Journalling Flash File System v2 (JFFS2) support"
-	select CRC32
-	depends on MTD
-	help
-	  JFFS2 is the second generation of the Journalling Flash File System
-	  for use on diskless embedded devices. It provides improved wear
-	  levelling, compression and support for hard links. You cannot use
-	  this on normal block devices, only on 'MTD' devices.
-
-	  Further information on the design and implementation of JFFS2 is
-	  available at <http://sources.redhat.com/jffs2/>.
-
-config JFFS2_FS_DEBUG
-	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
-	depends on JFFS2_FS
-	default "0"
-	help
-	  This controls the amount of debugging messages produced by the JFFS2
-	  code. Set it to zero for use in production systems. For evaluation,
-	  testing and debugging, it's advisable to set it to one. This will
-	  enable a few assertions and will print debugging messages at the
-	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
-	  is unlikely to be useful - it enables extra debugging in certain
-	  areas which at one point needed debugging, but when the bugs were
-	  located and fixed, the detailed messages were relegated to level 2.
-
-	  If reporting bugs, please try to have available a full dump of the
-	  messages at debug level 1 while the misbehaviour was occurring.
-
-config JFFS2_FS_WRITEBUFFER
-	bool "JFFS2 write-buffering support"
-	depends on JFFS2_FS
-	default y
-	help
-	  This enables the write-buffering support in JFFS2.
-
-	  This functionality is required to support JFFS2 on the following
-	  types of flash devices:
-	    - NAND flash
-	    - NOR flash with transparent ECC
-	    - DataFlash
-
-config JFFS2_FS_WBUF_VERIFY
-	bool "Verify JFFS2 write-buffer reads"
-	depends on JFFS2_FS_WRITEBUFFER
-	default n
-	help
-	  This causes JFFS2 to read back every page written through the
-	  write-buffer, and check for errors.
-
-config JFFS2_SUMMARY
-	bool "JFFS2 summary support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  This feature makes it possible to use summary information
-	  for faster filesystem mount.
-
-	  The summary information can be inserted into a filesystem image
-	  by the utility 'sumtool'.
-
-	  If unsure, say 'N'.
-
-config JFFS2_FS_XATTR
-	bool "JFFS2 XATTR support (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  Extended attributes are name:value pairs associated with inodes by
-	  the kernel or by users (see the attr(5) manual page, or visit
-	  <http://acl.bestbits.at/> for details).
-
-	  If unsure, say N.
-
-config JFFS2_FS_POSIX_ACL
-	bool "JFFS2 POSIX Access Control Lists"
-	depends on JFFS2_FS_XATTR
-	default y
-	select FS_POSIX_ACL
-	help
-	  Posix Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the Posix ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N
-
-config JFFS2_FS_SECURITY
-	bool "JFFS2 Security Labels"
-	depends on JFFS2_FS_XATTR
-	default y
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute handler for file security
-	  labels in the jffs2 filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for file security labels, say N.
-
-config JFFS2_COMPRESSION_OPTIONS
-	bool "Advanced compression options for JFFS2"
-	depends on JFFS2_FS
-	default n
-	help
-	  Enabling this option allows you to explicitly choose which
-	  compression modules, if any, are enabled in JFFS2. Removing
-	  compressors can mean you cannot read existing file systems,
-	  and enabling experimental compressors can mean that you
-	  write a file system which cannot be read by a standard kernel.
-
-	  If unsure, you should _definitely_ say 'N'.
-
-config JFFS2_ZLIB
-	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
-	select ZLIB_INFLATE
-	select ZLIB_DEFLATE
-	depends on JFFS2_FS
-	default y
-	help
-	  Zlib is designed to be a free, general-purpose, legally unencumbered,
-	  lossless data-compression library for use on virtually any computer
-	  hardware and operating system. See <http://www.gzip.org/zlib/> for
-	  further information.
-
-	  Say 'Y' if unsure.
-
-config JFFS2_LZO
-	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
-	select LZO_COMPRESS
-	select LZO_DECOMPRESS
-	depends on JFFS2_FS
-	default n
-	help
-	  minilzo-based compression. Generally works better than Zlib.
-
-	  This feature was added in July, 2007. Say 'N' if you need
-	  compatibility with older bootloaders or kernels.
-
-config JFFS2_RTIME
-	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
-	depends on JFFS2_FS
-	default y
-	help
-	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
-
-config JFFS2_RUBIN
-	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
-	depends on JFFS2_FS
-	default n
-	help
-	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
-
-choice
-	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
-	default JFFS2_CMODE_PRIORITY
-	depends on JFFS2_FS
-	help
-	  You can set here the default compression mode of JFFS2 from
-	  the available compression modes. Don't touch if unsure.
-
-config JFFS2_CMODE_NONE
-	bool "no compression"
-	help
-	  Uses no compression.
-
-config JFFS2_CMODE_PRIORITY
-	bool "priority"
-	help
-	  Tries the compressors in a predefined order and chooses the first
-	  successful one.
-
-config JFFS2_CMODE_SIZE
-	bool "size (EXPERIMENTAL)"
-	help
-	  Tries all compressors and chooses the one which has the smallest
-	  result.
-
-config JFFS2_CMODE_FAVOURLZO
-	bool "Favour LZO"
-	help
-	  Tries all compressors and chooses the one which has the smallest
-	  result but gives some preference to LZO (which has faster
-	  decompression) at the expense of size.
-
-endchoice
-
+source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
 
@@ -1913,148 +1725,7 @@ config SMB_NLS_REMOTE
 
 	  smbmount from samba 2.2.0 or later supports this.
 
-config CIFS
-	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
-	depends on INET
-	select NLS
-	help
-	  This is the client VFS module for the Common Internet File System
-	  (CIFS) protocol which is the successor to the Server Message Block 
-	  (SMB) protocol, the native file sharing mechanism for most early
-	  PC operating systems.  The CIFS protocol is fully supported by 
-	  file servers such as Windows 2000 (including Windows 2003, NT 4  
-	  and Windows XP) as well by Samba (which provides excellent CIFS
-	  server support for Linux and many other operating systems). Limited
-	  support for OS/2 and Windows ME and similar servers is provided as
-	  well.
-
-	  The cifs module provides an advanced network file system
-	  client for mounting to CIFS compliant servers.  It includes
-	  support for DFS (hierarchical name space), secure per-user
-	  session establishment via Kerberos or NTLM or NTLMv2,
-	  safe distributed caching (oplock), optional packet
-	  signing, Unicode and other internationalization improvements.
-	  If you need to mount to Samba or Windows from this machine, say Y.
-
-config CIFS_STATS
-        bool "CIFS statistics"
-        depends on CIFS
-        help
-          Enabling this option will cause statistics for each server share
-	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
-
-config CIFS_STATS2
-	bool "Extended statistics"
-	depends on CIFS_STATS
-	help
-	  Enabling this option will allow more detailed statistics on SMB
-	  request timing to be displayed in /proc/fs/cifs/DebugData and also
-	  allow optional logging of slow responses to dmesg (depending on the
-	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
-	  These additional statistics may have a minor effect on performance
-	  and memory utilization.
-
-	  Unless you are a developer or are doing network performance analysis
-	  or tuning, say N.
-
-config CIFS_WEAK_PW_HASH
-	bool "Support legacy servers which use weaker LANMAN security"
-	depends on CIFS
-	help
-	  Modern CIFS servers including Samba and most Windows versions
-	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
-	  security mechanisms. These hash the password more securely
-	  than the mechanisms used in the older LANMAN version of the
-	  SMB protocol but LANMAN based authentication is needed to
-	  establish sessions with some old SMB servers.
-
-	  Enabling this option allows the cifs module to mount to older
-	  LANMAN based servers such as OS/2 and Windows 95, but such
-	  mounts may be less secure than mounts using NTLM or more recent
-	  security mechanisms if you are on a public network.  Unless you
-	  have a need to access old SMB servers (and are on a private
-	  network) you probably want to say N.  Even if this support
-	  is enabled in the kernel build, LANMAN authentication will not be
-	  used automatically. At runtime LANMAN mounts are disabled but
-	  can be set to required (or optional) either in
-	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
-	  option on the mount command. This support is disabled by
-	  default in order to reduce the possibility of a downgrade
-	  attack.
-
-	  If unsure, say N.
-
-config CIFS_UPCALL
-	  bool "Kerberos/SPNEGO advanced session setup"
-	  depends on CIFS && KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which accesses
-	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
-	    Kerberos tickets which are needed to mount to certain secure servers
-	    (for which more secure Kerberos authentication is required). If
-	    unsure, say N.
-
-config CIFS_XATTR
-        bool "CIFS extended attributes"
-        depends on CIFS
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page, or visit
-          <http://acl.bestbits.at/> for details).  CIFS maps the name of
-          extended attributes beginning with the user namespace prefix
-          to SMB/CIFS EAs. EAs are stored on Windows servers without the
-          user namespace prefix, but their names are seen by Linux cifs clients
-          prefaced by the user namespace prefix. The system namespace
-          (used by some filesystems to store ACLs) is not supported at
-          this time.
-
-          If unsure, say N.
-
-config CIFS_POSIX
-        bool "CIFS POSIX Extensions"
-        depends on CIFS_XATTR
-        help
-          Enabling this option will cause the cifs client to attempt to
-	  negotiate a newer dialect with servers, such as Samba 3.0.5
-	  or later, that optionally can handle more POSIX like (rather
-	  than Windows like) file behavior.  It also enables
-	  support for POSIX ACLs (getfacl and setfacl) to servers
-	  (such as Samba 3.10 and later) which can negotiate
-	  CIFS POSIX ACL support.  If unsure, say N.
-
-config CIFS_DEBUG2
-	bool "Enable additional CIFS debugging routines"
-	depends on CIFS
-	help
-	   Enabling this option adds a few more debugging routines
-	   to the cifs code which slightly increases the size of
-	   the cifs module and can cause additional logging of debug
-	   messages in some error paths, slowing performance. This
-	   option can be turned off unless you are debugging
-	   cifs problems.  If unsure, say N.
-
-config CIFS_EXPERIMENTAL
-	  bool "CIFS Experimental Features (EXPERIMENTAL)"
-	  depends on CIFS && EXPERIMENTAL
-	  help
-	    Enables cifs features under testing. These features are
-	    experimental and currently include DFS support and directory 
-	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
-	    mechanism which will be used for Kerberos session negotiation
-	    and uid remapping.  Some of these features also may depend on 
-	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
-	    (which is disabled by default). See the file fs/cifs/README 
-	    for more details.  If unsure, say N.
-
-config CIFS_DFS_UPCALL
-	  bool "DFS feature support (EXPERIMENTAL)"
-	  depends on CIFS_EXPERIMENTAL
-	  depends on KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which contacts userspace
-	    helper utilities to provide server name resolution (host names to
-	    IP addresses) which is needed for implicit mounts of DFS junction
-	    points. If unsure, say N.
+source "fs/cifs/Kconfig"
 
 config NCP_FS
 	tristate "NCP file system support (to mount NetWare volumes)"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 801db1341811..ce9fb3fbfae4 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -40,6 +40,28 @@ config BINFMT_ELF_FDPIC
 
 	  It is also possible to run FDPIC ELF binaries on MMU linux also.
 
+config CORE_DUMP_DEFAULT_ELF_HEADERS
+	bool "Write ELF core dumps with partial segments"
+	default n
+	depends on BINFMT_ELF
+	help
+	  ELF core dump files describe each memory mapping of the crashed
+	  process, and can contain or omit the memory contents of each one.
+	  The contents of an unmodified text mapping are omitted by default.
+
+	  For an unmodified text mapping of an ELF object, including just
+	  the first page of the file in a core dump makes it possible to
+	  identify the build ID bits in the file, without paying the i/o
+	  cost and disk space to dump all the text.  However, versions of
+	  GDB before 6.7 are confused by ELF core dump files in this format.
+
+	  The core dump behavior can be controlled per process using
+	  the /proc/PID/coredump_filter pseudo-file; this setting is
+	  inherited.  See Documentation/filesystems/proc.txt for details.
+
+	  This config option changes the default setting of coredump_filter
+	  seen at boot time.  If unsure, say N.
+
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
 	depends on !MMU && (!FRV || BROKEN)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index c76afa26edf7..e2159063198a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1156,16 +1156,24 @@ static int dump_seek(struct file *file, loff_t off)
 static unsigned long vma_dump_size(struct vm_area_struct *vma,
 				   unsigned long mm_flags)
 {
+#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
+
 	/* The vma can be set up to tell us the answer directly.  */
 	if (vma->vm_flags & VM_ALWAYSDUMP)
 		goto whole;
 
+	/* Hugetlb memory check */
+	if (vma->vm_flags & VM_HUGETLB) {
+		if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
+			goto whole;
+		if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE))
+			goto whole;
+	}
+
 	/* Do not dump I/O mapped devices or special mappings */
 	if (vma->vm_flags & (VM_IO | VM_RESERVED))
 		return 0;
 
-#define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
-
 	/* By default, dump shared memory if mapped from an anonymous file. */
 	if (vma->vm_flags & VM_SHARED) {
 		if (vma->vm_file->f_path.dentry->d_inode->i_nlink == 0 ?
diff --git a/fs/buffer.c b/fs/buffer.c
index ac78d4c19b3b..6569fda5cfed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -76,8 +76,7 @@ EXPORT_SYMBOL(__lock_buffer);
 
 void unlock_buffer(struct buffer_head *bh)
 {
-	smp_mb__before_clear_bit();
-	clear_buffer_locked(bh);
+	clear_bit_unlock(BH_Lock, &bh->b_state);
 	smp_mb__after_clear_bit();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
new file mode 100644
index 000000000000..341a98965bd0
--- /dev/null
+++ b/fs/cifs/Kconfig
@@ -0,0 +1,142 @@
+config CIFS
+	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
+	depends on INET
+	select NLS
+	help
+	  This is the client VFS module for the Common Internet File System
+	  (CIFS) protocol which is the successor to the Server Message Block
+	  (SMB) protocol, the native file sharing mechanism for most early
+	  PC operating systems.  The CIFS protocol is fully supported by
+	  file servers such as Windows 2000 (including Windows 2003, NT 4
+	  and Windows XP) as well by Samba (which provides excellent CIFS
+	  server support for Linux and many other operating systems). Limited
+	  support for OS/2 and Windows ME and similar servers is provided as
+	  well.
+
+	  The cifs module provides an advanced network file system
+	  client for mounting to CIFS compliant servers.  It includes
+	  support for DFS (hierarchical name space), secure per-user
+	  session establishment via Kerberos or NTLM or NTLMv2,
+	  safe distributed caching (oplock), optional packet
+	  signing, Unicode and other internationalization improvements.
+	  If you need to mount to Samba or Windows from this machine, say Y.
+
+config CIFS_STATS
+        bool "CIFS statistics"
+        depends on CIFS
+        help
+          Enabling this option will cause statistics for each server share
+	  mounted by the cifs client to be displayed in /proc/fs/cifs/Stats
+
+config CIFS_STATS2
+	bool "Extended statistics"
+	depends on CIFS_STATS
+	help
+	  Enabling this option will allow more detailed statistics on SMB
+	  request timing to be displayed in /proc/fs/cifs/DebugData and also
+	  allow optional logging of slow responses to dmesg (depending on the
+	  value of /proc/fs/cifs/cifsFYI, see fs/cifs/README for more details).
+	  These additional statistics may have a minor effect on performance
+	  and memory utilization.
+
+	  Unless you are a developer or are doing network performance analysis
+	  or tuning, say N.
+
+config CIFS_WEAK_PW_HASH
+	bool "Support legacy servers which use weaker LANMAN security"
+	depends on CIFS
+	help
+	  Modern CIFS servers including Samba and most Windows versions
+	  (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos)
+	  security mechanisms. These hash the password more securely
+	  than the mechanisms used in the older LANMAN version of the
+	  SMB protocol but LANMAN based authentication is needed to
+	  establish sessions with some old SMB servers.
+
+	  Enabling this option allows the cifs module to mount to older
+	  LANMAN based servers such as OS/2 and Windows 95, but such
+	  mounts may be less secure than mounts using NTLM or more recent
+	  security mechanisms if you are on a public network.  Unless you
+	  have a need to access old SMB servers (and are on a private
+	  network) you probably want to say N.  Even if this support
+	  is enabled in the kernel build, LANMAN authentication will not be
+	  used automatically. At runtime LANMAN mounts are disabled but
+	  can be set to required (or optional) either in
+	  /proc/fs/cifs (see fs/cifs/README for more detail) or via an
+	  option on the mount command. This support is disabled by
+	  default in order to reduce the possibility of a downgrade
+	  attack.
+
+	  If unsure, say N.
+
+config CIFS_UPCALL
+	  bool "Kerberos/SPNEGO advanced session setup"
+	  depends on CIFS && KEYS
+	  help
+	    Enables an upcall mechanism for CIFS which accesses
+	    userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+	    Kerberos tickets which are needed to mount to certain secure servers
+	    (for which more secure Kerberos authentication is required). If
+	    unsure, say N.
+
+config CIFS_XATTR
+        bool "CIFS extended attributes"
+        depends on CIFS
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).  CIFS maps the name of
+          extended attributes beginning with the user namespace prefix
+          to SMB/CIFS EAs. EAs are stored on Windows servers without the
+          user namespace prefix, but their names are seen by Linux cifs clients
+          prefaced by the user namespace prefix. The system namespace
+          (used by some filesystems to store ACLs) is not supported at
+          this time.
+
+          If unsure, say N.
+
+config CIFS_POSIX
+        bool "CIFS POSIX Extensions"
+        depends on CIFS_XATTR
+        help
+          Enabling this option will cause the cifs client to attempt to
+	  negotiate a newer dialect with servers, such as Samba 3.0.5
+	  or later, that optionally can handle more POSIX like (rather
+	  than Windows like) file behavior.  It also enables
+	  support for POSIX ACLs (getfacl and setfacl) to servers
+	  (such as Samba 3.10 and later) which can negotiate
+	  CIFS POSIX ACL support.  If unsure, say N.
+
+config CIFS_DEBUG2
+	bool "Enable additional CIFS debugging routines"
+	depends on CIFS
+	help
+	   Enabling this option adds a few more debugging routines
+	   to the cifs code which slightly increases the size of
+	   the cifs module and can cause additional logging of debug
+	   messages in some error paths, slowing performance. This
+	   option can be turned off unless you are debugging
+	   cifs problems.  If unsure, say N.
+
+config CIFS_EXPERIMENTAL
+	  bool "CIFS Experimental Features (EXPERIMENTAL)"
+	  depends on CIFS && EXPERIMENTAL
+	  help
+	    Enables cifs features under testing. These features are
+	    experimental and currently include DFS support and directory
+	    change notification ie fcntl(F_DNOTIFY), as well as the upcall
+	    mechanism which will be used for Kerberos session negotiation
+	    and uid remapping.  Some of these features also may depend on
+	    setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
+	    (which is disabled by default). See the file fs/cifs/README
+	    for more details.  If unsure, say N.
+
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support (EXPERIMENTAL)"
+	  depends on CIFS_EXPERIMENTAL
+	  depends on KEYS
+	  help
+	    Enables an upcall mechanism for CIFS which contacts userspace
+	    helper utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c4a8a0605125..62d8bd8f14c0 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		SetPageUptodate(page);
 		unlock_page(page);
 		if (!pagevec_add(plru_pvec, page))
-			__pagevec_lru_add(plru_pvec);
+			__pagevec_lru_add_file(plru_pvec);
 		data += PAGE_CACHE_SIZE;
 	}
 	return;
@@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		bytes_read = 0;
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 
 /* need to free smb_read_data buf before exit */
 	if (smb_read_data) {
diff --git a/fs/exec.c b/fs/exec.c
index a41e7902ed0b..4e834f16d9da 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1386,7 +1386,7 @@ EXPORT_SYMBOL(set_binfmt);
  * name into corename, which must have space for at least
  * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
  */
-static int format_corename(char *corename, int nr_threads, long signr)
+static int format_corename(char *corename, long signr)
 {
 	const char *pat_ptr = core_pattern;
 	int ispipe = (*pat_ptr == '|');
@@ -1493,8 +1493,7 @@ static int format_corename(char *corename, int nr_threads, long signr)
 	 * If core_pattern does not include a %p (as is the default)
 	 * and core_uses_pid is set, then .%pid will be appended to
 	 * the filename. Do not do this for piped commands. */
-	if (!ispipe && !pid_in_pattern
-	    && (core_uses_pid || nr_threads)) {
+	if (!ispipe && !pid_in_pattern && core_uses_pid) {
 		rc = snprintf(out_ptr, out_end - out_ptr,
 			      ".%d", task_tgid_vnr(current));
 		if (rc > out_end - out_ptr)
@@ -1757,7 +1756,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	 * uses lock_kernel()
 	 */
  	lock_kernel();
-	ispipe = format_corename(corename, retval, signr);
+	ispipe = format_corename(corename, signr);
 	unlock_kernel();
 	/*
 	 * Don't bother to check the RLIMIT_CORE value if core_pattern points
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 92fd0338a6eb..f5b57a2ca35a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1547,6 +1547,7 @@ retry_alloc:
 	 * turn off reservation for this allocation
 	 */
 	if (my_rsv && (free_blocks < windowsz)
+		&& (free_blocks > 0)
 		&& (rsv_is_empty(&my_rsv->rsv_window)))
 		my_rsv = NULL;
 
@@ -1585,7 +1586,7 @@ retry_alloc:
 		 * free blocks is less than half of the reservation
 		 * window size.
 		 */
-		if (free_blocks <= (windowsz/2))
+		if (my_rsv && (free_blocks <= (windowsz/2)))
 			continue;
 
 		brelse(bitmap_bh);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 2eea96ec78ed..4c82531ea0a8 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -102,6 +102,7 @@ static int ext3_readdir(struct file * filp,
 	int err;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int ret = 0;
+	int dir_has_error = 0;
 
 	sb = inode->i_sb;
 
@@ -148,9 +149,12 @@ static int ext3_readdir(struct file * filp,
 		 * of recovering data when there's a bad sector
 		 */
 		if (!bh) {
-			ext3_error (sb, "ext3_readdir",
-				"directory #%lu contains a hole at offset %lu",
-				inode->i_ino, (unsigned long)filp->f_pos);
+			if (!dir_has_error) {
+				ext3_error(sb, __func__, "directory #%lu "
+					"contains a hole at offset %lld",
+					inode->i_ino, filp->f_pos);
+				dir_has_error = 1;
+			}
 			/* corrupt size?  Maybe no more blocks to read */
 			if (filp->f_pos > inode->i_blocks << 9)
 				break;
@@ -410,7 +414,7 @@ static int call_filldir(struct file * filp, void * dirent,
 				get_dtype(sb, fname->file_type));
 		if (error) {
 			filp->f_pos = curr_pos;
-			info->extra_fname = fname->next;
+			info->extra_fname = fname;
 			return error;
 		}
 		fname = fname->next;
@@ -449,11 +453,21 @@ static int ext3_dx_readdir(struct file * filp,
 	 * If there are any leftover names on the hash collision
 	 * chain, return them first.
 	 */
-	if (info->extra_fname &&
-	    call_filldir(filp, dirent, filldir, info->extra_fname))
-		goto finished;
+	if (info->extra_fname) {
+		if (call_filldir(filp, dirent, filldir, info->extra_fname))
+			goto finished;
 
-	if (!info->curr_node)
+		info->extra_fname = NULL;
+		info->curr_node = rb_next(info->curr_node);
+		if (!info->curr_node) {
+			if (info->next_hash == ~0) {
+				filp->f_pos = EXT3_HTREE_EOF;
+				goto finished;
+			}
+			info->curr_hash = info->next_hash;
+			info->curr_minor_hash = 0;
+		}
+	} else if (!info->curr_node)
 		info->curr_node = rb_first(&info->root);
 
 	while (1) {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ebfec4d0148e..f8424ad89971 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1186,6 +1186,13 @@ write_begin_failed:
 		ext3_journal_stop(handle);
 		unlock_page(page);
 		page_cache_release(page);
+		/*
+		 * block_write_begin may have instantiated a few blocks
+		 * outside i_size.  Trim these off again. Don't need
+		 * i_size_read because we hold i_mutex.
+		 */
+		if (pos + len > inode->i_size)
+			vmtruncate(inode, inode->i_size);
 	}
 	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 77278e947e94..78fdf3836370 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -790,7 +790,8 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 
 	if (reserved_gdb || gdb_off == 0) {
 		if (!EXT3_HAS_COMPAT_FEATURE(sb,
-					     EXT3_FEATURE_COMPAT_RESIZE_INODE)){
+					     EXT3_FEATURE_COMPAT_RESIZE_INODE)
+		    || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
 			ext3_warning(sb, __func__,
 				     "No reserved GDT blocks, can't resize");
 			return -EPERM;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 399a96a6c556..3a260af5544d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -625,6 +625,9 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
 		seq_puts(seq, ",data=writeback");
 
+	if (test_opt(sb, DATA_ERR_ABORT))
+		seq_puts(seq, ",data_err=abort");
+
 	ext3_show_quota_options(seq, sb);
 
 	return 0;
@@ -754,6 +757,7 @@ enum {
 	Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
 	Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -796,6 +800,8 @@ static const match_table_t tokens = {
 	{Opt_data_journal, "data=journal"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_err_abort, "data_err=abort"},
+	{Opt_data_err_ignore, "data_err=ignore"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1011,6 +1017,12 @@ static int parse_options (char *options, struct super_block *sb,
 				sbi->s_mount_opt |= data_opt;
 			}
 			break;
+		case Opt_data_err_abort:
+			set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
+		case Opt_data_err_ignore:
+			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -1986,6 +1998,10 @@ static void ext3_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JFS_BARRIER;
 	else
 		journal->j_flags &= ~JFS_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JFS_ABORT_ON_SYNCDATA_ERR;
+	else
+		journal->j_flags &= ~JFS_ABORT_ON_SYNCDATA_ERR;
 	spin_unlock(&journal->j_state_lock);
 }
 
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fec8f61227ff..0022eec63cda 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -199,6 +199,9 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
 		goto done;
 	}
 
+	if (inode->i_ino == HFSPLUS_EXT_CNID)
+		return -EIO;
+
 	mutex_lock(&HFSPLUS_I(inode).extents_lock);
 	res = hfsplus_ext_read_extent(inode, ablock);
 	if (!res) {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b085d64a2b67..963be644297a 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -254,6 +254,8 @@ static int hfsplus_file_open(struct inode *inode, struct file *file)
 {
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode).rsrc_inode;
+	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+		return -EOVERFLOW;
 	atomic_inc(&HFSPLUS_I(inode).opencnt);
 	return 0;
 }
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index ae08c057e751..25719d902c51 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -482,6 +482,8 @@ void journal_commit_transaction(journal_t *journal)
 		printk(KERN_WARNING
 			"JBD: Detected IO errors while flushing file data "
 			"on %s\n", bdevname(journal->j_fs_dev, b));
+		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
+			journal_abort(journal, err);
 		err = 0;
 	}
 
@@ -518,9 +520,10 @@ void journal_commit_transaction(journal_t *journal)
 		jh = commit_transaction->t_buffers;
 
 		/* If we're in abort mode, we just un-journal the buffer and
-		   release it for background writing. */
+		   release it. */
 
 		if (is_journal_aborted(journal)) {
+			clear_buffer_jbddirty(jh2bh(jh));
 			JBUFFER_TRACE(jh, "journal is aborting: refile");
 			journal_refile_buffer(journal, jh);
 			/* If that was the last one, we need to clean up
@@ -762,6 +765,9 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
+	if (err)
+		journal_abort(journal, err);
+
 	jbd_debug(3, "JBD: commit phase 6\n");
 
 	if (journal_write_commit_record(journal, commit_transaction))
@@ -852,6 +858,8 @@ restart_loop:
 		if (buffer_jbddirty(bh)) {
 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
 			__journal_insert_checkpoint(jh, commit_transaction);
+			if (is_journal_aborted(journal))
+				clear_buffer_jbddirty(bh);
 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
 			__journal_refile_buffer(jh);
 			jbd_unlock_bh_state(bh);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 0540ca27a446..d15cd6e7251e 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -954,9 +954,10 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 	journal_t *journal = handle->h_transaction->t_journal;
 	int need_brelse = 0;
 	struct journal_head *jh;
+	int ret = 0;
 
 	if (is_handle_aborted(handle))
-		return 0;
+		return ret;
 
 	jh = journal_add_journal_head(bh);
 	JBUFFER_TRACE(jh, "entry");
@@ -1067,7 +1068,16 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 				   time if it is redirtied */
 			}
 
-			/* journal_clean_data_list() may have got there first */
+			/*
+			 * We cannot remove the buffer with io error from the
+			 * committing transaction, because otherwise it would
+			 * miss the error and the commit would not abort.
+			 */
+			if (unlikely(!buffer_uptodate(bh))) {
+				ret = -EIO;
+				goto no_journal;
+			}
+
 			if (jh->b_transaction != NULL) {
 				JBUFFER_TRACE(jh, "unfile from commit");
 				__journal_temp_unlink_buffer(jh);
@@ -1108,7 +1118,7 @@ no_journal:
 	}
 	JBUFFER_TRACE(jh, "exit");
 	journal_put_journal_head(jh);
-	return 0;
+	return ret;
 }
 
 /**
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
new file mode 100644
index 000000000000..6ae169cd8faa
--- /dev/null
+++ b/fs/jffs2/Kconfig
@@ -0,0 +1,188 @@
+config JFFS2_FS
+	tristate "Journalling Flash File System v2 (JFFS2) support"
+	select CRC32
+	depends on MTD
+	help
+	  JFFS2 is the second generation of the Journalling Flash File System
+	  for use on diskless embedded devices. It provides improved wear
+	  levelling, compression and support for hard links. You cannot use
+	  this on normal block devices, only on 'MTD' devices.
+
+	  Further information on the design and implementation of JFFS2 is
+	  available at <http://sources.redhat.com/jffs2/>.
+
+config JFFS2_FS_DEBUG
+	int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)"
+	depends on JFFS2_FS
+	default "0"
+	help
+	  This controls the amount of debugging messages produced by the JFFS2
+	  code. Set it to zero for use in production systems. For evaluation,
+	  testing and debugging, it's advisable to set it to one. This will
+	  enable a few assertions and will print debugging messages at the
+	  KERN_DEBUG loglevel, where they won't normally be visible. Level 2
+	  is unlikely to be useful - it enables extra debugging in certain
+	  areas which at one point needed debugging, but when the bugs were
+	  located and fixed, the detailed messages were relegated to level 2.
+
+	  If reporting bugs, please try to have available a full dump of the
+	  messages at debug level 1 while the misbehaviour was occurring.
+
+config JFFS2_FS_WRITEBUFFER
+	bool "JFFS2 write-buffering support"
+	depends on JFFS2_FS
+	default y
+	help
+	  This enables the write-buffering support in JFFS2.
+
+	  This functionality is required to support JFFS2 on the following
+	  types of flash devices:
+	    - NAND flash
+	    - NOR flash with transparent ECC
+	    - DataFlash
+
+config JFFS2_FS_WBUF_VERIFY
+	bool "Verify JFFS2 write-buffer reads"
+	depends on JFFS2_FS_WRITEBUFFER
+	default n
+	help
+	  This causes JFFS2 to read back every page written through the
+	  write-buffer, and check for errors.
+
+config JFFS2_SUMMARY
+	bool "JFFS2 summary support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  This feature makes it possible to use summary information
+	  for faster filesystem mount.
+
+	  The summary information can be inserted into a filesystem image
+	  by the utility 'sumtool'.
+
+	  If unsure, say 'N'.
+
+config JFFS2_FS_XATTR
+	bool "JFFS2 XATTR support (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  Extended attributes are name:value pairs associated with inodes by
+	  the kernel or by users (see the attr(5) manual page, or visit
+	  <http://acl.bestbits.at/> for details).
+
+	  If unsure, say N.
+
+config JFFS2_FS_POSIX_ACL
+	bool "JFFS2 POSIX Access Control Lists"
+	depends on JFFS2_FS_XATTR
+	default y
+	select FS_POSIX_ACL
+	help
+	  Posix Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the Posix ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
+
+config JFFS2_FS_SECURITY
+	bool "JFFS2 Security Labels"
+	depends on JFFS2_FS_XATTR
+	default y
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute handler for file security
+	  labels in the jffs2 filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for file security labels, say N.
+
+config JFFS2_COMPRESSION_OPTIONS
+	bool "Advanced compression options for JFFS2"
+	depends on JFFS2_FS
+	default n
+	help
+	  Enabling this option allows you to explicitly choose which
+	  compression modules, if any, are enabled in JFFS2. Removing
+	  compressors can mean you cannot read existing file systems,
+	  and enabling experimental compressors can mean that you
+	  write a file system which cannot be read by a standard kernel.
+
+	  If unsure, you should _definitely_ say 'N'.
+
+config JFFS2_ZLIB
+	bool "JFFS2 ZLIB compression support" if JFFS2_COMPRESSION_OPTIONS
+	select ZLIB_INFLATE
+	select ZLIB_DEFLATE
+	depends on JFFS2_FS
+	default y
+	help
+	  Zlib is designed to be a free, general-purpose, legally unencumbered,
+	  lossless data-compression library for use on virtually any computer
+	  hardware and operating system. See <http://www.gzip.org/zlib/> for
+	  further information.
+
+	  Say 'Y' if unsure.
+
+config JFFS2_LZO
+	bool "JFFS2 LZO compression support" if JFFS2_COMPRESSION_OPTIONS
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	depends on JFFS2_FS
+	default n
+	help
+	  minilzo-based compression. Generally works better than Zlib.
+
+	  This feature was added in July, 2007. Say 'N' if you need
+	  compatibility with older bootloaders or kernels.
+
+config JFFS2_RTIME
+	bool "JFFS2 RTIME compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default y
+	help
+	  Rtime does manage to recompress already-compressed data. Say 'Y' if unsure.
+
+config JFFS2_RUBIN
+	bool "JFFS2 RUBIN compression support" if JFFS2_COMPRESSION_OPTIONS
+	depends on JFFS2_FS
+	default n
+	help
+	  RUBINMIPS and DYNRUBIN compressors. Say 'N' if unsure.
+
+choice
+	prompt "JFFS2 default compression mode" if JFFS2_COMPRESSION_OPTIONS
+	default JFFS2_CMODE_PRIORITY
+	depends on JFFS2_FS
+	help
+	  You can set here the default compression mode of JFFS2 from
+	  the available compression modes. Don't touch if unsure.
+
+config JFFS2_CMODE_NONE
+	bool "no compression"
+	help
+	  Uses no compression.
+
+config JFFS2_CMODE_PRIORITY
+	bool "priority"
+	help
+	  Tries the compressors in a predefined order and chooses the first
+	  successful one.
+
+config JFFS2_CMODE_SIZE
+	bool "size (EXPERIMENTAL)"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result.
+
+config JFFS2_CMODE_FAVOURLZO
+	bool "Favour LZO"
+	help
+	  Tries all compressors and chooses the one which has the smallest
+	  result but gives some preference to LZO (which has faster
+	  decompression) at the expense of size.
+
+endchoice
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index 86739ee53b37..f25e70c1b51c 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -53,8 +53,8 @@ static int jffs2_is_best_compression(struct jffs2_compressor *this,
 }
 
 /* jffs2_compress:
- * @data: Pointer to uncompressed data
- * @cdata: Pointer to returned pointer to buffer for compressed data
+ * @data_in: Pointer to uncompressed data
+ * @cpage_out: Pointer to returned pointer to buffer for compressed data
  * @datalen: On entry, holds the amount of data available for compression.
  *	On exit, expected to hold the amount of data actually compressed.
  * @cdatalen: On entry, holds the amount of space available for compressed
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index cd219ef55254..b1aaae823a52 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -311,7 +311,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
 	/* FIXME: If you care. We'd need to use frags for the target
 	   if it grows much more than this */
 	if (targetlen > 254)
-		return -EINVAL;
+		return -ENAMETOOLONG;
 
 	ri = jffs2_alloc_raw_inode();
 
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index dddb2a6c9e2c..259461b910af 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -68,7 +68,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
 	instr->len = c->sector_size;
 	instr->callback = jffs2_erase_callback;
 	instr->priv = (unsigned long)(&instr[1]);
-	instr->fail_addr = 0xffffffff;
+	instr->fail_addr = MTD_FAIL_ADDR_UNKNOWN;
 
 	((struct erase_priv_struct *)instr->priv)->jeb = jeb;
 	((struct erase_priv_struct *)instr->priv)->c = c;
@@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
 {
 	/* For NAND, if the failure did not occur at the device level for a
 	   specific physical page, don't bother updating the bad block table. */
-	if (jffs2_cleanmarker_oob(c) && (bad_offset != 0xffffffff)) {
+	if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) {
 		/* We had a device-level failure to erase.  Let's see if we've
 		   failed too many times. */
 		if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 086c43830221..249305d65d5b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -207,6 +207,8 @@ int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = 0;
 	buf->f_ffree = 0;
 	buf->f_namelen = JFFS2_MAX_NAME_LEN;
+	buf->f_fsid.val[0] = JFFS2_SUPER_MAGIC;
+	buf->f_fsid.val[1] = c->mtd->index;
 
 	spin_lock(&c->erase_completion_lock);
 	avail = c->dirty_size + c->free_size;
@@ -440,14 +442,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
 
 	memset(ri, 0, sizeof(*ri));
 	/* Set OS-specific defaults for new inodes */
-	ri->uid = cpu_to_je16(current->fsuid);
+	ri->uid = cpu_to_je16(current_fsuid());
 
 	if (dir_i->i_mode & S_ISGID) {
 		ri->gid = cpu_to_je16(dir_i->i_gid);
 		if (S_ISDIR(mode))
 			mode |= S_ISGID;
 	} else {
-		ri->gid = cpu_to_je16(current->fsgid);
+		ri->gid = cpu_to_je16(current_fsgid());
 	}
 
 	/* POSIX ACLs have to be processed now, at least partly.
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index a9bf9603c1ba..0875b60b4bf7 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -261,6 +261,10 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
 
 	jffs2_sum_reset_collected(c->summary); /* reset collected summary */
 
+	/* adjust write buffer offset, else we get a non contiguous write bug */
+	if (!(c->wbuf_ofs % c->sector_size) && !c->wbuf_len)
+		c->wbuf_ofs = 0xffffffff;
+
 	D1(printk(KERN_DEBUG "jffs2_find_nextblock(): new nextblock = 0x%08x\n", c->nextblock->offset));
 
 	return 0;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 0e78b00035e4..d9a721e6db70 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -679,10 +679,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 
 	memset(c->wbuf,0xff,c->wbuf_pagesize);
 	/* adjust write buffer offset, else we get a non contiguous write bug */
-	if (SECTOR_ADDR(c->wbuf_ofs) == SECTOR_ADDR(c->wbuf_ofs+c->wbuf_pagesize))
-		c->wbuf_ofs += c->wbuf_pagesize;
-	else
-		c->wbuf_ofs = 0xffffffff;
+	c->wbuf_ofs += c->wbuf_pagesize;
 	c->wbuf_len = 0;
 	return 0;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2ab70d46ecbc..efdba2e802d7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1517,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
 	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
 		pagevec_add(&lru_pvec, page);
-		pagevec_lru_add(&lru_pvec);
+		pagevec_lru_add_file(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index d020866d4232..3140a4429af1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
 			pages[nr] = *cached_page;
 			page_cache_get(*cached_page);
 			if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-				__pagevec_lru_add(lru_pvec);
+				__pagevec_lru_add_file(lru_pvec);
 			*cached_page = NULL;
 		}
 		index++;
@@ -2084,7 +2084,7 @@ err_out:
 						OSYNC_METADATA|OSYNC_DATA);
 		}
   	}
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
 			(long)status);
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 59ea42e1ef03..61b25f4eabe6 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -136,6 +136,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 	unsigned long allowed;
 	struct vmalloc_info vmi;
 	long cached;
+	unsigned long pages[NR_LRU_LISTS];
+	int lru;
 
 /*
  * display in kilobytes.
@@ -154,51 +156,70 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 
 	get_vmalloc_info(&vmi);
 
+	for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+		pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
 	/*
 	 * Tagged format, for easy grepping and expansion.
 	 */
 	len = sprintf(page,
-		"MemTotal:     %8lu kB\n"
-		"MemFree:      %8lu kB\n"
-		"Buffers:      %8lu kB\n"
-		"Cached:       %8lu kB\n"
-		"SwapCached:   %8lu kB\n"
-		"Active:       %8lu kB\n"
-		"Inactive:     %8lu kB\n"
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"SwapCached:     %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+#ifdef CONFIG_UNEVICTABLE_LRU
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+#endif
 #ifdef CONFIG_HIGHMEM
-		"HighTotal:    %8lu kB\n"
-		"HighFree:     %8lu kB\n"
-		"LowTotal:     %8lu kB\n"
-		"LowFree:      %8lu kB\n"
+		"HighTotal:      %8lu kB\n"
+		"HighFree:       %8lu kB\n"
+		"LowTotal:       %8lu kB\n"
+		"LowFree:        %8lu kB\n"
 #endif
-		"SwapTotal:    %8lu kB\n"
-		"SwapFree:     %8lu kB\n"
-		"Dirty:        %8lu kB\n"
-		"Writeback:    %8lu kB\n"
-		"AnonPages:    %8lu kB\n"
-		"Mapped:       %8lu kB\n"
-		"Slab:         %8lu kB\n"
-		"SReclaimable: %8lu kB\n"
-		"SUnreclaim:   %8lu kB\n"
-		"PageTables:   %8lu kB\n"
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Mapped:         %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		"PageTables:     %8lu kB\n"
 #ifdef CONFIG_QUICKLIST
-		"Quicklists:   %8lu kB\n"
+		"Quicklists:     %8lu kB\n"
 #endif
-		"NFS_Unstable: %8lu kB\n"
-		"Bounce:       %8lu kB\n"
-		"WritebackTmp: %8lu kB\n"
-		"CommitLimit:  %8lu kB\n"
-		"Committed_AS: %8lu kB\n"
-		"VmallocTotal: %8lu kB\n"
-		"VmallocUsed:  %8lu kB\n"
-		"VmallocChunk: %8lu kB\n",
+		"NFS_Unstable:   %8lu kB\n"
+		"Bounce:         %8lu kB\n"
+		"WritebackTmp:   %8lu kB\n"
+		"CommitLimit:    %8lu kB\n"
+		"Committed_AS:   %8lu kB\n"
+		"VmallocTotal:   %8lu kB\n"
+		"VmallocUsed:    %8lu kB\n"
+		"VmallocChunk:   %8lu kB\n",
 		K(i.totalram),
 		K(i.freeram),
 		K(i.bufferram),
 		K(cached),
 		K(total_swapcache_pages),
-		K(global_page_state(NR_ACTIVE)),
-		K(global_page_state(NR_INACTIVE)),
+		K(pages[LRU_ACTIVE_ANON]   + pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]),
+		K(pages[LRU_ACTIVE_ANON]),
+		K(pages[LRU_INACTIVE_ANON]),
+		K(pages[LRU_ACTIVE_FILE]),
+		K(pages[LRU_INACTIVE_FILE]),
+#ifdef CONFIG_UNEVICTABLE_LRU
+		K(pages[LRU_UNEVICTABLE]),
+		K(global_page_state(NR_MLOCK)),
+#endif
 #ifdef CONFIG_HIGHMEM
 		K(i.totalhigh),
 		K(i.freehigh),
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 841368b87a29..cd9ca67f841b 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -32,9 +32,6 @@ static size_t elfcorebuf_sz;
 /* Total size of vmcore file. */
 static u64 vmcore_size;
 
-/* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-
 struct proc_dir_entry *proc_vmcore = NULL;
 
 /* Reads a page from the oldmem device from given offset. */
@@ -647,7 +644,7 @@ static int __init vmcore_init(void)
 	int rc = 0;
 
 	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
-	if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX))
+	if (!(is_vmcore_usable()))
 		return rc;
 	rc = parse_crash_elf_headers();
 	if (rc) {
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5145cb9125af..76acdbc34611 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 			goto add_error;
 
 		if (!pagevec_add(&lru_pvec, page))
-			__pagevec_lru_add(&lru_pvec);
+			__pagevec_lru_add_file(&lru_pvec);
 
 		unlock_page(page);
 	}
 
-	pagevec_lru_add(&lru_pvec);
+	pagevec_lru_add_file(&lru_pvec);
 	return 0;
 
  fsize_exceeded:
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b13123424e49..f031d1c925f0 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -61,6 +61,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
 		inode->i_mapping->a_ops = &ramfs_aops;
 		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		switch (mode & S_IFMT) {
 		default:
diff --git a/fs/seq_file.c b/fs/seq_file.c
index bd20f7f5a933..eba2eabcd2b8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -452,17 +452,34 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
 
 int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
 {
-	size_t len = bitmap_scnprintf_len(nr_bits);
+	if (m->count < m->size) {
+		int len = bitmap_scnprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
+	}
+	m->count = m->size;
+	return -1;
+}
+EXPORT_SYMBOL(seq_bitmap);
 
-	if (m->count + len < m->size) {
-		bitmap_scnprintf(m->buf + m->count, m->size - m->count,
-				 bits, nr_bits);
-		m->count += len;
-		return 0;
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+		unsigned int nr_bits)
+{
+	if (m->count < m->size) {
+		int len = bitmap_scnlistprintf(m->buf + m->count,
+				m->size - m->count, bits, nr_bits);
+		if (m->count + len < m->size) {
+			m->count += len;
+			return 0;
+		}
 	}
 	m->count = m->size;
 	return -1;
 }
+EXPORT_SYMBOL(seq_bitmap_list);
 
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
diff --git a/include/asm-cris/thread_info.h b/include/asm-cris/thread_info.h
index 7efe1000f99d..cee97f14af3b 100644
--- a/include/asm-cris/thread_info.h
+++ b/include/asm-cris/thread_info.h
@@ -88,6 +88,7 @@ struct thread_info {
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
+#define TIF_FREEZE		18	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -95,6 +96,7 @@ struct thread_info {
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
diff --git a/include/asm-generic/rtc.h b/include/asm-generic/rtc.h
index 71ef3f0b9685..89061c1a67d4 100644
--- a/include/asm-generic/rtc.h
+++ b/include/asm-generic/rtc.h
@@ -84,12 +84,12 @@ static inline unsigned int get_rtc_time(struct rtc_time *time)
 
 	if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
 	{
-		BCD_TO_BIN(time->tm_sec);
-		BCD_TO_BIN(time->tm_min);
-		BCD_TO_BIN(time->tm_hour);
-		BCD_TO_BIN(time->tm_mday);
-		BCD_TO_BIN(time->tm_mon);
-		BCD_TO_BIN(time->tm_year);
+		time->tm_sec = bcd2bin(time->tm_sec);
+		time->tm_min = bcd2bin(time->tm_min);
+		time->tm_hour = bcd2bin(time->tm_hour);
+		time->tm_mday = bcd2bin(time->tm_mday);
+		time->tm_mon = bcd2bin(time->tm_mon);
+		time->tm_year = bcd2bin(time->tm_year);
 	}
 
 #ifdef CONFIG_MACH_DECSTATION
@@ -159,12 +159,12 @@ static inline int set_rtc_time(struct rtc_time *time)
 
 	if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)
 	    || RTC_ALWAYS_BCD) {
-		BIN_TO_BCD(sec);
-		BIN_TO_BCD(min);
-		BIN_TO_BCD(hrs);
-		BIN_TO_BCD(day);
-		BIN_TO_BCD(mon);
-		BIN_TO_BCD(yrs);
+		sec = bin2bcd(sec);
+		min = bin2bcd(min);
+		hrs = bin2bcd(hrs);
+		day = bin2bcd(day);
+		mon = bin2bcd(mon);
+		yrs = bin2bcd(yrs);
 	}
 
 	save_control = CMOS_READ(RTC_CONTROL);
diff --git a/include/asm-m68k/thread_info.h b/include/asm-m68k/thread_info.h
index abc002798a2b..af0fda46e94b 100644
--- a/include/asm-m68k/thread_info.h
+++ b/include/asm-m68k/thread_info.h
@@ -52,5 +52,6 @@ struct thread_info {
 #define TIF_DELAYED_TRACE	14	/* single step a syscall */
 #define TIF_SYSCALL_TRACE	15	/* syscall trace active */
 #define TIF_MEMDIE		16
+#define TIF_FREEZE		17	/* thread is freezing for suspend */
 
 #endif	/* _ASM_M68K_THREAD_INFO_H */
diff --git a/include/asm-parisc/thread_info.h b/include/asm-parisc/thread_info.h
index 9f812741c355..0407959da489 100644
--- a/include/asm-parisc/thread_info.h
+++ b/include/asm-parisc/thread_info.h
@@ -58,6 +58,7 @@ struct thread_info {
 #define TIF_32BIT               4       /* 32 bit binary */
 #define TIF_MEMDIE		5
 #define TIF_RESTORE_SIGMASK	6	/* restore saved signal mask */
+#define TIF_FREEZE		7	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
@@ -65,6 +66,7 @@ struct thread_info {
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_32BIT		(1 << TIF_32BIT)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
+#define _TIF_FREEZE		(1 << TIF_FREEZE)
 
 #define _TIF_USER_WORK_MASK     (_TIF_SIGPENDING | \
                                  _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
diff --git a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h
index e07e72846c7a..62274ab9471f 100644
--- a/include/asm-um/thread_info.h
+++ b/include/asm-um/thread_info.h
@@ -69,6 +69,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE	 	5
 #define TIF_SYSCALL_AUDIT	6
 #define TIF_RESTORE_SIGMASK	7
+#define TIF_FREEZE		16	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
@@ -77,5 +78,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_MEMDIE		(1 << TIF_MEMDIE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
+#define _TIF_FREEZE		(1 << TIF_FREEZE)
 
 #endif
diff --git a/include/asm-xtensa/thread_info.h b/include/asm-xtensa/thread_info.h
index 7e4131dd546c..0f4fe1faf9ba 100644
--- a/include/asm-xtensa/thread_info.h
+++ b/include/asm-xtensa/thread_info.h
@@ -134,6 +134,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_MEMDIE		5
 #define TIF_RESTORE_SIGMASK	6	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
+#define TIF_FREEZE		17	/* is freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
@@ -142,6 +143,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_FREEZE		(1<<TIF_FREEZE)
 
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index bf9aca548f14..e531783e5d78 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -183,6 +183,7 @@ unifdef-y += auto_fs.h
 unifdef-y += auxvec.h
 unifdef-y += binfmts.h
 unifdef-y += blktrace_api.h
+unifdef-y += byteorder.h
 unifdef-y += capability.h
 unifdef-y += capi.h
 unifdef-y += cciss_ioctl.h
@@ -340,6 +341,7 @@ unifdef-y += soundcard.h
 unifdef-y += stat.h
 unifdef-y += stddef.h
 unifdef-y += string.h
+unifdef-y += swab.h
 unifdef-y += synclink.h
 unifdef-y += sysctl.h
 unifdef-y += tcp.h
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0a24d5550eb3..bee52abb8a4d 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -175,6 +175,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_READ_MAP:       Can be mapped for reading
  * BDI_CAP_WRITE_MAP:      Can be mapped for writing
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
+ *
+ * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
@@ -184,6 +186,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_WRITE_MAP	0x00000020
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
+#define BDI_CAP_SWAP_BACKED	0x00000100
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
@@ -248,6 +251,11 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 				      BDI_CAP_NO_WRITEBACK));
 }
 
+static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
+{
+	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
+}
+
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -258,4 +266,9 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
+static inline bool mapping_cap_swap_backed(struct address_space *mapping)
+{
+	return bdi_cap_swap_backed(mapping->backing_dev_info);
+}
+
 #endif		/* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/bcd.h b/include/linux/bcd.h
index 7ac518e3c152..22ea563ba3eb 100644
--- a/include/linux/bcd.h
+++ b/include/linux/bcd.h
@@ -1,12 +1,3 @@
-/* Permission is hereby granted to copy, modify and redistribute this code
- * in terms of the GNU Library General Public License, Version 2 or later,
- * at your option.
- */
-
-/* macros to translate to/from binary and binary-coded decimal (frequently
- * found in RTC chips).
- */
-
 #ifndef _BCD_H
 #define _BCD_H
 
@@ -15,11 +6,4 @@
 unsigned bcd2bin(unsigned char val) __attribute_const__;
 unsigned char bin2bcd(unsigned val) __attribute_const__;
 
-#define BCD2BIN(val)	bcd2bin(val)
-#define BIN2BCD(val)	bin2bcd(val)
-
-/* backwards compat */
-#define BCD_TO_BIN(val) ((val)=BCD2BIN(val))
-#define BIN_TO_BCD(val) ((val)=BIN2BCD(val))
-
 #endif /* _BCD_H */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 89781fd48859..1abfe664c444 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -110,7 +110,6 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits);
 
 extern int bitmap_scnprintf(char *buf, unsigned int len,
 			const unsigned long *src, int nbits);
-extern int bitmap_scnprintf_len(unsigned int nr_bits);
 extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
 			unsigned long *dst, int nbits);
 extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index eadaab44015f..3ce64b90118c 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -322,7 +322,7 @@ static inline void wait_on_buffer(struct buffer_head *bh)
 
 static inline int trylock_buffer(struct buffer_head *bh)
 {
-	return likely(!test_and_set_bit(BH_Lock, &bh->b_state));
+	return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
 }
 
 static inline void lock_buffer(struct buffer_head *bh)
diff --git a/include/linux/byteorder/Kbuild b/include/linux/byteorder/Kbuild
index 1133d5f9d818..fbaa7f9cee32 100644
--- a/include/linux/byteorder/Kbuild
+++ b/include/linux/byteorder/Kbuild
@@ -1,3 +1,4 @@
 unifdef-y += big_endian.h
 unifdef-y += little_endian.h
 unifdef-y += swab.h
+unifdef-y += swabb.h
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 44f95b92393b..1cba3f3efe5f 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/byteorder/swab.h>
+#include <linux/byteorder/swabb.h>
 
 #define __constant_htonl(x) ((__force __be32)(__u32)(x))
 #define __constant_ntohl(x) ((__force __u32)(__be32)(x))
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 4cc170a31762..cedc1b5a289c 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -10,6 +10,7 @@
 
 #include <linux/types.h>
 #include <linux/byteorder/swab.h>
+#include <linux/byteorder/swabb.h>
 
 #define __constant_htonl(x) ((__force __be32)___constant_swab32((x)))
 #define __constant_ntohl(x) ___constant_swab32((__force __be32)(x))
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 30934e4bfaab..8b00f6643e93 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -9,12 +9,12 @@
  */
 
 #include <linux/sched.h>
-#include <linux/kref.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
+#include <linux/rwsem.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -137,6 +137,15 @@ struct cgroup {
 	 * release_list_lock
 	 */
 	struct list_head release_list;
+
+	/* pids_mutex protects the fields below */
+	struct rw_semaphore pids_mutex;
+	/* Array of process ids in the cgroup */
+	pid_t *tasks_pids;
+	/* How many files are using the current tasks_pids array */
+	int pids_use_count;
+	/* Length of the current tasks_pids array */
+	int pids_length;
 };
 
 /* A css_set is a structure holding pointers to a set of
@@ -149,7 +158,7 @@ struct cgroup {
 struct css_set {
 
 	/* Reference count */
-	struct kref ref;
+	atomic_t refcount;
 
 	/*
 	 * List running through all cgroup groups in the same hash
@@ -394,6 +403,9 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+void cgroup_mm_owner_callbacks(struct task_struct *old,
+			       struct task_struct *new);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
@@ -412,15 +424,9 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 	return -EINVAL;
 }
 
+static inline void cgroup_mm_owner_callbacks(struct task_struct *old,
+					     struct task_struct *new) {}
+
 #endif /* !CONFIG_CGROUPS */
 
-#ifdef CONFIG_MM_OWNER
-extern void
-cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new);
-#else /* !CONFIG_MM_OWNER */
-static inline void
-cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
-{
-}
-#endif /* CONFIG_MM_OWNER */
 #endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e2877454ec82..9c22396e8b50 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_FREEZER
+SUBSYS(freezer)
+#endif
+
+/* */
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 025e4f575103..0acf3b737e2e 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -8,12 +8,9 @@
 #include <linux/proc_fs.h>
 
 #define ELFCORE_ADDR_MAX	(-1ULL)
+#define ELFCORE_ADDR_ERR	(-2ULL)
 
-#ifdef CONFIG_PROC_VMCORE
 extern unsigned long long elfcorehdr_addr;
-#else
-static const unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-#endif
 
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
@@ -28,10 +25,43 @@ extern struct proc_dir_entry *proc_vmcore;
 
 #define vmcore_elf_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x))
 
+/*
+ * is_kdump_kernel() checks whether this kernel is booting after a panic of
+ * previous kernel or not. This is determined by checking if previous kernel
+ * has passed the elf core header address on command line.
+ *
+ * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will
+ * return 1 if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic of
+ * previous kernel.
+ */
+
 static inline int is_kdump_kernel(void)
 {
 	return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0;
 }
+
+/* is_vmcore_usable() checks if the kernel is booting after a panic and
+ * the vmcore region is usable.
+ *
+ * This makes use of the fact that due to alignment -2ULL is not
+ * a valid pointer, much in the vain of IS_ERR(), except
+ * dealing directly with an unsigned long long rather than a pointer.
+ */
+
+static inline int is_vmcore_usable(void)
+{
+	return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0;
+}
+
+/* vmcore_unusable() marks the vmcore as unusable,
+ * without disturbing the logic of is_kdump_kernel()
+ */
+
+static inline void vmcore_unusable(void)
+{
+	if (is_kdump_kernel())
+		elfcorehdr_addr = ELFCORE_ADDR_ERR;
+}
 #else /* !CONFIG_CRASH_DUMP */
 static inline int is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 159d9b476cd7..d14f02918483 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -380,6 +380,8 @@ struct ext3_inode {
 #define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT3_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT3_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT3_MOUNT_DATA_ERR_ABORT	0x400000 /* Abort on file data write
+						  * error in ordered mode */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 531ccd5f5960..75a81eaf3430 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -808,6 +808,7 @@ struct fb_tile_ops {
 struct fb_info {
 	int node;
 	int flags;
+	struct mutex lock;		/* Lock for open/release/ioctl funcs */
 	struct fb_var_screeninfo var;	/* Current var */
 	struct fb_fix_screeninfo fix;	/* Current fix */
 	struct fb_monspecs monspecs;	/* Current Monitor specs */
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index deddeedf3257..8f225339eee9 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -6,7 +6,7 @@
 #include <linux/sched.h>
 #include <linux/wait.h>
 
-#ifdef CONFIG_PM_SLEEP
+#ifdef CONFIG_FREEZER
 /*
  * Check if a process has been frozen
  */
@@ -39,28 +39,18 @@ static inline void clear_freeze_flag(struct task_struct *p)
 	clear_tsk_thread_flag(p, TIF_FREEZE);
 }
 
+static inline bool should_send_signal(struct task_struct *p)
+{
+	return !(p->flags & PF_FREEZER_NOSIG);
+}
+
 /*
  * Wake up a frozen process
- *
- * task_lock() is taken to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails.  Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
  */
-static inline int thaw_process(struct task_struct *p)
-{
-	task_lock(p);
-	if (frozen(p)) {
-		p->flags &= ~PF_FROZEN;
-		task_unlock(p);
-		wake_up_process(p);
-		return 1;
-	}
-	clear_freeze_flag(p);
-	task_unlock(p);
-	return 0;
-}
+extern int __thaw_process(struct task_struct *p);
+
+/* Takes and releases task alloc lock using task_lock() */
+extern int thaw_process(struct task_struct *p);
 
 extern void refrigerator(void);
 extern int freeze_processes(void);
@@ -75,6 +65,15 @@ static inline int try_to_freeze(void)
 		return 0;
 }
 
+extern bool freeze_task(struct task_struct *p, bool sig_only);
+extern void cancel_freezing(struct task_struct *p);
+
+#ifdef CONFIG_CGROUP_FREEZER
+extern int cgroup_frozen(struct task_struct *task);
+#else /* !CONFIG_CGROUP_FREEZER */
+static inline int cgroup_frozen(struct task_struct *task) { return 0; }
+#endif /* !CONFIG_CGROUP_FREEZER */
+
 /*
  * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it
  * calls wait_for_completion(&vfork) and reset right after it returns from this
@@ -166,7 +165,7 @@ static inline void set_freezable_with_signal(void)
 	} while (try_to_freeze());					\
 	__retval;							\
 })
-#else /* !CONFIG_PM_SLEEP */
+#else /* !CONFIG_FREEZER */
 static inline int frozen(struct task_struct *p) { return 0; }
 static inline int freezing(struct task_struct *p) { return 0; }
 static inline void set_freeze_flag(struct task_struct *p) {}
@@ -191,6 +190,6 @@ static inline void set_freezable_with_signal(void) {}
 #define wait_event_freezable_timeout(wq, condition, timeout)		\
 		wait_event_interruptible_timeout(wq, condition, timeout)
 
-#endif /* !CONFIG_PM_SLEEP */
+#endif /* !CONFIG_FREEZER */
 
 #endif	/* FREEZER_H_INCLUDED */
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 7ebbcb1c9ba4..35d4f6342fac 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -816,6 +816,9 @@ struct journal_s
 #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
 #define JFS_BARRIER	0x020	/* Use IDE barriers */
+#define JFS_ABORT_ON_SYNCDATA_ERR	0x040  /* Abort the journal on file
+						* data write error in ordered
+						* mode */
 
 /*
  * Function declarations for the journaling transaction and buffer
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fdf3967e1397..1fbe14d39521 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,16 +27,13 @@ struct mm_struct;
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 
-#define page_reset_bad_cgroup(page)	((page)->page_cgroup = 0)
-
-extern struct page_cgroup *page_get_page_cgroup(struct page *page);
 extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask);
+extern void mem_cgroup_move_lists(struct page *page, enum lru_list lru);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
-extern void mem_cgroup_move_lists(struct page *page, bool active);
 extern int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask);
 
 extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -44,7 +41,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active);
+					int active, int file);
 extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
 
@@ -69,21 +66,11 @@ extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem,
 extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 							int priority);
 
-extern long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-				struct zone *zone, int priority);
-extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-				struct zone *zone, int priority);
-
-#else /* CONFIG_CGROUP_MEM_RES_CTLR */
-static inline void page_reset_bad_cgroup(struct page *page)
-{
-}
+extern long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+					int priority, enum lru_list lru);
 
-static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-	return NULL;
-}
 
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
 static inline int mem_cgroup_charge(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
 {
@@ -159,14 +146,9 @@ static inline void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem,
 {
 }
 
-static inline long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
-{
-	return 0;
-}
-
-static inline long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
+static inline long mem_cgroup_calc_reclaim(struct mem_cgroup *mem,
+					struct zone *zone, int priority,
+					enum lru_list lru)
 {
 	return 0;
 }
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 03aea612d284..3f34005068d4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -7,7 +7,6 @@
 typedef struct page *new_page_t(struct page *, unsigned long private, int **);
 
 #ifdef CONFIG_MIGRATION
-extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
 extern int putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -21,8 +20,6 @@ extern int migrate_vmas(struct mm_struct *mm,
 		const nodemask_t *from, const nodemask_t *to,
 		unsigned long flags);
 #else
-static inline int isolate_lru_page(struct page *p, struct list_head *list)
-					{ return -ENOSYS; }
 static inline int putback_lru_pages(struct list_head *l) { return 0; }
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private) { return -ENOSYS; }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c61ba10768ea..ffee2f743418 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -132,6 +132,11 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_RandomReadHint(v)		((v)->vm_flags & VM_RAND_READ)
 
 /*
+ * special vmas that are non-mergable, non-mlock()able
+ */
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+
+/*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
  */
@@ -700,10 +705,10 @@ static inline int page_mapped(struct page *page)
 extern void show_free_areas(void);
 
 #ifdef CONFIG_SHMEM
-int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 #else
 static inline int shmem_lock(struct file *file, int lock,
-			     struct user_struct *user)
+			    struct user_struct *user)
 {
 	return 0;
 }
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 895bc4e93039..c948350c378e 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -1,40 +1,100 @@
-static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
-{
-	list_add(&page->lru, &zone->active_list);
-	__inc_zone_state(zone, NR_ACTIVE);
-}
+#ifndef LINUX_MM_INLINE_H
+#define LINUX_MM_INLINE_H
 
-static inline void
-add_page_to_inactive_list(struct zone *zone, struct page *page)
+/**
+ * page_is_file_cache - should the page be on a file LRU or anon LRU?
+ * @page: the page to test
+ *
+ * Returns LRU_FILE if @page is page cache page backed by a regular filesystem,
+ * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed.
+ * Used by functions that manipulate the LRU lists, to sort a page
+ * onto the right LRU list.
+ *
+ * We would like to get this info without a page flag, but the state
+ * needs to survive until the page is last deleted from the LRU, which
+ * could be as far down as __page_cache_release.
+ */
+static inline int page_is_file_cache(struct page *page)
 {
-	list_add(&page->lru, &zone->inactive_list);
-	__inc_zone_state(zone, NR_INACTIVE);
+	if (PageSwapBacked(page))
+		return 0;
+
+	/* The page is page cache backed by a normal filesystem. */
+	return LRU_FILE;
 }
 
 static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
+add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
-	list_del(&page->lru);
-	__dec_zone_state(zone, NR_ACTIVE);
+	list_add(&page->lru, &zone->lru[l].list);
+	__inc_zone_state(zone, NR_LRU_BASE + l);
 }
 
 static inline void
-del_page_from_inactive_list(struct zone *zone, struct page *page)
+del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l)
 {
 	list_del(&page->lru);
-	__dec_zone_state(zone, NR_INACTIVE);
+	__dec_zone_state(zone, NR_LRU_BASE + l);
 }
 
 static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
+	enum lru_list l = LRU_BASE;
+
 	list_del(&page->lru);
-	if (PageActive(page)) {
-		__ClearPageActive(page);
-		__dec_zone_state(zone, NR_ACTIVE);
+	if (PageUnevictable(page)) {
+		__ClearPageUnevictable(page);
+		l = LRU_UNEVICTABLE;
 	} else {
-		__dec_zone_state(zone, NR_INACTIVE);
+		if (PageActive(page)) {
+			__ClearPageActive(page);
+			l += LRU_ACTIVE;
+		}
+		l += page_is_file_cache(page);
+	}
+	__dec_zone_state(zone, NR_LRU_BASE + l);
+}
+
+/**
+ * page_lru - which LRU list should a page be on?
+ * @page: the page to test
+ *
+ * Returns the LRU list a page should be on, as an index
+ * into the array of LRU lists.
+ */
+static inline enum lru_list page_lru(struct page *page)
+{
+	enum lru_list lru = LRU_BASE;
+
+	if (PageUnevictable(page))
+		lru = LRU_UNEVICTABLE;
+	else {
+		if (PageActive(page))
+			lru += LRU_ACTIVE;
+		lru += page_is_file_cache(page);
 	}
+
+	return lru;
 }
 
+/**
+ * inactive_anon_is_low - check if anonymous pages need to be deactivated
+ * @zone: zone to check
+ *
+ * Returns true if the zone does not have enough inactive anon pages,
+ * meaning some active anon pages need to be deactivated.
+ */
+static inline int inactive_anon_is_low(struct zone *zone)
+{
+	unsigned long active, inactive;
+
+	active = zone_page_state(zone, NR_ACTIVE_ANON);
+	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+
+	if (inactive * zone->inactive_ratio < active)
+		return 1;
+
+	return 0;
+}
+#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 9d49fa36bbef..fe825471d5aa 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -94,9 +94,6 @@ struct page {
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-	unsigned long page_cgroup;
-#endif
 };
 
 /*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 428328a05fa1..35a7b5e19465 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -81,21 +81,31 @@ struct zone_padding {
 enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
-	NR_INACTIVE,
-	NR_ACTIVE,
+	NR_LRU_BASE,
+	NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
+	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
+	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
+#else
+	NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */
+	NR_MLOCK = NR_ACTIVE_FILE,
+#endif
 	NR_ANON_PAGES,	/* Mapped anonymous pages */
 	NR_FILE_MAPPED,	/* pagecache pages mapped into pagetables.
 			   only modified from process context */
 	NR_FILE_PAGES,
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
-	/* Second 128 byte cacheline */
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
 	NR_UNSTABLE_NFS,	/* NFS unstable pages */
 	NR_BOUNCE,
 	NR_VMSCAN_WRITE,
+	/* Second 128 byte cacheline */
 	NR_WRITEBACK_TEMP,	/* Writeback using temporary buffers */
 #ifdef CONFIG_NUMA
 	NUMA_HIT,		/* allocated in intended node */
@@ -107,6 +117,55 @@ enum zone_stat_item {
 #endif
 	NR_VM_ZONE_STAT_ITEMS };
 
+/*
+ * We do arithmetic on the LRU lists in various places in the code,
+ * so it is important to keep the active lists LRU_ACTIVE higher in
+ * the array than the corresponding inactive lists, and to keep
+ * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
+ *
+ * This has to be kept in sync with the statistics in zone_stat_item
+ * above and the descriptions in vmstat_text in mm/vmstat.c
+ */
+#define LRU_BASE 0
+#define LRU_ACTIVE 1
+#define LRU_FILE 2
+
+enum lru_list {
+	LRU_INACTIVE_ANON = LRU_BASE,
+	LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
+	LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
+	LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
+#ifdef CONFIG_UNEVICTABLE_LRU
+	LRU_UNEVICTABLE,
+#else
+	LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */
+#endif
+	NR_LRU_LISTS
+};
+
+#define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++)
+
+#define for_each_evictable_lru(l) for (l = 0; l <= LRU_ACTIVE_FILE; l++)
+
+static inline int is_file_lru(enum lru_list l)
+{
+	return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE);
+}
+
+static inline int is_active_lru(enum lru_list l)
+{
+	return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE);
+}
+
+static inline int is_unevictable_lru(enum lru_list l)
+{
+#ifdef CONFIG_UNEVICTABLE_LRU
+	return (l == LRU_UNEVICTABLE);
+#else
+	return 0;
+#endif
+}
+
 struct per_cpu_pages {
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
@@ -251,10 +310,22 @@ struct zone {
 
 	/* Fields commonly accessed by the page reclaim scanner */
 	spinlock_t		lru_lock;	
-	struct list_head	active_list;
-	struct list_head	inactive_list;
-	unsigned long		nr_scan_active;
-	unsigned long		nr_scan_inactive;
+	struct {
+		struct list_head list;
+		unsigned long nr_scan;
+	} lru[NR_LRU_LISTS];
+
+	/*
+	 * The pageout code in vmscan.c keeps track of how many of the
+	 * mem/swap backed and file backed pages are refeferenced.
+	 * The higher the rotated/scanned ratio, the more valuable
+	 * that cache is.
+	 *
+	 * The anon LRU stats live in [0], file LRU stats in [1]
+	 */
+	unsigned long		recent_rotated[2];
+	unsigned long		recent_scanned[2];
+
 	unsigned long		pages_scanned;	   /* since last reclaim */
 	unsigned long		flags;		   /* zone flags, see below */
 
@@ -276,6 +347,12 @@ struct zone {
 	 */
 	int prev_priority;
 
+	/*
+	 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+	 * this zone's LRU.  Maintained by the pageout code.
+	 */
+	unsigned int inactive_ratio;
+
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
@@ -524,8 +601,11 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[MAX_ZONELISTS];
 	int nr_zones;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
+#ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
 	struct page *node_mem_map;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	struct page_cgroup *node_page_cgroup;
+#endif
 #endif
 	struct bootmem_data *bdata;
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -854,6 +934,7 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
 #endif
 
 struct page;
+struct page_cgroup;
 struct mem_section {
 	/*
 	 * This is, logically, a pointer to an array of struct
@@ -871,6 +952,14 @@ struct mem_section {
 
 	/* See declaration of similar field in struct zone */
 	unsigned long *pageblock_flags;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+	/*
+	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
+	 * section. (see memcontrol.h/page_cgroup.h about this.)
+	 */
+	struct page_cgroup *page_cgroup;
+	unsigned long pad;
+#endif
 };
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index d6fb115f5a07..ee5124ec319e 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -12,6 +12,7 @@
 #include <linux/mtd/flashchip.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/cfi_endian.h>
+#include <linux/mtd/xip.h>
 
 #ifdef CONFIG_MTD_CFI_I1
 #define cfi_interleave(cfi) 1
@@ -430,7 +431,6 @@ static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t
 {
 	map_word val;
 	uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, cfi_interleave(cfi), type);
-
 	val = cfi_build_cmd(cmd, map, cfi);
 
 	if (prev_val)
@@ -483,6 +483,13 @@ static inline void cfi_udelay(int us)
 	}
 }
 
+int __xipram cfi_qry_present(struct map_info *map, __u32 base,
+			     struct cfi_private *cfi);
+int __xipram cfi_qry_mode_on(uint32_t base, struct map_info *map,
+			     struct cfi_private *cfi);
+void __xipram cfi_qry_mode_off(uint32_t base, struct map_info *map,
+			       struct cfi_private *cfi);
+
 struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t size,
 			     const char* name);
 struct cfi_fixup {
diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h
index 08dd131301c1..d4f38c5fd44e 100644
--- a/include/linux/mtd/flashchip.h
+++ b/include/linux/mtd/flashchip.h
@@ -73,6 +73,10 @@ struct flchip {
 	int buffer_write_time;
 	int erase_time;
 
+	int word_write_time_max;
+	int buffer_write_time_max;
+	int erase_time_max;
+
 	void *priv;
 };
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 922636548558..eae26bb6430a 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -25,8 +25,10 @@
 #define MTD_ERASE_DONE          0x08
 #define MTD_ERASE_FAILED        0x10
 
+#define MTD_FAIL_ADDR_UNKNOWN 0xffffffff
+
 /* If the erase fails, fail_addr might indicate exactly which block failed.  If
-   fail_addr = 0xffffffff, the failure was not at the device level or was not
+   fail_addr = MTD_FAIL_ADDR_UNKNOWN, the failure was not at the device level or was not
    specific to any particular block. */
 struct erase_info {
 	struct mtd_info *mtd;
diff --git a/include/linux/mtd/nand-gpio.h b/include/linux/mtd/nand-gpio.h
new file mode 100644
index 000000000000..51534e50f7fc
--- /dev/null
+++ b/include/linux/mtd/nand-gpio.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_MTD_NAND_GPIO_H
+#define __LINUX_MTD_NAND_GPIO_H
+
+#include <linux/mtd/nand.h>
+
+struct gpio_nand_platdata {
+	int	gpio_nce;
+	int	gpio_nwp;
+	int	gpio_cle;
+	int	gpio_ale;
+	int	gpio_rdy;
+	void	(*adjust_parts)(struct gpio_nand_platdata *, size_t);
+	struct mtd_partition *parts;
+	unsigned int num_parts;
+	unsigned int options;
+	int	chip_delay;
+};
+
+#endif
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 81774e5facf4..733d3f3b4eb8 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -248,6 +248,7 @@ struct nand_hw_control {
  * @read_page_raw:	function to read a raw page without ECC
  * @write_page_raw:	function to write a raw page without ECC
  * @read_page:	function to read a page according to the ecc generator requirements
+ * @read_subpage:	function to read parts of the page covered by ECC.
  * @write_page:	function to write a page according to the ecc generator requirements
  * @read_oob:	function to read chip OOB data
  * @write_oob:	function to write chip OOB data
diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index d1b310c92eb4..0c6bbe28f38c 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -152,6 +152,8 @@
 #define ONENAND_SYS_CFG1_INT		(1 << 6)
 #define ONENAND_SYS_CFG1_IOBE		(1 << 5)
 #define ONENAND_SYS_CFG1_RDY_CONF	(1 << 4)
+#define ONENAND_SYS_CFG1_HF		(1 << 2)
+#define ONENAND_SYS_CFG1_SYNC_WRITE	(1 << 1)
 
 /*
  * Controller Status Register F240h (R)
diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
index 5014f7a9f5df..c92b4d439609 100644
--- a/include/linux/mtd/partitions.h
+++ b/include/linux/mtd/partitions.h
@@ -73,7 +73,6 @@ struct device;
 struct device_node;
 
 int __devinit of_mtd_parse_partitions(struct device *dev,
-                                      struct mtd_info *mtd,
                                       struct device_node *node,
                                       struct mtd_partition **pparts);
 
diff --git a/include/linux/mtd/sh_flctl.h b/include/linux/mtd/sh_flctl.h
new file mode 100644
index 000000000000..e77c1cea404d
--- /dev/null
+++ b/include/linux/mtd/sh_flctl.h
@@ -0,0 +1,125 @@
+/*
+ * SuperH FLCTL nand controller
+ *
+ * Copyright © 2008 Renesas Solutions Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __SH_FLCTL_H__
+#define __SH_FLCTL_H__
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/partitions.h>
+
+/* FLCTL registers */
+#define FLCMNCR(f)		(f->reg + 0x0)
+#define FLCMDCR(f)		(f->reg + 0x4)
+#define FLCMCDR(f)		(f->reg + 0x8)
+#define FLADR(f)		(f->reg + 0xC)
+#define FLADR2(f)		(f->reg + 0x3C)
+#define FLDATAR(f)		(f->reg + 0x10)
+#define FLDTCNTR(f)		(f->reg + 0x14)
+#define FLINTDMACR(f)		(f->reg + 0x18)
+#define FLBSYTMR(f)		(f->reg + 0x1C)
+#define FLBSYCNT(f)		(f->reg + 0x20)
+#define FLDTFIFO(f)		(f->reg + 0x24)
+#define FLECFIFO(f)		(f->reg + 0x28)
+#define FLTRCR(f)		(f->reg + 0x2C)
+#define	FL4ECCRESULT0(f)	(f->reg + 0x80)
+#define	FL4ECCRESULT1(f)	(f->reg + 0x84)
+#define	FL4ECCRESULT2(f)	(f->reg + 0x88)
+#define	FL4ECCRESULT3(f)	(f->reg + 0x8C)
+#define	FL4ECCCR(f)		(f->reg + 0x90)
+#define	FL4ECCCNT(f)		(f->reg + 0x94)
+#define	FLERRADR(f)		(f->reg + 0x98)
+
+/* FLCMNCR control bits */
+#define ECCPOS2		(0x1 << 25)
+#define _4ECCCNTEN	(0x1 << 24)
+#define _4ECCEN		(0x1 << 23)
+#define _4ECCCORRECT	(0x1 << 22)
+#define SNAND_E		(0x1 << 18)	/* SNAND (0=512 1=2048)*/
+#define QTSEL_E		(0x1 << 17)
+#define ENDIAN		(0x1 << 16)	/* 1 = little endian */
+#define FCKSEL_E	(0x1 << 15)
+#define ECCPOS_00	(0x00 << 12)
+#define ECCPOS_01	(0x01 << 12)
+#define ECCPOS_02	(0x02 << 12)
+#define ACM_SACCES_MODE	(0x01 << 10)
+#define NANWF_E		(0x1 << 9)
+#define SE_D		(0x1 << 8)	/* Spare area disable */
+#define	CE1_ENABLE	(0x1 << 4)	/* Chip Enable 1 */
+#define	CE0_ENABLE	(0x1 << 3)	/* Chip Enable 0 */
+#define	TYPESEL_SET	(0x1 << 0)
+
+/* FLCMDCR control bits */
+#define ADRCNT2_E	(0x1 << 31)	/* 5byte address enable */
+#define ADRMD_E		(0x1 << 26)	/* Sector address access */
+#define CDSRC_E		(0x1 << 25)	/* Data buffer selection */
+#define DOSR_E		(0x1 << 24)	/* Status read check */
+#define SELRW		(0x1 << 21)	/*  0:read 1:write */
+#define DOADR_E		(0x1 << 20)	/* Address stage execute */
+#define ADRCNT_1	(0x00 << 18)	/* Address data bytes: 1byte */
+#define ADRCNT_2	(0x01 << 18)	/* Address data bytes: 2byte */
+#define ADRCNT_3	(0x02 << 18)	/* Address data bytes: 3byte */
+#define ADRCNT_4	(0x03 << 18)	/* Address data bytes: 4byte */
+#define DOCMD2_E	(0x1 << 17)	/* 2nd cmd stage execute */
+#define DOCMD1_E	(0x1 << 16)	/* 1st cmd stage execute */
+
+/* FLTRCR control bits */
+#define TRSTRT		(0x1 << 0)	/* translation start */
+#define TREND		(0x1 << 1)	/* translation end */
+
+/* FL4ECCCR control bits */
+#define	_4ECCFA		(0x1 << 2)	/* 4 symbols correct fault */
+#define	_4ECCEND	(0x1 << 1)	/* 4 symbols end */
+#define	_4ECCEXST	(0x1 << 0)	/* 4 symbols exist */
+
+#define INIT_FL4ECCRESULT_VAL	0x03FF03FF
+#define LOOP_TIMEOUT_MAX	0x00010000
+
+#define mtd_to_flctl(mtd)	container_of(mtd, struct sh_flctl, mtd)
+
+struct sh_flctl {
+	struct mtd_info		mtd;
+	struct nand_chip	chip;
+	void __iomem		*reg;
+
+	uint8_t	done_buff[2048 + 64];	/* max size 2048 + 64 */
+	int	read_bytes;
+	int	index;
+	int	seqin_column;		/* column in SEQIN cmd */
+	int	seqin_page_addr;	/* page_addr in SEQIN cmd */
+	uint32_t seqin_read_cmd;		/* read cmd in SEQIN cmd */
+	int	erase1_page_addr;	/* page_addr in ERASE1 cmd */
+	uint32_t erase_ADRCNT;		/* bits of FLCMDCR in ERASE1 cmd */
+	uint32_t rw_ADRCNT;	/* bits of FLCMDCR in READ WRITE cmd */
+
+	int	hwecc_cant_correct[4];
+
+	unsigned page_size:1;	/* NAND page size (0 = 512, 1 = 2048) */
+	unsigned hwecc:1;	/* Hardware ECC (0 = disabled, 1 = enabled) */
+};
+
+struct sh_flctl_platform_data {
+	struct mtd_partition	*parts;
+	int			nr_parts;
+	unsigned long		flcmncr_val;
+
+	unsigned has_hwecc:1;
+};
+
+#endif	/* __SH_FLCTL_H__ */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c74d3e875314..b12f93a3c345 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -93,6 +93,11 @@ enum pageflags {
 	PG_mappedtodisk,	/* Has blocks allocated on-disk */
 	PG_reclaim,		/* To be reclaimed asap */
 	PG_buddy,		/* Page is free, on buddy lists */
+	PG_swapbacked,		/* Page is backed by RAM/swap */
+#ifdef CONFIG_UNEVICTABLE_LRU
+	PG_unevictable,		/* Page is "unevictable"  */
+	PG_mlocked,		/* Page is vma mlocked */
+#endif
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	PG_uncached,		/* Page has been mapped as uncached */
 #endif
@@ -161,6 +166,18 @@ static inline int Page##uname(struct page *page) 			\
 #define TESTSCFLAG(uname, lname)					\
 	TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
 
+#define SETPAGEFLAG_NOOP(uname)						\
+static inline void SetPage##uname(struct page *page) {  }
+
+#define CLEARPAGEFLAG_NOOP(uname)					\
+static inline void ClearPage##uname(struct page *page) {  }
+
+#define __CLEARPAGEFLAG_NOOP(uname)					\
+static inline void __ClearPage##uname(struct page *page) {  }
+
+#define TESTCLEARFLAG_FALSE(uname)					\
+static inline int TestClearPage##uname(struct page *page) { return 0; }
+
 struct page;	/* forward declaration */
 
 TESTPAGEFLAG(Locked, locked)
@@ -169,6 +186,7 @@ PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced)
 PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty)
 PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru)
 PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active)
+	TESTCLEARFLAG(Active, active)
 __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
@@ -176,6 +194,7 @@ PAGEFLAG(SavePinned, savepinned);			/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(Private, private) __CLEARPAGEFLAG(Private, private)
 	__SETPAGEFLAG(Private, private)
+PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 
 __PAGEFLAG(SlobPage, slob_page)
 __PAGEFLAG(SlobFree, slob_free)
@@ -211,6 +230,25 @@ PAGEFLAG(SwapCache, swapcache)
 PAGEFLAG_FALSE(SwapCache)
 #endif
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable)
+	TESTCLEARFLAG(Unevictable, unevictable)
+
+#define MLOCK_PAGES 1
+PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
+	TESTSCFLAG(Mlocked, mlocked)
+
+#else
+
+#define MLOCK_PAGES 0
+PAGEFLAG_FALSE(Mlocked)
+	SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
+
+PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable)
+	SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable)
+	__CLEARPAGEFLAG_NOOP(Unevictable)
+#endif
+
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 PAGEFLAG(Uncached, uncached)
 #else
@@ -326,15 +364,25 @@ static inline void __ClearPageTail(struct page *page)
 
 #endif /* !PAGEFLAGS_EXTENDED */
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+#define __PG_UNEVICTABLE	(1 << PG_unevictable)
+#define __PG_MLOCKED		(1 << PG_mlocked)
+#else
+#define __PG_UNEVICTABLE	0
+#define __PG_MLOCKED		0
+#endif
+
 #define PAGE_FLAGS	(1 << PG_lru   | 1 << PG_private   | 1 << PG_locked | \
 			 1 << PG_buddy | 1 << PG_writeback | \
-			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active)
+			 1 << PG_slab  | 1 << PG_swapcache | 1 << PG_active | \
+			 __PG_UNEVICTABLE | __PG_MLOCKED)
 
 /*
  * Flags checked in bad_page().  Pages on the free list should not have
  * these flags set.  It they are, there is a problem.
  */
-#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | 1 << PG_reclaim | 1 << PG_dirty)
+#define PAGE_FLAGS_CLEAR_WHEN_BAD (PAGE_FLAGS | \
+		1 << PG_reclaim | 1 << PG_dirty | 1 << PG_swapbacked)
 
 /*
  * Flags checked when a page is freed.  Pages being freed should not have
@@ -347,7 +395,8 @@ static inline void __ClearPageTail(struct page *page)
  * Pages being prepped should not have these flags set.  It they are, there
  * is a problem.
  */
-#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | 1 << PG_reserved | 1 << PG_dirty)
+#define PAGE_FLAGS_CHECK_AT_PREP (PAGE_FLAGS | \
+		1 << PG_reserved | 1 << PG_dirty | 1 << PG_swapbacked)
 
 #endif /* !__GENERATING_BOUNDS_H */
 #endif	/* PAGE_FLAGS_H */
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
new file mode 100644
index 000000000000..0fd39f2231ec
--- /dev/null
+++ b/include/linux/page_cgroup.h
@@ -0,0 +1,103 @@
+#ifndef __LINUX_PAGE_CGROUP_H
+#define __LINUX_PAGE_CGROUP_H
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#include <linux/bit_spinlock.h>
+/*
+ * Page Cgroup can be considered as an extended mem_map.
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ * All page cgroups are allocated at boot or memory hotplug event,
+ * then the page cgroup for pfn always exists.
+ */
+struct page_cgroup {
+	unsigned long flags;
+	struct mem_cgroup *mem_cgroup;
+	struct page *page;
+	struct list_head lru;		/* per cgroup LRU list */
+};
+
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat);
+void __init page_cgroup_init(void);
+struct page_cgroup *lookup_page_cgroup(struct page *page);
+
+enum {
+	/* flags for mem_cgroup */
+	PCG_LOCK,  /* page cgroup is locked */
+	PCG_CACHE, /* charged as cache */
+	PCG_USED, /* this object is in use. */
+	/* flags for LRU placement */
+	PCG_ACTIVE, /* page is active in this cgroup */
+	PCG_FILE, /* page is file system backed */
+	PCG_UNEVICTABLE, /* page is unevictableable */
+};
+
+#define TESTPCGFLAG(uname, lname)			\
+static inline int PageCgroup##uname(struct page_cgroup *pc)	\
+	{ return test_bit(PCG_##lname, &pc->flags); }
+
+#define SETPCGFLAG(uname, lname)			\
+static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
+	{ set_bit(PCG_##lname, &pc->flags);  }
+
+#define CLEARPCGFLAG(uname, lname)			\
+static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
+	{ clear_bit(PCG_##lname, &pc->flags);  }
+
+/* Cache flag is set only once (at allocation) */
+TESTPCGFLAG(Cache, CACHE)
+
+TESTPCGFLAG(Used, USED)
+CLEARPCGFLAG(Used, USED)
+
+/* LRU management flags (from global-lru definition) */
+TESTPCGFLAG(File, FILE)
+SETPCGFLAG(File, FILE)
+CLEARPCGFLAG(File, FILE)
+
+TESTPCGFLAG(Active, ACTIVE)
+SETPCGFLAG(Active, ACTIVE)
+CLEARPCGFLAG(Active, ACTIVE)
+
+TESTPCGFLAG(Unevictable, UNEVICTABLE)
+SETPCGFLAG(Unevictable, UNEVICTABLE)
+CLEARPCGFLAG(Unevictable, UNEVICTABLE)
+
+static inline int page_cgroup_nid(struct page_cgroup *pc)
+{
+	return page_to_nid(pc->page);
+}
+
+static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
+{
+	return page_zonenum(pc->page);
+}
+
+static inline void lock_page_cgroup(struct page_cgroup *pc)
+{
+	bit_spin_lock(PCG_LOCK, &pc->flags);
+}
+
+static inline int trylock_page_cgroup(struct page_cgroup *pc)
+{
+	return bit_spin_trylock(PCG_LOCK, &pc->flags);
+}
+
+static inline void unlock_page_cgroup(struct page_cgroup *pc)
+{
+	bit_spin_unlock(PCG_LOCK, &pc->flags);
+}
+
+#else /* CONFIG_CGROUP_MEM_RES_CTLR */
+struct page_cgroup;
+
+static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+}
+
+static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+	return NULL;
+}
+#endif
+#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 5da31c12101c..709742be02f0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -32,6 +32,34 @@ static inline void mapping_set_error(struct address_space *mapping, int error)
 	}
 }
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+#define AS_UNEVICTABLE	(__GFP_BITS_SHIFT + 2)	/* e.g., ramdisk, SHM_LOCK */
+
+static inline void mapping_set_unevictable(struct address_space *mapping)
+{
+	set_bit(AS_UNEVICTABLE, &mapping->flags);
+}
+
+static inline void mapping_clear_unevictable(struct address_space *mapping)
+{
+	clear_bit(AS_UNEVICTABLE, &mapping->flags);
+}
+
+static inline int mapping_unevictable(struct address_space *mapping)
+{
+	if (likely(mapping))
+		return test_bit(AS_UNEVICTABLE, &mapping->flags);
+	return !!mapping;
+}
+#else
+static inline void mapping_set_unevictable(struct address_space *mapping) { }
+static inline void mapping_clear_unevictable(struct address_space *mapping) { }
+static inline int mapping_unevictable(struct address_space *mapping)
+{
+	return 0;
+}
+#endif
+
 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
@@ -271,19 +299,19 @@ extern int __lock_page_killable(struct page *page);
 extern void __lock_page_nosync(struct page *page);
 extern void unlock_page(struct page *page);
 
-static inline void set_page_locked(struct page *page)
+static inline void __set_page_locked(struct page *page)
 {
-	set_bit(PG_locked, &page->flags);
+	__set_bit(PG_locked, &page->flags);
 }
 
-static inline void clear_page_locked(struct page *page)
+static inline void __clear_page_locked(struct page *page)
 {
-	clear_bit(PG_locked, &page->flags);
+	__clear_bit(PG_locked, &page->flags);
 }
 
 static inline int trylock_page(struct page *page)
 {
-	return !test_and_set_bit(PG_locked, &page->flags);
+	return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
 }
 
 /*
@@ -410,17 +438,17 @@ extern void __remove_from_page_cache(struct page *page);
 
 /*
  * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run set_page_locked() against it.
+ * the page is new, so we can just run __set_page_locked() against it.
  */
 static inline int add_to_page_cache(struct page *page,
 		struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
 {
 	int error;
 
-	set_page_locked(page);
+	__set_page_locked(page);
 	error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
 	if (unlikely(error))
-		clear_page_locked(page);
+		__clear_page_locked(page);
 	return error;
 }
 
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 8eb7fa76c1d0..e90a2cb02915 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -23,9 +23,9 @@ struct pagevec {
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_release_nonlru(struct pagevec *pvec);
 void __pagevec_free(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec);
-void __pagevec_lru_add_active(struct pagevec *pvec);
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
+void pagevec_swap_free(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
@@ -81,10 +81,36 @@ static inline void pagevec_free(struct pagevec *pvec)
 		__pagevec_free(pvec);
 }
 
-static inline void pagevec_lru_add(struct pagevec *pvec)
+static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
+}
+
+static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_ACTIVE_ANON);
+}
+
+static inline void __pagevec_lru_add_file(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_INACTIVE_FILE);
+}
+
+static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
+{
+	____pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
+}
+
+static inline void pagevec_lru_add_file(struct pagevec *pvec)
+{
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_file(pvec);
+}
+
+static inline void pagevec_lru_add_anon(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec);
+		__pagevec_lru_add_anon(pvec);
 }
 
 #endif /* _LINUX_PAGEVEC_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 98dc6243a706..acf8f24037cd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -631,6 +631,8 @@ int __must_check pci_assign_resource(struct pci_dev *dev, int i);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 
 /* ROM control related routines */
+int pci_enable_rom(struct pci_dev *pdev);
+void pci_disable_rom(struct pci_dev *pdev);
 void __iomem __must_check *pci_map_rom(struct pci_dev *pdev, size_t *size);
 void pci_unmap_rom(struct pci_dev *pdev, void __iomem *rom);
 size_t pci_get_rom_size(void __iomem *rom, size_t size);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ea7416c901d1..22641d5d45df 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,7 +94,6 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern void ptrace_untrace(struct task_struct *child);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index fed6f5e0b411..89f0564b10c8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -39,18 +39,6 @@ struct anon_vma {
 
 #ifdef CONFIG_MMU
 
-extern struct kmem_cache *anon_vma_cachep;
-
-static inline struct anon_vma *anon_vma_alloc(void)
-{
-	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
-}
-
-static inline void anon_vma_free(struct anon_vma *anon_vma)
-{
-	kmem_cache_free(anon_vma_cachep, anon_vma);
-}
-
 static inline void anon_vma_lock(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
@@ -75,6 +63,9 @@ void anon_vma_unlink(struct vm_area_struct *);
 void anon_vma_link(struct vm_area_struct *);
 void __anon_vma_link(struct vm_area_struct *);
 
+extern struct anon_vma *page_lock_anon_vma(struct page *page);
+extern void page_unlock_anon_vma(struct anon_vma *anon_vma);
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
@@ -117,6 +108,19 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
  */
 int page_mkclean(struct page *);
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * called in munlock()/munmap() path to check for other vmas holding
+ * the page mlocked.
+ */
+int try_to_munlock(struct page *);
+#else
+static inline int try_to_munlock(struct page *page)
+{
+	return 0;	/* a.k.a. SWAP_SUCCESS */
+}
+#endif
+
 #else	/* !CONFIG_MMU */
 
 #define anon_vma_init()		do {} while (0)
@@ -140,5 +144,6 @@ static inline int page_mkclean(struct page *page)
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
+#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c226c7b82946..f52dbd3587a7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -403,12 +403,21 @@ extern int get_dumpable(struct mm_struct *mm);
 #define MMF_DUMP_MAPPED_PRIVATE	4
 #define MMF_DUMP_MAPPED_SHARED	5
 #define MMF_DUMP_ELF_HEADERS	6
+#define MMF_DUMP_HUGETLB_PRIVATE 7
+#define MMF_DUMP_HUGETLB_SHARED  8
 #define MMF_DUMP_FILTER_SHIFT	MMF_DUMPABLE_BITS
-#define MMF_DUMP_FILTER_BITS	5
+#define MMF_DUMP_FILTER_BITS	7
 #define MMF_DUMP_FILTER_MASK \
 	(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
 #define MMF_DUMP_FILTER_DEFAULT \
-	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED))
+	((1 << MMF_DUMP_ANON_PRIVATE) |	(1 << MMF_DUMP_ANON_SHARED) |\
+	 (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
+
+#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
+# define MMF_DUMP_MASK_DEFAULT_ELF	(1 << MMF_DUMP_ELF_HEADERS)
+#else
+# define MMF_DUMP_MASK_DEFAULT_ELF	0
+#endif
 
 struct sighand_struct {
 	atomic_t		count;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index a1783b229ef4..dc50bcc282a8 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -60,6 +60,19 @@ static inline int seq_nodemask(struct seq_file *m, nodemask_t *mask)
 	return seq_bitmap(m, mask->bits, MAX_NUMNODES);
 }
 
+int seq_bitmap_list(struct seq_file *m, unsigned long *bits,
+		unsigned int nr_bits);
+
+static inline int seq_cpumask_list(struct seq_file *m, cpumask_t *mask)
+{
+	return seq_bitmap_list(m, mask->bits, NR_CPUS);
+}
+
+static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
+{
+	return seq_bitmap_list(m, mask->bits, MAX_NUMNODES);
+}
+
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
 int single_release(struct inode *, struct file *);
 void *__seq_open_private(struct file *, const struct seq_operations *, int);
diff --git a/include/linux/swab.h b/include/linux/swab.h
index 270d5c208a89..bbed279f3b32 100644
--- a/include/linux/swab.h
+++ b/include/linux/swab.h
@@ -47,8 +47,6 @@ static inline __attribute_const__ __u16 ___swab16(__u16 val)
 {
 #ifdef __arch_swab16
 	return __arch_swab16(val);
-#elif defined(__arch_swab16p)
-	return __arch_swab16p(&val);
 #else
 	return __const_swab16(val);
 #endif
@@ -58,8 +56,6 @@ static inline __attribute_const__ __u32 ___swab32(__u32 val)
 {
 #ifdef __arch_swab32
 	return __arch_swab32(val);
-#elif defined(__arch_swab32p)
-	return __arch_swab32p(&val);
 #else
 	return __const_swab32(val);
 #endif
@@ -69,8 +65,6 @@ static inline __attribute_const__ __u64 ___swab64(__u64 val)
 {
 #ifdef __arch_swab64
 	return __arch_swab64(val);
-#elif defined(__arch_swab64p)
-	return __arch_swab64p(&val);
 #elif defined(__SWAB_64_THRU_32__)
 	__u32 h = val >> 32;
 	__u32 l = val & ((1ULL << 32) - 1);
@@ -84,8 +78,6 @@ static inline __attribute_const__ __u32 ___swahw32(__u32 val)
 {
 #ifdef __arch_swahw32
 	return __arch_swahw32(val);
-#elif defined(__arch_swahw32p)
-	return __arch_swahw32p(&val);
 #else
 	return __const_swahw32(val);
 #endif
@@ -95,8 +87,6 @@ static inline __attribute_const__ __u32 ___swahb32(__u32 val)
 {
 #ifdef __arch_swahb32
 	return __arch_swahb32(val);
-#elif defined(__arch_swahb32p)
-	return __arch_swahb32p(&val);
 #else
 	return __const_swahb32(val);
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de40f169a4e4..a3af95b2cb6d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -7,6 +7,7 @@
 #include <linux/list.h>
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
+#include <linux/node.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -171,8 +172,10 @@ extern unsigned int nr_free_pagecache_pages(void);
 
 
 /* linux/mm/swap.c */
-extern void lru_cache_add(struct page *);
-extern void lru_cache_add_active(struct page *);
+extern void __lru_cache_add(struct page *, enum lru_list lru);
+extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void lru_cache_add_active_or_unevictable(struct page *,
+					struct vm_area_struct *);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
@@ -180,12 +183,38 @@ extern int lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
+extern void add_page_to_unevictable_list(struct page *page);
+
+/**
+ * lru_cache_add: add a page to the page lists
+ * @page: the page to add
+ */
+static inline void lru_cache_add_anon(struct page *page)
+{
+	__lru_cache_add(page, LRU_INACTIVE_ANON);
+}
+
+static inline void lru_cache_add_active_anon(struct page *page)
+{
+	__lru_cache_add(page, LRU_ACTIVE_ANON);
+}
+
+static inline void lru_cache_add_file(struct page *page)
+{
+	__lru_cache_add(page, LRU_INACTIVE_FILE);
+}
+
+static inline void lru_cache_add_active_file(struct page *page)
+{
+	__lru_cache_add(page, LRU_ACTIVE_FILE);
+}
+
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
 							gfp_t gfp_mask);
-extern int __isolate_lru_page(struct page *page, int mode);
+extern int __isolate_lru_page(struct page *page, int mode, int file);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 extern int remove_mapping(struct address_space *mapping, struct page *page);
@@ -204,6 +233,34 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+extern int page_evictable(struct page *page, struct vm_area_struct *vma);
+extern void scan_mapping_unevictable_pages(struct address_space *);
+
+extern unsigned long scan_unevictable_pages;
+extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+extern int scan_unevictable_register_node(struct node *node);
+extern void scan_unevictable_unregister_node(struct node *node);
+#else
+static inline int page_evictable(struct page *page,
+						struct vm_area_struct *vma)
+{
+	return 1;
+}
+
+static inline void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+}
+
+static inline int scan_unevictable_register_node(struct node *node)
+{
+	return 0;
+}
+
+static inline void scan_unevictable_unregister_node(struct node *node) { }
+#endif
+
 extern int kswapd_run(int nid);
 
 #ifdef CONFIG_MMU
@@ -251,6 +308,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern int remove_exclusive_swap_page_ref(struct page *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
@@ -339,6 +397,11 @@ static inline int remove_exclusive_swap_page(struct page *p)
 	return 0;
 }
 
+static inline int remove_exclusive_swap_page_ref(struct page *page)
+{
+	return 0;
+}
+
 static inline swp_entry_t get_swap_page(void)
 {
 	swp_entry_t entry;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b330e289d71f..9d68fed50f11 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -21,8 +21,9 @@ struct kobject;
 struct module;
 
 /* FIXME
- * The *owner field is no longer used, but leave around
- * until the tree gets cleaned up fully.
+ * The *owner field is no longer used.
+ * x86 tree has been cleaned up. The owner
+ * attribute is still left for other arches.
  */
 struct attribute {
 	const char		*name;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 328eb4022727..4c28c4d564e2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -2,6 +2,7 @@
 #define _LINUX_VMALLOC_H
 
 #include <linux/spinlock.h>
+#include <linux/init.h>
 #include <asm/page.h>		/* pgprot_t */
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
@@ -23,7 +24,6 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #endif
 
 struct vm_struct {
-	/* keep next,addr,size together to speedup lookups */
 	struct vm_struct	*next;
 	void			*addr;
 	unsigned long		size;
@@ -37,6 +37,19 @@ struct vm_struct {
 /*
  *	Highlevel APIs for driver use
  */
+extern void vm_unmap_ram(const void *mem, unsigned int count);
+extern void *vm_map_ram(struct page **pages, unsigned int count,
+				int node, pgprot_t prot);
+extern void vm_unmap_aliases(void);
+
+#ifdef CONFIG_MMU
+extern void __init vmalloc_init(void);
+#else
+static inline void vmalloc_init(void)
+{
+}
+#endif
+
 extern void *vmalloc(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 58334d439516..9cd3ab0f554d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -41,6 +41,16 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_HUGETLB_PAGE
 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+		UNEVICTABLE_PGCULLED,	/* culled to noreclaim list */
+		UNEVICTABLE_PGSCANNED,	/* scanned for reclaimability */
+		UNEVICTABLE_PGRESCUED,	/* rescued from noreclaim list */
+		UNEVICTABLE_PGMLOCKED,
+		UNEVICTABLE_PGMUNLOCKED,
+		UNEVICTABLE_PGCLEARED,	/* on COW, page truncate */
+		UNEVICTABLE_PGSTRANDED,	/* unable to isolate on unlock */
+		UNEVICTABLE_MLOCKFREED,
+#endif
 		NR_VM_EVENT_ITEMS
 };
 
@@ -159,6 +169,16 @@ static inline unsigned long zone_page_state(struct zone *zone,
 	return x;
 }
 
+extern unsigned long global_lru_pages(void);
+
+static inline unsigned long zone_lru_pages(struct zone *zone)
+{
+	return (zone_page_state(zone, NR_ACTIVE_ANON)
+		+ zone_page_state(zone, NR_ACTIVE_FILE)
+		+ zone_page_state(zone, NR_INACTIVE_ANON)
+		+ zone_page_state(zone, NR_INACTIVE_FILE));
+}
+
 #ifdef CONFIG_NUMA
 /*
  * Determine the per node value of a stat item. This function
diff --git a/include/net/netns/x_tables.h b/include/net/netns/x_tables.h
index 0cb63ed2c1fc..b8093971ccb4 100644
--- a/include/net/netns/x_tables.h
+++ b/include/net/netns/x_tables.h
@@ -2,9 +2,9 @@
 #define __NETNS_X_TABLES_H
 
 #include <linux/list.h>
-#include <linux/net.h>
+#include <linux/netfilter.h>
 
 struct netns_xt {
-	struct list_head tables[NPROTO];
+	struct list_head tables[NFPROTO_NUMPROTO];
 };
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 5ceff3249a2d..8828ed0b2051 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -299,6 +299,13 @@ config CGROUP_NS
           for instance virtual servers and checkpoint/restart
           jobs.
 
+config CGROUP_FREEZER
+        bool "control group freezer subsystem"
+        depends on CGROUPS
+        help
+          Provides a way to freeze and unfreeze all tasks in a
+	  cgroup.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	depends on CGROUPS && EXPERIMENTAL
diff --git a/init/main.c b/init/main.c
index 27f6bf6108e9..4371d11721f6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -27,6 +27,7 @@
 #include <linux/gfp.h>
 #include <linux/percpu.h>
 #include <linux/kmod.h>
+#include <linux/vmalloc.h>
 #include <linux/kernel_stat.h>
 #include <linux/start_kernel.h>
 #include <linux/security.h>
@@ -642,6 +643,7 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
+	vmalloc_init();
 	vfs_caches_init_early();
 	cpuset_init_early();
 	mem_init();
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 96fb36cd9874..68eb857cfdea 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -52,6 +52,14 @@
 #define HARD_MSGMAX 	(131072/sizeof(void*))
 #define DFLT_MSGSIZEMAX 8192	/* max message size */
 
+/*
+ * Define the ranges various user-specified maximum values can
+ * be set to.
+ */
+#define MIN_MSGMAX	1		/* min value for msg_max */
+#define MAX_MSGMAX	HARD_MSGMAX	/* max value for msg_max */
+#define MIN_MSGSIZEMAX	128		/* min value for msgsize_max */
+#define MAX_MSGSIZEMAX	(8192*128)	/* max value for msgsize_max */
 
 struct ext_wait_queue {		/* queue of sleeping tasks */
 	struct task_struct *task;
@@ -134,8 +142,8 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode,
 			info->qsize = 0;
 			info->user = NULL;	/* set when all is ok */
 			memset(&info->attr, 0, sizeof(info->attr));
-			info->attr.mq_maxmsg = DFLT_MSGMAX;
-			info->attr.mq_msgsize = DFLT_MSGSIZEMAX;
+			info->attr.mq_maxmsg = msg_max;
+			info->attr.mq_msgsize = msgsize_max;
 			if (attr) {
 				info->attr.mq_maxmsg = attr->mq_maxmsg;
 				info->attr.mq_msgsize = attr->mq_msgsize;
@@ -1191,11 +1199,11 @@ static struct file_system_type mqueue_fs_type = {
 	.kill_sb = kill_litter_super,
 };
 
-static int msg_max_limit_min = DFLT_MSGMAX;
-static int msg_max_limit_max = HARD_MSGMAX;
+static int msg_max_limit_min = MIN_MSGMAX;
+static int msg_max_limit_max = MAX_MSGMAX;
 
-static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX;
-static int msg_maxsize_limit_max = INT_MAX;
+static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
+static int msg_maxsize_limit_max = MAX_MSGSIZEMAX;
 
 static ctl_table mq_sysctls[] = {
 	{
diff --git a/ipc/shm.c b/ipc/shm.c
index e77ec698cf40..0add3fa5f547 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -737,6 +737,10 @@ asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf)
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 	{
+		struct file *uninitialized_var(shm_file);
+
+		lru_add_drain_all();  /* drain pagevecs to lru lists */
+
 		shp = shm_lock_check(ns, shmid);
 		if (IS_ERR(shp)) {
 			err = PTR_ERR(shp);
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 000000000000..a3bb4cb52539
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
+config FREEZER
+	def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df7c3e2..066550aa61c5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
+obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -55,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
+obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8c6e1c17e6d3..046c1609606b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg)
 	struct cg_cgroup_link *link;
 	struct cg_cgroup_link *saved_link;
 
-	write_lock(&css_set_lock);
 	hlist_del(&cg->hlist);
 	css_set_count--;
 
@@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg)
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
-
-	write_unlock(&css_set_lock);
 }
 
-static void __release_css_set(struct kref *k, int taskexit)
+static void __put_css_set(struct css_set *cg, int taskexit)
 {
 	int i;
-	struct css_set *cg = container_of(k, struct css_set, ref);
-
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cg->refcount, -1, 1))
+		return;
+	write_lock(&css_set_lock);
+	if (!atomic_dec_and_test(&cg->refcount)) {
+		write_unlock(&css_set_lock);
+		return;
+	}
 	unlink_css_set(cg);
+	write_unlock(&css_set_lock);
 
 	rcu_read_lock();
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)
 	kfree(cg);
 }
 
-static void release_css_set(struct kref *k)
-{
-	__release_css_set(k, 0);
-}
-
-static void release_css_set_taskexit(struct kref *k)
-{
-	__release_css_set(k, 1);
-}
-
 /*
  * refcounted get/put for css_set objects
  */
 static inline void get_css_set(struct css_set *cg)
 {
-	kref_get(&cg->ref);
+	atomic_inc(&cg->refcount);
 }
 
 static inline void put_css_set(struct css_set *cg)
 {
-	kref_put(&cg->ref, release_css_set);
+	__put_css_set(cg, 0);
 }
 
 static inline void put_css_set_taskexit(struct css_set *cg)
 {
-	kref_put(&cg->ref, release_css_set_taskexit);
+	__put_css_set(cg, 1);
 }
 
 /*
@@ -427,7 +425,7 @@ static struct css_set *find_css_set(
 		return NULL;
 	}
 
-	kref_init(&res->ref);
+	atomic_set(&res->refcount, 1);
 	INIT_LIST_HEAD(&res->cg_links);
 	INIT_LIST_HEAD(&res->tasks);
 	INIT_HLIST_NODE(&res->hlist);
@@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = {
 	.remount_fs = cgroup_remount,
 };
 
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+	INIT_LIST_HEAD(&cgrp->sibling);
+	INIT_LIST_HEAD(&cgrp->children);
+	INIT_LIST_HEAD(&cgrp->css_sets);
+	INIT_LIST_HEAD(&cgrp->release_list);
+	init_rwsem(&cgrp->pids_mutex);
+}
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
@@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->top_cgroup = cgrp;
-	INIT_LIST_HEAD(&cgrp->sibling);
-	INIT_LIST_HEAD(&cgrp->children);
-	INIT_LIST_HEAD(&cgrp->css_sets);
-	INIT_LIST_HEAD(&cgrp->release_list);
+	init_cgroup_housekeeping(cgrp);
 }
 
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
 
 	read_lock(&css_set_lock);
 	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-		count += atomic_read(&link->cg->ref.refcount);
+		count += atomic_read(&link->cg->refcount);
 	}
 	read_unlock(&css_set_lock);
 	return count;
@@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
- * Upon tasks file open(), a struct ctr_struct is allocated, that
- * will have a pointer to an array (also allocated here).  The struct
- * ctr_struct * is stored in file->private_data.  Its resources will
- * be freed by release() when the file is closed.  The array is used
- * to sprintf the PIDs and then used by read().
  */
-struct ctr_struct {
-	char *buf;
-	int bufsz;
-};
 
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b)
 	return *(pid_t *)a - *(pid_t *)b;
 }
 
+
 /*
- * Convert array 'a' of 'npids' pid_t's to a string of newline separated
- * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
- * count 'cnt' of how many chars would be written if buf were large enough.
+ * seq_file methods for the "tasks" file. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->tasks_pids array.
  */
-static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+
+static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 {
-	int cnt = 0;
-	int i;
+	/*
+	 * Initially we receive a position value that corresponds to
+	 * one more than the last pid shown (or 0 on the first call or
+	 * after a seek to the start). Use a binary-search to find the
+	 * next pid to display, if any
+	 */
+	struct cgroup *cgrp = s->private;
+	int index = 0, pid = *pos;
+	int *iter;
 
-	for (i = 0; i < npids; i++)
-		cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
-	return cnt;
+	down_read(&cgrp->pids_mutex);
+	if (pid) {
+		int end = cgrp->pids_length;
+		int i;
+		while (index < end) {
+			int mid = (index + end) / 2;
+			if (cgrp->tasks_pids[mid] == pid) {
+				index = mid;
+				break;
+			} else if (cgrp->tasks_pids[mid] <= pid)
+				index = mid + 1;
+			else
+				end = mid;
+		}
+	}
+	/* If we're off the end of the array, we're done */
+	if (index >= cgrp->pids_length)
+		return NULL;
+	/* Update the abstract position to be the actual pid that we found */
+	iter = cgrp->tasks_pids + index;
+	*pos = *iter;
+	return iter;
+}
+
+static void cgroup_tasks_stop(struct seq_file *s, void *v)
+{
+	struct cgroup *cgrp = s->private;
+	up_read(&cgrp->pids_mutex);
 }
 
+static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct cgroup *cgrp = s->private;
+	int *p = v;
+	int *end = cgrp->tasks_pids + cgrp->pids_length;
+
+	/*
+	 * Advance to the next pid in the array. If this goes off the
+	 * end, we're done
+	 */
+	p++;
+	if (p >= end) {
+		return NULL;
+	} else {
+		*pos = *p;
+		return p;
+	}
+}
+
+static int cgroup_tasks_show(struct seq_file *s, void *v)
+{
+	return seq_printf(s, "%d\n", *(int *)v);
+}
+
+static struct seq_operations cgroup_tasks_seq_operations = {
+	.start = cgroup_tasks_start,
+	.stop = cgroup_tasks_stop,
+	.next = cgroup_tasks_next,
+	.show = cgroup_tasks_show,
+};
+
+static void release_cgroup_pid_array(struct cgroup *cgrp)
+{
+	down_write(&cgrp->pids_mutex);
+	BUG_ON(!cgrp->pids_use_count);
+	if (!--cgrp->pids_use_count) {
+		kfree(cgrp->tasks_pids);
+		cgrp->tasks_pids = NULL;
+		cgrp->pids_length = 0;
+	}
+	up_write(&cgrp->pids_mutex);
+}
+
+static int cgroup_tasks_release(struct inode *inode, struct file *file)
+{
+	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	release_cgroup_pid_array(cgrp);
+	return seq_release(inode, file);
+}
+
+static struct file_operations cgroup_tasks_operations = {
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.write = cgroup_file_write,
+	.release = cgroup_tasks_release,
+};
+
 /*
- * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * Handle an open on 'tasks' file.  Prepare an array containing the
  * process id's of tasks currently attached to the cgroup being opened.
- *
- * Does not require any specific cgroup mutexes, and does not take any.
  */
+
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	struct ctr_struct *ctr;
 	pid_t *pidarray;
 	int npids;
-	char c;
+	int retval;
 
+	/* Nothing to do for write-only files */
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 
-	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
-	if (!ctr)
-		goto err0;
-
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
@@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
 	 * show up until sometime later on.
 	 */
 	npids = cgroup_task_count(cgrp);
-	if (npids) {
-		pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
-		if (!pidarray)
-			goto err1;
-
-		npids = pid_array_load(pidarray, npids, cgrp);
-		sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
-		/* Call pid_array_to_buf() twice, first just to get bufsz */
-		ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
-		ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
-		if (!ctr->buf)
-			goto err2;
-		ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
-
-		kfree(pidarray);
-	} else {
-		ctr->buf = NULL;
-		ctr->bufsz = 0;
-	}
-	file->private_data = ctr;
-	return 0;
-
-err2:
-	kfree(pidarray);
-err1:
-	kfree(ctr);
-err0:
-	return -ENOMEM;
-}
+	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+	if (!pidarray)
+		return -ENOMEM;
+	npids = pid_array_load(pidarray, npids, cgrp);
+	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
 
-static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
-				    struct cftype *cft,
-				    struct file *file, char __user *buf,
-				    size_t nbytes, loff_t *ppos)
-{
-	struct ctr_struct *ctr = file->private_data;
+	/*
+	 * Store the array in the cgroup, freeing the old
+	 * array if necessary
+	 */
+	down_write(&cgrp->pids_mutex);
+	kfree(cgrp->tasks_pids);
+	cgrp->tasks_pids = pidarray;
+	cgrp->pids_length = npids;
+	cgrp->pids_use_count++;
+	up_write(&cgrp->pids_mutex);
 
-	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
-}
+	file->f_op = &cgroup_tasks_operations;
 
-static int cgroup_tasks_release(struct inode *unused_inode,
-					struct file *file)
-{
-	struct ctr_struct *ctr;
-
-	if (file->f_mode & FMODE_READ) {
-		ctr = file->private_data;
-		kfree(ctr->buf);
-		kfree(ctr);
+	retval = seq_open(file, &cgroup_tasks_seq_operations);
+	if (retval) {
+		release_cgroup_pid_array(cgrp);
+		return retval;
 	}
+	((struct seq_file *)file->private_data)->private = cgrp;
 	return 0;
 }
 
@@ -2210,7 +2268,6 @@ static struct cftype files[] = {
 	{
 		.name = "tasks",
 		.open = cgroup_tasks_open,
-		.read = cgroup_tasks_read,
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
@@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	mutex_lock(&cgroup_mutex);
 
-	INIT_LIST_HEAD(&cgrp->sibling);
-	INIT_LIST_HEAD(&cgrp->children);
-	INIT_LIST_HEAD(&cgrp->css_sets);
-	INIT_LIST_HEAD(&cgrp->release_list);
+	init_cgroup_housekeeping(cgrp);
 
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
@@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 int __init cgroup_init_early(void)
 {
 	int i;
-	kref_init(&init_css_set.ref);
-	kref_get(&init_css_set.ref);
+	atomic_set(&init_css_set.refcount, 1);
 	INIT_LIST_HEAD(&init_css_set.cg_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	INIT_HLIST_NODE(&init_css_set.hlist);
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index c3dc3aba4c02..daca6209202d 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
 	u64 count;
 
 	rcu_read_lock();
-	count = atomic_read(&current->cgroups->ref.refcount);
+	count = atomic_read(&current->cgroups->refcount);
 	rcu_read_unlock();
 	return count;
 }
@@ -90,7 +90,7 @@ static struct cftype files[] =  {
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
-	}
+	},
 };
 
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 000000000000..e95056954498
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,379 @@
+/*
+ * cgroup_freezer.c -  control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+
+enum freezer_state {
+	CGROUP_THAWED = 0,
+	CGROUP_FREEZING,
+	CGROUP_FROZEN,
+};
+
+struct freezer {
+	struct cgroup_subsys_state css;
+	enum freezer_state state;
+	spinlock_t lock; /* protects _writes_ to state */
+};
+
+static inline struct freezer *cgroup_freezer(
+		struct cgroup *cgroup)
+{
+	return container_of(
+		cgroup_subsys_state(cgroup, freezer_subsys_id),
+		struct freezer, css);
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, freezer_subsys_id),
+			    struct freezer, css);
+}
+
+int cgroup_frozen(struct task_struct *task)
+{
+	struct freezer *freezer;
+	enum freezer_state state;
+
+	task_lock(task);
+	freezer = task_freezer(task);
+	state = freezer->state;
+	task_unlock(task);
+
+	return state == CGROUP_FROZEN;
+}
+
+/*
+ * cgroups_write_string() limits the size of freezer state strings to
+ * CGROUP_LOCAL_BUFFER_SIZE
+ */
+static const char *freezer_state_strs[] = {
+	"THAWED",
+	"FREEZING",
+	"FROZEN",
+};
+
+/*
+ * State diagram
+ * Transitions are caused by userspace writes to the freezer.state file.
+ * The values in parenthesis are state labels. The rest are edge labels.
+ *
+ * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
+ *    ^ ^                    |                     |
+ *    | \_______THAWED_______/                     |
+ *    \__________________________THAWED____________/
+ */
+
+struct cgroup_subsys freezer_subsys;
+
+/* Locks taken and their ordering
+ * ------------------------------
+ * css_set_lock
+ * cgroup_mutex (AKA cgroup_lock)
+ * task->alloc_lock (AKA task_lock)
+ * freezer->lock
+ * task->sighand->siglock
+ *
+ * cgroup code forces css_set_lock to be taken before task->alloc_lock
+ *
+ * freezer_create(), freezer_destroy():
+ * cgroup_mutex [ by cgroup core ]
+ *
+ * can_attach():
+ * cgroup_mutex
+ *
+ * cgroup_frozen():
+ * task->alloc_lock (to get task's cgroup)
+ *
+ * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
+ * task->alloc_lock (to get task's cgroup)
+ * freezer->lock
+ *  sighand->siglock (if the cgroup is freezing)
+ *
+ * freezer_read():
+ * cgroup_mutex
+ *  freezer->lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *
+ * freezer_write() (freeze):
+ * cgroup_mutex
+ *  freezer->lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *    sighand->siglock
+ *
+ * freezer_write() (unfreeze):
+ * cgroup_mutex
+ *  freezer->lock
+ *   read_lock css_set_lock (cgroup iterator start)
+ *    task->alloc_lock (to prevent races with freeze_task())
+ *     sighand->siglock
+ */
+static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
+						  struct cgroup *cgroup)
+{
+	struct freezer *freezer;
+
+	freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+	if (!freezer)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&freezer->lock);
+	freezer->state = CGROUP_THAWED;
+	return &freezer->css;
+}
+
+static void freezer_destroy(struct cgroup_subsys *ss,
+			    struct cgroup *cgroup)
+{
+	kfree(cgroup_freezer(cgroup));
+}
+
+/* Task is frozen or will freeze immediately when next it gets woken */
+static bool is_task_frozen_enough(struct task_struct *task)
+{
+	return frozen(task) ||
+		(task_is_stopped_or_traced(task) && freezing(task));
+}
+
+/*
+ * The call to cgroup_lock() in the freezer.state write method prevents
+ * a write to that file racing against an attach, and hence the
+ * can_attach() result will remain valid until the attach completes.
+ */
+static int freezer_can_attach(struct cgroup_subsys *ss,
+			      struct cgroup *new_cgroup,
+			      struct task_struct *task)
+{
+	struct freezer *freezer;
+	int retval;
+
+	/* Anything frozen can't move or be moved to/from */
+
+	if (is_task_frozen_enough(task))
+		return -EBUSY;
+
+	freezer = cgroup_freezer(new_cgroup);
+	if (freezer->state == CGROUP_FROZEN)
+		return -EBUSY;
+
+	retval = 0;
+	task_lock(task);
+	freezer = task_freezer(task);
+	if (freezer->state == CGROUP_FROZEN)
+		retval = -EBUSY;
+	task_unlock(task);
+	return retval;
+}
+
+static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
+{
+	struct freezer *freezer;
+
+	task_lock(task);
+	freezer = task_freezer(task);
+	task_unlock(task);
+
+	BUG_ON(freezer->state == CGROUP_FROZEN);
+	spin_lock_irq(&freezer->lock);
+	/* Locking avoids race with FREEZING -> THAWED transitions. */
+	if (freezer->state == CGROUP_FREEZING)
+		freeze_task(task, true);
+	spin_unlock_irq(&freezer->lock);
+}
+
+/*
+ * caller must hold freezer->lock
+ */
+static void update_freezer_state(struct cgroup *cgroup,
+				 struct freezer *freezer)
+{
+	struct cgroup_iter it;
+	struct task_struct *task;
+	unsigned int nfrozen = 0, ntotal = 0;
+
+	cgroup_iter_start(cgroup, &it);
+	while ((task = cgroup_iter_next(cgroup, &it))) {
+		ntotal++;
+		if (is_task_frozen_enough(task))
+			nfrozen++;
+	}
+
+	/*
+	 * Transition to FROZEN when no new tasks can be added ensures
+	 * that we never exist in the FROZEN state while there are unfrozen
+	 * tasks.
+	 */
+	if (nfrozen == ntotal)
+		freezer->state = CGROUP_FROZEN;
+	else if (nfrozen > 0)
+		freezer->state = CGROUP_FREEZING;
+	else
+		freezer->state = CGROUP_THAWED;
+	cgroup_iter_end(cgroup, &it);
+}
+
+static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
+			struct seq_file *m)
+{
+	struct freezer *freezer;
+	enum freezer_state state;
+
+	if (!cgroup_lock_live_group(cgroup))
+		return -ENODEV;
+
+	freezer = cgroup_freezer(cgroup);
+	spin_lock_irq(&freezer->lock);
+	state = freezer->state;
+	if (state == CGROUP_FREEZING) {
+		/* We change from FREEZING to FROZEN lazily if the cgroup was
+		 * only partially frozen when we exitted write. */
+		update_freezer_state(cgroup, freezer);
+		state = freezer->state;
+	}
+	spin_unlock_irq(&freezer->lock);
+	cgroup_unlock();
+
+	seq_puts(m, freezer_state_strs[state]);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+{
+	struct cgroup_iter it;
+	struct task_struct *task;
+	unsigned int num_cant_freeze_now = 0;
+
+	freezer->state = CGROUP_FREEZING;
+	cgroup_iter_start(cgroup, &it);
+	while ((task = cgroup_iter_next(cgroup, &it))) {
+		if (!freeze_task(task, true))
+			continue;
+		if (is_task_frozen_enough(task))
+			continue;
+		if (!freezing(task) && !freezer_should_skip(task))
+			num_cant_freeze_now++;
+	}
+	cgroup_iter_end(cgroup, &it);
+
+	return num_cant_freeze_now ? -EBUSY : 0;
+}
+
+static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+{
+	struct cgroup_iter it;
+	struct task_struct *task;
+
+	cgroup_iter_start(cgroup, &it);
+	while ((task = cgroup_iter_next(cgroup, &it))) {
+		int do_wake;
+
+		task_lock(task);
+		do_wake = __thaw_process(task);
+		task_unlock(task);
+		if (do_wake)
+			wake_up_process(task);
+	}
+	cgroup_iter_end(cgroup, &it);
+	freezer->state = CGROUP_THAWED;
+
+	return 0;
+}
+
+static int freezer_change_state(struct cgroup *cgroup,
+				enum freezer_state goal_state)
+{
+	struct freezer *freezer;
+	int retval = 0;
+
+	freezer = cgroup_freezer(cgroup);
+	spin_lock_irq(&freezer->lock);
+	update_freezer_state(cgroup, freezer);
+	if (goal_state == freezer->state)
+		goto out;
+	switch (freezer->state) {
+	case CGROUP_THAWED:
+		retval = try_to_freeze_cgroup(cgroup, freezer);
+		break;
+	case CGROUP_FREEZING:
+		if (goal_state == CGROUP_FROZEN) {
+			/* Userspace is retrying after
+			 * "/bin/echo FROZEN > freezer.state" returned -EBUSY */
+			retval = try_to_freeze_cgroup(cgroup, freezer);
+			break;
+		}
+		/* state == FREEZING and goal_state == THAWED, so unfreeze */
+	case CGROUP_FROZEN:
+		retval = unfreeze_cgroup(cgroup, freezer);
+		break;
+	default:
+		break;
+	}
+out:
+	spin_unlock_irq(&freezer->lock);
+
+	return retval;
+}
+
+static int freezer_write(struct cgroup *cgroup,
+			 struct cftype *cft,
+			 const char *buffer)
+{
+	int retval;
+	enum freezer_state goal_state;
+
+	if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
+		goal_state = CGROUP_THAWED;
+	else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
+		goal_state = CGROUP_FROZEN;
+	else
+		return -EIO;
+
+	if (!cgroup_lock_live_group(cgroup))
+		return -ENODEV;
+	retval = freezer_change_state(cgroup, goal_state);
+	cgroup_unlock();
+	return retval;
+}
+
+static struct cftype files[] = {
+	{
+		.name = "state",
+		.read_seq_string = freezer_read,
+		.write_string = freezer_write,
+	},
+};
+
+static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
+}
+
+struct cgroup_subsys freezer_subsys = {
+	.name		= "freezer",
+	.create		= freezer_create,
+	.destroy	= freezer_destroy,
+	.populate	= freezer_populate,
+	.subsys_id	= freezer_subsys_id,
+	.can_attach	= freezer_can_attach,
+	.attach		= NULL,
+	.fork		= freezer_fork,
+	.exit		= NULL,
+};
diff --git a/kernel/configs.c b/kernel/configs.c
index 4c345210ed8c..abaee684ecbf 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -54,9 +54,6 @@
 
 #ifdef CONFIG_IKCONFIG_PROC
 
-/**************************************************/
-/* globals and useful constants                   */
-
 static ssize_t
 ikconfig_read_current(struct file *file, char __user *buf,
 		      size_t len, loff_t * offset)
@@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = {
 	.read = ikconfig_read_current,
 };
 
-/***************************************************/
-/* ikconfig_init: start up everything we need to */
-
 static int __init ikconfig_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -89,9 +83,6 @@ static int __init ikconfig_init(void)
 	return 0;
 }
 
-/***************************************************/
-/* ikconfig_cleanup: clean up our mess           */
-
 static void __exit ikconfig_cleanup(void)
 {
 	remove_proc_entry("config.gz", NULL);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index eab7bd6628e0..3e00526f52ec 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1172,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 {
 	struct cpuset trialcs;
 	int err;
-	int cpus_nonempty, balance_flag_changed;
+	int balance_flag_changed;
 
 	trialcs = *cs;
 	if (turning_on)
@@ -1184,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	if (err < 0)
 		return err;
 
-	cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
 	balance_flag_changed = (is_sched_load_balance(cs) !=
 		 			is_sched_load_balance(&trialcs));
 
@@ -1192,7 +1191,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	cs->flags = trialcs.flags;
 	mutex_unlock(&callback_mutex);
 
-	if (cpus_nonempty && balance_flag_changed)
+	if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
 		async_rebuild_sched_domains();
 
 	return 0;
@@ -2437,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = {
 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
 {
 	seq_printf(m, "Cpus_allowed:\t");
-	m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count,
-					task->cpus_allowed);
+	seq_cpumask(m, &task->cpus_allowed);
 	seq_printf(m, "\n");
 	seq_printf(m, "Cpus_allowed_list:\t");
-	m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count,
-					task->cpus_allowed);
+	seq_cpumask_list(m, &task->cpus_allowed);
 	seq_printf(m, "\n");
 	seq_printf(m, "Mems_allowed:\t");
-	m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count,
-					task->mems_allowed);
+	seq_nodemask(m, &task->mems_allowed);
 	seq_printf(m, "\n");
 	seq_printf(m, "Mems_allowed_list:\t");
-	m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count,
-					task->mems_allowed);
+	seq_nodemask_list(m, &task->mems_allowed);
 	seq_printf(m, "\n");
 }
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 000000000000..ba6248b323ef
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,154 @@
+/*
+ * kernel/freezer.c - Function to freeze a process
+ *
+ * Originally from kernel/power/process.c
+ */
+
+#include <linux/interrupt.h>
+#include <linux/suspend.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/freezer.h>
+
+/*
+ * freezing is complete, mark current process as frozen
+ */
+static inline void frozen_process(void)
+{
+	if (!unlikely(current->flags & PF_NOFREEZE)) {
+		current->flags |= PF_FROZEN;
+		wmb();
+	}
+	clear_freeze_flag(current);
+}
+
+/* Refrigerator is place where frozen processes are stored :-). */
+void refrigerator(void)
+{
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
+	long save;
+
+	task_lock(current);
+	if (freezing(current)) {
+		frozen_process();
+		task_unlock(current);
+	} else {
+		task_unlock(current);
+		return;
+	}
+	save = current->state;
+	pr_debug("%s entered refrigerator\n", current->comm);
+
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!frozen(current))
+			break;
+		schedule();
+	}
+	pr_debug("%s left refrigerator\n", current->comm);
+	__set_current_state(save);
+}
+EXPORT_SYMBOL(refrigerator);
+
+static void fake_signal_wake_up(struct task_struct *p)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->sighand->siglock, flags);
+	signal_wake_up(p, 0);
+	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+}
+
+/**
+ *	freeze_task - send a freeze request to given task
+ *	@p: task to send the request to
+ *	@sig_only: if set, the request will only be sent if the task has the
+ *		PF_FREEZER_NOSIG flag unset
+ *	Return value: 'false', if @sig_only is set and the task has
+ *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ *
+ *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and
+ *	either sending a fake signal to it or waking it up, depending on whether
+ *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
+ *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
+ *	TIF_FREEZE flag will not be set.
+ */
+bool freeze_task(struct task_struct *p, bool sig_only)
+{
+	/*
+	 * We first check if the task is freezing and next if it has already
+	 * been frozen to avoid the race with frozen_process() which first marks
+	 * the task as frozen and next clears its TIF_FREEZE.
+	 */
+	if (!freezing(p)) {
+		rmb();
+		if (frozen(p))
+			return false;
+
+		if (!sig_only || should_send_signal(p))
+			set_freeze_flag(p);
+		else
+			return false;
+	}
+
+	if (should_send_signal(p)) {
+		if (!signal_pending(p))
+			fake_signal_wake_up(p);
+	} else if (sig_only) {
+		return false;
+	} else {
+		wake_up_state(p, TASK_INTERRUPTIBLE);
+	}
+
+	return true;
+}
+
+void cancel_freezing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (freezing(p)) {
+		pr_debug("  clean up: %s\n", p->comm);
+		clear_freeze_flag(p);
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		recalc_sigpending_and_wake(p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+}
+
+/*
+ * Wake up a frozen process
+ *
+ * task_lock() is needed to prevent the race with refrigerator() which may
+ * occur if the freezing of tasks fails.  Namely, without the lock, if the
+ * freezing of tasks failed, thaw_tasks() might have run before a task in
+ * refrigerator() could call frozen_process(), in which case the task would be
+ * frozen and no one would thaw it.
+ */
+int __thaw_process(struct task_struct *p)
+{
+	if (frozen(p)) {
+		p->flags &= ~PF_FROZEN;
+		return 1;
+	}
+	clear_freeze_flag(p);
+	return 0;
+}
+
+int thaw_process(struct task_struct *p)
+{
+	task_lock(p);
+	if (__thaw_process(p) == 1) {
+		task_unlock(p);
+		wake_up_process(p);
+		return 1;
+	}
+	task_unlock(p);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index aef265325cd3..777ac458ac99 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1371,6 +1371,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(node_online_map);
 	VMCOREINFO_SYMBOL(swapper_pg_dir);
 	VMCOREINFO_SYMBOL(_stext);
+	VMCOREINFO_SYMBOL(vmlist);
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	VMCOREINFO_SYMBOL(mem_map);
@@ -1406,6 +1407,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(free_area, free_list);
 	VMCOREINFO_OFFSET(list_head, next);
 	VMCOREINFO_OFFSET(list_head, prev);
+	VMCOREINFO_OFFSET(vm_struct, addr);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 96cff2f8710b..14ec64fe175a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -171,12 +171,11 @@ EXPORT_SYMBOL(kthread_create);
  */
 void kthread_bind(struct task_struct *k, unsigned int cpu)
 {
-	if (k->state != TASK_UNINTERRUPTIBLE) {
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
 		WARN_ON(1);
 		return;
 	}
-	/* Must have done schedule() in kthread() before we set_task_cpu */
-	wait_task_inactive(k, 0);
 	set_task_cpu(k, cpu);
 	k->cpus_allowed = cpumask_of_cpu(cpu);
 	k->rt.nr_cpus_allowed = 1;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 278946aecaf0..ca634019497a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p)
 	return 1;
 }
 
-/*
- * freezing is complete, mark current process as frozen
- */
-static inline void frozen_process(void)
-{
-	if (!unlikely(current->flags & PF_NOFREEZE)) {
-		current->flags |= PF_FROZEN;
-		wmb();
-	}
-	clear_freeze_flag(current);
-}
-
-/* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
-{
-	/* Hmm, should we be allowed to suspend when there are realtime
-	   processes around? */
-	long save;
-
-	task_lock(current);
-	if (freezing(current)) {
-		frozen_process();
-		task_unlock(current);
-	} else {
-		task_unlock(current);
-		return;
-	}
-	save = current->state;
-	pr_debug("%s entered refrigerator\n", current->comm);
-
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending(); /* We sent fake signal, clean it up */
-	spin_unlock_irq(&current->sighand->siglock);
-
-	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!frozen(current))
-			break;
-		schedule();
-	}
-	pr_debug("%s left refrigerator\n", current->comm);
-	__set_current_state(save);
-}
-
-static void fake_signal_wake_up(struct task_struct *p)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&p->sighand->siglock, flags);
-	signal_wake_up(p, 0);
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
-}
-
-static inline bool should_send_signal(struct task_struct *p)
-{
-	return !(p->flags & PF_FREEZER_NOSIG);
-}
-
-/**
- *	freeze_task - send a freeze request to given task
- *	@p: task to send the request to
- *	@sig_only: if set, the request will only be sent if the task has the
- *		PF_FREEZER_NOSIG flag unset
- *	Return value: 'false', if @sig_only is set and the task has
- *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
- *
- *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and
- *	either sending a fake signal to it or waking it up, depending on whether
- *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
- *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- *	TIF_FREEZE flag will not be set.
- */
-static bool freeze_task(struct task_struct *p, bool sig_only)
-{
-	/*
-	 * We first check if the task is freezing and next if it has already
-	 * been frozen to avoid the race with frozen_process() which first marks
-	 * the task as frozen and next clears its TIF_FREEZE.
-	 */
-	if (!freezing(p)) {
-		rmb();
-		if (frozen(p))
-			return false;
-
-		if (!sig_only || should_send_signal(p))
-			set_freeze_flag(p);
-		else
-			return false;
-	}
-
-	if (should_send_signal(p)) {
-		if (!signal_pending(p))
-			fake_signal_wake_up(p);
-	} else if (sig_only) {
-		return false;
-	} else {
-		wake_up_state(p, TASK_INTERRUPTIBLE);
-	}
-
-	return true;
-}
-
-static void cancel_freezing(struct task_struct *p)
-{
-	unsigned long flags;
-
-	if (freezing(p)) {
-		pr_debug("  clean up: %s\n", p->comm);
-		clear_freeze_flag(p);
-		spin_lock_irqsave(&p->sighand->siglock, flags);
-		recalc_sigpending_and_wake(p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
-	}
-}
-
 static int try_to_freeze_tasks(bool sig_only)
 {
 	struct task_struct *g, *p;
@@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
 		if (nosig_only && should_send_signal(p))
 			continue;
 
+		if (cgroup_frozen(p))
+			continue;
+
 		thaw_process(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
@@ -264,4 +152,3 @@ void thaw_processes(void)
 	printk("done.\n");
 }
 
-EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 356699a96d56..1e68e4c39e2c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
  * TASK_TRACED, resume it now.
  * Requires that irqs be disabled.
  */
-void ptrace_untrace(struct task_struct *child)
+static void ptrace_untrace(struct task_struct *child)
 {
 	spin_lock(&child->sighand->siglock);
 	if (task_is_traced(child)) {
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ca4bbbe04aa4..59236e8b9daa 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -54,9 +54,9 @@
 #include <linux/cpu.h>
 #include <linux/random.h>
 #include <linux/delay.h>
-#include <linux/byteorder/swabb.h>
 #include <linux/cpumask.h>
 #include <linux/rcupreempt_trace.h>
+#include <asm/byteorder.h>
 
 /*
  * PREEMPT_RCU data structures.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 617d41e4d6a0..b3cc73931d1f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -833,6 +833,16 @@ static struct ctl_table kern_table[] = {
 		.proc_handler   = &proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "scan_unevictable_pages",
+		.data		= &scan_unevictable_pages,
+		.maxlen		= sizeof(scan_unevictable_pages),
+		.mode		= 0644,
+		.proc_handler	= &scan_unevictable_handler,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 06fb57c86de0..482df94ea21e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -316,17 +316,6 @@ int bitmap_scnprintf(char *buf, unsigned int buflen,
 EXPORT_SYMBOL(bitmap_scnprintf);
 
 /**
- * bitmap_scnprintf_len - return buffer length needed to convert
- * bitmap to an ASCII hex string
- * @nr_bits: number of bits to be converted
- */
-int bitmap_scnprintf_len(unsigned int nr_bits)
-{
-	unsigned int nr_nibbles = ALIGN(nr_bits, 4) / 4;
-	return nr_nibbles + ALIGN(nr_nibbles, CHUNKSZ / 4) / (CHUNKSZ / 4) - 1;
-}
-
-/**
  * __bitmap_parse - convert an ASCII hex string into a bitmap.
  * @buf: pointer to buffer containing string.
  * @buflen: buffer size in bytes.  If string is smaller than this
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index cceecb6a963d..a013bbc23717 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -24,6 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
+#include <linux/ioport.h>
 
 #include <asm/page.h>		/* for PAGE_SIZE */
 #include <asm/div64.h>
@@ -550,18 +551,51 @@ static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int
 #endif
 }
 
+static char *resource_string(char *buf, char *end, struct resource *res, int field_width, int precision, int flags)
+{
+#ifndef IO_RSRC_PRINTK_SIZE
+#define IO_RSRC_PRINTK_SIZE	4
+#endif
+
+#ifndef MEM_RSRC_PRINTK_SIZE
+#define MEM_RSRC_PRINTK_SIZE	8
+#endif
+
+	/* room for the actual numbers, the two "0x", -, [, ] and the final zero */
+	char sym[4*sizeof(resource_size_t) + 8];
+	char *p = sym, *pend = sym + sizeof(sym);
+	int size = -1;
+
+	if (res->flags & IORESOURCE_IO)
+		size = IO_RSRC_PRINTK_SIZE;
+	else if (res->flags & IORESOURCE_MEM)
+		size = MEM_RSRC_PRINTK_SIZE;
+
+	*p++ = '[';
+	p = number(p, pend, res->start, 16, size, -1, SPECIAL | SMALL | ZEROPAD);
+	*p++ = '-';
+	p = number(p, pend, res->end, 16, size, -1, SPECIAL | SMALL | ZEROPAD);
+	*p++ = ']';
+	*p = 0;
+
+	return string(buf, end, sym, field_width, precision, flags);
+}
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
  * specifiers.
  *
- * Right now we just handle 'F' (for symbolic Function descriptor pointers)
- * and 'S' (for Symbolic direct pointers), but this can easily be
- * extended in the future (network address types etc).
+ * Right now we handle:
+ *
+ * - 'F' For symbolic function descriptor pointers
+ * - 'S' For symbolic direct pointers
+ * - 'R' For a struct resource pointer, it prints the range of
+ *       addresses (not the name nor the flags)
  *
- * The difference between 'S' and 'F' is that on ia64 and ppc64 function
- * pointers are really function descriptors, which contain a pointer the
- * real address. 
+ * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
+ * function pointers are really function descriptors, which contain a
+ * pointer to the real address.
  */
 static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags)
 {
@@ -571,6 +605,8 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
 		/* Fallthrough */
 	case 'S':
 		return symbol_string(buf, end, ptr, field_width, precision, flags);
+	case 'R':
+		return resource_string(buf, end, ptr, field_width, precision, flags);
 	}
 	flags |= SMALL;
 	if (field_width == -1) {
@@ -590,6 +626,7 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field
  * This function follows C99 vsnprintf, but has some extensions:
  * %pS output the name of a text symbol
  * %pF output the name of a function pointer
+ * %pR output the address range in a struct resource
  *
  * The return value is the number of characters which would
  * be generated for the given input, excluding the trailing
diff --git a/mm/Kconfig b/mm/Kconfig
index 1a501a4de95c..5b5790f8a816 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -209,5 +209,16 @@ config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
 
+config UNEVICTABLE_LRU
+	bool "Add LRU list to track non-evictable pages"
+	default y
+	depends on MMU
+	help
+	  Keeps unevictable pages off of the active and inactive pageout
+	  lists, so kswapd will not waste CPU time or have its balancing
+	  algorithms thrown off by scanning these pages.  Selecting this
+	  will use one page flag and increase the code size a little,
+	  say Y unless you know what you are doing.
+
 config MMU_NOTIFIER
 	bool
diff --git a/mm/Makefile b/mm/Makefile
index da4ccf015aea..c06b45a1ff5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
-
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 903bf316912a..ab8553658af3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
+#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include "internal.h"
 
 /*
@@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 
-	mem_cgroup_uncharge_cache_page(page);
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
 	mapping->nrpages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	BUG_ON(page_mapped(page));
+	mem_cgroup_uncharge_cache_page(page);
 
 	/*
 	 * Some filesystems seem to re-dirty the page even after
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, gfp_t gfp_mask)
 {
-	int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-	if (ret == 0)
-		lru_cache_add(page);
+	int ret;
+
+	/*
+	 * Splice_read and readahead add shmem/tmpfs pages into the page cache
+	 * before shmem_readpage has a chance to mark them as SwapBacked: they
+	 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
+	 * (called in add_to_page_cache) needs to know where they're going too.
+	 */
+	if (mapping_cap_swap_backed(mapping))
+		SetPageSwapBacked(page);
+
+	ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+	if (ret == 0) {
+		if (page_is_file_cache(page))
+			lru_cache_add_file(page);
+		else
+			lru_cache_add_active_anon(page);
+	}
 	return ret;
 }
 
@@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
  * mechananism between PageLocked pages and PageWriteback pages is shared.
  * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
  *
- * The first mb is necessary to safely close the critical section opened by the
- * test_and_set_bit() to lock the page; the second mb is necessary to enforce
- * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
- * races with a parallel wait_on_page_locked()).
+ * The mb is necessary to enforce ordering between the clear_bit and the read
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
  */
 void unlock_page(struct page *page)
 {
-	smp_mb__before_clear_bit();
-	if (!test_and_clear_bit(PG_locked, &page->flags))
-		BUG();
-	smp_mb__after_clear_bit(); 
+	VM_BUG_ON(!PageLocked(page));
+	clear_bit_unlock(PG_locked, &page->flags);
+	smp_mb__after_clear_bit();
 	wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(unlock_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7881638e4a12..7d12ca70ef7b 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -21,6 +21,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long addr, pte_t *ptep)
 {
@@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 		spin_unlock(&mapping->i_mmap_lock);
 	}
 
+	if (vma->vm_flags & VM_LOCKED) {
+		/*
+		 * drop PG_Mlocked flag for over-mapped range
+		 */
+		unsigned int saved_flags = vma->vm_flags;
+		munlock_vma_pages_range(vma, start, start + size);
+		vma->vm_flags = saved_flags;
+	}
+
 	mmu_notifier_invalidate_range_start(mm, start, start + size);
 	err = populate_range(mm, vma, start, size, pgoff);
 	mmu_notifier_invalidate_range_end(mm, start, start + size);
 	if (!err && !(flags & MAP_NONBLOCK)) {
-		if (unlikely(has_write_lock)) {
-			downgrade_write(&mm->mmap_sem);
-			has_write_lock = 0;
+		if (vma->vm_flags & VM_LOCKED) {
+			/*
+			 * might be mapping previously unmapped range of file
+			 */
+			mlock_vma_pages_range(vma, start, start + size);
+		} else {
+			if (unlikely(has_write_lock)) {
+				downgrade_write(&mm->mmap_sem);
+				has_write_lock = 0;
+			}
+			make_pages_present(start, start+size);
 		}
-		make_pages_present(start, start+size);
 	}
 
 	/*
@@ -240,4 +258,3 @@ out:
 
 	return err;
 }
-
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 38633864a93e..ce8cbb29860b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -262,7 +262,7 @@ struct resv_map {
 	struct list_head regions;
 };
 
-struct resv_map *resv_map_alloc(void)
+static struct resv_map *resv_map_alloc(void)
 {
 	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
 	if (!resv_map)
@@ -274,7 +274,7 @@ struct resv_map *resv_map_alloc(void)
 	return resv_map;
 }
 
-void resv_map_release(struct kref *ref)
+static void resv_map_release(struct kref *ref)
 {
 	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
 
@@ -289,7 +289,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_SHARED))
 		return (struct resv_map *)(get_vma_private_data(vma) &
 							~HPAGE_RESV_MASK);
-	return 0;
+	return NULL;
 }
 
 static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
@@ -1459,11 +1459,11 @@ int hugetlb_report_meminfo(char *buf)
 {
 	struct hstate *h = &default_hstate;
 	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"HugePages_Rsvd:  %5lu\n"
-			"HugePages_Surp:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
+			"HugePages_Total:   %5lu\n"
+			"HugePages_Free:    %5lu\n"
+			"HugePages_Rsvd:    %5lu\n"
+			"HugePages_Surp:    %5lu\n"
+			"Hugepagesize:   %8lu kB\n",
 			h->nr_huge_pages,
 			h->free_huge_pages,
 			h->resv_huge_pages,
@@ -1747,10 +1747,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
  * from other VMAs and let the children be SIGKILLed if they are faulting the
  * same region.
  */
-int unmap_ref_private(struct mm_struct *mm,
-					struct vm_area_struct *vma,
-					struct page *page,
-					unsigned long address)
+static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+				struct page *page, unsigned long address)
 {
 	struct vm_area_struct *iter_vma;
 	struct address_space *mapping;
@@ -2073,6 +2071,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
+static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
+{
+	if (!ptep || write || shared)
+		return 0;
+	else
+		return huge_pte_none(huge_ptep_get(ptep));
+}
+
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i,
@@ -2082,6 +2088,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long vaddr = *position;
 	int remainder = *length;
 	struct hstate *h = hstate_vma(vma);
+	int zeropage_ok = 0;
+	int shared = vma->vm_flags & VM_SHARED;
 
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
@@ -2094,8 +2102,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * first, for the page indexing below to work.
 		 */
 		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+		if (huge_zeropage_ok(pte, write, shared))
+			zeropage_ok = 1;
 
-		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
+		if (!pte ||
+		    (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
 		    (write && !pte_write(huge_ptep_get(pte)))) {
 			int ret;
 
@@ -2115,8 +2126,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page = pte_page(huge_ptep_get(pte));
 same_page:
 		if (pages) {
-			get_page(page);
-			pages[i] = page + pfn_offset;
+			if (zeropage_ok)
+				pages[i] = ZERO_PAGE(0);
+			else
+				pages[i] = page + pfn_offset;
+			get_page(pages[i]);
 		}
 
 		if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 1f43f7416972..e4e728bdf324 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
 	atomic_dec(&page->_count);
 }
 
+/*
+ * in mm/vmscan.c:
+ */
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+
+/*
+ * in mm/page_alloc.c
+ */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
@@ -52,6 +61,120 @@ static inline unsigned long page_order(struct page *page)
 	return page_private(page);
 }
 
+extern long mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end);
+extern void munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end);
+static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * unevictable_migrate_page() called only from migrate_page_copy() to
+ * migrate unevictable flag to new page.
+ * Note that the old page has been isolated from the LRU lists at this
+ * point so we don't need to worry about LRU statistics.
+ */
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+	if (TestClearPageUnevictable(old))
+		SetPageUnevictable(new);
+}
+#else
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+}
+#endif
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Called only in fault path via page_evictable() for a new page
+ * to determine if it's being mapped into a LOCKED vma.
+ * If so, mark page as mlocked.
+ */
+static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+{
+	VM_BUG_ON(PageLRU(page));
+
+	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+		return 0;
+
+	if (!TestSetPageMlocked(page)) {
+		inc_zone_page_state(page, NR_MLOCK);
+		count_vm_event(UNEVICTABLE_PGMLOCKED);
+	}
+	return 1;
+}
+
+/*
+ * must be called with vma's mmap_sem held for read, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+
+/*
+ * Clear the page's PageMlocked().  This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache -- e.g.,
+ * on truncation or freeing.
+ *
+ * It is legal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+extern void __clear_page_mlock(struct page *page);
+static inline void clear_page_mlock(struct page *page)
+{
+	if (unlikely(TestClearPageMlocked(page)))
+		__clear_page_mlock(page);
+}
+
+/*
+ * mlock_migrate_page - called only from migrate_page_copy() to
+ * migrate the Mlocked page flag; update statistics.
+ */
+static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+{
+	if (TestClearPageMlocked(page)) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		__dec_zone_page_state(page, NR_MLOCK);
+		SetPageMlocked(newpage);
+		__inc_zone_page_state(newpage, NR_MLOCK);
+		local_irq_restore(flags);
+	}
+}
+
+/*
+ * free_page_mlock() -- clean up attempts to free and mlocked() page.
+ * Page should not be on lru, so no need to fix that up.
+ * free_pages_check() will verify...
+ */
+static inline void free_page_mlock(struct page *page)
+{
+	if (unlikely(TestClearPageMlocked(page))) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		__dec_zone_page_state(page, NR_MLOCK);
+		__count_vm_event(UNEVICTABLE_MLOCKFREED);
+		local_irq_restore(flags);
+	}
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+{
+	return 0;
+}
+static inline void clear_page_mlock(struct page *page) { }
+static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+static inline void free_page_mlock(struct page *page) { }
+
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
 /*
  * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
  * so all functions starting at paging_init should be marked __init
@@ -120,4 +243,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */
 
+#define GUP_FLAGS_WRITE                  0x1
+#define GUP_FLAGS_FORCE                  0x2
+#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long start, int len, int flags,
+		     struct page **pages, struct vm_area_struct **vmas);
+
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36896f3eb7f5..d4a92b63e98e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,11 +32,12 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>
 
 #include <asm/uaccess.h>
 
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-static struct kmem_cache *page_cgroup_cache __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 
 /*
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
 /*
  * For accounting under irq disable, no need for increment preempt count.
  */
-static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
+static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
 		enum mem_cgroup_stat_index idx, int val)
 {
-	int cpu = smp_processor_id();
-	stat->cpustat[cpu].count[idx] += val;
+	stat->count[idx] += val;
 }
 
 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
 /*
  * per-zone information in memory controller.
  */
-
-enum mem_cgroup_zstat_index {
-	MEM_CGROUP_ZSTAT_ACTIVE,
-	MEM_CGROUP_ZSTAT_INACTIVE,
-
-	NR_MEM_CGROUP_ZSTAT,
-};
-
 struct mem_cgroup_per_zone {
 	/*
 	 * spin_lock to protect the per cgroup LRU
 	 */
 	spinlock_t		lru_lock;
-	struct list_head	active_list;
-	struct list_head	inactive_list;
-	unsigned long count[NR_MEM_CGROUP_ZSTAT];
+	struct list_head	lists[NR_LRU_LISTS];
+	unsigned long		count[NR_LRU_LISTS];
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
 };
 static struct mem_cgroup init_mem_cgroup;
 
-/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT 	0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK	0x0
-#endif
-
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
-	struct list_head lru;		/* per cgroup LRU list */
-	struct page *page;
-	struct mem_cgroup *mem_cgroup;
-	int flags;
-};
-#define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
-
-static int page_cgroup_nid(struct page_cgroup *pc)
-{
-	return page_to_nid(pc->page);
-}
-
-static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
-{
-	return page_zonenum(pc->page);
-}
-
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
+	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
+	NR_CHARGE_TYPE,
+};
+
+/* only for here (for easy reading.) */
+#define PCGF_CACHE	(1UL << PCG_CACHE)
+#define PCGF_USED	(1UL << PCG_USED)
+#define PCGF_ACTIVE	(1UL << PCG_ACTIVE)
+#define PCGF_LOCK	(1UL << PCG_LOCK)
+#define PCGF_FILE	(1UL << PCG_FILE)
+static const unsigned long
+pcg_default_flags[NR_CHARGE_TYPE] = {
+	PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
+	PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
+	PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+	0, /* FORCE */
 };
 
 /*
  * Always modified under lru lock. Then, not necessary to preempt_disable()
  */
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
-					bool charge)
+static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+					 struct page_cgroup *pc,
+					 bool charge)
 {
 	int val = (charge)? 1 : -1;
 	struct mem_cgroup_stat *stat = &mem->stat;
+	struct mem_cgroup_stat_cpu *cpustat;
 
 	VM_BUG_ON(!irqs_disabled());
-	if (flags & PAGE_CGROUP_FLAG_CACHE)
-		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
+
+	cpustat = &stat->cpustat[smp_processor_id()];
+	if (PageCgroupCache(pc))
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 	else
-		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
+		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 
 	if (charge)
-		__mem_cgroup_stat_add_safe(stat,
+		__mem_cgroup_stat_add_safe(cpustat,
 				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 	else
-		__mem_cgroup_stat_add_safe(stat,
+		__mem_cgroup_stat_add_safe(cpustat,
 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 }
 
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
 }
 
 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
-					enum mem_cgroup_zstat_index idx)
+					enum lru_list idx)
 {
 	int nid, zid;
 	struct mem_cgroup_per_zone *mz;
@@ -262,85 +236,77 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 				struct mem_cgroup, css);
 }
 
-static inline int page_cgroup_locked(struct page *page)
-{
-	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
-{
-	VM_BUG_ON(!page_cgroup_locked(page));
-	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
-}
-
-struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
-}
-
-static void lock_page_cgroup(struct page *page)
-{
-	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static int try_lock_page_cgroup(struct page *page)
-{
-	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
-static void unlock_page_cgroup(struct page *page)
-{
-	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
-	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+	int lru = LRU_BASE;
+
+	if (PageCgroupUnevictable(pc))
+		lru = LRU_UNEVICTABLE;
+	else {
+		if (PageCgroupActive(pc))
+			lru += LRU_ACTIVE;
+		if (PageCgroupFile(pc))
+			lru += LRU_FILE;
+	}
 
-	if (from)
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
-	else
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 
-	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
+	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
 	list_del(&pc->lru);
 }
 
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 				struct page_cgroup *pc)
 {
-	int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
-
-	if (!to) {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
-		list_add(&pc->lru, &mz->inactive_list);
-	} else {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
-		list_add(&pc->lru, &mz->active_list);
+	int lru = LRU_BASE;
+
+	if (PageCgroupUnevictable(pc))
+		lru = LRU_UNEVICTABLE;
+	else {
+		if (PageCgroupActive(pc))
+			lru += LRU_ACTIVE;
+		if (PageCgroupFile(pc))
+			lru += LRU_FILE;
 	}
-	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
+
+	MEM_CGROUP_ZSTAT(mz, lru) += 1;
+	list_add(&pc->lru, &mz->lists[lru]);
+
+	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
 }
 
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 {
-	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+	int active    = PageCgroupActive(pc);
+	int file      = PageCgroupFile(pc);
+	int unevictable = PageCgroupUnevictable(pc);
+	enum lru_list from = unevictable ? LRU_UNEVICTABLE :
+				(LRU_FILE * !!file + !!active);
 
-	if (from)
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
-	else
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+	if (lru == from)
+		return;
 
-	if (active) {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
-		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
-		list_move(&pc->lru, &mz->active_list);
+	MEM_CGROUP_ZSTAT(mz, from) -= 1;
+	/*
+	 * However this is done under mz->lru_lock, another flags, which
+	 * are not related to LRU, will be modified from out-of-lock.
+	 * We have to use atomic set/clear flags.
+	 */
+	if (is_unevictable_lru(lru)) {
+		ClearPageCgroupActive(pc);
+		SetPageCgroupUnevictable(pc);
 	} else {
-		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
-		pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
-		list_move(&pc->lru, &mz->inactive_list);
+		if (is_active_lru(lru))
+			SetPageCgroupActive(pc);
+		else
+			ClearPageCgroupActive(pc);
+		ClearPageCgroupUnevictable(pc);
 	}
+
+	MEM_CGROUP_ZSTAT(mz, lru) += 1;
+	list_move(&pc->lru, &mz->lists[lru]);
 }
 
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -356,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 /*
  * This routine assumes that the appropriate zone's lru lock is already held
  */
-void mem_cgroup_move_lists(struct page *page, bool active)
+void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
@@ -372,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
-	if (!try_lock_page_cgroup(page))
+	pc = lookup_page_cgroup(page);
+	if (!trylock_page_cgroup(pc))
 		return;
-
-	pc = page_get_page_cgroup(page);
-	if (pc) {
+	if (pc && PageCgroupUsed(pc)) {
 		mz = page_cgroup_zoneinfo(pc);
 		spin_lock_irqsave(&mz->lru_lock, flags);
-		__mem_cgroup_move_lists(pc, active);
+		__mem_cgroup_move_lists(pc, lru);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 	}
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 }
 
 /*
@@ -403,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 }
 
 /*
- * This function is called from vmscan.c. In page reclaiming loop. balance
- * between active and inactive list is calculated. For memory controller
- * page reclaiming, we should use using mem_cgroup's imbalance rather than
- * zone's global lru imbalance.
- */
-long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
-{
-	unsigned long active, inactive;
-	/* active and inactive are the number of pages. 'long' is ok.*/
-	active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
-	inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
-	return (long) (active / (inactive + 1));
-}
-
-/*
  * prev_priority control...this will be used in memory reclaim path.
  */
 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -444,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
  * (see include/linux/mmzone.h)
  */
 
-long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
-				   struct zone *zone, int priority)
+long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
+					int priority, enum lru_list lru)
 {
-	long nr_active;
+	long nr_pages;
 	int nid = zone->zone_pgdat->node_id;
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 
-	nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
-	return (nr_active >> priority);
-}
+	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
 
-long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
-					struct zone *zone, int priority)
-{
-	long nr_inactive;
-	int nid = zone->zone_pgdat->node_id;
-	int zid = zone_idx(zone);
-	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
-
-	nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
-	return (nr_inactive >> priority);
+	return (nr_pages >> priority);
 }
 
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -473,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active)
+					int active, int file)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
@@ -484,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 	int nid = z->zone_pgdat->node_id;
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
+	int lru = LRU_FILE * !!file + !!active;
 
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
-	if (active)
-		src = &mz->active_list;
-	else
-		src = &mz->inactive_list;
-
+	src = &mz->lists[lru];
 
 	spin_lock(&mz->lru_lock);
 	scan = 0;
 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 		if (scan >= nr_to_scan)
 			break;
+		if (unlikely(!PageCgroupUsed(pc)))
+			continue;
 		page = pc->page;
 
 		if (unlikely(!PageLRU(page)))
 			continue;
 
-		if (PageActive(page) && !active) {
-			__mem_cgroup_move_lists(pc, true);
-			continue;
-		}
-		if (!PageActive(page) && active) {
-			__mem_cgroup_move_lists(pc, false);
+		/*
+		 * TODO: play better with lumpy reclaim, grabbing anything.
+		 */
+		if (PageUnevictable(page) ||
+		    (PageActive(page) && !active) ||
+		    (!PageActive(page) && active)) {
+			__mem_cgroup_move_lists(pc, page_lru(page));
 			continue;
 		}
 
 		scan++;
 		list_move(&pc->lru, &pc_list);
 
-		if (__isolate_lru_page(page, mode) == 0) {
+		if (__isolate_lru_page(page, mode, file) == 0) {
 			list_move(&page->lru, dst);
 			nr_taken++;
 		}
@@ -540,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
-	unsigned long flags;
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
+	unsigned long flags;
 
-	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
-	if (unlikely(pc == NULL))
-		goto err;
-
+	pc = lookup_page_cgroup(page);
+	/* can happen at boot */
+	if (unlikely(!pc))
+		return 0;
+	prefetchw(pc);
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
+
 	if (likely(!memcg)) {
 		rcu_read_lock();
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!mem)) {
 			rcu_read_unlock();
-			kmem_cache_free(page_cgroup_cache, pc);
 			return 0;
 		}
 		/*
@@ -572,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 		css_get(&memcg->css);
 	}
 
-	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
+	while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
 		if (!(gfp_mask & __GFP_WAIT))
 			goto out;
 
@@ -595,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 		}
 	}
 
-	pc->mem_cgroup = mem;
-	pc->page = page;
-	/*
-	 * If a page is accounted as a page cache, insert to inactive list.
-	 * If anon, insert to active list.
-	 */
-	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
-		pc->flags = PAGE_CGROUP_FLAG_CACHE;
-	else
-		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
 
-	lock_page_cgroup(page);
-	if (unlikely(page_get_page_cgroup(page))) {
-		unlock_page_cgroup(page);
+	lock_page_cgroup(pc);
+	if (unlikely(PageCgroupUsed(pc))) {
+		unlock_page_cgroup(pc);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
-		kmem_cache_free(page_cgroup_cache, pc);
+
 		goto done;
 	}
-	page_assign_page_cgroup(page, pc);
+	pc->mem_cgroup = mem;
+	/*
+	 * If a page is accounted as a page cache, insert to inactive list.
+	 * If anon, insert to active list.
+	 */
+	pc->flags = pcg_default_flags[ctype];
 
 	mz = page_cgroup_zoneinfo(pc);
+
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 
-	unlock_page_cgroup(page);
 done:
 	return 0;
 out:
 	css_put(&mem->css);
-	kmem_cache_free(page_cgroup_cache, pc);
-err:
 	return -ENOMEM;
 }
 
@@ -635,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
-
+	if (PageCompound(page))
+		return 0;
 	/*
 	 * If already mapped, we don't have to account.
 	 * If page cache, page->mapping has address_space.
@@ -656,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
-
+	if (PageCompound(page))
+		return 0;
 	/*
 	 * Corner case handling. This is called from add_to_page_cache()
 	 * in usual. But some FS (shmem) precharges this page before calling it
@@ -669,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 
-		lock_page_cgroup(page);
-		pc = page_get_page_cgroup(page);
-		if (pc) {
-			VM_BUG_ON(pc->page != page);
-			VM_BUG_ON(!pc->mem_cgroup);
-			unlock_page_cgroup(page);
+
+		pc = lookup_page_cgroup(page);
+		if (!pc)
+			return 0;
+		lock_page_cgroup(pc);
+		if (PageCgroupUsed(pc)) {
+			unlock_page_cgroup(pc);
 			return 0;
 		}
-		unlock_page_cgroup(page);
+		unlock_page_cgroup(pc);
 	}
 
 	if (unlikely(!mm))
 		mm = &init_mm;
 
-	return mem_cgroup_charge_common(page, mm, gfp_mask,
+	if (page_is_file_cache(page))
+		return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+	else
+		return mem_cgroup_charge_common(page, mm, gfp_mask,
+				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 
 /*
@@ -704,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	/*
 	 * Check if our page_cgroup is valid
 	 */
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (unlikely(!pc))
-		goto unlock;
-
-	VM_BUG_ON(pc->page != page);
+	pc = lookup_page_cgroup(page);
+	if (unlikely(!pc || !PageCgroupUsed(pc)))
+		return;
 
-	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
-	    && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
-		|| page_mapped(page)))
-		goto unlock;
+	lock_page_cgroup(pc);
+	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
+	     || !PageCgroupUsed(pc)) {
+		/* This happens at race in zap_pte_range() and do_swap_page()*/
+		unlock_page_cgroup(pc);
+		return;
+	}
+	ClearPageCgroupUsed(pc);
+	mem = pc->mem_cgroup;
 
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
+	unlock_page_cgroup(pc);
 
-	page_assign_page_cgroup(page, NULL);
-	unlock_page_cgroup(page);
-
-	mem = pc->mem_cgroup;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	css_put(&mem->css);
 
-	kmem_cache_free(page_cgroup_cache, pc);
 	return;
-unlock:
-	unlock_page_cgroup(page);
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
 {
+	/* early check. */
+	if (page_mapped(page))
+		return;
+	if (page->mapping && !PageAnon(page))
+		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
+	VM_BUG_ON(page->mapping);
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 
@@ -758,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 
-	lock_page_cgroup(page);
-	pc = page_get_page_cgroup(page);
-	if (pc) {
+	pc = lookup_page_cgroup(page);
+	lock_page_cgroup(pc);
+	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
-		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
-			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+		if (PageCgroupCache(pc)) {
+			if (page_is_file_cache(page))
+				ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+			else
+				ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+		}
 	}
-	unlock_page_cgroup(page);
+	unlock_page_cgroup(pc);
 	if (mem) {
 		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 			ctype, mem);
@@ -791,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
 	 */
 	if (!newpage->mapping)
 		__mem_cgroup_uncharge_common(newpage,
-					 MEM_CGROUP_CHARGE_TYPE_FORCE);
+				MEM_CGROUP_CHARGE_TYPE_FORCE);
 	else if (PageAnon(newpage))
 		mem_cgroup_uncharge_page(newpage);
 }
@@ -863,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 #define FORCE_UNCHARGE_BATCH	(128)
 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 			    struct mem_cgroup_per_zone *mz,
-			    int active)
+			    enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct page *page;
@@ -871,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 	unsigned long flags;
 	struct list_head *list;
 
-	if (active)
-		list = &mz->active_list;
-	else
-		list = &mz->inactive_list;
+	list = &mz->lists[lru];
 
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	while (!list_empty(list)) {
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		page = pc->page;
+		if (!PageCgroupUsed(pc))
+			break;
 		get_page(page);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 		/*
@@ -894,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 				count = FORCE_UNCHARGE_BATCH;
 				cond_resched();
 			}
-		} else
-			cond_resched();
+		} else {
+			spin_lock_irqsave(&mz->lru_lock, flags);
+			break;
+		}
 		spin_lock_irqsave(&mz->lru_lock, flags);
 	}
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -919,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 	while (mem->res.usage > 0) {
 		if (atomic_read(&mem->css.cgroup->count) > 0)
 			goto out;
+		/* This is for making all *used* pages to be on LRU. */
+		lru_add_drain_all();
 		for_each_node_state(node, N_POSSIBLE)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				struct mem_cgroup_per_zone *mz;
+				enum lru_list l;
 				mz = mem_cgroup_zoneinfo(mem, node, zid);
-				/* drop all page_cgroup in active_list */
-				mem_cgroup_force_empty_list(mem, mz, 1);
-				/* drop all page_cgroup in inactive_list */
-				mem_cgroup_force_empty_list(mem, mz, 0);
+				for_each_lru(l)
+					mem_cgroup_force_empty_list(mem, mz, l);
 			}
+		cond_resched();
 	}
 	ret = 0;
 out:
@@ -1012,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 	}
 	/* showing # of active pages */
 	{
-		unsigned long active, inactive;
-
-		inactive = mem_cgroup_get_all_zonestat(mem_cont,
-						MEM_CGROUP_ZSTAT_INACTIVE);
-		active = mem_cgroup_get_all_zonestat(mem_cont,
-						MEM_CGROUP_ZSTAT_ACTIVE);
-		cb->fill(cb, "active", (active) * PAGE_SIZE);
-		cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
+		unsigned long active_anon, inactive_anon;
+		unsigned long active_file, inactive_file;
+		unsigned long unevictable;
+
+		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_INACTIVE_ANON);
+		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_ACTIVE_ANON);
+		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_INACTIVE_FILE);
+		active_file = mem_cgroup_get_all_zonestat(mem_cont,
+						LRU_ACTIVE_FILE);
+		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
+							LRU_UNEVICTABLE);
+
+		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
+		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
+		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
+		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
+		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
+
 	}
 	return 0;
 }
@@ -1062,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
+	enum lru_list l;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
@@ -1082,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
-		INIT_LIST_HEAD(&mz->active_list);
-		INIT_LIST_HEAD(&mz->inactive_list);
 		spin_lock_init(&mz->lru_lock);
+		for_each_lru(l)
+			INIT_LIST_HEAD(&mz->lists[l]);
 	}
 	return 0;
 }
@@ -1124,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	int node;
 
 	if (unlikely((cont->parent) == NULL)) {
+		page_cgroup_init();
 		mem = &init_mem_cgroup;
-		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
 	} else {
 		mem = mem_cgroup_alloc();
 		if (!mem)
diff --git a/mm/memory.c b/mm/memory.c
index 1002f473f497..3a6c4a658325 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -64,6 +64,8 @@
 
 #include "internal.h"
 
+#include "internal.h"
+
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
 	return !vma->vm_ops || !vma->vm_ops->fault;
 }
 
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-		unsigned long start, int len, int write, int force,
+
+
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long start, int len, int flags,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
-	unsigned int vm_flags;
+	unsigned int vm_flags = 0;
+	int write = !!(flags & GUP_FLAGS_WRITE);
+	int force = !!(flags & GUP_FLAGS_FORCE);
+	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
 
 	if (len <= 0)
 		return 0;
@@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			pud_t *pud;
 			pmd_t *pmd;
 			pte_t *pte;
-			if (write) /* user gate pages are read-only */
+
+			/* user gate pages are read-only */
+			if (!ignore && write)
 				return i ? : -EFAULT;
 			if (pg > TASK_SIZE)
 				pgd = pgd_offset_k(pg);
@@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		}
 
-		if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
-				|| !(vm_flags & vma->vm_flags))
+		if (!vma ||
+		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+		    (!ignore && !(vm_flags & vma->vm_flags)))
 			return i ? : -EFAULT;
 
 		if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 	} while (len);
 	return i;
 }
+
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		unsigned long start, int len, int write, int force,
+		struct page **pages, struct vm_area_struct **vmas)
+{
+	int flags = 0;
+
+	if (write)
+		flags |= GUP_FLAGS_WRITE;
+	if (force)
+		flags |= GUP_FLAGS_FORCE;
+
+	return __get_user_pages(tsk, mm,
+				start, len, flags,
+				pages, vmas);
+}
+
 EXPORT_SYMBOL(get_user_pages);
 
 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1296,18 +1323,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
-	if (retval)
-		goto out;
-
 	retval = -EINVAL;
 	if (PageAnon(page))
-		goto out_uncharge;
+		goto out;
 	retval = -ENOMEM;
 	flush_dcache_page(page);
 	pte = get_locked_pte(mm, addr, &ptl);
 	if (!pte)
-		goto out_uncharge;
+		goto out;
 	retval = -EBUSY;
 	if (!pte_none(*pte))
 		goto out_unlock;
@@ -1323,8 +1346,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 	return retval;
 out_unlock:
 	pte_unmap_unlock(pte, ptl);
-out_uncharge:
-	mem_cgroup_uncharge_page(page);
 out:
 	return retval;
 }
@@ -1858,6 +1879,15 @@ gotten:
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 	if (!new_page)
 		goto oom;
+	/*
+	 * Don't let another task, with possibly unlocked vma,
+	 * keep the mlocked page.
+	 */
+	if (vma->vm_flags & VM_LOCKED) {
+		lock_page(old_page);	/* for LRU manipulation */
+		clear_page_mlock(old_page);
+		unlock_page(old_page);
+	}
 	cow_user_page(new_page, old_page, address, vma);
 	__SetPageUptodate(new_page);
 
@@ -1886,11 +1916,13 @@ gotten:
 		 * thread doing COW.
 		 */
 		ptep_clear_flush_notify(vma, address, page_table);
-		set_pte_at(mm, address, page_table, entry);
-		update_mmu_cache(vma, address, entry);
-		lru_cache_add_active(new_page);
+		SetPageSwapBacked(new_page);
+		lru_cache_add_active_or_unevictable(new_page, vma);
 		page_add_new_anon_rmap(new_page, vma, address);
 
+//TODO:  is this safe?  do_anonymous_page() does it this way.
+		set_pte_at(mm, address, page_table, entry);
+		update_mmu_cache(vma, address, entry);
 		if (old_page) {
 			/*
 			 * Only after switching the pte to the new page may
@@ -2288,16 +2320,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(PGMAJFAULT);
 	}
 
+	mark_page_accessed(page);
+
+	lock_page(page);
+	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+
 	if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-		delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
 		ret = VM_FAULT_OOM;
+		unlock_page(page);
 		goto out;
 	}
 
-	mark_page_accessed(page);
-	lock_page(page);
-	delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
-
 	/*
 	 * Back out if somebody else already faulted in this pte.
 	 */
@@ -2324,7 +2357,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	page_add_anon_rmap(page, vma, address);
 
 	swap_free(entry);
-	if (vm_swap_full())
+	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
 		remove_exclusive_swap_page(page);
 	unlock_page(page);
 
@@ -2382,7 +2415,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte_none(*page_table))
 		goto release;
 	inc_mm_counter(mm, anon_rss);
-	lru_cache_add_active(page);
+	SetPageSwapBacked(page);
+	lru_cache_add_active_or_unevictable(page, vma);
 	page_add_new_anon_rmap(page, vma, address);
 	set_pte_at(mm, address, page_table, entry);
 
@@ -2423,6 +2457,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *page;
 	pte_t entry;
 	int anon = 0;
+	int charged = 0;
 	struct page *dirty_page = NULL;
 	struct vm_fault vmf;
 	int ret;
@@ -2463,6 +2498,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				ret = VM_FAULT_OOM;
 				goto out;
 			}
+			if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+				ret = VM_FAULT_OOM;
+				page_cache_release(page);
+				goto out;
+			}
+			charged = 1;
+			/*
+			 * Don't let another task, with possibly unlocked vma,
+			 * keep the mlocked page.
+			 */
+			if (vma->vm_flags & VM_LOCKED)
+				clear_page_mlock(vmf.page);
 			copy_user_highpage(page, vmf.page, address, vma);
 			__SetPageUptodate(page);
 		} else {
@@ -2497,11 +2544,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	}
 
-	if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-		ret = VM_FAULT_OOM;
-		goto out;
-	}
-
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
 	/*
@@ -2520,11 +2562,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		entry = mk_pte(page, vma->vm_page_prot);
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
-                        inc_mm_counter(mm, anon_rss);
-                        lru_cache_add_active(page);
-                        page_add_new_anon_rmap(page, vma, address);
+			inc_mm_counter(mm, anon_rss);
+			SetPageSwapBacked(page);
+			lru_cache_add_active_or_unevictable(page, vma);
+			page_add_new_anon_rmap(page, vma, address);
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(page);
@@ -2533,11 +2575,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 				get_page(dirty_page);
 			}
 		}
+//TODO:  is this safe?  do_anonymous_page() does it this way.
+		set_pte_at(mm, address, page_table, entry);
 
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
 	} else {
-		mem_cgroup_uncharge_page(page);
+		if (charged)
+			mem_cgroup_uncharge_page(page);
 		if (anon)
 			page_cache_release(page);
 		else
@@ -2772,19 +2817,9 @@ int make_pages_present(unsigned long addr, unsigned long end)
 	len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
 	ret = get_user_pages(current, current->mm, addr,
 			len, write, 0, NULL, NULL);
-	if (ret < 0) {
-		/*
-		   SUS require strange return value to mlock
-		    - invalid addr generate to ENOMEM.
-		    - out of memory should generate EAGAIN.
-		*/
-		if (ret == -EFAULT)
-			ret = -ENOMEM;
-		else if (ret == -ENOMEM)
-			ret = -EAGAIN;
+	if (ret < 0)
 		return ret;
-	}
-	return ret == len ? 0 : -ENOMEM;
+	return ret == len ? 0 : -EFAULT;
 }
 
 #if !defined(__HAVE_ARCH_GATE_AREA)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 89fee2dcb039..6837a1014372 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
+#include <linux/pfn.h>
 
 #include <asm/tlbflush.h>
 
@@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 	BUG_ON(nr_pages % PAGES_PER_SECTION);
 
-	release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
-
 	sections_to_remove = nr_pages / PAGES_PER_SECTION;
 	for (i = 0; i < sections_to_remove; i++) {
 		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+		release_mem_region(pfn << PAGE_SHIFT,
+				   PAGES_PER_SECTION << PAGE_SHIFT);
 		ret = __remove_section(zone, __pfn_to_section(pfn));
 		if (ret)
 			break;
@@ -657,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		 * We can skip free pages. And we can only deal with pages on
 		 * LRU.
 		 */
-		ret = isolate_lru_page(page, &source);
+		ret = isolate_lru_page(page);
 		if (!ret) { /* Success */
+			list_add_tail(&page->lru, &source);
 			move_pages--;
 		} else {
 			/* Becasue we don't have big zone->lock. we should
@@ -849,10 +851,19 @@ failed_removal:
 
 	return ret;
 }
+
+int remove_memory(u64 start, u64 size)
+{
+	unsigned long start_pfn, end_pfn;
+
+	start_pfn = PFN_DOWN(start);
+	end_pfn = start_pfn + PFN_DOWN(size);
+	return offline_pages(start_pfn, end_pfn, 120 * HZ);
+}
 #else
 int remove_memory(u64 start, u64 size)
 {
 	return -EINVAL;
 }
-EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
+EXPORT_SYMBOL_GPL(remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 83369058ec13..36f42573a335 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,8 @@
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 
+#include "internal.h"
+
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
-		isolate_lru_page(page, pagelist);
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+		if (!isolate_lru_page(page)) {
+			list_add_tail(&page->lru, pagelist);
+		}
+	}
 }
 
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
@@ -2197,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
 	if (PageSwapCache(page))
 		md->swapcache++;
 
-	if (PageActive(page))
+	if (PageActive(page) || PageUnevictable(page))
 		md->active++;
 
 	if (PageWriteback(page))
diff --git a/mm/migrate.c b/mm/migrate.c
index 2a80136b23bb..6602941bfab0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,36 +37,6 @@
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 
 /*
- * Isolate one page from the LRU lists. If successful put it onto
- * the indicated list with elevated page count.
- *
- * Result:
- *  -EBUSY: page not on LRU list
- *  0: page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page, struct list_head *pagelist)
-{
-	int ret = -EBUSY;
-
-	if (PageLRU(page)) {
-		struct zone *zone = page_zone(page);
-
-		spin_lock_irq(&zone->lru_lock);
-		if (PageLRU(page) && get_page_unless_zero(page)) {
-			ret = 0;
-			ClearPageLRU(page);
-			if (PageActive(page))
-				del_page_from_active_list(zone, page);
-			else
-				del_page_from_inactive_list(zone, page);
-			list_add_tail(&page->lru, pagelist);
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-	return ret;
-}
-
-/*
  * migrate_prep() needs to be called before we start compiling a list of pages
  * to be migrated using isolate_lru_page().
  */
@@ -83,23 +53,9 @@ int migrate_prep(void)
 	return 0;
 }
 
-static inline void move_to_lru(struct page *page)
-{
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
 /*
- * Add isolated pages on the list back to the LRU.
+ * Add isolated pages on the list back to the LRU under page lock
+ * to avoid leaking evictable pages back onto unevictable list.
  *
  * returns the number of pages put back.
  */
@@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		list_del(&page->lru);
-		move_to_lru(page);
+		putback_lru_page(page);
 		count++;
 	}
 	return count;
@@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 	__inc_zone_page_state(newpage, NR_FILE_PAGES);
 
 	spin_unlock_irq(&mapping->tree_lock);
-	if (!PageSwapCache(newpage))
-		mem_cgroup_uncharge_cache_page(page);
 
 	return 0;
 }
@@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
  */
 static void migrate_page_copy(struct page *newpage, struct page *page)
 {
+	int anon;
+
 	copy_highpage(newpage, page);
 
 	if (PageError(page))
@@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
 		SetPageReferenced(newpage);
 	if (PageUptodate(page))
 		SetPageUptodate(newpage);
-	if (PageActive(page))
+	if (TestClearPageActive(page)) {
+		VM_BUG_ON(PageUnevictable(page));
 		SetPageActive(newpage);
+	} else
+		unevictable_migrate_page(newpage, page);
 	if (PageChecked(page))
 		SetPageChecked(newpage);
 	if (PageMappedToDisk(page))
@@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
 		__set_page_dirty_nobuffers(newpage);
  	}
 
+	mlock_migrate_page(newpage, page);
+
 #ifdef CONFIG_SWAP
 	ClearPageSwapCache(page);
 #endif
-	ClearPageActive(page);
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
+	/* page->mapping contains a flag for PageAnon() */
+	anon = PageAnon(page);
 	page->mapping = NULL;
 
+	if (!anon) /* This page was removed from radix-tree. */
+		mem_cgroup_uncharge_cache_page(page);
+
 	/*
 	 * If any waiters have accumulated on the new page then
 	 * wake them up.
@@ -594,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping,
  *
  * The new page will have replaced the old page if this function
  * is successful.
+ *
+ * Return value:
+ *   < 0 - error code
+ *  == 0 - success
  */
 static int move_to_new_page(struct page *newpage, struct page *page)
 {
@@ -611,6 +580,8 @@ static int move_to_new_page(struct page *newpage, struct page *page)
 	/* Prepare mapping for the new page.*/
 	newpage->index = page->index;
 	newpage->mapping = page->mapping;
+	if (PageSwapBacked(page))
+		SetPageSwapBacked(newpage);
 
 	mapping = page_mapping(page);
 	if (!mapping)
@@ -654,9 +625,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 	if (!newpage)
 		return -ENOMEM;
 
-	if (page_count(page) == 1)
+	if (page_count(page) == 1) {
 		/* page was freed from under us. So we are done. */
 		goto move_newpage;
+	}
 
 	charge = mem_cgroup_prepare_migration(page, newpage);
 	if (charge == -ENOMEM) {
@@ -730,7 +702,6 @@ rcu_unlock:
 		rcu_read_unlock();
 
 unlock:
-
 	unlock_page(page);
 
 	if (rc != -EAGAIN) {
@@ -741,17 +712,19 @@ unlock:
  		 * restored.
  		 */
  		list_del(&page->lru);
- 		move_to_lru(page);
+		putback_lru_page(page);
 	}
 
 move_newpage:
 	if (!charge)
 		mem_cgroup_end_migration(newpage);
+
 	/*
 	 * Move the new page to the LRU. If migration was not successful
 	 * then this will free the page.
 	 */
-	move_to_lru(newpage);
+	putback_lru_page(newpage);
+
 	if (result) {
 		if (rc)
 			*result = rc;
@@ -858,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
  * Move a set of pages as indicated in the pm array. The addr
  * field must be set to the virtual address of the page to be moved
  * and the node number must contain a valid target node.
+ * The pm array ends with node = MAX_NUMNODES.
  */
-static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
-				int migrate_all)
+static int do_move_page_to_node_array(struct mm_struct *mm,
+				      struct page_to_node *pm,
+				      int migrate_all)
 {
 	int err;
 	struct page_to_node *pp;
@@ -914,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
 				!migrate_all)
 			goto put_and_set;
 
-		err = isolate_lru_page(page, &pagelist);
+		err = isolate_lru_page(page);
+		if (!err)
+			list_add_tail(&page->lru, &pagelist);
 put_and_set:
 		/*
 		 * Either remove the duplicate refcount from
@@ -926,36 +903,118 @@ set_status:
 		pp->status = err;
 	}
 
+	err = 0;
 	if (!list_empty(&pagelist))
 		err = migrate_pages(&pagelist, new_page_node,
 				(unsigned long)pm);
-	else
-		err = -ENOENT;
 
 	up_read(&mm->mmap_sem);
 	return err;
 }
 
 /*
- * Determine the nodes of a list of pages. The addr in the pm array
- * must have been set to the virtual address of which we want to determine
- * the node number.
+ * Migrate an array of page address onto an array of nodes and fill
+ * the corresponding array of status.
  */
-static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
+static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+			 unsigned long nr_pages,
+			 const void __user * __user *pages,
+			 const int __user *nodes,
+			 int __user *status, int flags)
 {
+	struct page_to_node *pm = NULL;
+	nodemask_t task_nodes;
+	int err = 0;
+	int i;
+
+	task_nodes = cpuset_mems_allowed(task);
+
+	/* Limit nr_pages so that the multiplication may not overflow */
+	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+		err = -E2BIG;
+		goto out;
+	}
+
+	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+	if (!pm) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	/*
+	 * Get parameters from user space and initialize the pm
+	 * array. Return various errors if the user did something wrong.
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		const void __user *p;
+
+		err = -EFAULT;
+		if (get_user(p, pages + i))
+			goto out_pm;
+
+		pm[i].addr = (unsigned long)p;
+		if (nodes) {
+			int node;
+
+			if (get_user(node, nodes + i))
+				goto out_pm;
+
+			err = -ENODEV;
+			if (!node_state(node, N_HIGH_MEMORY))
+				goto out_pm;
+
+			err = -EACCES;
+			if (!node_isset(node, task_nodes))
+				goto out_pm;
+
+			pm[i].node = node;
+		} else
+			pm[i].node = 0;	/* anything to not match MAX_NUMNODES */
+	}
+	/* End marker */
+	pm[nr_pages].node = MAX_NUMNODES;
+
+	err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
+	if (err >= 0)
+		/* Return status information */
+		for (i = 0; i < nr_pages; i++)
+			if (put_user(pm[i].status, status + i))
+				err = -EFAULT;
+
+out_pm:
+	vfree(pm);
+out:
+	return err;
+}
+
+/*
+ * Determine the nodes of an array of pages and store it in an array of status.
+ */
+static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+			 const void __user * __user *pages,
+			 int __user *status)
+{
+	unsigned long i;
+	int err;
+
 	down_read(&mm->mmap_sem);
 
-	for ( ; pm->node != MAX_NUMNODES; pm++) {
+	for (i = 0; i < nr_pages; i++) {
+		const void __user *p;
+		unsigned long addr;
 		struct vm_area_struct *vma;
 		struct page *page;
-		int err;
 
 		err = -EFAULT;
-		vma = find_vma(mm, pm->addr);
+		if (get_user(p, pages+i))
+			goto out;
+		addr = (unsigned long) p;
+
+		vma = find_vma(mm, addr);
 		if (!vma)
 			goto set_status;
 
-		page = follow_page(vma, pm->addr, 0);
+		page = follow_page(vma, addr, 0);
 
 		err = PTR_ERR(page);
 		if (IS_ERR(page))
@@ -968,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
 
 		err = page_to_nid(page);
 set_status:
-		pm->status = err;
+		put_user(err, status+i);
 	}
+	err = 0;
 
+out:
 	up_read(&mm->mmap_sem);
-	return 0;
+	return err;
 }
 
 /*
@@ -984,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 			const int __user *nodes,
 			int __user *status, int flags)
 {
-	int err = 0;
-	int i;
 	struct task_struct *task;
-	nodemask_t task_nodes;
 	struct mm_struct *mm;
-	struct page_to_node *pm = NULL;
+	int err;
 
 	/* Check flags */
 	if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1021,75 +1079,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
 	    (current->uid != task->suid) && (current->uid != task->uid) &&
 	    !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
-		goto out2;
+		goto out;
 	}
 
  	err = security_task_movememory(task);
  	if (err)
- 		goto out2;
-
-
-	task_nodes = cpuset_mems_allowed(task);
-
-	/* Limit nr_pages so that the multiplication may not overflow */
-	if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
-		err = -E2BIG;
-		goto out2;
-	}
-
-	pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
-	if (!pm) {
-		err = -ENOMEM;
-		goto out2;
-	}
-
-	/*
-	 * Get parameters from user space and initialize the pm
-	 * array. Return various errors if the user did something wrong.
-	 */
-	for (i = 0; i < nr_pages; i++) {
-		const void __user *p;
-
-		err = -EFAULT;
-		if (get_user(p, pages + i))
-			goto out;
-
-		pm[i].addr = (unsigned long)p;
-		if (nodes) {
-			int node;
-
-			if (get_user(node, nodes + i))
-				goto out;
-
-			err = -ENODEV;
-			if (!node_state(node, N_HIGH_MEMORY))
-				goto out;
-
-			err = -EACCES;
-			if (!node_isset(node, task_nodes))
-				goto out;
+		goto out;
 
-			pm[i].node = node;
-		} else
-			pm[i].node = 0;	/* anything to not match MAX_NUMNODES */
+	if (nodes) {
+		err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
+				    flags);
+	} else {
+		err = do_pages_stat(mm, nr_pages, pages, status);
 	}
-	/* End marker */
-	pm[nr_pages].node = MAX_NUMNODES;
-
-	if (nodes)
-		err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
-	else
-		err = do_pages_stat(mm, pm);
-
-	if (err >= 0)
-		/* Return status information */
-		for (i = 0; i < nr_pages; i++)
-			if (put_user(pm[i].status, status + i))
-				err = -EFAULT;
 
 out:
-	vfree(pm);
-out2:
 	mmput(mm);
 	return err;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index 01fbe93eff5c..008ea70b7afa 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/hugetlb.h>
+
+#include "internal.h"
 
 int can_do_mlock(void)
 {
@@ -23,17 +31,381 @@ int can_do_mlock(void)
 }
 EXPORT_SYMBOL(can_do_mlock);
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * in vmscan and, possibly, the fault path; and to support semi-accurate
+ * statistics.
+ *
+ * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
+ * be placed on the LRU "unevictable" list, rather than the [in]active lists.
+ * The unevictable list is an LRU sibling list to the [in]active lists.
+ * PageUnevictable is set to indicate the unevictable state.
+ *
+ * When lazy mlocking via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have mlocked a page that is being munlocked. So lazy mlock must take
+ * the mmap_sem for read, and verify that the vma really is locked
+ * (see mm/rmap.c).
+ */
+
+/*
+ *  LRU accounting for clear_page_mlock()
+ */
+void __clear_page_mlock(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page));
+
+	if (!page->mapping) {	/* truncated ? */
+		return;
+	}
+
+	dec_zone_page_state(page, NR_MLOCK);
+	count_vm_event(UNEVICTABLE_PGCLEARED);
+	if (!isolate_lru_page(page)) {
+		putback_lru_page(page);
+	} else {
+		/*
+		 * Page not on the LRU yet.  Flush all pagevecs and retry.
+		 */
+		lru_add_drain_all();
+		if (!isolate_lru_page(page))
+			putback_lru_page(page);
+		else if (PageUnevictable(page))
+			count_vm_event(UNEVICTABLE_PGSTRANDED);
+
+	}
+}
+
+/*
+ * Mark page as mlocked if not already.
+ * If page on LRU, isolate and putback to move to unevictable list.
+ */
+void mlock_vma_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (!TestSetPageMlocked(page)) {
+		inc_zone_page_state(page, NR_MLOCK);
+		count_vm_event(UNEVICTABLE_PGMLOCKED);
+		if (!isolate_lru_page(page))
+			putback_lru_page(page);
+	}
+}
+
+/*
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ *
+ * Note:  unlike mlock_vma_page(), we can't just clear the PageMlocked
+ * [in try_to_munlock()] and then attempt to isolate the page.  We must
+ * isolate the page to keep others from messing with its unevictable
+ * and mlocked state while trying to munlock.  However, we pre-clear the
+ * mlocked state anyway as we might lose the isolation race and we might
+ * not get another chance to clear PageMlocked.  If we successfully
+ * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
+ * mapping the page, it will restore the PageMlocked state, unless the page
+ * is mapped in a non-linear vma.  So, we go ahead and SetPageMlocked(),
+ * perhaps redundantly.
+ * If we lose the isolation race, and the page is mapped by other VM_LOCKED
+ * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
+ * either of which will restore the PageMlocked state by calling
+ * mlock_vma_page() above, if it can grab the vma's mmap sem.
+ */
+static void munlock_vma_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	if (TestClearPageMlocked(page)) {
+		dec_zone_page_state(page, NR_MLOCK);
+		if (!isolate_lru_page(page)) {
+			int ret = try_to_munlock(page);
+			/*
+			 * did try_to_unlock() succeed or punt?
+			 */
+			if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
+				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+
+			putback_lru_page(page);
+		} else {
+			/*
+			 * We lost the race.  let try_to_unmap() deal
+			 * with it.  At least we get the page state and
+			 * mlock stats right.  However, page is still on
+			 * the noreclaim list.  We'll fix that up when
+			 * the page is eventually freed or we scan the
+			 * noreclaim list.
+			 */
+			if (PageUnevictable(page))
+				count_vm_event(UNEVICTABLE_PGSTRANDED);
+			else
+				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+		}
+	}
+}
+
+/**
+ * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
+ * @vma:   target vma
+ * @start: start address
+ * @end:   end address
+ * @mlock: 0 indicate munlock, otherwise mlock.
+ *
+ * If @mlock == 0, unlock an mlocked range;
+ * else mlock the range of pages.  This takes care of making the pages present ,
+ * too.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held for at least read.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end,
+				   int mlock)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = start;
+	struct page *pages[16]; /* 16 gives a reasonable batch */
+	int nr_pages = (end - start) / PAGE_SIZE;
+	int ret;
+	int gup_flags = 0;
+
+	VM_BUG_ON(start & ~PAGE_MASK);
+	VM_BUG_ON(end   & ~PAGE_MASK);
+	VM_BUG_ON(start < vma->vm_start);
+	VM_BUG_ON(end   > vma->vm_end);
+	VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
+		  (atomic_read(&mm->mm_users) != 0));
+
+	/*
+	 * mlock:   don't page populate if page has PROT_NONE permission.
+	 * munlock: the pages always do munlock althrough
+	 *          its has PROT_NONE permission.
+	 */
+	if (!mlock)
+		gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+
+	if (vma->vm_flags & VM_WRITE)
+		gup_flags |= GUP_FLAGS_WRITE;
+
+	lru_add_drain_all();	/* push cached pages to LRU */
+
+	while (nr_pages > 0) {
+		int i;
+
+		cond_resched();
+
+		/*
+		 * get_user_pages makes pages present if we are
+		 * setting mlock. and this extra reference count will
+		 * disable migration of this page.  However, page may
+		 * still be truncated out from under us.
+		 */
+		ret = __get_user_pages(current, mm, addr,
+				min_t(int, nr_pages, ARRAY_SIZE(pages)),
+				gup_flags, pages, NULL);
+		/*
+		 * This can happen for, e.g., VM_NONLINEAR regions before
+		 * a page has been allocated and mapped at a given offset,
+		 * or for addresses that map beyond end of a file.
+		 * We'll mlock the the pages if/when they get faulted in.
+		 */
+		if (ret < 0)
+			break;
+		if (ret == 0) {
+			/*
+			 * We know the vma is there, so the only time
+			 * we cannot get a single page should be an
+			 * error (ret < 0) case.
+			 */
+			WARN_ON(1);
+			break;
+		}
+
+		lru_add_drain();	/* push cached pages to LRU */
+
+		for (i = 0; i < ret; i++) {
+			struct page *page = pages[i];
+
+			lock_page(page);
+			/*
+			 * Because we lock page here and migration is blocked
+			 * by the elevated reference, we need only check for
+			 * page truncation (file-cache only).
+			 */
+			if (page->mapping) {
+				if (mlock)
+					mlock_vma_page(page);
+				else
+					munlock_vma_page(page);
+			}
+			unlock_page(page);
+			put_page(page);		/* ref from get_user_pages() */
+
+			/*
+			 * here we assume that get_user_pages() has given us
+			 * a list of virtually contiguous pages.
+			 */
+			addr += PAGE_SIZE;	/* for next get_user_pages() */
+			nr_pages--;
+		}
+		ret = 0;
+	}
+
+	lru_add_drain_all();	/* to update stats */
+
+	return ret;	/* count entire vma as locked_vm */
+}
+
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+	if (retval == -EFAULT)
+		retval = -ENOMEM;
+	else if (retval == -ENOMEM)
+		retval = -EAGAIN;
+	return retval;
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+
+/*
+ * Just make pages present if VM_LOCKED.  No-op if unlocking.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end,
+				   int mlock)
+{
+	if (mlock && (vma->vm_flags & VM_LOCKED))
+		return make_pages_present(start, end);
+	return 0;
+}
+
+static inline int __mlock_posix_error_return(long retval)
+{
+	return 0;
+}
+
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
+/**
+ * mlock_vma_pages_range() - mlock pages in specified vma range.
+ * @vma - the vma containing the specfied address range
+ * @start - starting address in @vma to mlock
+ * @end   - end address [+1] in @vma to mlock
+ *
+ * For mmap()/mremap()/expansion of mlocked vma.
+ *
+ * return 0 on success for "normal" vmas.
+ *
+ * return number of pages [> 0] to be removed from locked_vm on success
+ * of "special" vmas.
+ *
+ * return negative error if vma spanning @start-@range disappears while
+ * mmap semaphore is dropped.  Unlikely?
+ */
+long mlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	int nr_pages = (end - start) / PAGE_SIZE;
+	BUG_ON(!(vma->vm_flags & VM_LOCKED));
+
+	/*
+	 * filter unlockable vmas
+	 */
+	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+		goto no_mlock;
+
+	if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+			is_vm_hugetlb_page(vma) ||
+			vma == get_gate_vma(current))) {
+		long error;
+		downgrade_write(&mm->mmap_sem);
+
+		error = __mlock_vma_pages_range(vma, start, end, 1);
+
+		up_read(&mm->mmap_sem);
+		/* vma can change or disappear */
+		down_write(&mm->mmap_sem);
+		vma = find_vma(mm, start);
+		/* non-NULL vma must contain @start, but need to check @end */
+		if (!vma ||  end > vma->vm_end)
+			return -ENOMEM;
+
+		return 0;	/* hide other errors from mmap(), et al */
+	}
+
+	/*
+	 * User mapped kernel pages or huge pages:
+	 * make these pages present to populate the ptes, but
+	 * fall thru' to reset VM_LOCKED--no need to unlock, and
+	 * return nr_pages so these don't get counted against task's
+	 * locked limit.  huge pages are already counted against
+	 * locked vm limit.
+	 */
+	make_pages_present(start, end);
+
+no_mlock:
+	vma->vm_flags &= ~VM_LOCKED;	/* and don't come back! */
+	return nr_pages;		/* error or pages NOT mlocked */
+}
+
+
+/*
+ * munlock_vma_pages_range() - munlock all pages in the vma range.'
+ * @vma - vma containing range to be munlock()ed.
+ * @start - start address in @vma of the range
+ * @end - end of range in @vma.
+ *
+ *  For mremap(), munmap() and exit().
+ *
+ * Called with @vma VM_LOCKED.
+ *
+ * Returns with VM_LOCKED cleared.  Callers must be prepared to
+ * deal with this.
+ *
+ * We don't save and restore VM_LOCKED here because pages are
+ * still on lru.  In unmap path, pages might be scanned by reclaim
+ * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * free them.  This will result in freeing mlocked pages.
+ */
+void munlock_vma_pages_range(struct vm_area_struct *vma,
+			   unsigned long start, unsigned long end)
+{
+	vma->vm_flags &= ~VM_LOCKED;
+	__mlock_vma_pages_range(vma, start, end, 0);
+}
+
+/*
+ * mlock_fixup  - handle mlock[all]/munlock[all] requests.
+ *
+ * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
+ * munlock is a no-op.  However, for some special vmas, we go ahead and
+ * populate the ptes via make_pages_present().
+ *
+ * For vmas that pass the filters, merge/split as appropriate.
+ */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	unsigned long start, unsigned long end, unsigned int newflags)
 {
-	struct mm_struct * mm = vma->vm_mm;
+	struct mm_struct *mm = vma->vm_mm;
 	pgoff_t pgoff;
-	int pages;
+	int nr_pages;
 	int ret = 0;
-
-	if (newflags == vma->vm_flags) {
-		*prev = vma;
-		goto out;
+	int lock = newflags & VM_LOCKED;
+
+	if (newflags == vma->vm_flags ||
+			(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		goto out;	/* don't set VM_LOCKED,  don't count */
+
+	if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+			is_vm_hugetlb_page(vma) ||
+			vma == get_gate_vma(current)) {
+		if (lock)
+			make_pages_present(start, end);
+		goto out;	/* don't set VM_LOCKED,  don't count */
 	}
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		goto success;
 	}
 
-	*prev = vma;
-
 	if (start != vma->vm_start) {
 		ret = split_vma(mm, vma, start, 1);
 		if (ret)
@@ -60,24 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 success:
 	/*
+	 * Keep track of amount of locked VM.
+	 */
+	nr_pages = (end - start) >> PAGE_SHIFT;
+	if (!lock)
+		nr_pages = -nr_pages;
+	mm->locked_vm += nr_pages;
+
+	/*
 	 * vm_flags is protected by the mmap_sem held in write mode.
 	 * It's okay if try_to_unmap_one unmaps a page just after we
-	 * set VM_LOCKED, make_pages_present below will bring it back.
+	 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
 	 */
 	vma->vm_flags = newflags;
 
-	/*
-	 * Keep track of amount of locked VM.
-	 */
-	pages = (end - start) >> PAGE_SHIFT;
-	if (newflags & VM_LOCKED) {
-		pages = -pages;
-		if (!(newflags & VM_IO))
-			ret = make_pages_present(start, end);
+	if (lock) {
+		/*
+		 * mmap_sem is currently held for write.  Downgrade the write
+		 * lock to a read lock so that other faults, mmap scans, ...
+		 * while we fault in all pages.
+		 */
+		downgrade_write(&mm->mmap_sem);
+
+		ret = __mlock_vma_pages_range(vma, start, end, 1);
+
+		/*
+		 * Need to reacquire mmap sem in write mode, as our callers
+		 * expect this.  We have no support for atomically upgrading
+		 * a sem to write, so we need to check for ranges while sem
+		 * is unlocked.
+		 */
+		up_read(&mm->mmap_sem);
+		/* vma can change or disappear */
+		down_write(&mm->mmap_sem);
+		*prev = find_vma(mm, start);
+		/* non-NULL *prev must contain @start, but need to check @end */
+		if (!(*prev) || end > (*prev)->vm_end)
+			ret = -ENOMEM;
+		else if (ret > 0) {
+			mm->locked_vm -= ret;
+			ret = 0;
+		} else
+			ret = __mlock_posix_error_return(ret); /* translate if needed */
+	} else {
+		/*
+		 * TODO:  for unlocking, pages will already be resident, so
+		 * we don't need to wait for allocations/reclaim/pagein, ...
+		 * However, unlocking a very large region can still take a
+		 * while.  Should we downgrade the semaphore for both lock
+		 * AND unlock ?
+		 */
+		__mlock_vma_pages_range(vma, start, end, 0);
 	}
 
-	mm->locked_vm -= pages;
 out:
+	*prev = vma;
 	return ret;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index e7a5a68a9c2e..74f4d158022e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -410,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 }
 
-static inline void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma)
 {
 	struct file * file;
 
@@ -662,8 +662,6 @@ again:			remove_next = 1 + (end > next->vm_end);
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
-
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
 {
@@ -972,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
 			return -EPERM;
 		vm_flags |= VM_LOCKED;
 	}
+
 	/* mlock MCL_FUTURE? */
 	if (vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
@@ -1139,10 +1138,12 @@ munmap_back:
 	 * The VM_SHARED test is necessary because shmem_zero_setup
 	 * will create the file object for a shared anonymous map below.
 	 */
-	if (!file && !(vm_flags & VM_SHARED) &&
-	    vma_merge(mm, prev, addr, addr + len, vm_flags,
-					NULL, NULL, pgoff, NULL))
-		goto out;
+	if (!file && !(vm_flags & VM_SHARED)) {
+		vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+					NULL, NULL, pgoff, NULL);
+		if (vma)
+			goto out;
+	}
 
 	/*
 	 * Determine the object being mapped and call the appropriate
@@ -1224,10 +1225,14 @@ out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
-		mm->locked_vm += len >> PAGE_SHIFT;
-		make_pages_present(addr, addr + len);
-	}
-	if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+		/*
+		 * makes pages present; downgrades, drops, reacquires mmap_sem
+		 */
+		long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
+		if (nr_pages < 0)
+			return nr_pages;	/* vma gone! */
+		mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+	} else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
 		make_pages_present(addr, addr + len);
 	return addr;
 
@@ -1586,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
  * vma is the last one with address > vma->vm_end.  Have to extend vma.
  */
 #ifndef CONFIG_IA64
-static inline
+static
 #endif
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
@@ -1636,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 /*
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  */
-static inline int expand_downwards(struct vm_area_struct *vma,
+static int expand_downwards(struct vm_area_struct *vma,
 				   unsigned long address)
 {
 	int error;
@@ -1698,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
 	vma = find_vma_prev(mm, addr, &prev);
 	if (vma && (vma->vm_start <= addr))
 		return vma;
-	if (!prev || expand_stack(prev, addr))
+	if (expand_stack(prev, addr))
 		return NULL;
-	if (prev->vm_flags & VM_LOCKED)
-		make_pages_present(addr, prev->vm_end);
+	if (prev->vm_flags & VM_LOCKED) {
+		if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
+			return NULL;	/* vma gone! */
+	}
 	return prev;
 }
 #else
@@ -1727,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 	start = vma->vm_start;
 	if (expand_stack(vma, addr))
 		return NULL;
-	if (vma->vm_flags & VM_LOCKED)
-		make_pages_present(addr, start);
+	if (vma->vm_flags & VM_LOCKED) {
+		if (mlock_vma_pages_range(vma, addr, start) < 0)
+			return NULL;	/* vma gone! */
+	}
 	return vma;
 }
 #endif
@@ -1747,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 		long nrpages = vma_pages(vma);
 
 		mm->total_vm -= nrpages;
-		if (vma->vm_flags & VM_LOCKED)
-			mm->locked_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
 	} while (vma);
@@ -1914,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	vma = prev? prev->vm_next: mm->mmap;
 
 	/*
+	 * unlock any mlock()ed ranges before detaching vmas
+	 */
+	if (mm->locked_vm) {
+		struct vm_area_struct *tmp = vma;
+		while (tmp && tmp->vm_start < end) {
+			if (tmp->vm_flags & VM_LOCKED) {
+				mm->locked_vm -= vma_pages(tmp);
+				munlock_vma_pages_all(tmp);
+			}
+			tmp = tmp->vm_next;
+		}
+	}
+
+	/*
 	 * Remove the vma's, and unmap the actual pages
 	 */
 	detach_vmas_to_be_unmapped(mm, vma, prev, end);
@@ -2025,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 		return -ENOMEM;
 
 	/* Can we just expand an old private anonymous mapping? */
-	if (vma_merge(mm, prev, addr, addr + len, flags,
-					NULL, NULL, pgoff, NULL))
+	vma = vma_merge(mm, prev, addr, addr + len, flags,
+					NULL, NULL, pgoff, NULL);
+	if (vma)
 		goto out;
 
 	/*
@@ -2048,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED) {
-		mm->locked_vm += len >> PAGE_SHIFT;
-		make_pages_present(addr, addr + len);
+		if (!mlock_vma_pages_range(vma, addr, addr + len))
+			mm->locked_vm += (len >> PAGE_SHIFT);
 	}
 	return addr;
 }
@@ -2060,7 +2082,7 @@ EXPORT_SYMBOL(do_brk);
 void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather *tlb;
-	struct vm_area_struct *vma = mm->mmap;
+	struct vm_area_struct *vma;
 	unsigned long nr_accounted = 0;
 	unsigned long end;
 
@@ -2068,6 +2090,15 @@ void exit_mmap(struct mm_struct *mm)
 	arch_exit_mmap(mm);
 	mmu_notifier_release(mm);
 
+	if (mm->locked_vm) {
+		vma = mm->mmap;
+		while (vma) {
+			if (vma->vm_flags & VM_LOCKED)
+				munlock_vma_pages_all(vma);
+			vma = vma->vm_next;
+		}
+	}
+	vma = mm->mmap;
 	lru_add_drain();
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
diff --git a/mm/mremap.c b/mm/mremap.c
index 1a7743923c8c..58a2908f42f5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -24,6 +24,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += new_len >> PAGE_SHIFT;
 		if (new_len > old_len)
-			make_pages_present(new_addr + old_len,
-					   new_addr + new_len);
+			mlock_vma_pages_range(new_vma, new_addr + old_len,
+						       new_addr + new_len);
 	}
 
 	return new_addr;
@@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr,
 			vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
 				mm->locked_vm += pages;
-				make_pages_present(addr + old_len,
+				mlock_vma_pages_range(vma, addr + old_len,
 						   addr + new_len);
 			}
 			ret = addr;
diff --git a/mm/nommu.c b/mm/nommu.c
index ed75bc962fbe..2696b24f2bb3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -34,6 +34,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
 	return PAGE_SIZE << compound_order(page);
 }
 
-/*
- * get a list of pages in an address range belonging to the specified process
- * and indicate the VMA that covers each page
- * - this is potentially dodgy as we may end incrementing the page count of a
- *   slab page or a secondary page from a compound page
- * - don't permit access to VMAs that don't support it, such as I/O mappings
- */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-	unsigned long start, int len, int write, int force,
-	struct page **pages, struct vm_area_struct **vmas)
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+		     unsigned long start, int len, int flags,
+		struct page **pages, struct vm_area_struct **vmas)
 {
 	struct vm_area_struct *vma;
 	unsigned long vm_flags;
 	int i;
+	int write = !!(flags & GUP_FLAGS_WRITE);
+	int force = !!(flags & GUP_FLAGS_FORCE);
+	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
 
 	/* calculate required read or write permissions.
 	 * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 
 		/* protect what we can, including chardevs */
 		if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
-		    !(vm_flags & vma->vm_flags))
+		    (!ignore && !(vm_flags & vma->vm_flags)))
 			goto finish_or_fault;
 
 		if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 finish_or_fault:
 	return i ? : -EFAULT;
 }
+
+
+/*
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ *   slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
+ */
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+	unsigned long start, int len, int write, int force,
+	struct page **pages, struct vm_area_struct **vmas)
+{
+	int flags = 0;
+
+	if (write)
+		flags |= GUP_FLAGS_WRITE;
+	if (force)
+		flags |= GUP_FLAGS_FORCE;
+
+	return __get_user_pages(tsk, mm,
+				start, len, flags,
+				pages, vmas);
+}
 EXPORT_SYMBOL(get_user_pages);
 
 DEFINE_RWLOCK(vmlist_lock);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b40f6d5f8fe9..2970e35fd03f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 		struct zone *z =
 			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 
-		x += zone_page_state(z, NR_FREE_PAGES)
-			+ zone_page_state(z, NR_INACTIVE)
-			+ zone_page_state(z, NR_ACTIVE);
+		x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
 	}
 	/*
 	 * Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
-	x = global_page_state(NR_FREE_PAGES)
-		+ global_page_state(NR_INACTIVE)
-		+ global_page_state(NR_ACTIVE);
+	x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
 
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9eb9eb928285..d0a240fbb8bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -44,7 +44,7 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
-#include <linux/memcontrol.h>
+#include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 
 #include <asm/tlbflush.h>
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
 
 static void bad_page(struct page *page)
 {
-	void *pc = page_get_page_cgroup(page);
-
 	printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
 		"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
 		current->comm, page, (int)(2*sizeof(unsigned long)),
 		(unsigned long)page->flags, page->mapping,
 		page_mapcount(page), page_count(page));
-	if (pc) {
-		printk(KERN_EMERG "cgroup:%p\n", pc);
-		page_reset_bad_cgroup(page);
-	}
+
 	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
 		KERN_EMERG "Backtrace:\n");
 	dump_stack();
@@ -454,14 +449,16 @@ static inline void __free_one_page(struct page *page,
 
 static inline int free_pages_check(struct page *page)
 {
+	free_page_mlock(page);
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
 		bad_page(page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
+	if (PageSwapBacked(page))
+		__ClearPageSwapBacked(page);
 	/*
 	 * For now, we report if PG_reserved was found set, but do not
 	 * clear it, and do not free the page.  But we shall soon need
@@ -600,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
-		(page_get_page_cgroup(page) != NULL) |
 		(page_count(page) != 0)  |
 		(page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
 		bad_page(page);
@@ -614,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
 			1 << PG_referenced | 1 << PG_arch_1 |
-			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
+			1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
+#ifdef CONFIG_UNEVICTABLE_LRU
+			| 1 << PG_mlocked
+#endif
+			);
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 
@@ -1862,10 +1862,21 @@ void show_free_areas(void)
 		}
 	}
 
-	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+	printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
+		" inactive_file:%lu"
+//TODO:  check/adjust line lengths
+#ifdef CONFIG_UNEVICTABLE_LRU
+		" unevictable:%lu"
+#endif
+		" dirty:%lu writeback:%lu unstable:%lu\n"
 		" free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
-		global_page_state(NR_ACTIVE),
-		global_page_state(NR_INACTIVE),
+		global_page_state(NR_ACTIVE_ANON),
+		global_page_state(NR_ACTIVE_FILE),
+		global_page_state(NR_INACTIVE_ANON),
+		global_page_state(NR_INACTIVE_FILE),
+#ifdef CONFIG_UNEVICTABLE_LRU
+		global_page_state(NR_UNEVICTABLE),
+#endif
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
 		global_page_state(NR_UNSTABLE_NFS),
@@ -1888,8 +1899,13 @@ void show_free_areas(void)
 			" min:%lukB"
 			" low:%lukB"
 			" high:%lukB"
-			" active:%lukB"
-			" inactive:%lukB"
+			" active_anon:%lukB"
+			" inactive_anon:%lukB"
+			" active_file:%lukB"
+			" inactive_file:%lukB"
+#ifdef CONFIG_UNEVICTABLE_LRU
+			" unevictable:%lukB"
+#endif
 			" present:%lukB"
 			" pages_scanned:%lu"
 			" all_unreclaimable? %s"
@@ -1899,8 +1915,13 @@ void show_free_areas(void)
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
-			K(zone_page_state(zone, NR_ACTIVE)),
-			K(zone_page_state(zone, NR_INACTIVE)),
+			K(zone_page_state(zone, NR_ACTIVE_ANON)),
+			K(zone_page_state(zone, NR_INACTIVE_ANON)),
+			K(zone_page_state(zone, NR_ACTIVE_FILE)),
+			K(zone_page_state(zone, NR_INACTIVE_FILE)),
+#ifdef CONFIG_UNEVICTABLE_LRU
+			K(zone_page_state(zone, NR_UNEVICTABLE)),
+#endif
 			K(zone->present_pages),
 			zone->pages_scanned,
 			(zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -3410,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
+	pgdat_page_cgroup_init(pgdat);
 	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, memmap_pages;
+		enum lru_list l;
 
 		size = zone_spanned_pages_in_node(nid, j, zones_size);
 		realsize = size - zone_absent_pages_in_node(nid, j,
@@ -3428,8 +3451,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 			PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
 		if (realsize >= memmap_pages) {
 			realsize -= memmap_pages;
-			mminit_dprintk(MMINIT_TRACE, "memmap_init",
-				"%s zone: %lu pages used for memmap\n",
+			printk(KERN_DEBUG
+				"  %s zone: %lu pages used for memmap\n",
 				zone_names[j], memmap_pages);
 		} else
 			printk(KERN_WARNING
@@ -3439,8 +3462,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		/* Account for reserved pages */
 		if (j == 0 && realsize > dma_reserve) {
 			realsize -= dma_reserve;
-			mminit_dprintk(MMINIT_TRACE, "memmap_init",
-					"%s zone: %lu pages reserved\n",
+			printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
 					zone_names[0], dma_reserve);
 		}
 
@@ -3465,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		zone->prev_priority = DEF_PRIORITY;
 
 		zone_pcp_init(zone);
-		INIT_LIST_HEAD(&zone->active_list);
-		INIT_LIST_HEAD(&zone->inactive_list);
-		zone->nr_scan_active = 0;
-		zone->nr_scan_inactive = 0;
+		for_each_lru(l) {
+			INIT_LIST_HEAD(&zone->lru[l].list);
+			zone->lru[l].nr_scan = 0;
+		}
+		zone->recent_rotated[0] = 0;
+		zone->recent_rotated[1] = 0;
+		zone->recent_scanned[0] = 0;
+		zone->recent_scanned[1] = 0;
 		zap_zone_vm_stats(zone);
 		zone->flags = 0;
 		if (!size)
@@ -4210,7 +4236,7 @@ void setup_per_zone_pages_min(void)
 	for_each_zone(zone) {
 		u64 tmp;
 
-		spin_lock_irqsave(&zone->lru_lock, flags);
+		spin_lock_irqsave(&zone->lock, flags);
 		tmp = (u64)pages_min * zone->present_pages;
 		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
@@ -4242,13 +4268,53 @@ void setup_per_zone_pages_min(void)
 		zone->pages_low   = zone->pages_min + (tmp >> 2);
 		zone->pages_high  = zone->pages_min + (tmp >> 1);
 		setup_zone_migrate_reserve(zone);
-		spin_unlock_irqrestore(&zone->lru_lock, flags);
+		spin_unlock_irqrestore(&zone->lock, flags);
 	}
 
 	/* update totalreserve_pages */
 	calculate_totalreserve_pages();
 }
 
+/**
+ * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
+ *
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
+ * INACTIVE_ANON pages on this zone's LRU, maintained by the
+ * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
+ * the anonymous pages are kept on the inactive list.
+ *
+ * total     target    max
+ * memory    ratio     inactive anon
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
+ */
+void setup_per_zone_inactive_ratio(void)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		unsigned int gb, ratio;
+
+		/* Zone size in gigabytes */
+		gb = zone->present_pages >> (30 - PAGE_SHIFT);
+		ratio = int_sqrt(10 * gb);
+		if (!ratio)
+			ratio = 1;
+
+		zone->inactive_ratio = ratio;
+	}
+}
+
 /*
  * Initialise min_free_kbytes.
  *
@@ -4286,6 +4352,7 @@ static int __init init_per_zone_pages_min(void)
 		min_free_kbytes = 65536;
 	setup_per_zone_pages_min();
 	setup_per_zone_lowmem_reserve();
+	setup_per_zone_inactive_ratio();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 000000000000..5d86550701f2
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,237 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/bit_spinlock.h>
+#include <linux/page_cgroup.h>
+#include <linux/hash.h>
+#include <linux/memory.h>
+
+static void __meminit
+__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
+{
+	pc->flags = 0;
+	pc->mem_cgroup = NULL;
+	pc->page = pfn_to_page(pfn);
+}
+static unsigned long total_usage;
+
+#if !defined(CONFIG_SPARSEMEM)
+
+
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+	pgdat->node_page_cgroup = NULL;
+}
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	unsigned long offset;
+	struct page_cgroup *base;
+
+	base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
+	if (unlikely(!base))
+		return NULL;
+
+	offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
+	return base + offset;
+}
+
+static int __init alloc_node_page_cgroup(int nid)
+{
+	struct page_cgroup *base, *pc;
+	unsigned long table_size;
+	unsigned long start_pfn, nr_pages, index;
+
+	start_pfn = NODE_DATA(nid)->node_start_pfn;
+	nr_pages = NODE_DATA(nid)->node_spanned_pages;
+
+	table_size = sizeof(struct page_cgroup) * nr_pages;
+
+	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+	if (!base)
+		return -ENOMEM;
+	for (index = 0; index < nr_pages; index++) {
+		pc = base + index;
+		__init_page_cgroup(pc, start_pfn + index);
+	}
+	NODE_DATA(nid)->node_page_cgroup = base;
+	total_usage += table_size;
+	return 0;
+}
+
+void __init page_cgroup_init(void)
+{
+
+	int nid, fail;
+
+	for_each_online_node(nid)  {
+		fail = alloc_node_page_cgroup(nid);
+		if (fail)
+			goto fail;
+	}
+	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+	printk(KERN_INFO "please try cgroup_disable=memory option if you"
+	" don't want\n");
+	return;
+fail:
+	printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
+	printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
+	panic("Out of memory");
+}
+
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+	struct mem_section *section = __pfn_to_section(pfn);
+
+	return section->page_cgroup + pfn;
+}
+
+int __meminit init_section_page_cgroup(unsigned long pfn)
+{
+	struct mem_section *section;
+	struct page_cgroup *base, *pc;
+	unsigned long table_size;
+	int nid, index;
+
+	section = __pfn_to_section(pfn);
+
+	if (section->page_cgroup)
+		return 0;
+
+	nid = page_to_nid(pfn_to_page(pfn));
+
+	table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+	base = kmalloc_node(table_size, GFP_KERNEL, nid);
+	if (!base)
+		base = vmalloc_node(table_size, nid);
+
+	if (!base) {
+		printk(KERN_ERR "page cgroup allocation failure\n");
+		return -ENOMEM;
+	}
+
+	for (index = 0; index < PAGES_PER_SECTION; index++) {
+		pc = base + index;
+		__init_page_cgroup(pc, pfn + index);
+	}
+
+	section = __pfn_to_section(pfn);
+	section->page_cgroup = base - pfn;
+	total_usage += table_size;
+	return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __free_page_cgroup(unsigned long pfn)
+{
+	struct mem_section *ms;
+	struct page_cgroup *base;
+
+	ms = __pfn_to_section(pfn);
+	if (!ms || !ms->page_cgroup)
+		return;
+	base = ms->page_cgroup + pfn;
+	ms->page_cgroup = NULL;
+	if (is_vmalloc_addr(base))
+		vfree(base);
+	else
+		kfree(base);
+}
+
+int online_page_cgroup(unsigned long start_pfn,
+			unsigned long nr_pages,
+			int nid)
+{
+	unsigned long start, end, pfn;
+	int fail = 0;
+
+	start = start_pfn & (PAGES_PER_SECTION - 1);
+	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+
+	for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+		if (!pfn_present(pfn))
+			continue;
+		fail = init_section_page_cgroup(pfn);
+	}
+	if (!fail)
+		return 0;
+
+	/* rollback */
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+		__free_page_cgroup(pfn);
+
+	return -ENOMEM;
+}
+
+int offline_page_cgroup(unsigned long start_pfn,
+		unsigned long nr_pages, int nid)
+{
+	unsigned long start, end, pfn;
+
+	start = start_pfn & (PAGES_PER_SECTION - 1);
+	end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+		__free_page_cgroup(pfn);
+	return 0;
+
+}
+
+static int page_cgroup_callback(struct notifier_block *self,
+			       unsigned long action, void *arg)
+{
+	struct memory_notify *mn = arg;
+	int ret = 0;
+	switch (action) {
+	case MEM_GOING_ONLINE:
+		ret = online_page_cgroup(mn->start_pfn,
+				   mn->nr_pages, mn->status_change_nid);
+		break;
+	case MEM_CANCEL_ONLINE:
+	case MEM_OFFLINE:
+		offline_page_cgroup(mn->start_pfn,
+				mn->nr_pages, mn->status_change_nid);
+		break;
+	case MEM_GOING_OFFLINE:
+		break;
+	case MEM_ONLINE:
+	case MEM_CANCEL_OFFLINE:
+		break;
+	}
+	ret = notifier_from_errno(ret);
+	return ret;
+}
+
+#endif
+
+void __init page_cgroup_init(void)
+{
+	unsigned long pfn;
+	int fail = 0;
+
+	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+		if (!pfn_present(pfn))
+			continue;
+		fail = init_section_page_cgroup(pfn);
+	}
+	if (fail) {
+		printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
+		panic("Out of memory");
+	} else {
+		hotplug_memory_notifier(page_cgroup_callback, 0);
+	}
+	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+	printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
+	" want\n");
+}
+
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+	return;
+}
+
+#endif
diff --git a/mm/readahead.c b/mm/readahead.c
index 6cbd9a72fde2..bec83c15a78f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
  */
 unsigned long max_sane_readahead(unsigned long nr)
 {
-	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
+	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
 		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 }
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 0383acfcb068..10993942d6c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,9 +53,47 @@
 
 #include <asm/tlbflush.h>
 
-struct kmem_cache *anon_vma_cachep;
+#include "internal.h"
 
-/* This must be called under the mmap_sem. */
+static struct kmem_cache *anon_vma_cachep;
+
+static inline struct anon_vma *anon_vma_alloc(void)
+{
+	return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+}
+
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+	kmem_cache_free(anon_vma_cachep, anon_vma);
+}
+
+/**
+ * anon_vma_prepare - attach an anon_vma to a memory region
+ * @vma: the memory region in question
+ *
+ * This makes sure the memory mapping described by 'vma' has
+ * an 'anon_vma' attached to it, so that we can associate the
+ * anonymous pages mapped into it with that anon_vma.
+ *
+ * The common case will be that we already have one, but if
+ * if not we either need to find an adjacent mapping that we
+ * can re-use the anon_vma from (very common when the only
+ * reason for splitting a vma has been mprotect()), or we
+ * allocate a new one.
+ *
+ * Anon-vma allocations are very subtle, because we may have
+ * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * and that may actually touch the spinlock even in the newly
+ * allocated vma (it depends on RCU to make sure that the
+ * anon_vma isn't actually destroyed).
+ *
+ * As a result, we need to do proper anon_vma locking even
+ * for the new allocation. At the same time, we do not want
+ * to do any locking for the common case of already having
+ * an anon_vma.
+ *
+ * This must be called with the mmap_sem held for reading.
+ */
 int anon_vma_prepare(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
@@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 	might_sleep();
 	if (unlikely(!anon_vma)) {
 		struct mm_struct *mm = vma->vm_mm;
-		struct anon_vma *allocated, *locked;
+		struct anon_vma *allocated;
 
 		anon_vma = find_mergeable_anon_vma(vma);
-		if (anon_vma) {
-			allocated = NULL;
-			locked = anon_vma;
-			spin_lock(&locked->lock);
-		} else {
+		allocated = NULL;
+		if (!anon_vma) {
 			anon_vma = anon_vma_alloc();
 			if (unlikely(!anon_vma))
 				return -ENOMEM;
 			allocated = anon_vma;
-			locked = NULL;
 		}
+		spin_lock(&anon_vma->lock);
 
 		/* page_table_lock to protect against threads */
 		spin_lock(&mm->page_table_lock);
@@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 		}
 		spin_unlock(&mm->page_table_lock);
 
-		if (locked)
-			spin_unlock(&locked->lock);
+		spin_unlock(&anon_vma->lock);
 		if (unlikely(allocated))
 			anon_vma_free(allocated);
 	}
@@ -157,7 +191,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
 	struct anon_vma *anon_vma;
 	unsigned long anon_mapping;
@@ -177,7 +211,7 @@ out:
 	return NULL;
 }
 
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
 	spin_unlock(&anon_vma->lock);
 	rcu_read_unlock();
@@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 	return NULL;
 }
 
+/**
+ * page_mapped_in_vma - check whether a page is really mapped in a VMA
+ * @page: the page to test
+ * @vma: the VMA to test
+ *
+ * Returns 1 if the page is mapped into the page tables of the VMA, 0
+ * if the page is not mapped into the page tables of this VMA.  Only
+ * valid for normal file or anonymous VMAs.
+ */
+static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+	unsigned long address;
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	address = vma_address(page, vma);
+	if (address == -EFAULT)		/* out of vma range */
+		return 0;
+	pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
+	if (!pte)			/* the page is not in this mm */
+		return 0;
+	pte_unmap_unlock(pte, ptl);
+
+	return 1;
+}
+
 /*
  * Subfunctions of page_referenced: page_referenced_one called
  * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page,
 	if (!pte)
 		goto out;
 
+	/*
+	 * Don't want to elevate referenced for mlocked page that gets this far,
+	 * in order that it progresses to try_to_unmap and is moved to the
+	 * unevictable list.
+	 */
 	if (vma->vm_flags & VM_LOCKED) {
-		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young_notify(vma, address, pte))
+		goto out_unmap;
+	}
+
+	if (ptep_clear_flush_young_notify(vma, address, pte))
 		referenced++;
 
 	/* Pretend the page is referenced if the task has the
@@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page,
 			rwsem_is_locked(&mm->mmap_sem))
 		referenced++;
 
+out_unmap:
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
 out:
@@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page,
 		 */
 		if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
 			continue;
-		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
-				  == (VM_LOCKED|VM_MAYSHARE)) {
-			referenced++;
-			break;
-		}
 		referenced += page_referenced_one(page, vma, &mapcount);
 		if (!mapcount)
 			break;
@@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
 			page_clear_dirty(page);
 			set_page_dirty(page);
 		}
-
-		mem_cgroup_uncharge_page(page);
+		if (PageAnon(page))
+			mem_cgroup_uncharge_page(page);
 		__dec_zone_page_state(page,
 			PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
 		/*
@@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young_notify(vma, address, pte)))) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
-	}
+	if (!migration) {
+		if (vma->vm_flags & VM_LOCKED) {
+			ret = SWAP_MLOCK;
+			goto out_unmap;
+		}
+		if (ptep_clear_flush_young_notify(vma, address, pte)) {
+			ret = SWAP_FAIL;
+			goto out_unmap;
+		}
+  	}
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
@@ -802,12 +870,17 @@ out:
  * For very sparsely populated VMAs this is a little inefficient - chances are
  * there there won't be many ptes located within the scan cluster.  In this case
  * maybe we could scan further - to the end of the pte page, perhaps.
+ *
+ * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
+ * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
+ * rather than unmapping them.  If we encounter the "check_page" that vmscan is
+ * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
  */
 #define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
 #define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
 
-static void try_to_unmap_cluster(unsigned long cursor,
-	unsigned int *mapcount, struct vm_area_struct *vma)
+static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
+		struct vm_area_struct *vma, struct page *check_page)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
@@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	struct page *page;
 	unsigned long address;
 	unsigned long end;
+	int ret = SWAP_AGAIN;
+	int locked_vma = 0;
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
@@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 	pgd = pgd_offset(mm, address);
 	if (!pgd_present(*pgd))
-		return;
+		return ret;
 
 	pud = pud_offset(pgd, address);
 	if (!pud_present(*pud))
-		return;
+		return ret;
 
 	pmd = pmd_offset(pud, address);
 	if (!pmd_present(*pmd))
-		return;
+		return ret;
+
+	/*
+	 * MLOCK_PAGES => feature is configured.
+	 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
+	 * keep the sem while scanning the cluster for mlocking pages.
+	 */
+	if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
+		locked_vma = (vma->vm_flags & VM_LOCKED);
+		if (!locked_vma)
+			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
+	}
 
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 
@@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
 		page = vm_normal_page(vma, address, *pte);
 		BUG_ON(!page || PageAnon(page));
 
+		if (locked_vma) {
+			mlock_vma_page(page);   /* no-op if already mlocked */
+			if (page == check_page)
+				ret = SWAP_MLOCK;
+			continue;	/* don't unmap */
+		}
+
 		if (ptep_clear_flush_young_notify(vma, address, pte))
 			continue;
 
@@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	if (locked_vma)
+		up_read(&vma->vm_mm->mmap_sem);
+	return ret;
 }
 
-static int try_to_unmap_anon(struct page *page, int migration)
+/*
+ * common handling for pages mapped in VM_LOCKED vmas
+ */
+static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
+{
+	int mlocked = 0;
+
+	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+		if (vma->vm_flags & VM_LOCKED) {
+			mlock_vma_page(page);
+			mlocked++;	/* really mlocked the page */
+		}
+		up_read(&vma->vm_mm->mmap_sem);
+	}
+	return mlocked;
+}
+
+/**
+ * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
+ * rmap method
+ * @page: the page to unmap/unlock
+ * @unlock:  request for unlock rather than unmap [unlikely]
+ * @migration:  unmapping for migration - ignored if @unlock
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * anonymous pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * 'LOCKED.
+ */
+static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 {
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
+	unsigned int mlocked = 0;
 	int ret = SWAP_AGAIN;
 
+	if (MLOCK_PAGES && unlikely(unlock))
+		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
+
 	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		return ret;
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		ret = try_to_unmap_one(page, vma, migration);
-		if (ret == SWAP_FAIL || !page_mapped(page))
-			break;
+		if (MLOCK_PAGES && unlikely(unlock)) {
+			if (!((vma->vm_flags & VM_LOCKED) &&
+			      page_mapped_in_vma(page, vma)))
+				continue;  /* must visit all unlocked vmas */
+			ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
+		} else {
+			ret = try_to_unmap_one(page, vma, migration);
+			if (ret == SWAP_FAIL || !page_mapped(page))
+				break;
+		}
+		if (ret == SWAP_MLOCK) {
+			mlocked = try_to_mlock_page(page, vma);
+			if (mlocked)
+				break;	/* stop if actually mlocked page */
+		}
 	}
 
 	page_unlock_anon_vma(anon_vma);
+
+	if (mlocked)
+		ret = SWAP_MLOCK;	/* actually mlocked the page */
+	else if (ret == SWAP_MLOCK)
+		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
+
 	return ret;
 }
 
 /**
- * try_to_unmap_file - unmap file page using the object-based rmap method
- * @page: the page to unmap
- * @migration: migration flag
+ * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
+ * @page: the page to unmap/unlock
+ * @unlock:  request for unlock rather than unmap [unlikely]
+ * @migration:  unmapping for migration - ignored if @unlock
  *
  * Find all the mappings of a page using the mapping pointer and the vma chains
  * contained in the address_space struct it points to.
  *
- * This function is only called from try_to_unmap for object-based pages.
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * object-based pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, int migration)
+static int try_to_unmap_file(struct page *page, int unlock, int migration)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration)
 	unsigned long max_nl_cursor = 0;
 	unsigned long max_nl_size = 0;
 	unsigned int mapcount;
+	unsigned int mlocked = 0;
+
+	if (MLOCK_PAGES && unlikely(unlock))
+		ret = SWAP_SUCCESS;	/* default for try_to_munlock() */
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-		ret = try_to_unmap_one(page, vma, migration);
-		if (ret == SWAP_FAIL || !page_mapped(page))
-			goto out;
+		if (MLOCK_PAGES && unlikely(unlock)) {
+			if (!(vma->vm_flags & VM_LOCKED))
+				continue;	/* must visit all vmas */
+			ret = SWAP_MLOCK;
+		} else {
+			ret = try_to_unmap_one(page, vma, migration);
+			if (ret == SWAP_FAIL || !page_mapped(page))
+				goto out;
+		}
+		if (ret == SWAP_MLOCK) {
+			mlocked = try_to_mlock_page(page, vma);
+			if (mlocked)
+				break;  /* stop if actually mlocked page */
+		}
 	}
 
+	if (mlocked)
+		goto out;
+
 	if (list_empty(&mapping->i_mmap_nonlinear))
 		goto out;
 
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-		if ((vma->vm_flags & VM_LOCKED) && !migration)
+		if (MLOCK_PAGES && unlikely(unlock)) {
+			if (!(vma->vm_flags & VM_LOCKED))
+				continue;	/* must visit all vmas */
+			ret = SWAP_MLOCK;	/* leave mlocked == 0 */
+			goto out;		/* no need to look further */
+		}
+		if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
 			continue;
 		cursor = (unsigned long) vma->vm_private_data;
 		if (cursor > max_nl_cursor)
@@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration)
 			max_nl_size = cursor;
 	}
 
-	if (max_nl_size == 0) {	/* any nonlinears locked or reserved */
+	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
 		ret = SWAP_FAIL;
 		goto out;
 	}
@@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration)
 	do {
 		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
 						shared.vm_set.list) {
-			if ((vma->vm_flags & VM_LOCKED) && !migration)
+			if (!MLOCK_PAGES && !migration &&
+			    (vma->vm_flags & VM_LOCKED))
 				continue;
 			cursor = (unsigned long) vma->vm_private_data;
 			while ( cursor < max_nl_cursor &&
 				cursor < vma->vm_end - vma->vm_start) {
-				try_to_unmap_cluster(cursor, &mapcount, vma);
+				ret = try_to_unmap_cluster(cursor, &mapcount,
+								vma, page);
+				if (ret == SWAP_MLOCK)
+					mlocked = 2;	/* to return below */
 				cursor += CLUSTER_SIZE;
 				vma->vm_private_data = (void *) cursor;
 				if ((int)mapcount <= 0)
@@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration)
 		vma->vm_private_data = NULL;
 out:
 	spin_unlock(&mapping->i_mmap_lock);
+	if (mlocked)
+		ret = SWAP_MLOCK;	/* actually mlocked the page */
+	else if (ret == SWAP_MLOCK)
+		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
 	return ret;
 }
 
@@ -1002,6 +1192,7 @@ out:
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
+ * SWAP_MLOCK	- page is mlocked.
  */
 int try_to_unmap(struct page *page, int migration)
 {
@@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration)
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
-		ret = try_to_unmap_anon(page, migration);
+		ret = try_to_unmap_anon(page, 0, migration);
 	else
-		ret = try_to_unmap_file(page, migration);
-
-	if (!page_mapped(page))
+		ret = try_to_unmap_file(page, 0, migration);
+	if (ret != SWAP_MLOCK && !page_mapped(page))
 		ret = SWAP_SUCCESS;
 	return ret;
 }
 
+#ifdef CONFIG_UNEVICTABLE_LRU
+/**
+ * try_to_munlock - try to munlock a page
+ * @page: the page to be munlocked
+ *
+ * Called from munlock code.  Checks all of the VMAs mapping the page
+ * to make sure nobody else has this page mlocked. The page will be
+ * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ *
+ * Return values are:
+ *
+ * SWAP_SUCCESS	- no vma's holding page mlocked.
+ * SWAP_AGAIN	- page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_MLOCK	- page is now mlocked.
+ */
+int try_to_munlock(struct page *page)
+{
+	VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+
+	if (PageAnon(page))
+		return try_to_unmap_anon(page, 1, 0);
+	else
+		return try_to_unmap_file(page, 1, 0);
+}
+#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index d87958a5f03e..d38d7e61fcd0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -199,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
 
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 	.unplug_io_fn	= default_unplug_io_fn,
 };
 
@@ -1367,6 +1367,7 @@ repeat:
 				error = -ENOMEM;
 				goto failed;
 			}
+			SetPageSwapBacked(filepage);
 
 			/* Precharge page while we can wait, compensate after */
 			error = mem_cgroup_cache_charge(filepage, current->mm,
@@ -1476,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 		if (!user_shm_lock(inode->i_size, user))
 			goto out_nomem;
 		info->flags |= VM_LOCKED;
+		mapping_set_unevictable(file->f_mapping);
 	}
 	if (!lock && (info->flags & VM_LOCKED) && user) {
 		user_shm_unlock(inode->i_size, user);
 		info->flags &= ~VM_LOCKED;
+		mapping_clear_unevictable(file->f_mapping);
+		scan_mapping_unevictable_pages(file->f_mapping);
 	}
 	retval = 0;
+
 out_nomem:
 	spin_unlock(&info->lock);
 	return retval;
diff --git a/mm/swap.c b/mm/swap.c
index 9e0cb3118079..2152e48a7b8f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,11 +31,12 @@
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
 
+#include "internal.h"
+
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
+static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 
 /*
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock(&zone->lru_lock);
 		}
-		if (PageLRU(page) && !PageActive(page)) {
-			list_move_tail(&page->lru, &zone->inactive_list);
+		if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+			int lru = page_is_file_cache(page);
+			list_move_tail(&page->lru, &zone->lru[lru].list);
 			pgmoved++;
 		}
 	}
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
 void  rotate_reclaimable_page(struct page *page)
 {
 	if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
-	    PageLRU(page)) {
+	    !PageUnevictable(page) && PageLRU(page)) {
 		struct pagevec *pvec;
 		unsigned long flags;
 
@@ -157,12 +159,19 @@ void activate_page(struct page *page)
 	struct zone *zone = page_zone(page);
 
 	spin_lock_irq(&zone->lru_lock);
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(zone, page);
+	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+		int file = page_is_file_cache(page);
+		int lru = LRU_BASE + file;
+		del_page_from_lru_list(zone, page, lru);
+
 		SetPageActive(page);
-		add_page_to_active_list(zone, page);
+		lru += LRU_ACTIVE;
+		add_page_to_lru_list(zone, page, lru);
 		__count_vm_event(PGACTIVATE);
-		mem_cgroup_move_lists(page, true);
+		mem_cgroup_move_lists(page, lru);
+
+		zone->recent_rotated[!!file]++;
+		zone->recent_scanned[!!file]++;
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
@@ -176,7 +185,8 @@ void activate_page(struct page *page)
  */
 void mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
+	if (!PageActive(page) && !PageUnevictable(page) &&
+			PageReferenced(page) && PageLRU(page)) {
 		activate_page(page);
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page)
 
 EXPORT_SYMBOL(mark_page_accessed);
 
-/**
- * lru_cache_add: add a page to the page lists
- * @page: the page to add
- */
-void lru_cache_add(struct page *page)
+void __lru_cache_add(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
-		__pagevec_lru_add(pvec);
+		____pagevec_lru_add(pvec, lru);
 	put_cpu_var(lru_add_pvecs);
 }
 
-void lru_cache_add_active(struct page *page)
+/**
+ * lru_cache_add_lru - add a page to a page list
+ * @page: the page to be added to the LRU.
+ * @lru: the LRU list to which the page is added.
+ */
+void lru_cache_add_lru(struct page *page, enum lru_list lru)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	if (PageActive(page)) {
+		VM_BUG_ON(PageUnevictable(page));
+		ClearPageActive(page);
+	} else if (PageUnevictable(page)) {
+		VM_BUG_ON(PageActive(page));
+		ClearPageUnevictable(page);
+	}
 
-	page_cache_get(page);
-	if (!pagevec_add(pvec, page))
-		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
+	__lru_cache_add(page, lru);
+}
+
+/**
+ * add_page_to_unevictable_list - add a page to the unevictable list
+ * @page:  the page to be added to the unevictable list
+ *
+ * Add page directly to its zone's unevictable list.  To avoid races with
+ * tasks that might be making the page evictable, through eg. munlock,
+ * munmap or exit, while it's not on the lru, we want to add the page
+ * while it's locked or otherwise "invisible" to other tasks.  This is
+ * difficult to do when using the pagevec cache, so bypass that.
+ */
+void add_page_to_unevictable_list(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irq(&zone->lru_lock);
+	SetPageUnevictable(page);
+	SetPageLRU(page);
+	add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+	spin_unlock_irq(&zone->lru_lock);
+}
+
+/**
+ * lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma:   vma in which page is mapped for determining reclaimability
+ *
+ * place @page on active or unevictable LRU list, depending on
+ * page_evictable().  Note that if the page is not evictable,
+ * it goes directly back onto it's zone's unevictable list.  It does
+ * NOT use a per cpu pagevec.
+ */
+void lru_cache_add_active_or_unevictable(struct page *page,
+					struct vm_area_struct *vma)
+{
+	if (page_evictable(page, vma))
+		lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
+	else
+		add_page_to_unevictable_list(page);
 }
 
 /*
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page)
  */
 static void drain_cpu_pagevecs(int cpu)
 {
+	struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
 	struct pagevec *pvec;
+	int lru;
 
-	pvec = &per_cpu(lru_add_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec);
-
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
+	for_each_lru(lru) {
+		pvec = &pvecs[lru - LRU_BASE];
+		if (pagevec_count(pvec))
+			____pagevec_lru_add(pvec, lru);
+	}
 
 	pvec = &per_cpu(lru_rotate_pvecs, cpu);
 	if (pagevec_count(pvec)) {
@@ -244,7 +299,7 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
 	lru_add_drain();
@@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold)
 
 		if (PageLRU(page)) {
 			struct zone *pagezone = page_zone(page);
+
 			if (pagezone != zone) {
 				if (zone)
 					spin_unlock_irqrestore(&zone->lru_lock,
@@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
  */
-void __pagevec_lru_add(struct pagevec *pvec)
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
 	int i;
 	struct zone *zone = NULL;
+	VM_BUG_ON(is_unevictable_lru(lru));
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
@@ -395,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
+		VM_BUG_ON(PageActive(page));
+		VM_BUG_ON(PageUnevictable(page));
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		add_page_to_inactive_list(zone, page);
+		if (is_active_lru(lru))
+			SetPageActive(page);
+		add_page_to_lru_list(zone, page, lru);
 	}
 	if (zone)
 		spin_unlock_irq(&zone->lru_lock);
@@ -405,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
-EXPORT_SYMBOL(__pagevec_lru_add);
+EXPORT_SYMBOL(____pagevec_lru_add);
 
-void __pagevec_lru_add_active(struct pagevec *pvec)
+/*
+ * Try to drop buffers from the pages in a pagevec
+ */
+void pagevec_strip(struct pagevec *pvec)
 {
 	int i;
-	struct zone *zone = NULL;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
-		struct zone *pagezone = page_zone(page);
 
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
+		if (PagePrivate(page) && trylock_page(page)) {
+			if (PagePrivate(page))
+				try_to_release_page(page, 0);
+			unlock_page(page);
 		}
-		VM_BUG_ON(PageLRU(page));
-		SetPageLRU(page);
-		VM_BUG_ON(PageActive(page));
-		SetPageActive(page);
-		add_page_to_active_list(zone, page);
 	}
-	if (zone)
-		spin_unlock_irq(&zone->lru_lock);
-	release_pages(pvec->pages, pvec->nr, pvec->cold);
-	pagevec_reinit(pvec);
 }
 
-/*
- * Try to drop buffers from the pages in a pagevec
+/**
+ * pagevec_swap_free - try to free swap space from the pages in a pagevec
+ * @pvec: pagevec with swapcache pages to free the swap space of
+ *
+ * The caller needs to hold an extra reference to each page and
+ * not hold the page lock on the pages.  This function uses a
+ * trylock on the page lock so it may not always free the swap
+ * space associated with a page.
  */
-void pagevec_strip(struct pagevec *pvec)
+void pagevec_swap_free(struct pagevec *pvec)
 {
 	int i;
 
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
-		if (PagePrivate(page) && trylock_page(page)) {
-			if (PagePrivate(page))
-				try_to_release_page(page, 0);
+		if (PageSwapCache(page) && trylock_page(page)) {
+			if (PageSwapCache(page))
+				remove_exclusive_swap_page_ref(page);
 			unlock_page(page);
 		}
 	}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 797c3831cbec..3353c9029cef 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
 };
 
 static struct backing_dev_info swap_backing_dev_info = {
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
 	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
@@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageSwapCache(page));
 	BUG_ON(PagePrivate(page));
+	BUG_ON(!PageSwapBacked(page));
 	error = radix_tree_preload(gfp_mask);
 	if (!error) {
 		page_cache_get(page);
@@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * re-using the just freed swap entry for an existing page.
 		 * May fail (-ENOMEM) if radix-tree node allocation failed.
 		 */
-		set_page_locked(new_page);
+		__set_page_locked(new_page);
+		SetPageSwapBacked(new_page);
 		err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
 		if (likely(!err)) {
 			/*
 			 * Initiate read into locked page and return.
 			 */
-			lru_cache_add_active(new_page);
+			lru_cache_add_anon(new_page);
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
-		clear_page_locked(new_page);
+		ClearPageSwapBacked(new_page);
+		__clear_page_locked(new_page);
 		swap_free(entry);
 	} while (err != -ENOMEM);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1e330f2998fa..90cb67a5417c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page)
  * Work out if there are any other processes sharing this
  * swap cache page. Free it if you can. Return success.
  */
-int remove_exclusive_swap_page(struct page *page)
+static int remove_exclusive_swap_page_count(struct page *page, int count)
 {
 	int retval;
 	struct swap_info_struct * p;
@@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
 		return 0;
 	if (PageWriteback(page))
 		return 0;
-	if (page_count(page) != 2) /* 2: us + cache */
+	if (page_count(page) != count) /* us + cache + ptes */
 		return 0;
 
 	entry.val = page_private(page);
@@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the swapcache lock held.. */
 		spin_lock_irq(&swapper_space.tree_lock);
-		if ((page_count(page) == 2) && !PageWriteback(page)) {
+		if ((page_count(page) == count) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
@@ -388,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page)
 }
 
 /*
+ * Most of the time the page should have two references: one for the
+ * process and one for the swap cache.
+ */
+int remove_exclusive_swap_page(struct page *page)
+{
+	return remove_exclusive_swap_page_count(page, 2);
+}
+
+/*
+ * The pageout code holds an extra reference to the page.  That raises
+ * the reference count to test for to 2 for a page that is only in the
+ * swap cache plus 1 for each process that maps the page.
+ */
+int remove_exclusive_swap_page_ref(struct page *page)
+{
+	return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
+}
+
+/*
  * Free the swap entry like above, but also try to
  * free the page cache entry if it is the last user.
  */
@@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (p) {
 		if (swap_entry_free(p, swp_offset(entry)) == 1) {
 			page = find_get_page(&swapper_space, entry.val);
-			if (page && unlikely(!trylock_page(page))) {
+			if (page && !trylock_page(page)) {
 				page_cache_release(page);
 				page = NULL;
 			}
diff --git a/mm/truncate.c b/mm/truncate.c
index e83e4b114ef1..1229211104f8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -18,6 +18,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
+#include "internal.h"
 
 
 /**
@@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 
 	cancel_dirty_page(page, PAGE_CACHE_SIZE);
 
+	clear_page_mlock(page);
 	remove_from_page_cache(page);
 	ClearPageMappedToDisk(page);
 	page_cache_release(page);	/* pagecache ref */
@@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
+	clear_page_mlock(page);
 	ret = remove_mapping(mapping, page);
 
 	return ret;
@@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (PageDirty(page))
 		goto failed;
 
+	clear_page_mlock(page);
 	BUG_ON(PagePrivate(page));
 	__remove_from_page_cache(page);
 	spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bba06c41fc59..712ae47af0bf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,6 +8,7 @@
  *  Numa awareness, Christoph Lameter, SGI, June 2005
  */
 
+#include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
@@ -18,16 +19,17 @@
 #include <linux/debugobjects.h>
 #include <linux/vmalloc.h>
 #include <linux/kallsyms.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
 
+#include <asm/atomic.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 
 
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
-
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-			    int node, void *caller);
+/*** Page table manipulation functions ***/
 
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 }
 
-static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
-						unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
-						unsigned long end)
+static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
 	} while (pud++, addr = next, addr != end);
 }
 
-void unmap_kernel_range(unsigned long addr, unsigned long size)
+static void vunmap_page_range(unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long start = addr;
-	unsigned long end = addr + size;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 			continue;
 		vunmap_pud_range(pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
-	flush_tlb_kernel_range(start, end);
-}
-
-static void unmap_vm_area(struct vm_struct *area)
-{
-	unmap_kernel_range((unsigned long)area->addr, area->size);
 }
 
 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
-			unsigned long end, pgprot_t prot, struct page ***pages)
+		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pte_t *pte;
 
+	/*
+	 * nr is a running index into the array which helps higher level
+	 * callers keep track of where we're up to.
+	 */
+
 	pte = pte_alloc_kernel(pmd, addr);
 	if (!pte)
 		return -ENOMEM;
 	do {
-		struct page *page = **pages;
-		WARN_ON(!pte_none(*pte));
-		if (!page)
+		struct page *page = pages[*nr];
+
+		if (WARN_ON(!pte_none(*pte)))
+			return -EBUSY;
+		if (WARN_ON(!page))
 			return -ENOMEM;
 		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
-		(*pages)++;
+		(*nr)++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	return 0;
 }
 
-static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
-			unsigned long end, pgprot_t prot, struct page ***pages)
+static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pmd_t *pmd;
 	unsigned long next;
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
-		if (vmap_pte_range(pmd, addr, next, prot, pages))
+		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
-			unsigned long end, pgprot_t prot, struct page ***pages)
+static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pud_t *pud;
 	unsigned long next;
@@ -141,44 +140,49 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
-		if (vmap_pmd_range(pud, addr, next, prot, pages))
+		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+/*
+ * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
+ * will have pfns corresponding to the "pages" array.
+ *
+ * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ */
+static int vmap_page_range(unsigned long addr, unsigned long end,
+				pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
 	unsigned long next;
-	unsigned long addr = (unsigned long) area->addr;
-	unsigned long end = addr + area->size - PAGE_SIZE;
-	int err;
+	int err = 0;
+	int nr = 0;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = vmap_pud_range(pgd, addr, next, prot, pages);
+		err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	flush_cache_vmap((unsigned long) area->addr, end);
-	return err;
+	flush_cache_vmap(addr, end);
+
+	if (unlikely(err))
+		return err;
+	return nr;
 }
-EXPORT_SYMBOL_GPL(map_vm_area);
 
 /*
- * Map a vmalloc()-space virtual address to the physical page.
+ * Walk a vmap address to the struct page it maps.
  */
 struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
 	unsigned long addr = (unsigned long) vmalloc_addr;
 	struct page *page = NULL;
 	pgd_t *pgd = pgd_offset_k(addr);
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *ptep, pte;
 
 	/*
 	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
@@ -188,10 +192,12 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 			!is_module_address(addr));
 
 	if (!pgd_none(*pgd)) {
-		pud = pud_offset(pgd, addr);
+		pud_t *pud = pud_offset(pgd, addr);
 		if (!pud_none(*pud)) {
-			pmd = pmd_offset(pud, addr);
+			pmd_t *pmd = pmd_offset(pud, addr);
 			if (!pmd_none(*pmd)) {
+				pte_t *ptep, pte;
+
 				ptep = pte_offset_map(pmd, addr);
 				pte = *ptep;
 				if (pte_present(pte))
@@ -213,13 +219,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-static struct vm_struct *
-__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
-		unsigned long end, int node, gfp_t gfp_mask, void *caller)
+
+/*** Global kva allocator ***/
+
+#define VM_LAZY_FREE	0x01
+#define VM_LAZY_FREEING	0x02
+#define VM_VM_AREA	0x04
+
+struct vmap_area {
+	unsigned long va_start;
+	unsigned long va_end;
+	unsigned long flags;
+	struct rb_node rb_node;		/* address sorted rbtree */
+	struct list_head list;		/* address sorted list */
+	struct list_head purge_list;	/* "lazy purge" list */
+	void *private;
+	struct rcu_head rcu_head;
+};
+
+static DEFINE_SPINLOCK(vmap_area_lock);
+static struct rb_root vmap_area_root = RB_ROOT;
+static LIST_HEAD(vmap_area_list);
+
+static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
-	struct vm_struct **p, *tmp, *area;
-	unsigned long align = 1;
+	struct rb_node *n = vmap_area_root.rb_node;
+
+	while (n) {
+		struct vmap_area *va;
+
+		va = rb_entry(n, struct vmap_area, rb_node);
+		if (addr < va->va_start)
+			n = n->rb_left;
+		else if (addr > va->va_start)
+			n = n->rb_right;
+		else
+			return va;
+	}
+
+	return NULL;
+}
+
+static void __insert_vmap_area(struct vmap_area *va)
+{
+	struct rb_node **p = &vmap_area_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct rb_node *tmp;
+
+	while (*p) {
+		struct vmap_area *tmp;
+
+		parent = *p;
+		tmp = rb_entry(parent, struct vmap_area, rb_node);
+		if (va->va_start < tmp->va_end)
+			p = &(*p)->rb_left;
+		else if (va->va_end > tmp->va_start)
+			p = &(*p)->rb_right;
+		else
+			BUG();
+	}
+
+	rb_link_node(&va->rb_node, parent, p);
+	rb_insert_color(&va->rb_node, &vmap_area_root);
+
+	/* address-sort this list so it is usable like the vmlist */
+	tmp = rb_prev(&va->rb_node);
+	if (tmp) {
+		struct vmap_area *prev;
+		prev = rb_entry(tmp, struct vmap_area, rb_node);
+		list_add_rcu(&va->list, &prev->list);
+	} else
+		list_add_rcu(&va->list, &vmap_area_list);
+}
+
+static void purge_vmap_area_lazy(void);
+
+/*
+ * Allocate a region of KVA of the specified size and alignment, within the
+ * vstart and vend.
+ */
+static struct vmap_area *alloc_vmap_area(unsigned long size,
+				unsigned long align,
+				unsigned long vstart, unsigned long vend,
+				int node, gfp_t gfp_mask)
+{
+	struct vmap_area *va;
+	struct rb_node *n;
+	unsigned long addr;
+	int purged = 0;
+
+	BUG_ON(size & ~PAGE_MASK);
+
+	addr = ALIGN(vstart, align);
+
+	va = kmalloc_node(sizeof(struct vmap_area),
+			gfp_mask & GFP_RECLAIM_MASK, node);
+	if (unlikely(!va))
+		return ERR_PTR(-ENOMEM);
+
+retry:
+	spin_lock(&vmap_area_lock);
+	/* XXX: could have a last_hole cache */
+	n = vmap_area_root.rb_node;
+	if (n) {
+		struct vmap_area *first = NULL;
+
+		do {
+			struct vmap_area *tmp;
+			tmp = rb_entry(n, struct vmap_area, rb_node);
+			if (tmp->va_end >= addr) {
+				if (!first && tmp->va_start < addr + size)
+					first = tmp;
+				n = n->rb_left;
+			} else {
+				first = tmp;
+				n = n->rb_right;
+			}
+		} while (n);
+
+		if (!first)
+			goto found;
+
+		if (first->va_end < addr) {
+			n = rb_next(&first->rb_node);
+			if (n)
+				first = rb_entry(n, struct vmap_area, rb_node);
+			else
+				goto found;
+		}
+
+		while (addr + size >= first->va_start && addr + size <= vend) {
+			addr = ALIGN(first->va_end + PAGE_SIZE, align);
+
+			n = rb_next(&first->rb_node);
+			if (n)
+				first = rb_entry(n, struct vmap_area, rb_node);
+			else
+				goto found;
+		}
+	}
+found:
+	if (addr + size > vend) {
+		spin_unlock(&vmap_area_lock);
+		if (!purged) {
+			purge_vmap_area_lazy();
+			purged = 1;
+			goto retry;
+		}
+		if (printk_ratelimit())
+			printk(KERN_WARNING "vmap allocation failed: "
+				 "use vmalloc=<size> to increase size.\n");
+		return ERR_PTR(-EBUSY);
+	}
+
+	BUG_ON(addr & (align-1));
+
+	va->va_start = addr;
+	va->va_end = addr + size;
+	va->flags = 0;
+	__insert_vmap_area(va);
+	spin_unlock(&vmap_area_lock);
+
+	return va;
+}
+
+static void rcu_free_va(struct rcu_head *head)
+{
+	struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
+
+	kfree(va);
+}
+
+static void __free_vmap_area(struct vmap_area *va)
+{
+	BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+	rb_erase(&va->rb_node, &vmap_area_root);
+	RB_CLEAR_NODE(&va->rb_node);
+	list_del_rcu(&va->list);
+
+	call_rcu(&va->rcu_head, rcu_free_va);
+}
+
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+	spin_lock(&vmap_area_lock);
+	__free_vmap_area(va);
+	spin_unlock(&vmap_area_lock);
+}
+
+/*
+ * Clear the pagetable entries of a given vmap_area
+ */
+static void unmap_vmap_area(struct vmap_area *va)
+{
+	vunmap_page_range(va->va_start, va->va_end);
+}
+
+/*
+ * lazy_max_pages is the maximum amount of virtual address space we gather up
+ * before attempting to purge with a TLB flush.
+ *
+ * There is a tradeoff here: a larger number will cover more kernel page tables
+ * and take slightly longer to purge, but it will linearly reduce the number of
+ * global TLB flushes that must be performed. It would seem natural to scale
+ * this number up linearly with the number of CPUs (because vmapping activity
+ * could also scale linearly with the number of CPUs), however it is likely
+ * that in practice, workloads might be constrained in other ways that mean
+ * vmap activity will not scale linearly with CPUs. Also, I want to be
+ * conservative and not introduce a big latency on huge systems, so go with
+ * a less aggressive log scale. It will still be an improvement over the old
+ * code, and it will be simple to change the scale factor if we find that it
+ * becomes a problem on bigger systems.
+ */
+static unsigned long lazy_max_pages(void)
+{
+	unsigned int log;
+
+	log = fls(num_online_cpus());
+
+	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+}
+
+static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+
+/*
+ * Purges all lazily-freed vmap areas.
+ *
+ * If sync is 0 then don't purge if there is already a purge in progress.
+ * If force_flush is 1, then flush kernel TLBs between *start and *end even
+ * if we found no lazy vmap areas to unmap (callers can use this to optimise
+ * their own TLB flushing).
+ * Returns with *start = min(*start, lowest purged address)
+ *              *end = max(*end, highest purged address)
+ */
+static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
+					int sync, int force_flush)
+{
+	static DEFINE_SPINLOCK(purge_lock);
+	LIST_HEAD(valist);
+	struct vmap_area *va;
+	int nr = 0;
+
+	/*
+	 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
+	 * should not expect such behaviour. This just simplifies locking for
+	 * the case that isn't actually used at the moment anyway.
+	 */
+	if (!sync && !force_flush) {
+		if (!spin_trylock(&purge_lock))
+			return;
+	} else
+		spin_lock(&purge_lock);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(va, &vmap_area_list, list) {
+		if (va->flags & VM_LAZY_FREE) {
+			if (va->va_start < *start)
+				*start = va->va_start;
+			if (va->va_end > *end)
+				*end = va->va_end;
+			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+			unmap_vmap_area(va);
+			list_add_tail(&va->purge_list, &valist);
+			va->flags |= VM_LAZY_FREEING;
+			va->flags &= ~VM_LAZY_FREE;
+		}
+	}
+	rcu_read_unlock();
+
+	if (nr) {
+		BUG_ON(nr > atomic_read(&vmap_lazy_nr));
+		atomic_sub(nr, &vmap_lazy_nr);
+	}
+
+	if (nr || force_flush)
+		flush_tlb_kernel_range(*start, *end);
+
+	if (nr) {
+		spin_lock(&vmap_area_lock);
+		list_for_each_entry(va, &valist, purge_list)
+			__free_vmap_area(va);
+		spin_unlock(&vmap_area_lock);
+	}
+	spin_unlock(&purge_lock);
+}
+
+/*
+ * Kick off a purge of the outstanding lazy areas.
+ */
+static void purge_vmap_area_lazy(void)
+{
+	unsigned long start = ULONG_MAX, end = 0;
+
+	__purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+	va->flags |= VM_LAZY_FREE;
+	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
+	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+		purge_vmap_area_lazy();
+}
+
+static struct vmap_area *find_vmap_area(unsigned long addr)
+{
+	struct vmap_area *va;
+
+	spin_lock(&vmap_area_lock);
+	va = __find_vmap_area(addr);
+	spin_unlock(&vmap_area_lock);
+
+	return va;
+}
+
+static void free_unmap_vmap_area_addr(unsigned long addr)
+{
+	struct vmap_area *va;
+
+	va = find_vmap_area(addr);
+	BUG_ON(!va);
+	free_unmap_vmap_area(va);
+}
+
+
+/*** Per cpu kva allocator ***/
+
+/*
+ * vmap space is limited especially on 32 bit architectures. Ensure there is
+ * room for at least 16 percpu vmap blocks per CPU.
+ */
+/*
+ * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
+ * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
+ * instead (we just need a rough idea)
+ */
+#if BITS_PER_LONG == 32
+#define VMALLOC_SPACE		(128UL*1024*1024)
+#else
+#define VMALLOC_SPACE		(128UL*1024*1024*1024)
+#endif
+
+#define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
+#define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
+#define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
+#define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
+#define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
+#define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
+#define VMAP_BBMAP_BITS		VMAP_MIN(VMAP_BBMAP_BITS_MAX,		\
+					VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
+						VMALLOC_PAGES / NR_CPUS / 16))
+
+#define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
+
+struct vmap_block_queue {
+	spinlock_t lock;
+	struct list_head free;
+	struct list_head dirty;
+	unsigned int nr_dirty;
+};
+
+struct vmap_block {
+	spinlock_t lock;
+	struct vmap_area *va;
+	struct vmap_block_queue *vbq;
+	unsigned long free, dirty;
+	DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
+	DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+	union {
+		struct {
+			struct list_head free_list;
+			struct list_head dirty_list;
+		};
+		struct rcu_head rcu_head;
+	};
+};
+
+/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
+static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
+
+/*
+ * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * in the free path. Could get rid of this if we change the API to return a
+ * "cookie" from alloc, to be passed to free. But no big deal yet.
+ */
+static DEFINE_SPINLOCK(vmap_block_tree_lock);
+static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+
+/*
+ * We should probably have a fallback mechanism to allocate virtual memory
+ * out of partially filled vmap blocks. However vmap block sizing should be
+ * fairly reasonable according to the vmalloc size, so it shouldn't be a
+ * big problem.
+ */
+
+static unsigned long addr_to_vb_idx(unsigned long addr)
+{
+	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
+	addr /= VMAP_BLOCK_SIZE;
+	return addr;
+}
+
+static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+{
+	struct vmap_block_queue *vbq;
+	struct vmap_block *vb;
+	struct vmap_area *va;
+	unsigned long vb_idx;
+	int node, err;
+
+	node = numa_node_id();
+
+	vb = kmalloc_node(sizeof(struct vmap_block),
+			gfp_mask & GFP_RECLAIM_MASK, node);
+	if (unlikely(!vb))
+		return ERR_PTR(-ENOMEM);
+
+	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
+					VMALLOC_START, VMALLOC_END,
+					node, gfp_mask);
+	if (unlikely(IS_ERR(va))) {
+		kfree(vb);
+		return ERR_PTR(PTR_ERR(va));
+	}
+
+	err = radix_tree_preload(gfp_mask);
+	if (unlikely(err)) {
+		kfree(vb);
+		free_vmap_area(va);
+		return ERR_PTR(err);
+	}
+
+	spin_lock_init(&vb->lock);
+	vb->va = va;
+	vb->free = VMAP_BBMAP_BITS;
+	vb->dirty = 0;
+	bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
+	bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+	INIT_LIST_HEAD(&vb->free_list);
+	INIT_LIST_HEAD(&vb->dirty_list);
+
+	vb_idx = addr_to_vb_idx(va->va_start);
+	spin_lock(&vmap_block_tree_lock);
+	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
+	spin_unlock(&vmap_block_tree_lock);
+	BUG_ON(err);
+	radix_tree_preload_end();
+
+	vbq = &get_cpu_var(vmap_block_queue);
+	vb->vbq = vbq;
+	spin_lock(&vbq->lock);
+	list_add(&vb->free_list, &vbq->free);
+	spin_unlock(&vbq->lock);
+	put_cpu_var(vmap_cpu_blocks);
+
+	return vb;
+}
+
+static void rcu_free_vb(struct rcu_head *head)
+{
+	struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
+
+	kfree(vb);
+}
+
+static void free_vmap_block(struct vmap_block *vb)
+{
+	struct vmap_block *tmp;
+	unsigned long vb_idx;
+
+	spin_lock(&vb->vbq->lock);
+	if (!list_empty(&vb->free_list))
+		list_del(&vb->free_list);
+	if (!list_empty(&vb->dirty_list))
+		list_del(&vb->dirty_list);
+	spin_unlock(&vb->vbq->lock);
+
+	vb_idx = addr_to_vb_idx(vb->va->va_start);
+	spin_lock(&vmap_block_tree_lock);
+	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
+	spin_unlock(&vmap_block_tree_lock);
+	BUG_ON(tmp != vb);
+
+	free_unmap_vmap_area(vb->va);
+	call_rcu(&vb->rcu_head, rcu_free_vb);
+}
+
+static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
+{
+	struct vmap_block_queue *vbq;
+	struct vmap_block *vb;
+	unsigned long addr = 0;
+	unsigned int order;
+
+	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+	order = get_order(size);
+
+again:
+	rcu_read_lock();
+	vbq = &get_cpu_var(vmap_block_queue);
+	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+		int i;
+
+		spin_lock(&vb->lock);
+		i = bitmap_find_free_region(vb->alloc_map,
+						VMAP_BBMAP_BITS, order);
+
+		if (i >= 0) {
+			addr = vb->va->va_start + (i << PAGE_SHIFT);
+			BUG_ON(addr_to_vb_idx(addr) !=
+					addr_to_vb_idx(vb->va->va_start));
+			vb->free -= 1UL << order;
+			if (vb->free == 0) {
+				spin_lock(&vbq->lock);
+				list_del_init(&vb->free_list);
+				spin_unlock(&vbq->lock);
+			}
+			spin_unlock(&vb->lock);
+			break;
+		}
+		spin_unlock(&vb->lock);
+	}
+	put_cpu_var(vmap_cpu_blocks);
+	rcu_read_unlock();
+
+	if (!addr) {
+		vb = new_vmap_block(gfp_mask);
+		if (IS_ERR(vb))
+			return vb;
+		goto again;
+	}
+
+	return (void *)addr;
+}
+
+static void vb_free(const void *addr, unsigned long size)
+{
+	unsigned long offset;
+	unsigned long vb_idx;
+	unsigned int order;
+	struct vmap_block *vb;
+
+	BUG_ON(size & ~PAGE_MASK);
+	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+	order = get_order(size);
+
+	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+
+	vb_idx = addr_to_vb_idx((unsigned long)addr);
+	rcu_read_lock();
+	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
+	rcu_read_unlock();
+	BUG_ON(!vb);
+
+	spin_lock(&vb->lock);
+	bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+	if (!vb->dirty) {
+		spin_lock(&vb->vbq->lock);
+		list_add(&vb->dirty_list, &vb->vbq->dirty);
+		spin_unlock(&vb->vbq->lock);
+	}
+	vb->dirty += 1UL << order;
+	if (vb->dirty == VMAP_BBMAP_BITS) {
+		BUG_ON(vb->free || !list_empty(&vb->free_list));
+		spin_unlock(&vb->lock);
+		free_vmap_block(vb);
+	} else
+		spin_unlock(&vb->lock);
+}
+
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+	unsigned long start = ULONG_MAX, end = 0;
+	int cpu;
+	int flush = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+		struct vmap_block *vb;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+			int i;
+
+			spin_lock(&vb->lock);
+			i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
+			while (i < VMAP_BBMAP_BITS) {
+				unsigned long s, e;
+				int j;
+				j = find_next_zero_bit(vb->dirty_map,
+					VMAP_BBMAP_BITS, i);
+
+				s = vb->va->va_start + (i << PAGE_SHIFT);
+				e = vb->va->va_start + (j << PAGE_SHIFT);
+				vunmap_page_range(s, e);
+				flush = 1;
+
+				if (s < start)
+					start = s;
+				if (e > end)
+					end = e;
+
+				i = j;
+				i = find_next_bit(vb->dirty_map,
+							VMAP_BBMAP_BITS, i);
+			}
+			spin_unlock(&vb->lock);
+		}
+		rcu_read_unlock();
+	}
+
+	__purge_vmap_area_lazy(&start, &end, 1, flush);
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+
+/**
+ * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
+ * @mem: the pointer returned by vm_map_ram
+ * @count: the count passed to that vm_map_ram call (cannot unmap partial)
+ */
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+	unsigned long size = count << PAGE_SHIFT;
+	unsigned long addr = (unsigned long)mem;
+
+	BUG_ON(!addr);
+	BUG_ON(addr < VMALLOC_START);
+	BUG_ON(addr > VMALLOC_END);
+	BUG_ON(addr & (PAGE_SIZE-1));
+
+	debug_check_no_locks_freed(mem, size);
+
+	if (likely(count <= VMAP_MAX_ALLOC))
+		vb_free(mem, size);
+	else
+		free_unmap_vmap_area_addr(addr);
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+
+/**
+ * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
+ * @pages: an array of pointers to the pages to be mapped
+ * @count: number of pages
+ * @node: prefer to allocate data structures on this node
+ * @prot: memory protection to use. PAGE_KERNEL for regular RAM
+ * @returns: a pointer to the address that has been mapped, or NULL on failure
+ */
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+	unsigned long size = count << PAGE_SHIFT;
 	unsigned long addr;
+	void *mem;
+
+	if (likely(count <= VMAP_MAX_ALLOC)) {
+		mem = vb_alloc(size, GFP_KERNEL);
+		if (IS_ERR(mem))
+			return NULL;
+		addr = (unsigned long)mem;
+	} else {
+		struct vmap_area *va;
+		va = alloc_vmap_area(size, PAGE_SIZE,
+				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+		if (IS_ERR(va))
+			return NULL;
+
+		addr = va->va_start;
+		mem = (void *)addr;
+	}
+	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+		vm_unmap_ram(mem, count);
+		return NULL;
+	}
+	return mem;
+}
+EXPORT_SYMBOL(vm_map_ram);
+
+void __init vmalloc_init(void)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct vmap_block_queue *vbq;
+
+		vbq = &per_cpu(vmap_block_queue, i);
+		spin_lock_init(&vbq->lock);
+		INIT_LIST_HEAD(&vbq->free);
+		INIT_LIST_HEAD(&vbq->dirty);
+		vbq->nr_dirty = 0;
+	}
+}
+
+void unmap_kernel_range(unsigned long addr, unsigned long size)
+{
+	unsigned long end = addr + size;
+	vunmap_page_range(addr, end);
+	flush_tlb_kernel_range(addr, end);
+}
+
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+	unsigned long addr = (unsigned long)area->addr;
+	unsigned long end = addr + area->size - PAGE_SIZE;
+	int err;
+
+	err = vmap_page_range(addr, end, prot, *pages);
+	if (err > 0) {
+		*pages += err;
+		err = 0;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(map_vm_area);
+
+/*** Old vmalloc interfaces ***/
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+
+static struct vm_struct *__get_vm_area_node(unsigned long size,
+		unsigned long flags, unsigned long start, unsigned long end,
+		int node, gfp_t gfp_mask, void *caller)
+{
+	static struct vmap_area *va;
+	struct vm_struct *area;
+	struct vm_struct *tmp, **p;
+	unsigned long align = 1;
 
 	BUG_ON(in_interrupt());
 	if (flags & VM_IOREMAP) {
@@ -232,13 +976,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
 
 		align = 1ul << bit;
 	}
-	addr = ALIGN(start, align);
+
 	size = PAGE_ALIGN(size);
 	if (unlikely(!size))
 		return NULL;
 
 	area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
-
 	if (unlikely(!area))
 		return NULL;
 
@@ -247,48 +990,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
 	 */
 	size += PAGE_SIZE;
 
-	write_lock(&vmlist_lock);
-	for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
-		if ((unsigned long)tmp->addr < addr) {
-			if((unsigned long)tmp->addr + tmp->size >= addr)
-				addr = ALIGN(tmp->size + 
-					     (unsigned long)tmp->addr, align);
-			continue;
-		}
-		if ((size + addr) < addr)
-			goto out;
-		if (size + addr <= (unsigned long)tmp->addr)
-			goto found;
-		addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
-		if (addr > end - size)
-			goto out;
+	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
+	if (IS_ERR(va)) {
+		kfree(area);
+		return NULL;
 	}
-	if ((size + addr) < addr)
-		goto out;
-	if (addr > end - size)
-		goto out;
-
-found:
-	area->next = *p;
-	*p = area;
 
 	area->flags = flags;
-	area->addr = (void *)addr;
+	area->addr = (void *)va->va_start;
 	area->size = size;
 	area->pages = NULL;
 	area->nr_pages = 0;
 	area->phys_addr = 0;
 	area->caller = caller;
+	va->private = area;
+	va->flags |= VM_VM_AREA;
+
+	write_lock(&vmlist_lock);
+	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+		if (tmp->addr >= area->addr)
+			break;
+	}
+	area->next = *p;
+	*p = area;
 	write_unlock(&vmlist_lock);
 
 	return area;
-
-out:
-	write_unlock(&vmlist_lock);
-	kfree(area);
-	if (printk_ratelimit())
-		printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
-	return NULL;
 }
 
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
@@ -328,39 +1055,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
 				  gfp_mask, __builtin_return_address(0));
 }
 
-/* Caller must hold vmlist_lock */
-static struct vm_struct *__find_vm_area(const void *addr)
+static struct vm_struct *find_vm_area(const void *addr)
 {
-	struct vm_struct *tmp;
+	struct vmap_area *va;
 
-	for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
-		 if (tmp->addr == addr)
-			break;
-	}
-
-	return tmp;
-}
-
-/* Caller must hold vmlist_lock */
-static struct vm_struct *__remove_vm_area(const void *addr)
-{
-	struct vm_struct **p, *tmp;
+	va = find_vmap_area((unsigned long)addr);
+	if (va && va->flags & VM_VM_AREA)
+		return va->private;
 
-	for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
-		 if (tmp->addr == addr)
-			 goto found;
-	}
 	return NULL;
-
-found:
-	unmap_vm_area(tmp);
-	*p = tmp->next;
-
-	/*
-	 * Remove the guard page.
-	 */
-	tmp->size -= PAGE_SIZE;
-	return tmp;
 }
 
 /**
@@ -373,11 +1076,24 @@ found:
  */
 struct vm_struct *remove_vm_area(const void *addr)
 {
-	struct vm_struct *v;
-	write_lock(&vmlist_lock);
-	v = __remove_vm_area(addr);
-	write_unlock(&vmlist_lock);
-	return v;
+	struct vmap_area *va;
+
+	va = find_vmap_area((unsigned long)addr);
+	if (va && va->flags & VM_VM_AREA) {
+		struct vm_struct *vm = va->private;
+		struct vm_struct *tmp, **p;
+		free_unmap_vmap_area(va);
+		vm->size -= PAGE_SIZE;
+
+		write_lock(&vmlist_lock);
+		for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+			;
+		*p = tmp->next;
+		write_unlock(&vmlist_lock);
+
+		return vm;
+	}
+	return NULL;
 }
 
 static void __vunmap(const void *addr, int deallocate_pages)
@@ -487,6 +1203,8 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
+static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+			    int node, void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node, void *caller)
 {
@@ -613,10 +1331,8 @@ void *vmalloc_user(unsigned long size)
 
 	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
 	if (ret) {
-		write_lock(&vmlist_lock);
-		area = __find_vm_area(ret);
+		area = find_vm_area(ret);
 		area->flags |= VM_USERMAP;
-		write_unlock(&vmlist_lock);
 	}
 	return ret;
 }
@@ -696,10 +1412,8 @@ void *vmalloc_32_user(unsigned long size)
 
 	ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
 	if (ret) {
-		write_lock(&vmlist_lock);
-		area = __find_vm_area(ret);
+		area = find_vm_area(ret);
 		area->flags |= VM_USERMAP;
-		write_unlock(&vmlist_lock);
 	}
 	return ret;
 }
@@ -800,26 +1514,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 	struct vm_struct *area;
 	unsigned long uaddr = vma->vm_start;
 	unsigned long usize = vma->vm_end - vma->vm_start;
-	int ret;
 
 	if ((PAGE_SIZE-1) & (unsigned long)addr)
 		return -EINVAL;
 
-	read_lock(&vmlist_lock);
-	area = __find_vm_area(addr);
+	area = find_vm_area(addr);
 	if (!area)
-		goto out_einval_locked;
+		return -EINVAL;
 
 	if (!(area->flags & VM_USERMAP))
-		goto out_einval_locked;
+		return -EINVAL;
 
 	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
-		goto out_einval_locked;
-	read_unlock(&vmlist_lock);
+		return -EINVAL;
 
 	addr += pgoff << PAGE_SHIFT;
 	do {
 		struct page *page = vmalloc_to_page(addr);
+		int ret;
+
 		ret = vm_insert_page(vma, uaddr, page);
 		if (ret)
 			return ret;
@@ -832,11 +1545,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 	/* Prevent "things" like memory migration? VM_flags need a cleanup... */
 	vma->vm_flags |= VM_RESERVED;
 
-	return ret;
-
-out_einval_locked:
-	read_unlock(&vmlist_lock);
-	return -EINVAL;
+	return 0;
 }
 EXPORT_SYMBOL(remap_vmalloc_range);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ff1a58e7c10..3b5860294bb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,7 @@
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
+#include <linux/sysctl.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -78,7 +79,7 @@ struct scan_control {
 	unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
 			unsigned long *scanned, int order, int mode,
 			struct zone *z, struct mem_cgroup *mem_cont,
-			int active);
+			int active, int file);
 };
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -470,6 +471,85 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 	return 0;
 }
 
+/**
+ * putback_lru_page - put previously isolated page onto appropriate LRU list
+ * @page: page to be put back to appropriate lru list
+ *
+ * Add previously isolated @page to appropriate LRU list.
+ * Page may still be unevictable for other reasons.
+ *
+ * lru_lock must not be held, interrupts must be enabled.
+ */
+#ifdef CONFIG_UNEVICTABLE_LRU
+void putback_lru_page(struct page *page)
+{
+	int lru;
+	int active = !!TestClearPageActive(page);
+	int was_unevictable = PageUnevictable(page);
+
+	VM_BUG_ON(PageLRU(page));
+
+redo:
+	ClearPageUnevictable(page);
+
+	if (page_evictable(page, NULL)) {
+		/*
+		 * For evictable pages, we can use the cache.
+		 * In event of a race, worst case is we end up with an
+		 * unevictable page on [in]active list.
+		 * We know how to handle that.
+		 */
+		lru = active + page_is_file_cache(page);
+		lru_cache_add_lru(page, lru);
+	} else {
+		/*
+		 * Put unevictable pages directly on zone's unevictable
+		 * list.
+		 */
+		lru = LRU_UNEVICTABLE;
+		add_page_to_unevictable_list(page);
+	}
+	mem_cgroup_move_lists(page, lru);
+
+	/*
+	 * page's status can change while we move it among lru. If an evictable
+	 * page is on unevictable list, it never be freed. To avoid that,
+	 * check after we added it to the list, again.
+	 */
+	if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+		if (!isolate_lru_page(page)) {
+			put_page(page);
+			goto redo;
+		}
+		/* This means someone else dropped this page from LRU
+		 * So, it will be freed or putback to LRU again. There is
+		 * nothing to do here.
+		 */
+	}
+
+	if (was_unevictable && lru != LRU_UNEVICTABLE)
+		count_vm_event(UNEVICTABLE_PGRESCUED);
+	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+		count_vm_event(UNEVICTABLE_PGCULLED);
+
+	put_page(page);		/* drop ref from isolate */
+}
+
+#else /* CONFIG_UNEVICTABLE_LRU */
+
+void putback_lru_page(struct page *page)
+{
+	int lru;
+	VM_BUG_ON(PageLRU(page));
+
+	lru = !!TestClearPageActive(page) + page_is_file_cache(page);
+	lru_cache_add_lru(page, lru);
+	mem_cgroup_move_lists(page, lru);
+	put_page(page);
+}
+#endif /* CONFIG_UNEVICTABLE_LRU */
+
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -503,6 +583,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 		sc->nr_scanned++;
 
+		if (unlikely(!page_evictable(page, NULL)))
+			goto cull_mlocked;
+
 		if (!sc->may_swap && page_mapped(page))
 			goto keep_locked;
 
@@ -539,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page))
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			switch (try_to_munlock(page)) {
+			case SWAP_FAIL:		/* shouldn't happen */
+			case SWAP_AGAIN:
+				goto keep_locked;
+			case SWAP_MLOCK:
+				goto cull_mlocked;
+			case SWAP_SUCCESS:
+				; /* fall thru'; add to swap cache */
+			}
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
+		}
 #endif /* CONFIG_SWAP */
 
 		mapping = page_mapping(page);
@@ -556,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
+			case SWAP_MLOCK:
+				goto cull_mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -602,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * possible for a page to have PageDirty set, but it is actually
 		 * clean (all its buffers are clean).  This happens if the
 		 * buffers were written out directly, with submit_bh(). ext3
-		 * will do this, as well as the blockdev mapping. 
+		 * will do this, as well as the blockdev mapping.
 		 * try_to_release_page() will discover that cleanness and will
 		 * drop the buffers and mark the page clean - it can be freed.
 		 *
@@ -637,7 +732,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (!mapping || !__remove_mapping(mapping, page))
 			goto keep_locked;
 
-		unlock_page(page);
+		/*
+		 * At this point, we have no other references and there is
+		 * no way to pick any more up (removed from LRU, removed
+		 * from pagecache). Can use non-atomic bitops now (and
+		 * we obviously don't have to worry about waking up a process
+		 * waiting on the page lock, because there are no references.
+		 */
+		__clear_page_locked(page);
 free_it:
 		nr_reclaimed++;
 		if (!pagevec_add(&freed_pvec, page)) {
@@ -646,14 +748,23 @@ free_it:
 		}
 		continue;
 
+cull_mlocked:
+		unlock_page(page);
+		putback_lru_page(page);
+		continue;
+
 activate_locked:
+		/* Not a candidate for swapping, so reclaim swap space. */
+		if (PageSwapCache(page) && vm_swap_full())
+			remove_exclusive_swap_page_ref(page);
+		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
-		VM_BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
@@ -677,7 +788,7 @@ keep:
  *
  * returns 0 on success, -ve errno on failure.
  */
-int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode, int file)
 {
 	int ret = -EINVAL;
 
@@ -693,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode)
 	if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
 		return ret;
 
+	if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
+		return ret;
+
+	/*
+	 * When this function is being called for lumpy reclaim, we
+	 * initially look into all LRU pages, active, inactive and
+	 * unevictable; only give shrink_page_list evictable pages.
+	 */
+	if (PageUnevictable(page))
+		return ret;
+
 	ret = -EBUSY;
 	if (likely(get_page_unless_zero(page))) {
 		/*
@@ -723,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode)
  * @scanned:	The number of pages that were scanned.
  * @order:	The caller's attempted allocation order
  * @mode:	One of the LRU isolation modes
+ * @file:	True [1] if isolating file [!anon] pages
  *
  * returns how many pages were moved onto *@dst.
  */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 		struct list_head *src, struct list_head *dst,
-		unsigned long *scanned, int order, int mode)
+		unsigned long *scanned, int order, int mode, int file)
 {
 	unsigned long nr_taken = 0;
 	unsigned long scan;
@@ -745,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 		VM_BUG_ON(!PageLRU(page));
 
-		switch (__isolate_lru_page(page, mode)) {
+		switch (__isolate_lru_page(page, mode, file)) {
 		case 0:
 			list_move(&page->lru, dst);
 			nr_taken++;
@@ -788,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 				break;
 
 			cursor_page = pfn_to_page(pfn);
+
 			/* Check that we have not crossed a zone boundary. */
 			if (unlikely(page_zone_id(cursor_page) != zone_id))
 				continue;
-			switch (__isolate_lru_page(cursor_page, mode)) {
+			switch (__isolate_lru_page(cursor_page, mode, file)) {
 			case 0:
 				list_move(&cursor_page->lru, dst);
 				nr_taken++;
@@ -802,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 				/* else it is being freed elsewhere */
 				list_move(&cursor_page->lru, src);
 			default:
-				break;
+				break;	/* ! on LRU or wrong list */
 			}
 		}
 	}
@@ -816,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
-					int active)
+					int active, int file)
 {
+	int lru = LRU_BASE;
 	if (active)
-		return isolate_lru_pages(nr, &z->active_list, dst,
-						scanned, order, mode);
-	else
-		return isolate_lru_pages(nr, &z->inactive_list, dst,
-						scanned, order, mode);
+		lru += LRU_ACTIVE;
+	if (file)
+		lru += LRU_FILE;
+	return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
+								mode, !!file);
 }
 
 /*
  * clear_active_flags() is a helper for shrink_active_list(), clearing
  * any active bits from the pages in the list.
  */
-static unsigned long clear_active_flags(struct list_head *page_list)
+static unsigned long clear_active_flags(struct list_head *page_list,
+					unsigned int *count)
 {
 	int nr_active = 0;
+	int lru;
 	struct page *page;
 
-	list_for_each_entry(page, page_list, lru)
+	list_for_each_entry(page, page_list, lru) {
+		lru = page_is_file_cache(page);
 		if (PageActive(page)) {
+			lru += LRU_ACTIVE;
 			ClearPageActive(page);
 			nr_active++;
 		}
+		count[lru]++;
+	}
 
 	return nr_active;
 }
 
+/**
+ * isolate_lru_page - tries to isolate a page from its LRU list
+ * @page: page to isolate from its LRU list
+ *
+ * Isolates a @page from an LRU list, clears PageLRU and adjusts the
+ * vmstat statistic corresponding to whatever LRU list the page was on.
+ *
+ * Returns 0 if the page was removed from an LRU list.
+ * Returns -EBUSY if the page was not on an LRU list.
+ *
+ * The returned page will have PageLRU() cleared.  If it was found on
+ * the active list, it will have PageActive set.  If it was found on
+ * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * may need to be cleared by the caller before letting the page go.
+ *
+ * The vmstat statistic corresponding to the list on which the page was
+ * found will be decremented.
+ *
+ * Restrictions:
+ * (1) Must be called with an elevated refcount on the page. This is a
+ *     fundamentnal difference from isolate_lru_pages (which is called
+ *     without a stable reference).
+ * (2) the lru_lock must not be held.
+ * (3) interrupts must be enabled.
+ */
+int isolate_lru_page(struct page *page)
+{
+	int ret = -EBUSY;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irq(&zone->lru_lock);
+		if (PageLRU(page) && get_page_unless_zero(page)) {
+			int lru = page_lru(page);
+			ret = 0;
+			ClearPageLRU(page);
+
+			del_page_from_lru_list(zone, page, lru);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+	return ret;
+}
+
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
  */
 static unsigned long shrink_inactive_list(unsigned long max_scan,
-				struct zone *zone, struct scan_control *sc)
+			struct zone *zone, struct scan_control *sc,
+			int priority, int file)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -866,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		unsigned long nr_scan;
 		unsigned long nr_freed;
 		unsigned long nr_active;
+		unsigned int count[NR_LRU_LISTS] = { 0, };
+		int mode = ISOLATE_INACTIVE;
+
+		/*
+		 * If we need a large contiguous chunk of memory, or have
+		 * trouble getting a small set of contiguous pages, we
+		 * will reclaim both active and inactive pages.
+		 *
+		 * We use the same threshold as pageout congestion_wait below.
+		 */
+		if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+			mode = ISOLATE_BOTH;
+		else if (sc->order && priority < DEF_PRIORITY - 2)
+			mode = ISOLATE_BOTH;
 
 		nr_taken = sc->isolate_pages(sc->swap_cluster_max,
-			     &page_list, &nr_scan, sc->order,
-			     (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
-					     ISOLATE_BOTH : ISOLATE_INACTIVE,
-				zone, sc->mem_cgroup, 0);
-		nr_active = clear_active_flags(&page_list);
+			     &page_list, &nr_scan, sc->order, mode,
+				zone, sc->mem_cgroup, 0, file);
+		nr_active = clear_active_flags(&page_list, count);
 		__count_vm_events(PGDEACTIVATE, nr_active);
 
-		__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
-		__mod_zone_page_state(zone, NR_INACTIVE,
-						-(nr_taken - nr_active));
-		if (scan_global_lru(sc))
+		__mod_zone_page_state(zone, NR_ACTIVE_FILE,
+						-count[LRU_ACTIVE_FILE]);
+		__mod_zone_page_state(zone, NR_INACTIVE_FILE,
+						-count[LRU_INACTIVE_FILE]);
+		__mod_zone_page_state(zone, NR_ACTIVE_ANON,
+						-count[LRU_ACTIVE_ANON]);
+		__mod_zone_page_state(zone, NR_INACTIVE_ANON,
+						-count[LRU_INACTIVE_ANON]);
+
+		if (scan_global_lru(sc)) {
 			zone->pages_scanned += nr_scan;
+			zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
+			zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+			zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+			zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+		}
 		spin_unlock_irq(&zone->lru_lock);
 
 		nr_scanned += nr_scan;
@@ -899,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			 * The attempt at page out may have made some
 			 * of the pages active, mark them inactive again.
 			 */
-			nr_active = clear_active_flags(&page_list);
+			nr_active = clear_active_flags(&page_list, count);
 			count_vm_events(PGDEACTIVATE, nr_active);
 
 			nr_freed += shrink_page_list(&page_list, sc,
@@ -924,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		 * Put back any unfreeable pages.
 		 */
 		while (!list_empty(&page_list)) {
+			int lru;
 			page = lru_to_page(&page_list);
 			VM_BUG_ON(PageLRU(page));
-			SetPageLRU(page);
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
-				add_page_to_inactive_list(zone, page);
+			if (unlikely(!page_evictable(page, NULL))) {
+				spin_unlock_irq(&zone->lru_lock);
+				putback_lru_page(page);
+				spin_lock_irq(&zone->lru_lock);
+				continue;
+			}
+			SetPageLRU(page);
+			lru = page_lru(page);
+			add_page_to_lru_list(zone, page, lru);
+			mem_cgroup_move_lists(page, lru);
+			if (PageActive(page) && scan_global_lru(sc)) {
+				int file = !!page_is_file_cache(page);
+				zone->recent_rotated[file]++;
+			}
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_release(&pvec);
@@ -962,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 
 static inline int zone_is_near_oom(struct zone *zone)
 {
-	return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
-				+ zone_page_state(zone, NR_INACTIVE))*3;
-}
-
-/*
- * Determine we should try to reclaim mapped pages.
- * This is called only when sc->mem_cgroup is NULL.
- */
-static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
-				int priority)
-{
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
-	long imbalance;
-	int reclaim_mapped = 0;
-	int prev_priority;
-
-	if (scan_global_lru(sc) && zone_is_near_oom(zone))
-		return 1;
-	/*
-	 * `distress' is a measure of how much trouble we're having
-	 * reclaiming pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	if (scan_global_lru(sc))
-		prev_priority = zone->prev_priority;
-	else
-		prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
-
-	distress = 100 >> min(prev_priority, priority);
-
-	/*
-	 * The point of this algorithm is to decide when to start
-	 * reclaiming mapped memory instead of just pagecache.  Work out
-	 * how much memory
-	 * is mapped.
-	 */
-	if (scan_global_lru(sc))
-		mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-				global_page_state(NR_ANON_PAGES)) * 100) /
-					vm_total_pages;
-	else
-		mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The
-	 * mapped ratio is downgraded - just because there's a lot of
-	 * mapped memory doesn't necessarily mean that page reclaim
-	 * isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start
-	 * going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm
-	 * altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-
-	/*
-	 * If there's huge imbalance between active and inactive
-	 * (think active 100 times larger than inactive) we should
-	 * become more permissive, or the system will take too much
-	 * cpu before it start swapping during memory pressure.
-	 * Distress is about avoiding early-oom, this is about
-	 * making swappiness graceful despite setting it to low
-	 * values.
-	 *
-	 * Avoid div by zero with nr_inactive+1, and max resulting
-	 * value is vm_total_pages.
-	 */
-	if (scan_global_lru(sc)) {
-		imbalance  = zone_page_state(zone, NR_ACTIVE);
-		imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
-	} else
-		imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
-
-	/*
-	 * Reduce the effect of imbalance if swappiness is low,
-	 * this means for a swappiness very low, the imbalance
-	 * must be much higher than 100 for this logic to make
-	 * the difference.
-	 *
-	 * Max temporary value is vm_total_pages*100.
-	 */
-	imbalance *= (vm_swappiness + 1);
-	imbalance /= 100;
-
-	/*
-	 * If not much of the ram is mapped, makes the imbalance
-	 * less relevant, it's high priority we refill the inactive
-	 * list with mapped pages only in presence of high ratio of
-	 * mapped pages.
-	 *
-	 * Max temporary value is vm_total_pages*100.
-	 */
-	imbalance *= mapped_ratio;
-	imbalance /= 100;
-
-	/* apply imbalance feedback to swap_tendency */
-	swap_tendency += imbalance;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped
-	 * memory onto the inactive list.
-	 */
-	if (swap_tendency >= 100)
-		reclaim_mapped = 1;
-
-	return reclaim_mapped;
+	return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
 }
 
 /*
@@ -1093,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
 
 
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-				struct scan_control *sc, int priority)
+			struct scan_control *sc, int priority, int file)
 {
 	unsigned long pgmoved;
 	int pgdeactivate = 0;
 	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
-	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
-	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
+	LIST_HEAD(l_inactive);
 	struct page *page;
 	struct pagevec pvec;
-	int reclaim_mapped = 0;
-
-	if (sc->may_swap)
-		reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
+	enum lru_list lru;
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
 	pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
 					ISOLATE_ACTIVE, zone,
-					sc->mem_cgroup, 1);
+					sc->mem_cgroup, 1, file);
 	/*
 	 * zone->pages_scanned is used for detect zone's oom
 	 * mem_cgroup remembers nr_scan by itself.
 	 */
-	if (scan_global_lru(sc))
+	if (scan_global_lru(sc)) {
 		zone->pages_scanned += pgscanned;
+		zone->recent_scanned[!!file] += pgmoved;
+	}
 
-	__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
+	if (file)
+		__mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
+	else
+		__mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
 	spin_unlock_irq(&zone->lru_lock);
 
+	pgmoved = 0;
 	while (!list_empty(&l_hold)) {
 		cond_resched();
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
-		if (page_mapped(page)) {
-			if (!reclaim_mapped ||
-			    (total_swap_pages == 0 && PageAnon(page)) ||
-			    page_referenced(page, 0, sc->mem_cgroup)) {
-				list_add(&page->lru, &l_active);
-				continue;
-			}
+
+		if (unlikely(!page_evictable(page, NULL))) {
+			putback_lru_page(page);
+			continue;
 		}
+
+		/* page_referenced clears PageReferenced */
+		if (page_mapping_inuse(page) &&
+		    page_referenced(page, 0, sc->mem_cgroup))
+			pgmoved++;
+
 		list_add(&page->lru, &l_inactive);
 	}
 
+	/*
+	 * Count referenced pages from currently used mappings as
+	 * rotated, even though they are moved to the inactive list.
+	 * This helps balance scan pressure between file and anonymous
+	 * pages in get_scan_ratio.
+	 */
+	zone->recent_rotated[!!file] += pgmoved;
+
+	/*
+	 * Move the pages to the [file or anon] inactive list.
+	 */
 	pagevec_init(&pvec, 1);
+
 	pgmoved = 0;
+	lru = LRU_BASE + file * LRU_FILE;
 	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
@@ -1149,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		VM_BUG_ON(!PageActive(page));
 		ClearPageActive(page);
 
-		list_move(&page->lru, &zone->inactive_list);
-		mem_cgroup_move_lists(page, false);
+		list_move(&page->lru, &zone->lru[lru].list);
+		mem_cgroup_move_lists(page, lru);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-			__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
+			__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
@@ -1163,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
-	__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
 		spin_unlock_irq(&zone->lru_lock);
 		pagevec_strip(&pvec);
 		spin_lock_irq(&zone->lru_lock);
 	}
-
-	pgmoved = 0;
-	while (!list_empty(&l_active)) {
-		page = lru_to_page(&l_active);
-		prefetchw_prev_lru_page(page, &l_active, flags);
-		VM_BUG_ON(PageLRU(page));
-		SetPageLRU(page);
-		VM_BUG_ON(!PageActive(page));
-
-		list_move(&page->lru, &zone->active_list);
-		mem_cgroup_move_lists(page, true);
-		pgmoved++;
-		if (!pagevec_add(&pvec, page)) {
-			__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
-			pgmoved = 0;
-			spin_unlock_irq(&zone->lru_lock);
-			__pagevec_release(&pvec);
-			spin_lock_irq(&zone->lru_lock);
-		}
-	}
-	__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
-
 	__count_zone_vm_events(PGREFILL, zone, pgscanned);
 	__count_vm_events(PGDEACTIVATE, pgdeactivate);
 	spin_unlock_irq(&zone->lru_lock);
+	if (vm_swap_full())
+		pagevec_swap_free(&pvec);
 
 	pagevec_release(&pvec);
 }
 
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+	struct zone *zone, struct scan_control *sc, int priority)
+{
+	int file = is_file_lru(lru);
+
+	if (lru == LRU_ACTIVE_FILE) {
+		shrink_active_list(nr_to_scan, zone, sc, priority, file);
+		return 0;
+	}
+
+	if (lru == LRU_ACTIVE_ANON &&
+	    (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
+		shrink_active_list(nr_to_scan, zone, sc, priority, file);
+		return 0;
+	}
+	return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
+}
+
+/*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned.  The relative value of each set of LRU lists is determined
+ * by looking at the fraction of the pages scanned we did rotate back
+ * onto the active list instead of evict.
+ *
+ * percent[0] specifies how much pressure to put on ram/swap backed
+ * memory, while percent[1] determines pressure on the file LRUs.
+ */
+static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
+					unsigned long *percent)
+{
+	unsigned long anon, file, free;
+	unsigned long anon_prio, file_prio;
+	unsigned long ap, fp;
+
+	anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
+		zone_page_state(zone, NR_INACTIVE_ANON);
+	file  = zone_page_state(zone, NR_ACTIVE_FILE) +
+		zone_page_state(zone, NR_INACTIVE_FILE);
+	free  = zone_page_state(zone, NR_FREE_PAGES);
+
+	/* If we have no swap space, do not bother scanning anon pages. */
+	if (nr_swap_pages <= 0) {
+		percent[0] = 0;
+		percent[1] = 100;
+		return;
+	}
+
+	/* If we have very few page cache pages, force-scan anon pages. */
+	if (unlikely(file + free <= zone->pages_high)) {
+		percent[0] = 100;
+		percent[1] = 0;
+		return;
+	}
+
+	/*
+	 * OK, so we have swap space and a fair amount of page cache
+	 * pages.  We use the recently rotated / recently scanned
+	 * ratios to determine how valuable each cache is.
+	 *
+	 * Because workloads change over time (and to avoid overflow)
+	 * we keep these statistics as a floating average, which ends
+	 * up weighing recent references more than old ones.
+	 *
+	 * anon in [0], file in [1]
+	 */
+	if (unlikely(zone->recent_scanned[0] > anon / 4)) {
+		spin_lock_irq(&zone->lru_lock);
+		zone->recent_scanned[0] /= 2;
+		zone->recent_rotated[0] /= 2;
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	if (unlikely(zone->recent_scanned[1] > file / 4)) {
+		spin_lock_irq(&zone->lru_lock);
+		zone->recent_scanned[1] /= 2;
+		zone->recent_rotated[1] /= 2;
+		spin_unlock_irq(&zone->lru_lock);
+	}
+
+	/*
+	 * With swappiness at 100, anonymous and file have the same priority.
+	 * This scanning priority is essentially the inverse of IO cost.
+	 */
+	anon_prio = sc->swappiness;
+	file_prio = 200 - sc->swappiness;
+
+	/*
+	 *                  anon       recent_rotated[0]
+	 * %anon = 100 * ----------- / ----------------- * IO cost
+	 *               anon + file      rotate_sum
+	 */
+	ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
+	ap /= zone->recent_rotated[0] + 1;
+
+	fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
+	fp /= zone->recent_rotated[1] + 1;
+
+	/* Normalize to percentages */
+	percent[0] = 100 * ap / (ap + fp + 1);
+	percent[1] = 100 - percent[0];
+}
+
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static unsigned long shrink_zone(int priority, struct zone *zone,
 				struct scan_control *sc)
 {
-	unsigned long nr_active;
-	unsigned long nr_inactive;
+	unsigned long nr[NR_LRU_LISTS];
 	unsigned long nr_to_scan;
 	unsigned long nr_reclaimed = 0;
+	unsigned long percent[2];	/* anon @ 0; file @ 1 */
+	enum lru_list l;
 
-	if (scan_global_lru(sc)) {
-		/*
-		 * Add one to nr_to_scan just to make sure that the kernel
-		 * will slowly sift through the active list.
-		 */
-		zone->nr_scan_active +=
-			(zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
-		nr_active = zone->nr_scan_active;
-		zone->nr_scan_inactive +=
-			(zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-		nr_inactive = zone->nr_scan_inactive;
-		if (nr_inactive >= sc->swap_cluster_max)
-			zone->nr_scan_inactive = 0;
-		else
-			nr_inactive = 0;
-
-		if (nr_active >= sc->swap_cluster_max)
-			zone->nr_scan_active = 0;
-		else
-			nr_active = 0;
-	} else {
-		/*
-		 * This reclaim occurs not because zone memory shortage but
-		 * because memory controller hits its limit.
-		 * Then, don't modify zone reclaim related data.
-		 */
-		nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
-					zone, priority);
-
-		nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
-					zone, priority);
-	}
+	get_scan_ratio(zone, sc, percent);
 
+	for_each_evictable_lru(l) {
+		if (scan_global_lru(sc)) {
+			int file = is_file_lru(l);
+			int scan;
 
-	while (nr_active || nr_inactive) {
-		if (nr_active) {
-			nr_to_scan = min(nr_active,
-					(unsigned long)sc->swap_cluster_max);
-			nr_active -= nr_to_scan;
-			shrink_active_list(nr_to_scan, zone, sc, priority);
+			scan = zone_page_state(zone, NR_LRU_BASE + l);
+			if (priority) {
+				scan >>= priority;
+				scan = (scan * percent[file]) / 100;
+			}
+			zone->lru[l].nr_scan += scan;
+			nr[l] = zone->lru[l].nr_scan;
+			if (nr[l] >= sc->swap_cluster_max)
+				zone->lru[l].nr_scan = 0;
+			else
+				nr[l] = 0;
+		} else {
+			/*
+			 * This reclaim occurs not because zone memory shortage
+			 * but because memory controller hits its limit.
+			 * Don't modify zone reclaim related data.
+			 */
+			nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
+								priority, l);
 		}
+	}
 
-		if (nr_inactive) {
-			nr_to_scan = min(nr_inactive,
+	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
+					nr[LRU_INACTIVE_FILE]) {
+		for_each_evictable_lru(l) {
+			if (nr[l]) {
+				nr_to_scan = min(nr[l],
 					(unsigned long)sc->swap_cluster_max);
-			nr_inactive -= nr_to_scan;
-			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
-								sc);
+				nr[l] -= nr_to_scan;
+
+				nr_reclaimed += shrink_list(l, nr_to_scan,
+							zone, sc, priority);
+			}
 		}
 	}
 
+	/*
+	 * Even if we did not try to evict anon pages at all, we want to
+	 * rebalance the anon lru active/inactive ratio.
+	 */
+	if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
+		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+	else if (!scan_global_lru(sc))
+		shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+
 	throttle_vm_writeout(sc->gfp_mask);
 	return nr_reclaimed;
 }
@@ -1321,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
 
 	return nr_reclaimed;
 }
- 
+
 /*
  * This is the main entry point to direct page reclaim.
  *
@@ -1364,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
 
-			lru_pages += zone_page_state(zone, NR_ACTIVE)
-					+ zone_page_state(zone, NR_INACTIVE);
+			lru_pages += zone_lru_pages(zone);
 		}
 	}
 
@@ -1555,6 +1759,14 @@ loop_again:
 			    priority != DEF_PRIORITY)
 				continue;
 
+			/*
+			 * Do some background aging of the anon list, to give
+			 * pages a chance to be referenced before reclaiming.
+			 */
+			if (inactive_anon_is_low(zone))
+				shrink_active_list(SWAP_CLUSTER_MAX, zone,
+							&sc, priority, 0);
+
 			if (!zone_watermark_ok(zone, order, zone->pages_high,
 					       0, 0)) {
 				end_zone = i;
@@ -1567,8 +1779,7 @@ loop_again:
 		for (i = 0; i <= end_zone; i++) {
 			struct zone *zone = pgdat->node_zones + i;
 
-			lru_pages += zone_page_state(zone, NR_ACTIVE)
-					+ zone_page_state(zone, NR_INACTIVE);
+			lru_pages += zone_lru_pages(zone);
 		}
 
 		/*
@@ -1612,8 +1823,7 @@ loop_again:
 			if (zone_is_all_unreclaimable(zone))
 				continue;
 			if (nr_slab == 0 && zone->pages_scanned >=
-				(zone_page_state(zone, NR_ACTIVE)
-				+ zone_page_state(zone, NR_INACTIVE)) * 6)
+						(zone_lru_pages(zone) * 6))
 					zone_set_flag(zone,
 						      ZONE_ALL_UNRECLAIMABLE);
 			/*
@@ -1667,7 +1877,7 @@ out:
 
 /*
  * The background pageout daemon, started as a kernel thread
- * from the init process. 
+ * from the init process.
  *
  * This basically trickles out pages so that we have _some_
  * free memory available even if there is no other activity
@@ -1761,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order)
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
+unsigned long global_lru_pages(void)
+{
+	return global_page_state(NR_ACTIVE_ANON)
+		+ global_page_state(NR_ACTIVE_FILE)
+		+ global_page_state(NR_INACTIVE_ANON)
+		+ global_page_state(NR_INACTIVE_FILE);
+}
+
 #ifdef CONFIG_PM
 /*
  * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
@@ -1774,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 {
 	struct zone *zone;
 	unsigned long nr_to_scan, ret = 0;
+	enum lru_list l;
 
 	for_each_zone(zone) {
 
@@ -1783,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
-		/* For pass = 0 we don't shrink the active list */
-		if (pass > 0) {
-			zone->nr_scan_active +=
-				(zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
-			if (zone->nr_scan_active >= nr_pages || pass > 3) {
-				zone->nr_scan_active = 0;
+		for_each_evictable_lru(l) {
+			/* For pass = 0, we don't shrink the active list */
+			if (pass == 0 &&
+				(l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
+				continue;
+
+			zone->lru[l].nr_scan +=
+				(zone_page_state(zone, NR_LRU_BASE + l)
+								>> prio) + 1;
+			if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+				zone->lru[l].nr_scan = 0;
 				nr_to_scan = min(nr_pages,
-					zone_page_state(zone, NR_ACTIVE));
-				shrink_active_list(nr_to_scan, zone, sc, prio);
+					zone_page_state(zone,
+							NR_LRU_BASE + l));
+				ret += shrink_list(l, nr_to_scan, zone,
+								sc, prio);
+				if (ret >= nr_pages)
+					return ret;
 			}
 		}
-
-		zone->nr_scan_inactive +=
-			(zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
-		if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
-			zone->nr_scan_inactive = 0;
-			nr_to_scan = min(nr_pages,
-				zone_page_state(zone, NR_INACTIVE));
-			ret += shrink_inactive_list(nr_to_scan, zone, sc);
-			if (ret >= nr_pages)
-				return ret;
-		}
 	}
 
 	return ret;
 }
 
-static unsigned long count_lru_pages(void)
-{
-	return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
-}
-
 /*
  * Try to free `nr_pages' of memory, system-wide, and return the number of
  * freed pages.
@@ -1840,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 	current->reclaim_state = &reclaim_state;
 
-	lru_pages = count_lru_pages();
+	lru_pages = global_lru_pages();
 	nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
 	/* If slab caches are huge, it's better to hit them first */
 	while (nr_slab >= lru_pages) {
@@ -1883,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 			reclaim_state.reclaimed_slab = 0;
 			shrink_slab(sc.nr_scanned, sc.gfp_mask,
-					count_lru_pages());
+					global_lru_pages());
 			ret += reclaim_state.reclaimed_slab;
 			if (ret >= nr_pages)
 				goto out;
@@ -1900,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	if (!ret) {
 		do {
 			reclaim_state.reclaimed_slab = 0;
-			shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+			shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
 			ret += reclaim_state.reclaimed_slab;
 		} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
 	}
@@ -2128,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	return ret;
 }
 #endif
+
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ * @vma: the VMA in which the page is or will be mapped, may be NULL
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list.  The vma argument is !NULL when called from the
+ * fault path to determine how to instantate a new page.
+ *
+ * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
+ *
+ */
+int page_evictable(struct page *page, struct vm_area_struct *vma)
+{
+
+	if (mapping_unevictable(page_mapping(page)))
+		return 0;
+
+	if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+		return 0;
+
+	return 1;
+}
+
+static void show_page_path(struct page *page)
+{
+	char buf[256];
+	if (page_is_file_cache(page)) {
+		struct address_space *mapping = page->mapping;
+		struct dentry *dentry;
+		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+
+		spin_lock(&mapping->i_mmap_lock);
+		dentry = d_find_alias(mapping->host);
+		printk(KERN_INFO "rescued: %s %lu\n",
+		       dentry_path(dentry, buf, 256), pgoff);
+		spin_unlock(&mapping->i_mmap_lock);
+	} else {
+#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
+		struct anon_vma *anon_vma;
+		struct vm_area_struct *vma;
+
+		anon_vma = page_lock_anon_vma(page);
+		if (!anon_vma)
+			return;
+
+		list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+			printk(KERN_INFO "rescued: anon %s\n",
+			       vma->vm_mm->owner->comm);
+			break;
+		}
+		page_unlock_anon_vma(anon_vma);
+#endif
+	}
+}
+
+
+/**
+ * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
+ * @page: page to check evictability and move to appropriate lru list
+ * @zone: zone page is in
+ *
+ * Checks a page for evictability and moves the page to the appropriate
+ * zone lru list.
+ *
+ * Restrictions: zone->lru_lock must be held, page must be on LRU and must
+ * have PageUnevictable set.
+ */
+static void check_move_unevictable_page(struct page *page, struct zone *zone)
+{
+	VM_BUG_ON(PageActive(page));
+
+retry:
+	ClearPageUnevictable(page);
+	if (page_evictable(page, NULL)) {
+		enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+
+		show_page_path(page);
+
+		__dec_zone_state(zone, NR_UNEVICTABLE);
+		list_move(&page->lru, &zone->lru[l].list);
+		__inc_zone_state(zone, NR_INACTIVE_ANON + l);
+		__count_vm_event(UNEVICTABLE_PGRESCUED);
+	} else {
+		/*
+		 * rotate unevictable list
+		 */
+		SetPageUnevictable(page);
+		list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+		if (page_evictable(page, NULL))
+			goto retry;
+	}
+}
+
+/**
+ * scan_mapping_unevictable_pages - scan an address space for evictable pages
+ * @mapping: struct address_space to scan for evictable pages
+ *
+ * Scan all pages in mapping.  Check unevictable pages for
+ * evictability and move them to the appropriate zone lru list.
+ */
+void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+	pgoff_t next = 0;
+	pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
+			 PAGE_CACHE_SHIFT;
+	struct zone *zone;
+	struct pagevec pvec;
+
+	if (mapping->nrpages == 0)
+		return;
+
+	pagevec_init(&pvec, 0);
+	while (next < end &&
+		pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		int i;
+		int pg_scanned = 0;
+
+		zone = NULL;
+
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+			pgoff_t page_index = page->index;
+			struct zone *pagezone = page_zone(page);
+
+			pg_scanned++;
+			if (page_index > next)
+				next = page_index;
+			next++;
+
+			if (pagezone != zone) {
+				if (zone)
+					spin_unlock_irq(&zone->lru_lock);
+				zone = pagezone;
+				spin_lock_irq(&zone->lru_lock);
+			}
+
+			if (PageLRU(page) && PageUnevictable(page))
+				check_move_unevictable_page(page, zone);
+		}
+		if (zone)
+			spin_unlock_irq(&zone->lru_lock);
+		pagevec_release(&pvec);
+
+		count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
+	}
+
+}
+
+/**
+ * scan_zone_unevictable_pages - check unevictable list for evictable pages
+ * @zone - zone of which to scan the unevictable list
+ *
+ * Scan @zone's unevictable LRU lists to check for pages that have become
+ * evictable.  Move those that have to @zone's inactive list where they
+ * become candidates for reclaim, unless shrink_inactive_zone() decides
+ * to reactivate them.  Pages that are still unevictable are rotated
+ * back onto @zone's unevictable list.
+ */
+#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
+void scan_zone_unevictable_pages(struct zone *zone)
+{
+	struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
+	unsigned long scan;
+	unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+
+	while (nr_to_scan > 0) {
+		unsigned long batch_size = min(nr_to_scan,
+						SCAN_UNEVICTABLE_BATCH_SIZE);
+
+		spin_lock_irq(&zone->lru_lock);
+		for (scan = 0;  scan < batch_size; scan++) {
+			struct page *page = lru_to_page(l_unevictable);
+
+			if (!trylock_page(page))
+				continue;
+
+			prefetchw_prev_lru_page(page, l_unevictable, flags);
+
+			if (likely(PageLRU(page) && PageUnevictable(page)))
+				check_move_unevictable_page(page, zone);
+
+			unlock_page(page);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+
+		nr_to_scan -= batch_size;
+	}
+}
+
+
+/**
+ * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
+ *
+ * A really big hammer:  scan all zones' unevictable LRU lists to check for
+ * pages that have become evictable.  Move those back to the zones'
+ * inactive list where they become candidates for reclaim.
+ * This occurs when, e.g., we have unswappable pages on the unevictable lists,
+ * and we add swap to the system.  As such, it runs in the context of a task
+ * that has possibly/probably made some previously unevictable pages
+ * evictable.
+ */
+void scan_all_zones_unevictable_pages(void)
+{
+	struct zone *zone;
+
+	for_each_zone(zone) {
+		scan_zone_unevictable_pages(zone);
+	}
+}
+
+/*
+ * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
+ * all nodes' unevictable lists for evictable pages
+ */
+unsigned long scan_unevictable_pages;
+
+int scan_unevictable_handler(struct ctl_table *table, int write,
+			   struct file *file, void __user *buffer,
+			   size_t *length, loff_t *ppos)
+{
+	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+	if (write && *(unsigned long *)table->data)
+		scan_all_zones_unevictable_pages();
+
+	scan_unevictable_pages = 0;
+	return 0;
+}
+
+/*
+ * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
+ * a specified node's per zone unevictable lists for evictable pages.
+ */
+
+static ssize_t read_scan_unevictable_node(struct sys_device *dev,
+					  struct sysdev_attribute *attr,
+					  char *buf)
+{
+	return sprintf(buf, "0\n");	/* always zero; should fit... */
+}
+
+static ssize_t write_scan_unevictable_node(struct sys_device *dev,
+					   struct sysdev_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+	struct zone *zone;
+	unsigned long res;
+	unsigned long req = strict_strtoul(buf, 10, &res);
+
+	if (!req)
+		return 1;	/* zero is no-op */
+
+	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+		if (!populated_zone(zone))
+			continue;
+		scan_zone_unevictable_pages(zone);
+	}
+	return 1;
+}
+
+
+static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+			read_scan_unevictable_node,
+			write_scan_unevictable_node);
+
+int scan_unevictable_register_node(struct node *node)
+{
+	return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+
+void scan_unevictable_unregister_node(struct node *node)
+{
+	sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d7826af2fb07..9343227c5c60 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -619,8 +619,14 @@ const struct seq_operations pagetypeinfo_op = {
 static const char * const vmstat_text[] = {
 	/* Zoned VM counters */
 	"nr_free_pages",
-	"nr_inactive",
-	"nr_active",
+	"nr_inactive_anon",
+	"nr_active_anon",
+	"nr_inactive_file",
+	"nr_active_file",
+#ifdef CONFIG_UNEVICTABLE_LRU
+	"nr_unevictable",
+	"nr_mlock",
+#endif
 	"nr_anon_pages",
 	"nr_mapped",
 	"nr_file_pages",
@@ -675,6 +681,16 @@ static const char * const vmstat_text[] = {
 	"htlb_buddy_alloc_success",
 	"htlb_buddy_alloc_fail",
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+	"unevictable_pgs_culled",
+	"unevictable_pgs_scanned",
+	"unevictable_pgs_rescued",
+	"unevictable_pgs_mlocked",
+	"unevictable_pgs_munlocked",
+	"unevictable_pgs_cleared",
+	"unevictable_pgs_stranded",
+	"unevictable_pgs_mlockfreed",
+#endif
 #endif
 };
 
@@ -688,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n        min      %lu"
 		   "\n        low      %lu"
 		   "\n        high     %lu"
-		   "\n        scanned  %lu (a: %lu i: %lu)"
+		   "\n        scanned  %lu (aa: %lu ia: %lu af: %lu if: %lu)"
 		   "\n        spanned  %lu"
 		   "\n        present  %lu",
 		   zone_page_state(zone, NR_FREE_PAGES),
@@ -696,7 +712,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   zone->pages_low,
 		   zone->pages_high,
 		   zone->pages_scanned,
-		   zone->nr_scan_active, zone->nr_scan_inactive,
+		   zone->lru[LRU_ACTIVE_ANON].nr_scan,
+		   zone->lru[LRU_INACTIVE_ANON].nr_scan,
+		   zone->lru[LRU_ACTIVE_FILE].nr_scan,
+		   zone->lru[LRU_INACTIVE_FILE].nr_scan,
 		   zone->spanned_pages,
 		   zone->present_pages);
 
@@ -733,10 +752,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
 		   "\n  prev_priority:     %i"
-		   "\n  start_pfn:         %lu",
+		   "\n  start_pfn:         %lu"
+		   "\n  inactive_ratio:    %u",
 			   zone_is_all_unreclaimable(zone),
 		   zone->prev_priority,
-		   zone->zone_start_pfn);
+		   zone->zone_start_pfn,
+		   zone->inactive_ratio);
 	seq_putc(m, '\n');
 }
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index a4abed5b4c44..fa5cda4e552a 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -719,7 +719,7 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 		return NF_ACCEPT;
 	}
 	*d = (struct net_device *)in;
-	NF_HOOK(NF_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in,
+	NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in,
 		(struct net_device *)out, br_nf_forward_finish);
 
 	return NF_STOLEN;
diff --git a/net/core/dev.c b/net/core/dev.c
index 868ec0ba8b77..b8a4fd0806af 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -924,10 +924,10 @@ int dev_change_name(struct net_device *dev, const char *newname)
 		strlcpy(dev->name, newname, IFNAMSIZ);
 
 rollback:
-	err = device_rename(&dev->dev, dev->name);
-	if (err) {
+	ret = device_rename(&dev->dev, dev->name);
+	if (ret) {
 		memcpy(dev->name, oldname, IFNAMSIZ);
-		return err;
+		return ret;
 	}
 
 	write_lock_bh(&dev_base_lock);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 11062780bb02..d4ce1224e008 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -259,7 +259,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
 	fl.fl6_flowlabel = 0;
 	fl.oif = ireq6->iif;
 	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
-	fl.fl_ip_sport = inet_sk(sk)->sport;
+	fl.fl_ip_sport = inet_rsk(req)->loc_port;
 	security_req_classify_flow(req, &fl);
 
 	opt = np->opt;
@@ -558,7 +558,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
-		fl.fl_ip_sport = inet_sk(sk)->sport;
+		fl.fl_ip_sport = inet_rsk(req)->loc_port;
 		security_sk_classify_flow(sk, &fl);
 
 		if (ip6_dst_lookup(sk, &dst, &fl))
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index b2804e2d1b8c..e6bf99e3e41a 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -309,6 +309,7 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb)
 	struct dccp_request_sock *dreq = dccp_rsk(req);
 
 	inet_rsk(req)->rmt_port	  = dccp_hdr(skb)->dccph_sport;
+	inet_rsk(req)->loc_port	  = dccp_hdr(skb)->dccph_dport;
 	inet_rsk(req)->acked	  = 0;
 	req->rcv_wnd		  = sysctl_dccp_feat_sequence_window;
 	dreq->dreq_timestamp_echo = 0;
diff --git a/net/dccp/output.c b/net/dccp/output.c
index d06945c7d3df..809d803d5006 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -347,7 +347,7 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
 	/* Build and checksum header */
 	dh = dccp_zeroed_hdr(skb, dccp_header_size);
 
-	dh->dccph_sport	= inet_sk(sk)->sport;
+	dh->dccph_sport	= inet_rsk(req)->loc_port;
 	dh->dccph_dport	= inet_rsk(req)->rmt_port;
 	dh->dccph_doff	= (dccp_header_size +
 			   DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index b043eda60b04..1a9dd66511fc 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -663,7 +663,7 @@ out:
 void arp_xmit(struct sk_buff *skb)
 {
 	/* Send it off, maybe filter it using firewalling first.  */
-	NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+	NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
 }
 
 /*
@@ -928,7 +928,7 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
 
 	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
 
-	return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+	return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
 
 freeskb:
 	kfree_skb(skb);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ffeaffc3fffe..8303e4b406c0 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -742,6 +742,7 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
 			*obj = kmalloc(sizeof(struct snmp_object) + len,
 				       GFP_ATOMIC);
 			if (*obj == NULL) {
+				kfree(p);
 				kfree(id);
 				if (net_ratelimit())
 					printk("OOM in bsalg (%d)\n", __LINE__);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index ec394cf5a19b..676c80b5b14b 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -204,6 +204,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	req->mss = mss;
 	ireq->rmt_port = th->source;
+	ireq->loc_port = th->dest;
 	ipv6_addr_copy(&ireq6->rmt_addr, &ipv6_hdr(skb)->saddr);
 	ipv6_addr_copy(&ireq6->loc_addr, &ipv6_hdr(skb)->daddr);
 	if (ipv6_opt_accepted(sk, skb) ||
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e5310c9b84dc..b6b356b7912a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -476,7 +476,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req)
 	fl.fl6_flowlabel = 0;
 	fl.oif = treq->iif;
 	fl.fl_ip_dport = inet_rsk(req)->rmt_port;
-	fl.fl_ip_sport = inet_sk(sk)->sport;
+	fl.fl_ip_sport = inet_rsk(req)->loc_port;
 	security_req_classify_flow(req, &fl);
 
 	opt = np->opt;
@@ -1309,7 +1309,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		ipv6_addr_copy(&fl.fl6_src, &treq->loc_addr);
 		fl.oif = sk->sk_bound_dev_if;
 		fl.fl_ip_dport = inet_rsk(req)->rmt_port;
-		fl.fl_ip_sport = inet_sk(sk)->sport;
+		fl.fl_ip_sport = inet_rsk(req)->loc_port;
 		security_req_classify_flow(req, &fl);
 
 		if (ip6_dst_lookup(sk, &dst, &fl))
@@ -1865,7 +1865,7 @@ static void get_openreq6(struct seq_file *seq,
 		   i,
 		   src->s6_addr32[0], src->s6_addr32[1],
 		   src->s6_addr32[2], src->s6_addr32[3],
-		   ntohs(inet_sk(sk)->sport),
+		   ntohs(inet_rsk(req)->loc_port),
 		   dest->s6_addr32[0], dest->s6_addr32[1],
 		   dest->s6_addr32[2], dest->s6_addr32[3],
 		   ntohs(inet_rsk(req)->rmt_port),
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 78892cf2b021..25dcef9f2194 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -271,7 +271,6 @@ config NF_CONNTRACK_TFTP
 config NF_CT_NETLINK
 	tristate 'Connection tracking netlink interface'
 	select NETFILTER_NETLINK
-	depends on NF_NAT=n || NF_NAT
 	default m if NETFILTER_ADVANCED=n
 	help
 	  This option enables support for a netlink-based userspace interface
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 05048e403266..79a698052218 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -25,11 +25,13 @@ menuconfig IP_VS
 if IP_VS
 
 config	IP_VS_IPV6
-	bool "IPv6 support for IPVS (DANGEROUS)"
+	bool "IPv6 support for IPVS"
 	depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6)
 	---help---
 	  Add IPv6 support to IPVS. This is incomplete and might be dangerous.
 
+	  See http://www.mindbasket.com/ipvs for more information.
+
 	  Say N if unsure.
 
 config	IP_VS_DEBUG
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2e4ad9671e19..a040d46f85d6 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -813,6 +813,7 @@ out:
 	return err;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
 static int
 ctnetlink_parse_nat_setup(struct nf_conn *ct,
 			  enum nf_nat_manip_type manip,
@@ -840,6 +841,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 
 	return parse_nat_setup(ct, manip, attr);
 }
+#endif
 
 static int
 ctnetlink_change_status(struct nf_conn *ct, struct nlattr *cda[])
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 2cc1fff49307..f9977b3311f7 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -48,7 +48,7 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "NFQUEUE",
-		.family		= NF_ARP,
+		.family		= NFPROTO_ARP,
 		.target		= nfqueue_tg,
 		.targetsize	= sizeof(struct xt_NFQ_info),
 		.me		= THIS_MODULE,
diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c
index 6f62c36948d9..7ac54eab0b00 100644
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -61,7 +61,7 @@ iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (info->flags & IPRANGE_SRC) {
 		m  = ntohl(iph->saddr) < ntohl(info->src_min.ip);
 		m |= ntohl(iph->saddr) > ntohl(info->src_max.ip);
-		m ^= info->flags & IPRANGE_SRC_INV;
+		m ^= !!(info->flags & IPRANGE_SRC_INV);
 		if (m) {
 			pr_debug("src IP " NIPQUAD_FMT " NOT in range %s"
 			         NIPQUAD_FMT "-" NIPQUAD_FMT "\n",
@@ -75,7 +75,7 @@ iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (info->flags & IPRANGE_DST) {
 		m  = ntohl(iph->daddr) < ntohl(info->dst_min.ip);
 		m |= ntohl(iph->daddr) > ntohl(info->dst_max.ip);
-		m ^= info->flags & IPRANGE_DST_INV;
+		m ^= !!(info->flags & IPRANGE_DST_INV);
 		if (m) {
 			pr_debug("dst IP " NIPQUAD_FMT " NOT in range %s"
 			         NIPQUAD_FMT "-" NIPQUAD_FMT "\n",
@@ -114,14 +114,14 @@ iprange_mt6(const struct sk_buff *skb, const struct xt_match_param *par)
 	if (info->flags & IPRANGE_SRC) {
 		m  = iprange_ipv6_sub(&iph->saddr, &info->src_min.in6) < 0;
 		m |= iprange_ipv6_sub(&iph->saddr, &info->src_max.in6) > 0;
-		m ^= info->flags & IPRANGE_SRC_INV;
+		m ^= !!(info->flags & IPRANGE_SRC_INV);
 		if (m)
 			return false;
 	}
 	if (info->flags & IPRANGE_DST) {
 		m  = iprange_ipv6_sub(&iph->daddr, &info->dst_min.in6) < 0;
 		m |= iprange_ipv6_sub(&iph->daddr, &info->dst_max.in6) > 0;
-		m ^= info->flags & IPRANGE_DST_INV;
+		m ^= !!(info->flags & IPRANGE_DST_INV);
 		if (m)
 			return false;
 	}
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 4ebd4ca9a991..280c471bcdf4 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -318,15 +318,15 @@ static bool recent_mt_check(const struct xt_mtchk_param *par)
 	for (i = 0; i < ip_list_hash_size; i++)
 		INIT_LIST_HEAD(&t->iphash[i]);
 #ifdef CONFIG_PROC_FS
-	t->proc = proc_create(t->name, ip_list_perms, recent_proc_dir,
-		  &recent_mt_fops);
+	t->proc = proc_create_data(t->name, ip_list_perms, recent_proc_dir,
+		  &recent_mt_fops, t);
 	if (t->proc == NULL) {
 		kfree(t);
 		goto out;
 	}
 #ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT
-	t->proc_old = proc_create(t->name, ip_list_perms, proc_old_dir,
-		      &recent_old_fops);
+	t->proc_old = proc_create_data(t->name, ip_list_perms, proc_old_dir,
+		      &recent_old_fops, t);
 	if (t->proc_old == NULL) {
 		remove_proc_entry(t->name, proc_old_dir);
 		kfree(t);
@@ -334,11 +334,9 @@ static bool recent_mt_check(const struct xt_mtchk_param *par)
 	}
 	t->proc_old->uid   = ip_list_uid;
 	t->proc_old->gid   = ip_list_gid;
-	t->proc_old->data  = t;
 #endif
 	t->proc->uid       = ip_list_uid;
 	t->proc->gid       = ip_list_gid;
-	t->proc->data      = t;
 #endif
 	spin_lock_bh(&recent_lock);
 	list_add_tail(&t->list, &tables);
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7b5572d6beb5..93cd30ce6501 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -326,6 +326,7 @@ struct Qdisc_ops noop_qdisc_ops __read_mostly = {
 
 static struct netdev_queue noop_netdev_queue = {
 	.qdisc		=	&noop_qdisc,
+	.qdisc_sleeping	=	&noop_qdisc,
 };
 
 struct Qdisc noop_qdisc = {
@@ -352,6 +353,7 @@ static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
 static struct Qdisc noqueue_qdisc;
 static struct netdev_queue noqueue_netdev_queue = {
 	.qdisc		=	&noqueue_qdisc,
+	.qdisc_sleeping	=	&noqueue_qdisc,
 };
 
 static struct Qdisc noqueue_qdisc = {
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 46f23971f7e4..5ba78701adc3 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -1,5 +1,5 @@
 /*
- * dev_cgroup.c - device cgroup subsystem
+ * device_cgroup.c - device cgroup subsystem
  *
  * Copyright 2007 IBM Corp
  */
@@ -10,6 +10,7 @@
 #include <linux/list.h>
 #include <linux/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/rcupdate.h>
 
 #define ACC_MKNOD 1
 #define ACC_READ  2
@@ -22,18 +23,8 @@
 
 /*
  * whitelist locking rules:
- * cgroup_lock() cannot be taken under dev_cgroup->lock.
- * dev_cgroup->lock can be taken with or without cgroup_lock().
- *
- * modifications always require cgroup_lock
- * modifications to a list which is visible require the
- *   dev_cgroup->lock *and* cgroup_lock()
- * walking the list requires dev_cgroup->lock or cgroup_lock().
- *
- * reasoning: dev_whitelist_copy() needs to kmalloc, so needs
- *   a mutex, which the cgroup_lock() is.  Since modifying
- *   a visible list requires both locks, either lock can be
- *   taken for walking the list.
+ * hold cgroup_lock() for update/read.
+ * hold rcu_read_lock() for read.
  */
 
 struct dev_whitelist_item {
@@ -47,7 +38,6 @@ struct dev_whitelist_item {
 struct dev_cgroup {
 	struct cgroup_subsys_state css;
 	struct list_head whitelist;
-	spinlock_t lock;
 };
 
 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
@@ -84,13 +74,9 @@ static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
 	struct dev_whitelist_item *wh, *tmp, *new;
 
 	list_for_each_entry(wh, orig, list) {
-		new = kmalloc(sizeof(*wh), GFP_KERNEL);
+		new = kmemdup(wh, sizeof(*wh), GFP_KERNEL);
 		if (!new)
 			goto free_and_exit;
-		new->major = wh->major;
-		new->minor = wh->minor;
-		new->type = wh->type;
-		new->access = wh->access;
 		list_add_tail(&new->list, dest);
 	}
 
@@ -107,19 +93,16 @@ free_and_exit:
 /* Stupid prototype - don't bother combining existing entries */
 /*
  * called under cgroup_lock()
- * since the list is visible to other tasks, we need the spinlock also
  */
 static int dev_whitelist_add(struct dev_cgroup *dev_cgroup,
 			struct dev_whitelist_item *wh)
 {
 	struct dev_whitelist_item *whcopy, *walk;
 
-	whcopy = kmalloc(sizeof(*whcopy), GFP_KERNEL);
+	whcopy = kmemdup(wh, sizeof(*wh), GFP_KERNEL);
 	if (!whcopy)
 		return -ENOMEM;
 
-	memcpy(whcopy, wh, sizeof(*whcopy));
-	spin_lock(&dev_cgroup->lock);
 	list_for_each_entry(walk, &dev_cgroup->whitelist, list) {
 		if (walk->type != wh->type)
 			continue;
@@ -135,7 +118,6 @@ static int dev_whitelist_add(struct dev_cgroup *dev_cgroup,
 
 	if (whcopy != NULL)
 		list_add_tail_rcu(&whcopy->list, &dev_cgroup->whitelist);
-	spin_unlock(&dev_cgroup->lock);
 	return 0;
 }
 
@@ -149,14 +131,12 @@ static void whitelist_item_free(struct rcu_head *rcu)
 
 /*
  * called under cgroup_lock()
- * since the list is visible to other tasks, we need the spinlock also
  */
 static void dev_whitelist_rm(struct dev_cgroup *dev_cgroup,
 			struct dev_whitelist_item *wh)
 {
 	struct dev_whitelist_item *walk, *tmp;
 
-	spin_lock(&dev_cgroup->lock);
 	list_for_each_entry_safe(walk, tmp, &dev_cgroup->whitelist, list) {
 		if (walk->type == DEV_ALL)
 			goto remove;
@@ -174,7 +154,6 @@ remove:
 			call_rcu(&walk->rcu, whitelist_item_free);
 		}
 	}
-	spin_unlock(&dev_cgroup->lock);
 }
 
 /*
@@ -214,7 +193,6 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss,
 		}
 	}
 
-	spin_lock_init(&dev_cgroup->lock);
 	return &dev_cgroup->css;
 }
 
@@ -330,15 +308,11 @@ static int parent_has_perm(struct dev_cgroup *childcg,
 {
 	struct cgroup *pcg = childcg->css.cgroup->parent;
 	struct dev_cgroup *parent;
-	int ret;
 
 	if (!pcg)
 		return 1;
 	parent = cgroup_to_devcgroup(pcg);
-	spin_lock(&parent->lock);
-	ret = may_access_whitelist(parent, wh);
-	spin_unlock(&parent->lock);
-	return ret;
+	return may_access_whitelist(parent, wh);
 }
 
 /*
@@ -357,17 +331,14 @@ static int parent_has_perm(struct dev_cgroup *childcg,
 static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 				   int filetype, const char *buffer)
 {
-	struct dev_cgroup *cur_devcgroup;
 	const char *b;
 	char *endp;
-	int retval = 0, count;
+	int count;
 	struct dev_whitelist_item wh;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	cur_devcgroup = task_devcgroup(current);
-
 	memset(&wh, 0, sizeof(wh));
 	b = buffer;
 
@@ -437,7 +408,6 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	}
 
 handle:
-	retval = 0;
 	switch (filetype) {
 	case DEVCG_ALLOW:
 		if (!parent_has_perm(devcgroup, &wh))
diff --git a/sound/core/pcm_misc.c b/sound/core/pcm_misc.c
index 89b7f549bebd..ea2bf82c9373 100644
--- a/sound/core/pcm_misc.c
+++ b/sound/core/pcm_misc.c
@@ -319,6 +319,7 @@ EXPORT_SYMBOL(snd_pcm_format_physical_width);
 /**
  * snd_pcm_format_size - return the byte size of samples on the given format
  * @format: the format to check
+ * @samples: sampling rate
  *
  * Returns the byte size of the given samples for the format, or a
  * negative error code if unknown format.
diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c
index e5e749f3e0ef..73be7e14a603 100644
--- a/sound/drivers/dummy.c
+++ b/sound/drivers/dummy.c
@@ -51,7 +51,7 @@ static int emu10k1_playback_constraints(struct snd_pcm_runtime *runtime)
 	if (err < 0)
 		return err;
 	err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 256, UINT_MAX);
-	if (err) < 0)
+	if (err < 0)
 		return err;
 	return 0;
 }
diff --git a/sound/pci/ca0106/ca0106_main.c b/sound/pci/ca0106/ca0106_main.c
index a7d89662acf6..88fbf285d2b7 100644
--- a/sound/pci/ca0106/ca0106_main.c
+++ b/sound/pci/ca0106/ca0106_main.c
@@ -759,7 +759,6 @@ static int snd_ca0106_pcm_prepare_playback(struct snd_pcm_substream *substream)
 			       SPCS_CHANNELNUM_LEFT | SPCS_SOURCENUM_UNSPEC |
 			       SPCS_GENERATIONSTATUS | 0x00001200 |
 			       0x00000000 | SPCS_EMPHASIS_NONE | SPCS_COPYRIGHT );
-	}
 #endif
 
 	return 0;
diff --git a/sound/ppc/snd_ps3.c b/sound/ppc/snd_ps3.c
index 20d0e328288a..8f9e3859c37c 100644
--- a/sound/ppc/snd_ps3.c
+++ b/sound/ppc/snd_ps3.c
@@ -666,6 +666,7 @@ static int snd_ps3_init_avsetting(struct snd_ps3_card_info *card)
 	card->avs.avs_audio_width = PS3AV_CMD_AUDIO_WORD_BITS_16;
 	card->avs.avs_audio_format = PS3AV_CMD_AUDIO_FORMAT_PCM;
 	card->avs.avs_audio_source = PS3AV_CMD_AUDIO_SOURCE_SERIAL;
+	memcpy(card->avs.avs_cs_info, ps3av_mode_cs_info, 8);
 
 	ret = snd_ps3_change_avsetting(card);
 
@@ -685,6 +686,7 @@ static int snd_ps3_set_avsetting(struct snd_pcm_substream *substream)
 {
 	struct snd_ps3_card_info *card = snd_pcm_substream_chip(substream);
 	struct snd_ps3_avsetting_info avs;
+	int ret;
 
 	avs = card->avs;
 
@@ -729,19 +731,92 @@ static int snd_ps3_set_avsetting(struct snd_pcm_substream *substream)
 		return 1;
 	}
 
-	if ((card->avs.avs_audio_width != avs.avs_audio_width) ||
-	    (card->avs.avs_audio_rate != avs.avs_audio_rate)) {
-		card->avs = avs;
-		snd_ps3_change_avsetting(card);
+	memcpy(avs.avs_cs_info, ps3av_mode_cs_info, 8);
 
+	if (memcmp(&card->avs, &avs, sizeof(avs))) {
 		pr_debug("%s: after freq=%d width=%d\n", __func__,
 			 card->avs.avs_audio_rate, card->avs.avs_audio_width);
 
-		return 0;
+		card->avs = avs;
+		snd_ps3_change_avsetting(card);
+		ret = 0;
 	} else
+		ret = 1;
+
+	/* check CS non-audio bit and mute accordingly */
+	if (avs.avs_cs_info[0] & 0x02)
+		ps3av_audio_mute_analog(1); /* mute if non-audio */
+	else
+		ps3av_audio_mute_analog(0);
+
+	return ret;
+}
+
+/*
+ * SPDIF status bits controls
+ */
+static int snd_ps3_spdif_mask_info(struct snd_kcontrol *kcontrol,
+				   struct snd_ctl_elem_info *uinfo)
+{
+	uinfo->type = SNDRV_CTL_ELEM_TYPE_IEC958;
+	uinfo->count = 1;
+	return 0;
+}
+
+/* FIXME: ps3av_set_audio_mode() assumes only consumer mode */
+static int snd_ps3_spdif_cmask_get(struct snd_kcontrol *kcontrol,
+				   struct snd_ctl_elem_value *ucontrol)
+{
+	memset(ucontrol->value.iec958.status, 0xff, 8);
+	return 0;
+}
+
+static int snd_ps3_spdif_pmask_get(struct snd_kcontrol *kcontrol,
+				   struct snd_ctl_elem_value *ucontrol)
+{
+	return 0;
+}
+
+static int snd_ps3_spdif_default_get(struct snd_kcontrol *kcontrol,
+				     struct snd_ctl_elem_value *ucontrol)
+{
+	memcpy(ucontrol->value.iec958.status, ps3av_mode_cs_info, 8);
+	return 0;
+}
+
+static int snd_ps3_spdif_default_put(struct snd_kcontrol *kcontrol,
+				     struct snd_ctl_elem_value *ucontrol)
+{
+	if (memcmp(ps3av_mode_cs_info, ucontrol->value.iec958.status, 8)) {
+		memcpy(ps3av_mode_cs_info, ucontrol->value.iec958.status, 8);
 		return 1;
+	}
+	return 0;
 }
 
+static struct snd_kcontrol_new spdif_ctls[] = {
+	{
+		.access = SNDRV_CTL_ELEM_ACCESS_READ,
+		.iface = SNDRV_CTL_ELEM_IFACE_PCM,
+		.name = SNDRV_CTL_NAME_IEC958("",PLAYBACK,CON_MASK),
+		.info = snd_ps3_spdif_mask_info,
+		.get = snd_ps3_spdif_cmask_get,
+	},
+	{
+		.access = SNDRV_CTL_ELEM_ACCESS_READ,
+		.iface = SNDRV_CTL_ELEM_IFACE_PCM,
+		.name = SNDRV_CTL_NAME_IEC958("",PLAYBACK,PRO_MASK),
+		.info = snd_ps3_spdif_mask_info,
+		.get = snd_ps3_spdif_pmask_get,
+	},
+	{
+		.iface = SNDRV_CTL_ELEM_IFACE_PCM,
+		.name = SNDRV_CTL_NAME_IEC958("",PLAYBACK,DEFAULT),
+		.info = snd_ps3_spdif_mask_info,
+		.get = snd_ps3_spdif_default_get,
+		.put = snd_ps3_spdif_default_put,
+	},
+};
 
 
 static int snd_ps3_map_mmio(void)
@@ -842,7 +917,7 @@ static void snd_ps3_audio_set_base_addr(uint64_t ioaddr_start)
 
 static int __init snd_ps3_driver_probe(struct ps3_system_bus_device *dev)
 {
-	int ret;
+	int i, ret;
 	u64 lpar_addr, lpar_size;
 
 	BUG_ON(!firmware_has_feature(FW_FEATURE_PS3_LV1));
@@ -903,6 +978,15 @@ static int __init snd_ps3_driver_probe(struct ps3_system_bus_device *dev)
 	strcpy(the_card.card->driver, "PS3");
 	strcpy(the_card.card->shortname, "PS3");
 	strcpy(the_card.card->longname, "PS3 sound");
+
+	/* create control elements */
+	for (i = 0; i < ARRAY_SIZE(spdif_ctls); i++) {
+		ret = snd_ctl_add(the_card.card,
+				  snd_ctl_new1(&spdif_ctls[i], &the_card));
+		if (ret < 0)
+			goto clean_card;
+	}
+
 	/* create PCM devices instance */
 	/* NOTE:this driver works assuming pcm:substream = 1:1 */
 	ret = snd_pcm_new(the_card.card,
diff --git a/sound/ppc/snd_ps3.h b/sound/ppc/snd_ps3.h
index 4b7e6fbbe500..326fb29e82d8 100644
--- a/sound/ppc/snd_ps3.h
+++ b/sound/ppc/snd_ps3.h
@@ -51,6 +51,7 @@ struct snd_ps3_avsetting_info {
 	uint32_t avs_audio_width;
 	uint32_t avs_audio_format; /* fixed */
 	uint32_t avs_audio_source; /* fixed */
+	unsigned char avs_cs_info[8];
 };
 /*
  * PS3 audio 'card' instance
diff --git a/sound/soc/omap/omap-mcbsp.c b/sound/soc/omap/omap-mcbsp.c
index 0a063a98a661..853b33ae3435 100644
--- a/sound/soc/omap/omap-mcbsp.c
+++ b/sound/soc/omap/omap-mcbsp.c
@@ -43,6 +43,7 @@
 struct omap_mcbsp_data {
 	unsigned int			bus_id;
 	struct omap_mcbsp_reg_cfg	regs;
+	unsigned int			fmt;
 	/*
 	 * Flags indicating is the bus already activated and configured by
 	 * another substream
@@ -200,6 +201,7 @@ static int omap_mcbsp_dai_hw_params(struct snd_pcm_substream *substream,
 	struct omap_mcbsp_data *mcbsp_data = to_mcbsp(cpu_dai->private_data);
 	struct omap_mcbsp_reg_cfg *regs = &mcbsp_data->regs;
 	int dma, bus_id = mcbsp_data->bus_id, id = cpu_dai->id;
+	int wlen;
 	unsigned long port;
 
 	if (cpu_class_is_omap1()) {
@@ -244,19 +246,29 @@ static int omap_mcbsp_dai_hw_params(struct snd_pcm_substream *substream,
 	switch (params_format(params)) {
 	case SNDRV_PCM_FORMAT_S16_LE:
 		/* Set word lengths */
+		wlen = 16;
 		regs->rcr2	|= RWDLEN2(OMAP_MCBSP_WORD_16);
 		regs->rcr1	|= RWDLEN1(OMAP_MCBSP_WORD_16);
 		regs->xcr2	|= XWDLEN2(OMAP_MCBSP_WORD_16);
 		regs->xcr1	|= XWDLEN1(OMAP_MCBSP_WORD_16);
-		/* Set FS period and length in terms of bit clock periods */
-		regs->srgr2	|= FPER(16 * 2 - 1);
-		regs->srgr1	|= FWID(16 - 1);
 		break;
 	default:
 		/* Unsupported PCM format */
 		return -EINVAL;
 	}
 
+	/* Set FS period and length in terms of bit clock periods */
+	switch (mcbsp_data->fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+	case SND_SOC_DAIFMT_I2S:
+		regs->srgr2	|= FPER(wlen * 2 - 1);
+		regs->srgr1	|= FWID(wlen - 1);
+		break;
+	case SND_SOC_DAIFMT_DSP_A:
+		regs->srgr2	|= FPER(wlen * 2 - 1);
+		regs->srgr1	|= FWID(0);
+		break;
+	}
+
 	omap_mcbsp_config(bus_id, &mcbsp_data->regs);
 	mcbsp_data->configured = 1;
 
@@ -272,10 +284,12 @@ static int omap_mcbsp_dai_set_dai_fmt(struct snd_soc_dai *cpu_dai,
 {
 	struct omap_mcbsp_data *mcbsp_data = to_mcbsp(cpu_dai->private_data);
 	struct omap_mcbsp_reg_cfg *regs = &mcbsp_data->regs;
+	unsigned int temp_fmt = fmt;
 
 	if (mcbsp_data->configured)
 		return 0;
 
+	mcbsp_data->fmt = fmt;
 	memset(regs, 0, sizeof(*regs));
 	/* Generic McBSP register settings */
 	regs->spcr2	|= XINTM(3) | FREE;
@@ -293,6 +307,8 @@ static int omap_mcbsp_dai_set_dai_fmt(struct snd_soc_dai *cpu_dai,
 		/* 0-bit data delay */
 		regs->rcr2      |= RDATDLY(0);
 		regs->xcr2      |= XDATDLY(0);
+		/* Invert bit clock and FS polarity configuration for DSP_A */
+		temp_fmt ^= SND_SOC_DAIFMT_IB_IF;
 		break;
 	default:
 		/* Unsupported data format */
@@ -316,7 +332,7 @@ static int omap_mcbsp_dai_set_dai_fmt(struct snd_soc_dai *cpu_dai,
 	}
 
 	/* Set bit clock (CLKX/CLKR) and FS polarities */
-	switch (fmt & SND_SOC_DAIFMT_INV_MASK) {
+	switch (temp_fmt & SND_SOC_DAIFMT_INV_MASK) {
 	case SND_SOC_DAIFMT_NB_NF:
 		/*
 		 * Normal BCLK + FS.